Skip to content

Commit 1302740

Browse files
authored
fix(parser/html): regex literals in frontmatter (#9531)
1 parent 331dc0d commit 1302740

4 files changed

Lines changed: 262 additions & 18 deletions

File tree

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@biomejs/biome": patch
3+
---
4+
5+
Fixed [#9187](https://github.com/biomejs/biome/issues/9187): Astro frontmatter containing regex literals with quotes (`/'/`, `/"/`) or dashes (`/---/`) no longer causes parse errors.

crates/biome_html_parser/src/lexer/mod.rs

Lines changed: 164 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1555,9 +1555,10 @@ impl<'src> LexerWithCheckpoint<'src> for HtmlLexer<'src> {
15551555
}
15561556
}
15571557

1558-
/// Tracks whether the lexer is currently inside an open string literal while
1559-
/// scanning Astro frontmatter. Used to determine whether a `---` sequence is
1560-
/// a genuine closing fence or merely three dashes that appear inside a string.
1558+
/// Tracks whether the lexer is currently inside an open string literal, regex
1559+
/// literal, or comment while scanning Astro frontmatter. Used to determine
1560+
/// whether a `---` sequence is a genuine closing fence or merely three dashes
1561+
/// that appear inside a string or regex.
15611562
///
15621563
/// ## Design
15631564
///
@@ -1568,6 +1569,11 @@ impl<'src> LexerWithCheckpoint<'src> for HtmlLexer<'src> {
15681569
/// already open; it closes the string only when it **matches** the opening
15691570
/// quote. For example, a `'` inside a `"…"` string is treated as a literal
15701571
/// character, not as a new string opener.
1572+
/// - The **regex flag** (`in_regex`): set when a `/` is encountered in a
1573+
/// position where it starts a regex literal (determined by the previous
1574+
/// non-whitespace byte). While set, all bytes are consumed until an
1575+
/// unescaped `/` closes the regex. Quotes and dashes inside a regex are
1576+
/// not treated as string delimiters or fence markers.
15711577
/// - The **comment state** (`comment`): distinguishes single-line (`//`) from
15721578
/// multi-line (`/* … */`) comments, so that quote characters inside comments
15731579
/// are not counted as string delimiters.
@@ -1578,13 +1584,17 @@ impl<'src> LexerWithCheckpoint<'src> for HtmlLexer<'src> {
15781584
struct QuotesSeen {
15791585
/// The quote character that opened the current string, if any.
15801586
current_quote: Option<u8>,
1587+
/// Whether we are currently inside a regex literal (`/…/`).
1588+
in_regex: bool,
15811589
/// Current comment state.
15821590
comment: QuotesSeenComment,
15831591
/// Whether the previous byte was an unescaped backslash.
15841592
escaped: bool,
15851593
/// The previous byte, needed to detect `//` and `/* */` comment markers
15861594
/// and the `*/` block-comment terminator.
15871595
prev_byte: Option<u8>,
1596+
/// The previous non-whitespace byte, used for the regex-start heuristic.
1597+
prev_non_ws_byte: Option<u8>,
15881598
}
15891599

15901600
/// Distinguishes the kind of comment the lexer is currently inside.
@@ -1602,9 +1612,11 @@ impl QuotesSeen {
16021612
fn new() -> Self {
16031613
Self {
16041614
current_quote: None,
1615+
in_regex: false,
16051616
comment: QuotesSeenComment::None,
16061617
escaped: false,
16071618
prev_byte: None,
1619+
prev_non_ws_byte: None,
16081620
}
16091621
}
16101622

@@ -1617,13 +1629,21 @@ impl QuotesSeen {
16171629
self.comment = QuotesSeenComment::None;
16181630
}
16191631
self.prev_byte = Some(byte);
1632+
if !byte.is_ascii_whitespace() {
1633+
self.prev_non_ws_byte = Some(byte);
1634+
}
16201635
// Quotes inside comments are ignored.
16211636
return;
16221637
}
16231638
QuotesSeenComment::MultiLine => {
16241639
// Multi-line comment ends at `*/`.
16251640
if self.prev_byte == Some(b'*') && byte == b'/' {
16261641
self.comment = QuotesSeenComment::None;
1642+
// Use a neutral prev_byte so the closing `/` of `*/` is
1643+
// not mistaken for a potential regex or comment opener.
1644+
self.prev_byte = None;
1645+
self.prev_non_ws_byte = Some(b'/');
1646+
return;
16271647
}
16281648
self.prev_byte = Some(byte);
16291649
// Quotes inside comments are ignored.
@@ -1632,11 +1652,31 @@ impl QuotesSeen {
16321652
QuotesSeenComment::None => {}
16331653
}
16341654

1655+
// Inside a regex literal: consume bytes until an unescaped `/` closes it.
1656+
if self.in_regex {
1657+
if byte == b'\\' {
1658+
self.escaped = !self.escaped;
1659+
self.prev_byte = Some(byte);
1660+
} else if byte == b'/' && !self.escaped {
1661+
self.in_regex = false;
1662+
self.escaped = false;
1663+
// Use a neutral prev_byte so the closing `/` of the regex is
1664+
// not mistaken for a deferred slash (comment/regex opener).
1665+
self.prev_byte = None;
1666+
self.prev_non_ws_byte = Some(b'/');
1667+
} else {
1668+
self.escaped = false;
1669+
self.prev_byte = Some(byte);
1670+
}
1671+
return;
1672+
}
1673+
16351674
// Handle escape sequences: a `\` that is not itself escaped toggles the
16361675
// escape flag for the next character.
16371676
if byte == b'\\' {
16381677
self.escaped = !self.escaped;
16391678
self.prev_byte = Some(byte);
1679+
self.prev_non_ws_byte = Some(byte);
16401680
return;
16411681
}
16421682

@@ -1647,24 +1687,60 @@ impl QuotesSeen {
16471687

16481688
if was_escaped {
16491689
self.prev_byte = Some(byte);
1690+
if !byte.is_ascii_whitespace() {
1691+
self.prev_non_ws_byte = Some(byte);
1692+
}
16501693
return;
16511694
}
16521695

1653-
// Detect comment openers — only valid outside of open strings.
1696+
// Detect comment openers and regex literals — only valid outside of open strings.
1697+
if self.current_quote.is_none() && byte == b'/' {
1698+
// Check if the previous byte was also `/` → single-line comment.
1699+
if self.prev_byte == Some(b'/') {
1700+
self.comment = QuotesSeenComment::SingleLine;
1701+
self.prev_byte = Some(byte);
1702+
// Don't update prev_non_ws_byte — it was already preserved
1703+
// when we deferred the first `/`.
1704+
return;
1705+
}
1706+
1707+
// The `/` might start a comment (if followed by `/` or `*`), a
1708+
// regex literal, or be a division operator. We defer the decision:
1709+
// store it as prev_byte and decide on the *next* byte.
1710+
// Crucially, do NOT update prev_non_ws_byte here — we need to
1711+
// preserve the byte before the `/` for the regex heuristic.
1712+
self.prev_byte = Some(byte);
1713+
return;
1714+
}
1715+
1716+
// If the *previous* byte was `/` (outside a string), decide now whether
1717+
// it was a comment opener, a regex opener, or plain division.
16541718
if self.current_quote.is_none() && self.prev_byte == Some(b'/') {
1655-
match byte {
1656-
b'/' => {
1657-
self.comment = QuotesSeenComment::SingleLine;
1658-
self.prev_byte = Some(byte);
1659-
return;
1719+
if byte == b'*' {
1720+
self.comment = QuotesSeenComment::MultiLine;
1721+
self.prev_byte = Some(byte);
1722+
self.prev_non_ws_byte = Some(byte);
1723+
return;
1724+
}
1725+
1726+
// Not `//` or `/*`, so the previous `/` was either a regex opener
1727+
// or a division operator. Use the previous non-whitespace byte
1728+
// before the `/` to decide.
1729+
if self.slash_starts_regex() {
1730+
// The `/` opened a regex. The current byte is the first byte
1731+
// inside the regex body.
1732+
self.in_regex = true;
1733+
if byte == b'\\' {
1734+
self.escaped = true;
16601735
}
1661-
b'*' => {
1662-
self.comment = QuotesSeenComment::MultiLine;
1663-
self.prev_byte = Some(byte);
1664-
return;
1736+
self.prev_byte = Some(byte);
1737+
if !byte.is_ascii_whitespace() {
1738+
self.prev_non_ws_byte = Some(byte);
16651739
}
1666-
_ => {}
1740+
return;
16671741
}
1742+
// It was division; update prev_non_ws_byte to `/` now.
1743+
self.prev_non_ws_byte = Some(b'/');
16681744
}
16691745

16701746
// Track string delimiters.
@@ -1689,13 +1765,42 @@ impl QuotesSeen {
16891765
}
16901766

16911767
self.prev_byte = Some(byte);
1768+
if !byte.is_ascii_whitespace() {
1769+
self.prev_non_ws_byte = Some(byte);
1770+
}
16921771
}
16931772

1694-
/// Returns `true` when the tracker is not currently inside an open string literal
1695-
/// or a comment. Both states must be absent for a `---` fence to be a valid
1696-
/// frontmatter closing delimiter.
1773+
/// Returns whether a deferred `/` starts a regex literal based on
1774+
/// `prev_non_ws_byte`. After an identifier character, closing
1775+
/// paren/bracket, number, or `++`/`--` suffix, `/` is division. In all
1776+
/// other positions `/` starts a regex.
1777+
fn slash_starts_regex(&self) -> bool {
1778+
match self.prev_non_ws_byte {
1779+
None => true,
1780+
Some(b) => !matches!(
1781+
b,
1782+
b'a'..=b'z'
1783+
| b'A'..=b'Z'
1784+
| b'0'..=b'9'
1785+
| b'_'
1786+
| b'$'
1787+
| b')'
1788+
| b']'
1789+
| b'+'
1790+
| b'-'
1791+
),
1792+
}
1793+
}
1794+
1795+
/// Returns `true` when the tracker is not currently inside an open string
1796+
/// literal, regex literal, or comment, and there is no pending deferred
1797+
/// slash that might open a regex. All conditions must be absent for a
1798+
/// `---` fence to be a valid frontmatter closing delimiter.
16971799
fn is_empty(&self) -> bool {
1698-
self.current_quote.is_none() && self.comment == QuotesSeenComment::None
1800+
self.current_quote.is_none()
1801+
&& !self.in_regex
1802+
&& self.comment == QuotesSeenComment::None
1803+
&& self.prev_byte != Some(b'/')
16991804
}
17001805
}
17011806

@@ -1952,4 +2057,45 @@ const f = "something" "#;
19522057
"double backslash followed by closing quote must close the string"
19532058
);
19542059
}
2060+
2061+
// --- Tests for issue #9187: regex literals in frontmatter ---
2062+
2063+
/// A regex literal containing a single quote must not leave the tracker in a
2064+
/// non-empty state. The quote inside the regex is not a string delimiter.
2065+
#[test]
2066+
fn issue_9187_regex_with_single_quote() {
2067+
let source = "const test = /'/\n";
2068+
let mut quotes_seen = QuotesSeen::new();
2069+
track(source, &mut quotes_seen);
2070+
assert!(
2071+
quotes_seen.is_empty(),
2072+
"regex literal containing single quote must not open a string"
2073+
);
2074+
}
2075+
2076+
/// A regex literal containing a double quote must not leave the tracker in a
2077+
/// non-empty state.
2078+
#[test]
2079+
fn issue_9187_regex_with_double_quote() {
2080+
let source = "const test = /\"/\n";
2081+
let mut quotes_seen = QuotesSeen::new();
2082+
track(source, &mut quotes_seen);
2083+
assert!(
2084+
quotes_seen.is_empty(),
2085+
"regex literal containing double quote must not open a string"
2086+
);
2087+
}
2088+
2089+
/// A regex literal containing `---` must not cause the tracker to misidentify
2090+
/// the fence. The tracker must remain empty after the regex closes.
2091+
#[test]
2092+
fn issue_9187_regex_with_dashes() {
2093+
let source = "const test = /---/\n";
2094+
let mut quotes_seen = QuotesSeen::new();
2095+
track(source, &mut quotes_seen);
2096+
assert!(
2097+
quotes_seen.is_empty(),
2098+
"regex literal containing dashes must not confuse the tracker"
2099+
);
2100+
}
19552101
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
const test1 = /'/
3+
const test2 = /"/
4+
const test3 = /---/
5+
---
6+
7+
<div></div>
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
---
2+
source: crates/biome_html_parser/tests/spec_test.rs
3+
expression: snapshot
4+
---
5+
6+
## Input
7+
8+
```astro
9+
---
10+
const test1 = /'/
11+
const test2 = /"/
12+
const test3 = /---/
13+
---
14+
15+
<div></div>
16+
17+
```
18+
19+
20+
## AST
21+
22+
```
23+
HtmlRoot {
24+
bom_token: missing (optional),
25+
frontmatter: AstroFrontmatterElement {
26+
l_fence_token: FENCE@0..3 "---" [] [],
27+
content: AstroEmbeddedContent {
28+
content_token: HTML_LITERAL@3..60 "const test1 = /'/\nconst test2 = /\"/\nconst test3 = /---/\n" [Newline("\n")] [],
29+
},
30+
r_fence_token: FENCE@60..63 "---" [] [],
31+
},
32+
directive: missing (optional),
33+
html: HtmlElementList [
34+
HtmlElement {
35+
opening_element: HtmlOpeningElement {
36+
l_angle_token: L_ANGLE@63..66 "<" [Newline("\n"), Newline("\n")] [],
37+
name: HtmlTagName {
38+
value_token: HTML_LITERAL@66..69 "div" [] [],
39+
},
40+
attributes: HtmlAttributeList [],
41+
r_angle_token: R_ANGLE@69..70 ">" [] [],
42+
},
43+
children: HtmlElementList [],
44+
closing_element: HtmlClosingElement {
45+
l_angle_token: L_ANGLE@70..71 "<" [] [],
46+
slash_token: SLASH@71..72 "/" [] [],
47+
name: HtmlTagName {
48+
value_token: HTML_LITERAL@72..75 "div" [] [],
49+
},
50+
r_angle_token: R_ANGLE@75..76 ">" [] [],
51+
},
52+
},
53+
],
54+
eof_token: EOF@76..77 "" [Newline("\n")] [],
55+
}
56+
```
57+
58+
## CST
59+
60+
```
61+
0: HTML_ROOT@0..77
62+
0: (empty)
63+
1: ASTRO_FRONTMATTER_ELEMENT@0..63
64+
0: FENCE@0..3 "---" [] []
65+
1: ASTRO_EMBEDDED_CONTENT@3..60
66+
0: HTML_LITERAL@3..60 "const test1 = /'/\nconst test2 = /\"/\nconst test3 = /---/\n" [Newline("\n")] []
67+
2: FENCE@60..63 "---" [] []
68+
2: (empty)
69+
3: HTML_ELEMENT_LIST@63..76
70+
0: HTML_ELEMENT@63..76
71+
0: HTML_OPENING_ELEMENT@63..70
72+
0: L_ANGLE@63..66 "<" [Newline("\n"), Newline("\n")] []
73+
1: HTML_TAG_NAME@66..69
74+
0: HTML_LITERAL@66..69 "div" [] []
75+
2: HTML_ATTRIBUTE_LIST@69..69
76+
3: R_ANGLE@69..70 ">" [] []
77+
1: HTML_ELEMENT_LIST@70..70
78+
2: HTML_CLOSING_ELEMENT@70..76
79+
0: L_ANGLE@70..71 "<" [] []
80+
1: SLASH@71..72 "/" [] []
81+
2: HTML_TAG_NAME@72..75
82+
0: HTML_LITERAL@72..75 "div" [] []
83+
3: R_ANGLE@75..76 ">" [] []
84+
4: EOF@76..77 "" [Newline("\n")] []
85+
86+
```

0 commit comments

Comments
 (0)