Skip to content

Commit 73a5f10

Browse files
fix: use byte-safe indexing in style encapsulation to prevent UTF-8 panics (#248)
fix: use byte-safe indexing in CSS style encapsulation to prevent UTF-8 panics Several functions in the style encapsulation module used char indices to slice UTF-8 strings, causing panics on selectors containing multibyte characters (e.g. `ü`, `é`, `─`). This fixes `split_by_combinators`, `find_pseudo_element_start`, `find_pseudo_class_start`, `find_matching_paren`, and `try_scope_pseudo_function_with_context` to use either `char_indices()` or byte-level scanning for ASCII-only delimiters. Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent dcae8a8 commit 73a5f10

1 file changed

Lines changed: 95 additions & 42 deletions

File tree

crates/oxc_angular_compiler/src/styles/encapsulation.rs

Lines changed: 95 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1148,15 +1148,15 @@ fn find_pseudo_function_before(before_hc: &str) -> (String, usize) {
11481148
/// Returns the index of the closing paren (exclusive).
11491149
fn find_matching_paren(s: &str, start: usize) -> Option<usize> {
11501150
let mut depth = 1;
1151-
let chars: Vec<char> = s[start..].chars().collect();
1151+
let bytes = s.as_bytes();
11521152

1153-
for (i, c) in chars.iter().enumerate() {
1154-
match c {
1155-
'(' => depth += 1,
1156-
')' => {
1153+
for i in start..bytes.len() {
1154+
match bytes[i] {
1155+
b'(' => depth += 1,
1156+
b')' => {
11571157
depth -= 1;
11581158
if depth == 0 {
1159-
return Some(start + i);
1159+
return Some(i);
11601160
}
11611161
}
11621162
_ => {}
@@ -2088,7 +2088,7 @@ fn try_scope_pseudo_function_with_context(
20882088
// Find all pseudo-function parts
20892089
let mut pseudo_parts: Vec<String> = Vec::new();
20902090
let mut last_end = 0;
2091-
let chars: Vec<char> = trimmed.chars().collect();
2091+
let bytes = trimmed.as_bytes();
20922092

20932093
let mut search_from = 0;
20942094
while let Some(mat) = find_where_or_is(trimmed, search_from) {
@@ -2106,13 +2106,13 @@ fn try_scope_pseudo_function_with_context(
21062106

21072107
// Find the matching closing paren
21082108
let paren_start = mat.end;
2109-
let mut paren_depth = 1;
2109+
let mut paren_depth: u32 = 1;
21102110
let mut paren_end = paren_start;
21112111

2112-
for i in paren_start..trimmed.len() {
2113-
match chars[i] {
2114-
'(' => paren_depth += 1,
2115-
')' => {
2112+
for i in paren_start..bytes.len() {
2113+
match bytes[i] {
2114+
b'(' => paren_depth += 1,
2115+
b')' => {
21162116
paren_depth -= 1;
21172117
if paren_depth == 0 {
21182118
paren_end = i;
@@ -2258,14 +2258,15 @@ fn contains_host_attr_at_top_level(selector: &str, host_attr: &str) -> bool {
22582258
/// Returns pairs of (selector_part, combinator_with_spaces).
22592259
fn split_by_combinators(selector: &str) -> Vec<(&str, &str)> {
22602260
let mut result = Vec::new();
2261-
let chars: Vec<char> = selector.chars().collect();
2262-
let mut start = 0;
2263-
let mut i = 0;
2261+
let char_indices: Vec<(usize, char)> = selector.char_indices().collect();
2262+
let mut start = 0_usize; // byte index into selector
2263+
let mut i = 0_usize; // index into char_indices
22642264
let mut paren_depth: u32 = 0;
22652265
let mut bracket_depth: u32 = 0;
22662266

2267-
while i < chars.len() {
2268-
match chars[i] {
2267+
while i < char_indices.len() {
2268+
let (byte_pos, ch) = char_indices[i];
2269+
match ch {
22692270
'(' => paren_depth += 1,
22702271
')' => paren_depth = paren_depth.saturating_sub(1),
22712272
'[' => bracket_depth += 1,
@@ -2276,9 +2277,10 @@ fn split_by_combinators(selector: &str) -> Vec<(&str, &str)> {
22762277
// A space following an escaped hex value and followed by another hex character
22772278
// (ie: ".\fc ber" for ".über") is not a separator between 2 selectors
22782279
// Check: if the part ends with an escape placeholder AND next char is hex
2279-
let part = &selector[start..i];
2280-
let next_char_is_hex =
2281-
i + 1 < chars.len() && chars[i] == ' ' && chars[i + 1].is_ascii_hexdigit();
2280+
let part = &selector[start..byte_pos];
2281+
let next_char_is_hex = i + 1 < char_indices.len()
2282+
&& ch == ' '
2283+
&& char_indices[i + 1].1.is_ascii_hexdigit();
22822284
let part_ends_with_esc_placeholder = part.contains("__esc-ph-");
22832285

22842286
if next_char_is_hex && part_ends_with_esc_placeholder {
@@ -2288,25 +2290,25 @@ fn split_by_combinators(selector: &str) -> Vec<(&str, &str)> {
22882290
}
22892291

22902292
// Found a potential combinator
2291-
let part_end = i;
2293+
let part_end = byte_pos;
22922294

22932295
// Collect the combinator (may include spaces around it)
2294-
let combinator_start = i;
2295-
while i < chars.len()
2296-
&& (chars[i] == ' '
2297-
|| chars[i] == '\n'
2298-
|| chars[i] == '\t'
2299-
|| chars[i] == '\r'
2300-
|| chars[i] == '>'
2301-
|| chars[i] == '+'
2302-
|| chars[i] == '~')
2296+
let combinator_start = byte_pos;
2297+
while i < char_indices.len()
2298+
&& matches!(char_indices[i].1, ' ' | '\n' | '\t' | '\r' | '>' | '+' | '~')
23032299
{
23042300
i += 1;
23052301
}
23062302

2303+
let combinator_end =
2304+
if i < char_indices.len() { char_indices[i].0 } else { selector.len() };
2305+
23072306
// Always push the part, even if empty (to preserve leading combinators)
2308-
result.push((&selector[start..part_end], &selector[combinator_start..i]));
2309-
start = i;
2307+
result.push((
2308+
&selector[start..part_end],
2309+
&selector[combinator_start..combinator_end],
2310+
));
2311+
start = combinator_end;
23102312
continue;
23112313
}
23122314
_ => {}
@@ -2491,16 +2493,20 @@ fn scope_after_host_with_context(selector: &str, ctx: &mut ScopingContext) -> St
24912493

24922494
/// Find the start position of a pseudo-element (::).
24932495
fn find_pseudo_element_start(s: &str) -> Option<usize> {
2496+
let char_indices: Vec<(usize, char)> = s.char_indices().collect();
24942497
let mut i = 0;
2495-
let chars: Vec<char> = s.chars().collect();
24962498
let mut in_brackets: u32 = 0;
24972499

2498-
while i < chars.len() {
2499-
match chars[i] {
2500+
while i < char_indices.len() {
2501+
let (byte_pos, ch) = char_indices[i];
2502+
match ch {
25002503
'[' => in_brackets += 1,
25012504
']' => in_brackets = in_brackets.saturating_sub(1),
2502-
':' if in_brackets == 0 && i + 1 < chars.len() && chars[i + 1] == ':' => {
2503-
return Some(i);
2505+
':' if in_brackets == 0
2506+
&& i + 1 < char_indices.len()
2507+
&& char_indices[i + 1].1 == ':' =>
2508+
{
2509+
return Some(byte_pos);
25042510
}
25052511
_ => {}
25062512
}
@@ -2512,20 +2518,21 @@ fn find_pseudo_element_start(s: &str) -> Option<usize> {
25122518
/// Find the start position of a pseudo-class (:), including pseudo-functions.
25132519
/// The caller decides how to handle pseudo-functions vs regular pseudo-classes.
25142520
fn find_pseudo_class_start(s: &str) -> Option<usize> {
2521+
let char_indices: Vec<(usize, char)> = s.char_indices().collect();
25152522
let mut i = 0;
2516-
let chars: Vec<char> = s.chars().collect();
25172523
let mut in_brackets: u32 = 0;
25182524

2519-
while i < chars.len() {
2520-
match chars[i] {
2525+
while i < char_indices.len() {
2526+
let (byte_pos, ch) = char_indices[i];
2527+
match ch {
25212528
'[' => in_brackets += 1,
25222529
']' => in_brackets = in_brackets.saturating_sub(1),
25232530
':' if in_brackets == 0 => {
25242531
// Check it's not :: (pseudo-element) - those are handled separately
2525-
if i + 1 < chars.len() && chars[i + 1] == ':' {
2532+
if i + 1 < char_indices.len() && char_indices[i + 1].1 == ':' {
25262533
return None;
25272534
}
2528-
return Some(i);
2535+
return Some(byte_pos);
25292536
}
25302537
_ => {}
25312538
}
@@ -3455,4 +3462,50 @@ mod tests {
34553462
result.len()
34563463
);
34573464
}
3465+
3466+
#[test]
3467+
fn test_multibyte_utf8_in_selector() {
3468+
// Selectors with multibyte UTF-8 characters (e.g. attribute selectors with
3469+
// non-ASCII values) must not panic from byte/char index mismatch.
3470+
let result = shim_css_text(r#"[data-label="ÄÖÜ"] .child { color: red; }"#, "contenta", "");
3471+
assert!(
3472+
result.contains("[contenta]"),
3473+
"Should scope selectors containing multibyte UTF-8. Got: {}",
3474+
result
3475+
);
3476+
}
3477+
3478+
#[test]
3479+
fn test_multibyte_utf8_pseudo_element() {
3480+
// Pseudo-elements on selectors with multibyte characters must not panic.
3481+
let result = shim_css_text(r#"[title="café"]::before { content: ""; }"#, "contenta", "");
3482+
assert!(
3483+
result.contains("[contenta]"),
3484+
"Should scope pseudo-elements with multibyte UTF-8. Got: {}",
3485+
result
3486+
);
3487+
}
3488+
3489+
#[test]
3490+
fn test_multibyte_utf8_pseudo_class() {
3491+
// Pseudo-classes on selectors with multibyte characters must not panic.
3492+
let result = shim_css_text(r#".naïve:hover { color: blue; }"#, "contenta", "");
3493+
assert!(
3494+
result.contains("[contenta]"),
3495+
"Should scope pseudo-classes with multibyte UTF-8. Got: {}",
3496+
result
3497+
);
3498+
}
3499+
3500+
#[test]
3501+
fn test_multibyte_utf8_combinator_split() {
3502+
// Combinators between selectors with multibyte characters must not panic.
3503+
let result =
3504+
shim_css_text(r#".über > .straße + .café ~ .naïve { color: green; }"#, "contenta", "");
3505+
assert!(
3506+
result.contains("[contenta]"),
3507+
"Should handle combinators with multibyte UTF-8 selectors. Got: {}",
3508+
result
3509+
);
3510+
}
34583511
}

0 commit comments

Comments
 (0)