@@ -1148,15 +1148,15 @@ fn find_pseudo_function_before(before_hc: &str) -> (String, usize) {
11481148/// Returns the index of the closing paren (exclusive).
11491149fn find_matching_paren ( s : & str , start : usize ) -> Option < usize > {
11501150 let mut depth = 1 ;
1151- let chars : Vec < char > = s[ start.. ] . chars ( ) . collect ( ) ;
1151+ let bytes = s. as_bytes ( ) ;
11521152
1153- for ( i , c ) in chars . iter ( ) . enumerate ( ) {
1154- match c {
1155- '(' => depth += 1 ,
1156- ')' => {
1153+ for i in start..bytes . len ( ) {
1154+ match bytes [ i ] {
1155+ b '(' => depth += 1 ,
1156+ b ')' => {
11571157 depth -= 1 ;
11581158 if depth == 0 {
1159- return Some ( start + i) ;
1159+ return Some ( i) ;
11601160 }
11611161 }
11621162 _ => { }
@@ -2088,7 +2088,7 @@ fn try_scope_pseudo_function_with_context(
20882088 // Find all pseudo-function parts
20892089 let mut pseudo_parts: Vec < String > = Vec :: new ( ) ;
20902090 let mut last_end = 0 ;
2091- let chars : Vec < char > = trimmed. chars ( ) . collect ( ) ;
2091+ let bytes = trimmed. as_bytes ( ) ;
20922092
20932093 let mut search_from = 0 ;
20942094 while let Some ( mat) = find_where_or_is ( trimmed, search_from) {
@@ -2106,13 +2106,13 @@ fn try_scope_pseudo_function_with_context(
21062106
21072107 // Find the matching closing paren
21082108 let paren_start = mat. end ;
2109- let mut paren_depth = 1 ;
2109+ let mut paren_depth: u32 = 1 ;
21102110 let mut paren_end = paren_start;
21112111
2112- for i in paren_start..trimmed . len ( ) {
2113- match chars [ i] {
2114- '(' => paren_depth += 1 ,
2115- ')' => {
2112+ for i in paren_start..bytes . len ( ) {
2113+ match bytes [ i] {
2114+ b '(' => paren_depth += 1 ,
2115+ b ')' => {
21162116 paren_depth -= 1 ;
21172117 if paren_depth == 0 {
21182118 paren_end = i;
@@ -2258,14 +2258,15 @@ fn contains_host_attr_at_top_level(selector: &str, host_attr: &str) -> bool {
22582258/// Returns pairs of (selector_part, combinator_with_spaces).
22592259fn split_by_combinators ( selector : & str ) -> Vec < ( & str , & str ) > {
22602260 let mut result = Vec :: new ( ) ;
2261- let chars : Vec < char > = selector. chars ( ) . collect ( ) ;
2262- let mut start = 0 ;
2263- let mut i = 0 ;
2261+ let char_indices : Vec < ( usize , char ) > = selector. char_indices ( ) . collect ( ) ;
2262+ let mut start = 0_usize ; // byte index into selector
2263+ let mut i = 0_usize ; // index into char_indices
22642264 let mut paren_depth: u32 = 0 ;
22652265 let mut bracket_depth: u32 = 0 ;
22662266
2267- while i < chars. len ( ) {
2268- match chars[ i] {
2267+ while i < char_indices. len ( ) {
2268+ let ( byte_pos, ch) = char_indices[ i] ;
2269+ match ch {
22692270 '(' => paren_depth += 1 ,
22702271 ')' => paren_depth = paren_depth. saturating_sub ( 1 ) ,
22712272 '[' => bracket_depth += 1 ,
@@ -2276,9 +2277,10 @@ fn split_by_combinators(selector: &str) -> Vec<(&str, &str)> {
22762277 // A space following an escaped hex value and followed by another hex character
22772278 // (ie: ".\fc ber" for ".über") is not a separator between 2 selectors
22782279 // Check: if the part ends with an escape placeholder AND next char is hex
2279- let part = & selector[ start..i] ;
2280- let next_char_is_hex =
2281- i + 1 < chars. len ( ) && chars[ i] == ' ' && chars[ i + 1 ] . is_ascii_hexdigit ( ) ;
2280+ let part = & selector[ start..byte_pos] ;
2281+ let next_char_is_hex = i + 1 < char_indices. len ( )
2282+ && ch == ' '
2283+ && char_indices[ i + 1 ] . 1 . is_ascii_hexdigit ( ) ;
22822284 let part_ends_with_esc_placeholder = part. contains ( "__esc-ph-" ) ;
22832285
22842286 if next_char_is_hex && part_ends_with_esc_placeholder {
@@ -2288,25 +2290,25 @@ fn split_by_combinators(selector: &str) -> Vec<(&str, &str)> {
22882290 }
22892291
22902292 // Found a potential combinator
2291- let part_end = i ;
2293+ let part_end = byte_pos ;
22922294
22932295 // Collect the combinator (may include spaces around it)
2294- let combinator_start = i;
2295- while i < chars. len ( )
2296- && ( chars[ i] == ' '
2297- || chars[ i] == '\n'
2298- || chars[ i] == '\t'
2299- || chars[ i] == '\r'
2300- || chars[ i] == '>'
2301- || chars[ i] == '+'
2302- || chars[ i] == '~' )
2296+ let combinator_start = byte_pos;
2297+ while i < char_indices. len ( )
2298+ && matches ! ( char_indices[ i] . 1 , ' ' | '\n' | '\t' | '\r' | '>' | '+' | '~' )
23032299 {
23042300 i += 1 ;
23052301 }
23062302
2303+ let combinator_end =
2304+ if i < char_indices. len ( ) { char_indices[ i] . 0 } else { selector. len ( ) } ;
2305+
23072306 // Always push the part, even if empty (to preserve leading combinators)
2308- result. push ( ( & selector[ start..part_end] , & selector[ combinator_start..i] ) ) ;
2309- start = i;
2307+ result. push ( (
2308+ & selector[ start..part_end] ,
2309+ & selector[ combinator_start..combinator_end] ,
2310+ ) ) ;
2311+ start = combinator_end;
23102312 continue ;
23112313 }
23122314 _ => { }
@@ -2491,16 +2493,20 @@ fn scope_after_host_with_context(selector: &str, ctx: &mut ScopingContext) -> St
24912493
24922494/// Find the start position of a pseudo-element (::).
24932495fn find_pseudo_element_start ( s : & str ) -> Option < usize > {
2496+ let char_indices: Vec < ( usize , char ) > = s. char_indices ( ) . collect ( ) ;
24942497 let mut i = 0 ;
2495- let chars: Vec < char > = s. chars ( ) . collect ( ) ;
24962498 let mut in_brackets: u32 = 0 ;
24972499
2498- while i < chars. len ( ) {
2499- match chars[ i] {
2500+ while i < char_indices. len ( ) {
2501+ let ( byte_pos, ch) = char_indices[ i] ;
2502+ match ch {
25002503 '[' => in_brackets += 1 ,
25012504 ']' => in_brackets = in_brackets. saturating_sub ( 1 ) ,
2502- ':' if in_brackets == 0 && i + 1 < chars. len ( ) && chars[ i + 1 ] == ':' => {
2503- return Some ( i) ;
2505+ ':' if in_brackets == 0
2506+ && i + 1 < char_indices. len ( )
2507+ && char_indices[ i + 1 ] . 1 == ':' =>
2508+ {
2509+ return Some ( byte_pos) ;
25042510 }
25052511 _ => { }
25062512 }
@@ -2512,20 +2518,21 @@ fn find_pseudo_element_start(s: &str) -> Option<usize> {
25122518/// Find the start position of a pseudo-class (:), including pseudo-functions.
25132519/// The caller decides how to handle pseudo-functions vs regular pseudo-classes.
25142520fn find_pseudo_class_start ( s : & str ) -> Option < usize > {
2521+ let char_indices: Vec < ( usize , char ) > = s. char_indices ( ) . collect ( ) ;
25152522 let mut i = 0 ;
2516- let chars: Vec < char > = s. chars ( ) . collect ( ) ;
25172523 let mut in_brackets: u32 = 0 ;
25182524
2519- while i < chars. len ( ) {
2520- match chars[ i] {
2525+ while i < char_indices. len ( ) {
2526+ let ( byte_pos, ch) = char_indices[ i] ;
2527+ match ch {
25212528 '[' => in_brackets += 1 ,
25222529 ']' => in_brackets = in_brackets. saturating_sub ( 1 ) ,
25232530 ':' if in_brackets == 0 => {
25242531 // Check it's not :: (pseudo-element) - those are handled separately
2525- if i + 1 < chars . len ( ) && chars [ i + 1 ] == ':' {
2532+ if i + 1 < char_indices . len ( ) && char_indices [ i + 1 ] . 1 == ':' {
25262533 return None ;
25272534 }
2528- return Some ( i ) ;
2535+ return Some ( byte_pos ) ;
25292536 }
25302537 _ => { }
25312538 }
@@ -3455,4 +3462,50 @@ mod tests {
34553462 result. len( )
34563463 ) ;
34573464 }
3465+
3466+ #[ test]
3467+ fn test_multibyte_utf8_in_selector ( ) {
3468+ // Selectors with multibyte UTF-8 characters (e.g. attribute selectors with
3469+ // non-ASCII values) must not panic from byte/char index mismatch.
3470+ let result = shim_css_text ( r#"[data-label="ÄÖÜ"] .child { color: red; }"# , "contenta" , "" ) ;
3471+ assert ! (
3472+ result. contains( "[contenta]" ) ,
3473+ "Should scope selectors containing multibyte UTF-8. Got: {}" ,
3474+ result
3475+ ) ;
3476+ }
3477+
3478+ #[ test]
3479+ fn test_multibyte_utf8_pseudo_element ( ) {
3480+ // Pseudo-elements on selectors with multibyte characters must not panic.
3481+ let result = shim_css_text ( r#"[title="café"]::before { content: ""; }"# , "contenta" , "" ) ;
3482+ assert ! (
3483+ result. contains( "[contenta]" ) ,
3484+ "Should scope pseudo-elements with multibyte UTF-8. Got: {}" ,
3485+ result
3486+ ) ;
3487+ }
3488+
3489+ #[ test]
3490+ fn test_multibyte_utf8_pseudo_class ( ) {
3491+ // Pseudo-classes on selectors with multibyte characters must not panic.
3492+ let result = shim_css_text ( r#".naïve:hover { color: blue; }"# , "contenta" , "" ) ;
3493+ assert ! (
3494+ result. contains( "[contenta]" ) ,
3495+ "Should scope pseudo-classes with multibyte UTF-8. Got: {}" ,
3496+ result
3497+ ) ;
3498+ }
3499+
3500+ #[ test]
3501+ fn test_multibyte_utf8_combinator_split ( ) {
3502+ // Combinators between selectors with multibyte characters must not panic.
3503+ let result =
3504+ shim_css_text ( r#".über > .straße + .café ~ .naïve { color: green; }"# , "contenta" , "" ) ;
3505+ assert ! (
3506+ result. contains( "[contenta]" ) ,
3507+ "Should handle combinators with multibyte UTF-8 selectors. Got: {}" ,
3508+ result
3509+ ) ;
3510+ }
34583511}
0 commit comments