diff --git a/shared/yeast-macros/src/lib.rs b/shared/yeast-macros/src/lib.rs index 0c264ee13c80..1d7236b500a9 100644 --- a/shared/yeast-macros/src/lib.rs +++ b/shared/yeast-macros/src/lib.rs @@ -9,14 +9,21 @@ mod parse; /// /// ```text /// (_) - match any named node (skips unnamed tokens) +/// _ - match any node, named or unnamed /// (kind) - match a named node of the given kind /// ("literal") - match an unnamed token by its text +/// "literal" - shorthand for `("literal")` /// (kind field: (pattern)) - match with named field -/// (kind (pat) (pat)...) - match unnamed children (after all fields) +/// (kind field: _) - bare `_` and bare literals work in field position too +/// (kind (pat) (pat)...) - match unnamed children /// (pattern) @capture - capture the matched node +/// "literal" @capture - capture an unnamed token +/// _ @capture - capture any node /// (pattern)* @capture - capture each repeated match /// (pattern)? - zero or one /// ``` +/// +/// Named fields and bare child patterns may be intermixed in any order. #[proc_macro] pub fn query(input: TokenStream) -> TokenStream { let input2: TokenStream2 = input.into(); diff --git a/shared/yeast-macros/src/parse.rs b/shared/yeast-macros/src/parse.rs index f8554f3178ca..70bd46d5b6f6 100644 --- a/shared/yeast-macros/src/parse.rs +++ b/shared/yeast-macros/src/parse.rs @@ -38,7 +38,8 @@ fn parse_query_node(tokens: &mut Tokens) -> Result { } } -/// Parse a query atom: `(kind fields...)` or `(kind fields... bare_children...)`. +/// Parse a query atom: a parenthesized node, a bare `_` (any node), or a +/// bare string literal (unnamed token). /// Does not handle `@capture` — that's handled by the caller as a postfix. fn parse_query_atom(tokens: &mut Tokens) -> Result { match tokens.peek() { @@ -58,9 +59,17 @@ fn parse_query_atom(tokens: &mut Tokens) -> Result { } Ok(result) } + Some(TokenTree::Ident(id)) if *id == "_" => { + tokens.next(); + Ok(quote! { yeast::query::QueryNode::Any { match_unnamed: true } }) + } + Some(TokenTree::Literal(_)) => { + let lit = expect_literal(tokens)?; + Ok(quote! { yeast::query::QueryNode::UnnamedNode { kind: #lit } }) + } Some(tok) => Err(syn::Error::new_spanned( tok.clone(), - "expected `(` in query; use `(_) @name` to capture a wildcard", + "expected `(`, `_`, or string literal in query", )), } } @@ -74,7 +83,7 @@ fn parse_query_node_inner(tokens: &mut Tokens) -> Result { )), Some(TokenTree::Ident(id)) if *id == "_" => { tokens.next(); - Ok(quote! { yeast::query::QueryNode::Any() }) + Ok(quote! { yeast::query::QueryNode::Any { match_unnamed: false } }) } Some(TokenTree::Literal(_)) => { let lit = expect_literal(tokens)?; @@ -98,11 +107,14 @@ fn parse_query_node_inner(tokens: &mut Tokens) -> Result { } } -/// Parse zero or more field specifications and trailing bare patterns. -/// Named fields: `name: pattern` or `name*: (list...)`. -/// Bare patterns (no field name) become implicit `child` field entries. +/// Parse zero or more field specifications and bare patterns. +/// Named fields: `name: pattern`. Bare patterns (no field name) become +/// implicit `child` field entries. Named fields and bare patterns may +/// appear in any order; bare patterns are accumulated and emitted as a +/// single `("child", ...)` entry. fn parse_query_fields(tokens: &mut Tokens) -> Result> { let mut fields = Vec::new(); + let mut bare_children: Vec = Vec::new(); while tokens.peek().is_some() { if peek_is_field(tokens) { let field_name = expect_ident(tokens, "expected field name")?; @@ -115,16 +127,21 @@ fn parse_query_fields(tokens: &mut Tokens) -> Result> { (#field_str, vec![yeast::query::QueryListElem::SingleNode(#child)]) }); } else { - // Bare patterns — collect as implicit `child` field + // Bare patterns — accumulate into the implicit `child` field. + // We don't break here, so we can interleave with named fields. let elems = parse_query_list(tokens)?; - if !elems.is_empty() { - fields.push(quote! { - ("child", vec![#(#elems),*]) - }); + if elems.is_empty() { + // Nothing more we can parse at this level. + break; } - break; + bare_children.extend(elems); } } + if !bare_children.is_empty() { + fields.push(quote! { + ("child", vec![#(#bare_children),*]) + }); + } Ok(fields) } @@ -178,10 +195,11 @@ fn parse_query_list(tokens: &mut Tokens) -> Result> { continue; } - // Check for string literal (unnamed node) + // Check for string literal (unnamed node), optionally followed by @capture if peek_is_literal(tokens) { let lit = expect_literal(tokens)?; let node = quote! { yeast::query::QueryNode::UnnamedNode { kind: #lit } }; + let node = maybe_wrap_capture(tokens, node)?; let elem = maybe_wrap_repetition( tokens, quote! { @@ -192,10 +210,12 @@ fn parse_query_list(tokens: &mut Tokens) -> Result> { continue; } - // Check for bare _ (wildcard), possibly followed by @capture + // Check for bare `_` (any node, named or unnamed), possibly followed by @capture. + // Distinct from `(_)` which only matches named nodes — this matches + // tree-sitter query semantics. if peek_is_underscore(tokens) { tokens.next(); - let node = quote! { yeast::query::QueryNode::Any() }; + let node = quote! { yeast::query::QueryNode::Any { match_unnamed: true } }; let node = maybe_wrap_capture(tokens, node)?; let elem = maybe_wrap_repetition( tokens, diff --git a/shared/yeast/doc/yeast.md b/shared/yeast/doc/yeast.md index d49ff96f11df..5a6267fba0f9 100644 --- a/shared/yeast/doc/yeast.md +++ b/shared/yeast/doc/yeast.md @@ -103,19 +103,40 @@ Captures bind matched nodes to names for use in the transform. A capture (identifier) @name // capture an identifier node (_) @value // capture any named node (identifier)* @items // capture each repeated match +("=") @op // capture an unnamed token by its text +"=" @op // shorthand for the line above +_ @anything // capture any node, named or unnamed ``` -### Unnamed children +### Named vs unnamed children -Patterns that appear after all named fields match unnamed (positional) -children. Named node patterns like `(_)` automatically skip unnamed tokens -(keywords, operators, punctuation), matching tree-sitter semantics: +The two wildcard forms `(_)` and bare `_` differ: + +- `(_)` matches only **named** nodes. When used as a positional pattern, + unnamed children (keywords, operators, punctuation) are skipped over. +- Bare `_` matches **any** node, named or unnamed, taking whatever is next + in the child list. + +Bare child patterns are matched **forward-scan**: each pattern advances +through the iterator until it finds a child that matches, skipping +non-matching children along the way. So `(foo ("baz"))` against a `foo` +whose children are `[bar, baz]` succeeds — the matcher scans past `bar` +and matches `baz`. The iterator advances as it goes, so subsequent +patterns can never match children that appear earlier in source order +than already-matched ones. + +For named-only patterns (`(_)`, `(some_kind ...)`), the scan additionally +skips past unnamed tokens without trying to match them, since they can +never match anyway. + +Anchors (`.`) for forcing immediate adjacency, like in tree-sitter +queries, are not supported. ```rust (for - pattern: (_) @pat // named field - value: (in (_) @val) // "in" token is skipped automatically - body: (do (_)* @body) // "do" and "end" tokens skipped + pattern: (_) @pat // named field, captures any named node + value: (in (_) @val) // "in" wrapper is a named node here + body: (do (_)* @body) // "do" and "end" tokens skipped by (_) ) ``` diff --git a/shared/yeast/src/query.rs b/shared/yeast/src/query.rs index 223b34569190..01e5e22ad730 100644 --- a/shared/yeast/src/query.rs +++ b/shared/yeast/src/query.rs @@ -2,7 +2,13 @@ use crate::{captures::Captures, Ast, Id}; #[derive(Debug, Clone)] pub enum QueryNode { - Any(), + /// A wildcard. With `match_unnamed = false` (the default for `(_)`), + /// only matches named nodes when used positionally — unnamed children + /// are skipped over. With `match_unnamed = true` (for bare `_`), the + /// wildcard consumes whatever the next child is, named or unnamed. + Any { + match_unnamed: bool, + }, Node { kind: &'static str, children: Vec<(&'static str, Vec)>, @@ -24,7 +30,7 @@ impl QueryNode { QueryNode::Node { kind, .. } => Some(kind), QueryNode::UnnamedNode { kind } => Some(kind), QueryNode::Capture { node, .. } => node.root_kind(), - QueryNode::Any() => None, + QueryNode::Any { .. } => None, } } } @@ -51,7 +57,7 @@ impl QueryNode { /// semantics where `(_)` only matches named nodes. fn matches_named_only(&self) -> bool { match self { - QueryNode::Any() => true, + QueryNode::Any { match_unnamed } => !match_unnamed, QueryNode::Node { .. } => true, QueryNode::UnnamedNode { .. } => false, QueryNode::Capture { node, .. } => node.matches_named_only(), @@ -60,7 +66,7 @@ impl QueryNode { pub fn do_match(&self, ast: &Ast, node: Id, matches: &mut Captures) -> Result { match self { - QueryNode::Any() => Ok(true), + QueryNode::Any { .. } => Ok(true), QueryNode::Node { kind, children } => { let node = ast.get_node(node).unwrap(); let target_kind = ast @@ -161,25 +167,28 @@ impl QueryListElem { } } QueryListElem::SingleNode(sub_query) => { - if sub_query.matches_named_only() { - // Skip unnamed children, matching tree-sitter semantics - // where (_) only matches named nodes. - loop { - match remaining_children.next() { - Some(child) => { - let node = ast.get_node(child).unwrap(); - if node.is_named() { - return sub_query.do_match(ast, child, matches); - } - // Skip unnamed child, continue to next - } - None => return Ok(false), + // Forward-scan semantics: advance through the iterator until + // we find a child that matches `sub_query`. Skip ahead past + // unnamed children when the sub-query is named-only (so they + // can never match anyway). On a match attempt that fails, + // restore the captures so partial captures from a complex + // sub-query don't leak. + let skip_unnamed = sub_query.matches_named_only(); + loop { + let Some(child) = remaining_children.next() else { + return Ok(false); + }; + if skip_unnamed { + let node = ast.get_node(child).unwrap(); + if !node.is_named() { + continue; } } - } else if let Some(child) = remaining_children.next() { - sub_query.do_match(ast, child, matches) - } else { - Ok(false) + let snapshot = matches.clone(); + if sub_query.do_match(ast, child, matches)? { + return Ok(true); + } + *matches = snapshot; } } } diff --git a/shared/yeast/src/schema.rs b/shared/yeast/src/schema.rs index 0a33fd6e0ed4..12554d9c8692 100644 --- a/shared/yeast/src/schema.rs +++ b/shared/yeast/src/schema.rs @@ -61,9 +61,10 @@ impl Schema { } } // Import all node kind names, preserving tree-sitter's IDs. - // Track named and unnamed variants separately. - // For named kinds, use the canonical ID from id_for_node_kind(name, true) - // since some languages have multiple IDs for the same named kind. + // Track named and unnamed variants separately. For both named and + // unnamed kinds, use the canonical ID from id_for_node_kind, since + // some languages have multiple IDs for the same name (e.g., the + // reserved error token at ID 0 may share a name with a real token). for id in 0..language.node_kind_count() as u16 { if let Some(name) = language.node_kind_for_id(id) { if !name.is_empty() { @@ -75,12 +76,13 @@ impl Schema { schema.kind_names.insert(canonical_id, name); } } else { - // For unnamed kinds, only insert if we don't already have one - // (some languages have multiple unnamed IDs for the same text) - schema - .unnamed_kind_ids - .entry(name.to_string()) - .or_insert(id); + let canonical_id = language.id_for_node_kind(name, false); + if canonical_id != 0 && !schema.unnamed_kind_ids.contains_key(name) { + schema + .unnamed_kind_ids + .insert(name.to_string(), canonical_id); + schema.kind_names.insert(canonical_id, name); + } } // Always track the name for any ID we encounter schema.kind_names.entry(id).or_insert(name); diff --git a/shared/yeast/tests/test.rs b/shared/yeast/tests/test.rs index e4485857bff1..f7b363294bcc 100644 --- a/shared/yeast/tests/test.rs +++ b/shared/yeast/tests/test.rs @@ -170,6 +170,187 @@ fn test_query_repeated_capture() { assert_eq!(captures.get_all("names").len(), 3); } +#[test] +fn test_capture_unnamed_node_parenthesized() { + // `("=") @op` captures the unnamed `=` token between left and right. + let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]); + let ast = runner.run("x = 1").unwrap(); + + let query = yeast::query!( + (assignment + left: (_) @lhs + ("=") @op + right: (_) @rhs + ) + ); + + let mut cursor = AstCursor::new(&ast); + cursor.goto_first_child(); + let assignment_id = cursor.node().id(); + + let mut captures = yeast::captures::Captures::new(); + let matched = query.do_match(&ast, assignment_id, &mut captures).unwrap(); + assert!(matched); + let op_id = captures.get_var("op").unwrap(); + let op_node = ast.get_node(op_id).unwrap(); + assert_eq!(op_node.kind(), "="); + assert!(!op_node.is_named()); +} + +#[test] +fn test_capture_unnamed_node_bare_literal() { + // `"=" @op` (without surrounding parens) is the same as `("=") @op`. + let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]); + let ast = runner.run("x = 1").unwrap(); + + let query = yeast::query!( + (assignment + left: (_) @lhs + "=" @op + right: (_) @rhs + ) + ); + + let mut cursor = AstCursor::new(&ast); + cursor.goto_first_child(); + let assignment_id = cursor.node().id(); + + let mut captures = yeast::captures::Captures::new(); + let matched = query.do_match(&ast, assignment_id, &mut captures).unwrap(); + assert!(matched); + let op_id = captures.get_var("op").unwrap(); + let op_node = ast.get_node(op_id).unwrap(); + assert_eq!(op_node.kind(), "="); + assert!(!op_node.is_named()); +} + +#[test] +fn test_bare_underscore_matches_unnamed() { + // Bare `_` matches any node, including unnamed tokens, while `(_)` + // matches only named nodes. Demonstrate by matching the unnamed `=` + // token in the implicit `child` field of an `assignment`. + let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]); + let ast = runner.run("x = 1").unwrap(); + + let mut cursor = AstCursor::new(&ast); + cursor.goto_first_child(); + let assignment_id = cursor.node().id(); + + // `(_)` skips unnamed children, so a query containing a single `(_)` + // bare pattern fails to match the assignment (whose only unfielded + // child is the unnamed `=`). + let query_named = yeast::query!((assignment (_) @any)); + let mut captures = yeast::captures::Captures::new(); + let matched = query_named + .do_match(&ast, assignment_id, &mut captures) + .unwrap(); + assert!( + !matched, + "(_) should skip the unnamed `=` and fail to match" + ); + + // Bare `_` accepts the next child whatever it is, so it matches the + // unnamed `=` token. + let query_any = yeast::query!((assignment _ @any)); + let mut captures = yeast::captures::Captures::new(); + let matched = query_any + .do_match(&ast, assignment_id, &mut captures) + .unwrap(); + assert!(matched, "_ should match the unnamed `=`"); + let any_node = ast.get_node(captures.get_var("any").unwrap()).unwrap(); + assert_eq!(any_node.kind(), "="); + assert!(!any_node.is_named()); +} + +#[test] +fn test_bare_forms_in_field_position() { + // The bare `_` and bare-literal forms should be accepted as a + // field's value, not just in the bare-children position. This is + // syntactic sugar for `(_)` / `("…")` and goes through the same + // code paths. + let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]); + let ast = runner.run("x = 1").unwrap(); + + let mut cursor = AstCursor::new(&ast); + cursor.goto_first_child(); + let assignment_id = cursor.node().id(); + + // Bare `_` in field position. Captures the named `identifier "x"` + // child of the `left` field — bare `_` admits unnamed too, but the + // first child of `left` happens to be named. + let query = yeast::query!((assignment left: _ @lhs)); + let mut captures = yeast::captures::Captures::new(); + let matched = query.do_match(&ast, assignment_id, &mut captures).unwrap(); + assert!(matched); + assert_eq!( + ast.get_node(captures.get_var("lhs").unwrap()) + .unwrap() + .kind(), + "identifier" + ); + + // Bare literal in field position. Equivalent to `("=") @op`. + let query = yeast::query!((assignment child: "=" @op)); + let mut captures = yeast::captures::Captures::new(); + let matched = query.do_match(&ast, assignment_id, &mut captures).unwrap(); + assert!(matched); + let op = ast.get_node(captures.get_var("op").unwrap()).unwrap(); + assert_eq!(op.kind(), "="); + assert!(!op.is_named()); +} + +#[test] +fn test_forward_scan_finds_unnamed_token_late() { + // The `do` named-wrapper node has three children in its implicit + // `child` field, in source order: `do` (unnamed kw), the body + // identifier, and `end` (unnamed kw). Forward-scan semantics let a + // query for `("end")` skip past the first two and match the third. + // Without forward-scan, the matcher took the first child unconditionally + // and failed. + let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]); + let ast = runner.run("for x in list do\n y\nend").unwrap(); + + // Navigate: program > for > do (the body wrapper). + let mut cursor = AstCursor::new(&ast); + cursor.goto_first_child(); // for + cursor.goto_first_child(); // do (the body) + while cursor.node().kind() != "do" || !cursor.node().is_named() { + assert!(cursor.goto_next_sibling(), "expected to find named `do`"); + } + let do_id = cursor.node().id(); + + let query = yeast::query!((do ("end") @kw)); + let mut captures = yeast::captures::Captures::new(); + let matched = query.do_match(&ast, do_id, &mut captures).unwrap(); + assert!(matched, "forward-scan should find the `end` keyword"); + let kw = ast.get_node(captures.get_var("kw").unwrap()).unwrap(); + assert_eq!(kw.kind(), "end"); + assert!(!kw.is_named()); +} + +#[test] +fn test_forward_scan_preserves_order() { + // Bare patterns are scanned left-to-right and consume positions in + // order. A query for ("end") then ("do") should fail because `do` + // appears before `end` in the source order; once forward-scan has + // consumed `end`, the iterator is exhausted. + let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]); + let ast = runner.run("for x in list do\n y\nend").unwrap(); + + let mut cursor = AstCursor::new(&ast); + cursor.goto_first_child(); + cursor.goto_first_child(); + while cursor.node().kind() != "do" || !cursor.node().is_named() { + assert!(cursor.goto_next_sibling(), "expected to find named `do`"); + } + let do_id = cursor.node().id(); + + let query = yeast::query!((do ("end") @first ("do") @second)); + let mut captures = yeast::captures::Captures::new(); + let matched = query.do_match(&ast, do_id, &mut captures).unwrap(); + assert!(!matched, "scan must not go backwards"); +} + // ---- Tree builder tests ---- #[test]