From f0b2043a3901f2db96f249cec4ac8c0bccdcce0f Mon Sep 17 00:00:00 2001 From: "claude@clouddev1" Date: Mon, 18 May 2026 22:36:19 +0000 Subject: [PATCH] walker: add Subgrammar node + recursion-depth cap (ADR-0026 step 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New `Node::Subgrammar(&'static Node)` variant lets a named static grammar fragment recurse through a reference — `Seq` / `Choice` embed children by value and cannot close a cycle, but a `&'static Node` can point back at an enclosing fragment. This is the mechanism the stratified WHERE-expression grammar (ADR-0026 §2) recurses through. The walker counts active Subgrammar frames in `WalkContext::subgrammar_depth` and refuses past `MAX_SUBGRAMMAR_DEPTH` (64), surfacing a friendly `parse.custom.expression_too_deep` error instead of a stack overflow. Depth is saved/restored per frame so a speculatively-walked-then-rolled-back Choice branch leaves no residue. No grammar references the node yet; covered by walker unit tests with a small recursive `( x )` test grammar. --- src/dsl/grammar/mod.rs | 22 ++++++ src/dsl/walker/context.rs | 8 ++ src/dsl/walker/driver.rs | 129 +++++++++++++++++++++++++++++++- src/friendly/keys.rs | 1 + src/friendly/strings/en-US.yaml | 5 ++ 5 files changed, 164 insertions(+), 1 deletion(-) diff --git a/src/dsl/grammar/mod.rs b/src/dsl/grammar/mod.rs index 28fe936..241ba99 100644 --- a/src/dsl/grammar/mod.rs +++ b/src/dsl/grammar/mod.rs @@ -292,6 +292,28 @@ pub enum Node { separator: Option<&'static Self>, min: usize, }, + /// Walks the referenced `&'static Node` once, mandatory + /// (ADR-0026 §2). The reference indirection is what lets a + /// named `static` grammar fragment appear inside its own + /// subtree: a `Seq` / `Choice` embeds its children by value + /// and so cannot close a cycle, but a `&'static Node` + /// reference can point back at an enclosing fragment. This + /// is the mechanism the stratified WHERE-expression grammar + /// recurses through — the `( or_expr )` branch and the + /// `not_expr` self-reference. + /// + /// The walker counts active `Subgrammar` frames in + /// `WalkContext::subgrammar_depth` and refuses past + /// `walker::driver::MAX_SUBGRAMMAR_DEPTH`, so pathologically + /// nested input (`((((…))))`) fails with a friendly error + /// rather than overflowing the parser stack. + /// + /// The static counterpart of `DynamicSubgrammar`: that one + /// builds a fresh node from the `WalkContext` at walk time; + /// this one references a fixed fragment already in the + /// grammar tree. + #[allow(dead_code)] + Subgrammar(&'static Self), /// Resolves at walk time using the active `WalkContext`. /// Phase D+ uses this for `column_value_list`. The factory /// is pure in `ctx`, so the walker memoizes the resolution diff --git a/src/dsl/walker/context.rs b/src/dsl/walker/context.rs index 70e12a3..0634c9f 100644 --- a/src/dsl/walker/context.rs +++ b/src/dsl/walker/context.rs @@ -75,6 +75,13 @@ pub struct WalkContext<'a> { /// skipped from the value list because the dispatch path /// auto-fills them). pub user_listed_columns: Option>, + /// Count of active `Node::Subgrammar` frames on the walk + /// stack (ADR-0026 §2). The walker increments on entry to a + /// `Subgrammar`, restores the saved value on exit, and + /// refuses past `driver::MAX_SUBGRAMMAR_DEPTH` so a + /// pathologically nested expression fails with a friendly + /// error instead of overflowing the process stack. + pub subgrammar_depth: usize, } impl<'a> WalkContext<'a> { @@ -100,6 +107,7 @@ impl<'a> WalkContext<'a> { pending_value_column: None, pending_hint_mode: None, user_listed_columns: None, + subgrammar_depth: 0, } } } diff --git a/src/dsl/walker/driver.rs b/src/dsl/walker/driver.rs index 6b615a9..6af3dff 100644 --- a/src/dsl/walker/driver.rs +++ b/src/dsl/walker/driver.rs @@ -35,6 +35,18 @@ use crate::dsl::walker::outcome::{ ByteClass, Expectation, MatchedItem, MatchedKind, MatchedPath, }; +/// Maximum nesting of `Node::Subgrammar` frames (ADR-0026 §1). +/// +/// The stratified WHERE-expression grammar descends one +/// `Subgrammar` hop per precedence tier, plus a tier-stack per +/// parenthesised group, so this bounds real expression nesting +/// many parentheses deep — far past any hand-written filter. +/// Its purpose is purely a stack-overflow guard: input nested +/// past the cap (`((((…))))`) fails with a friendly +/// `expression_too_deep` error instead of recursing until the +/// process stack is exhausted. +pub const MAX_SUBGRAMMAR_DEPTH: usize = 64; + /// Memo cache for `Node::DynamicSubgrammar` resolution. /// /// A factory builds a `Node` from the active `WalkContext`; the @@ -199,6 +211,9 @@ fn walk_node_inner( kind: FailureKind::Mismatch { expected: vec![] }, } } + Node::Subgrammar(inner) => { + walk_subgrammar(source, pos, inner, ctx, path, per_byte) + } Node::DynamicSubgrammar(factory) => { // ADR-0024 §sub-grammars: resolve the inner Node at // walk time from the active `WalkContext`, then walk @@ -884,6 +899,48 @@ fn walk_optional( } } +/// Walk a `&'static Node` reference once (ADR-0026 §2). +/// +/// The reference indirection is what lets a named `static` +/// grammar fragment recurse: `Seq` / `Choice` embed children by +/// value and so cannot close a cycle, but a `Subgrammar` node +/// holding a `&'static Node` can point back into an enclosing +/// fragment. The stratified WHERE-expression grammar's +/// `( or_expr )` branch and `not_expr` self-reference both +/// recurse this way. +/// +/// `WalkContext::subgrammar_depth` counts active frames. Past +/// `MAX_SUBGRAMMAR_DEPTH` the walk fails with a friendly +/// `expression_too_deep` validation error rather than +/// overflowing the process stack. The depth is saved on entry +/// and restored on exit unconditionally, so a speculatively- +/// walked branch that a `Choice` later rolls back leaves the +/// counter clean. +fn walk_subgrammar( + source: &str, + pos: usize, + inner: &'static Node, + ctx: &mut WalkContext, + path: &mut MatchedPath, + per_byte: &mut Vec, +) -> NodeWalkResult { + let saved_depth = ctx.subgrammar_depth; + ctx.subgrammar_depth += 1; + if ctx.subgrammar_depth > MAX_SUBGRAMMAR_DEPTH { + ctx.subgrammar_depth = saved_depth; + return NodeWalkResult::Failed { + position: pos, + kind: FailureKind::Validation(ValidationError { + message_key: "parse.custom.expression_too_deep", + args: Vec::new(), + }), + }; + } + let result = walk_node(source, pos, inner, ctx, path, per_byte); + ctx.subgrammar_depth = saved_depth; + result +} + fn merge_expected(dst: &mut Vec, src: Vec) { for e in src { if !dst.contains(&e) { @@ -894,9 +951,79 @@ fn merge_expected(dst: &mut Vec, src: Vec) { #[cfg(test)] mod tests { - use super::{DYNAMIC_CACHE, resolve_dynamic}; + use super::{ + DYNAMIC_CACHE, FailureKind, MAX_SUBGRAMMAR_DEPTH, NodeWalkResult, + resolve_dynamic, walk_node, + }; use crate::dsl::grammar::{Node, Word}; use crate::dsl::walker::context::WalkContext; + use crate::dsl::walker::outcome::MatchedPath; + + // Recursive test grammar for the `Subgrammar` node + // (ADR-0026 §2): `x` | `( )`. `NESTED_GROUP` reaches + // back to `NESTED` through `Subgrammar(&NESTED)` — the cycle + // a by-value `Seq` slice could not express. + static NESTED_GROUP: &[Node] = &[ + Node::Punct('('), + Node::Subgrammar(&NESTED), + Node::Punct(')'), + ]; + static NESTED_CHOICES: &[Node] = &[ + Node::Seq(NESTED_GROUP), + Node::Word(Word::keyword("x")), + ]; + static NESTED: Node = Node::Choice(NESTED_CHOICES); + + fn walk_nested(input: &str) -> NodeWalkResult { + let mut ctx = WalkContext::new(); + let mut path = MatchedPath::new(); + let mut per_byte = Vec::new(); + let result = + walk_node(input, 0, &NESTED, &mut ctx, &mut path, &mut per_byte); + assert_eq!( + ctx.subgrammar_depth, 0, + "subgrammar_depth must be restored to 0 after the walk", + ); + result + } + + #[test] + fn subgrammar_walks_a_recursive_grammar() { + for input in ["x", "(x)", "((x))", "(((x)))"] { + assert!( + matches!(walk_nested(input), NodeWalkResult::Matched { .. }), + "{input:?} should match the recursive Subgrammar grammar", + ); + } + } + + #[test] + fn subgrammar_depth_cap_allows_exactly_the_limit() { + let input = format!( + "{}x{}", + "(".repeat(MAX_SUBGRAMMAR_DEPTH), + ")".repeat(MAX_SUBGRAMMAR_DEPTH), + ); + assert!( + matches!(walk_nested(&input), NodeWalkResult::Matched { .. }), + "exactly MAX_SUBGRAMMAR_DEPTH nested groups should still walk", + ); + } + + #[test] + fn subgrammar_depth_cap_rejects_pathological_nesting() { + let over = MAX_SUBGRAMMAR_DEPTH + 1; + let input = format!("{}x{}", "(".repeat(over), ")".repeat(over)); + match walk_nested(&input) { + NodeWalkResult::Failed { + kind: FailureKind::Validation(err), + .. + } => assert_eq!(err.message_key, "parse.custom.expression_too_deep"), + other => { + panic!("expected an expression_too_deep failure, got {other:?}") + } + } + } /// Trivial factory — ignores the context. The memo behaviour /// is keyed on the context, not the factory's output, so a diff --git a/src/friendly/keys.rs b/src/friendly/keys.rs index 81ea222..439cb47 100644 --- a/src/friendly/keys.rs +++ b/src/friendly/keys.rs @@ -183,6 +183,7 @@ pub const KEYS_AND_PLACEHOLDERS: &[(&str, &[&str])] = &[ ("parse.custom.bind_type_mismatch", &["found", "expected"]), ("parse.custom.change_column_flags_exclusive", &[]), ("parse.custom.create_table_needs_pk", &[]), + ("parse.custom.expression_too_deep", &[]), ("parse.custom.insert_form_a_missing_values", &["columns"]), ("parse.custom.on_action_specified_twice", &["target"]), ("parse.custom.replay_path_expected", &[]), diff --git a/src/friendly/strings/en-US.yaml b/src/friendly/strings/en-US.yaml index 23ba2de..0c235f1 100644 --- a/src/friendly/strings/en-US.yaml +++ b/src/friendly/strings/en-US.yaml @@ -372,6 +372,11 @@ parse: replay_path_expected: "expected a path after `replay`" create_table_needs_pk: |- tables need at least one column. Add `with pk` for a default `id INTEGER PRIMARY KEY`, or `with pk ()` to choose. Use a comma-separated list for compound primary keys. + # ADR-0026 §1: the recursion-depth guard on the + # WHERE-expression grammar. Input nested past the cap + # (`((((…))))`) stops here with a friendly error instead + # of overflowing the parser stack. + expression_too_deep: "expression nested too deeply" on_action_specified_twice: "`on {target}` specified twice" change_column_flags_exclusive: "`--force-conversion` and `--dont-convert` are mutually exclusive — pick one." unknown_type: "unknown type '{found}' (expected one of: {expected})"