From c93f9394f5e0aa2e51e321e0efe01d1477ab9030 Mon Sep 17 00:00:00 2001 From: "claude@clouddev1" Date: Tue, 19 May 2026 21:39:49 +0000 Subject: [PATCH] grammar: SQL expression grammar fragment (ADR-0031) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A new `src/dsl/grammar/sql_expr.rs` authored as a parallel fragment to `expr.rs` (the DSL `WHERE` grammar, ADR-0026). The ADR's stratified ladder lands as named `static` `Node`s, one per precedence tier: or_expr → and_expr → not_expr → predicate → additive → multiplicative → unary → primary Recursion through `Node::Subgrammar` reuses ADR-0026's `MAX_SUBGRAMMAR_DEPTH = 64` cap unchanged; no new walker capability is required. `predicate_tail` follows ADR-0026's factoring (shared operand prefix, infix `NOT` as an explicit branch, no `Optional`-first branch) so `Choice` discriminates cleanly. `name_or_call` factors the identifier-prefix shared between column refs and function calls into a single `Ident` followed by an `Optional` `( call_args )` tail — the same hazard-avoidance shape `predicate_tail` uses. The fragment exports `pub static SQL_OR_EXPR` (test entry) and `pub static SQL_EXPRESSION` (drop-in `Subgrammar(&SQL_OR_EXPR)` that SQL `CommandNode` shapes embed in their `Seq`). No AST builder — every Phase-1 consumer (SELECT projection, WHERE) runs validated SQL as text per ADR-0030 §4/§6. 13 unit tests cover every operator and precedence pair, the full predicate set, `CASE` (searched + simple) including `count(*)` and `count(distinct …)`, parenthesised regrouping, case-insensitive keywords, the depth cap, and a representative set of malformed inputs that do *not* walk. Module registered via one new line in `grammar/mod.rs`. --- src/dsl/grammar/mod.rs | 1 + src/dsl/grammar/sql_expr.rs | 599 ++++++++++++++++++++++++++++++++++++ 2 files changed, 600 insertions(+) create mode 100644 src/dsl/grammar/sql_expr.rs diff --git a/src/dsl/grammar/mod.rs b/src/dsl/grammar/mod.rs index 05574f0..d65fc76 100644 --- a/src/dsl/grammar/mod.rs +++ b/src/dsl/grammar/mod.rs @@ -27,6 +27,7 @@ pub mod data; pub mod ddl; pub mod expr; pub mod shared; +pub mod sql_expr; use crate::dsl::command::Command; use crate::dsl::walker::context::WalkContext; diff --git a/src/dsl/grammar/sql_expr.rs b/src/dsl/grammar/sql_expr.rs new file mode 100644 index 0000000..d38d040 --- /dev/null +++ b/src/dsl/grammar/sql_expr.rs @@ -0,0 +1,599 @@ +//! The SQL expression grammar fragment (ADR-0031). +//! +//! This is the advanced-mode counterpart of `grammar::expr` (the +//! DSL `WHERE` grammar, ADR-0026). It fills every expression slot +//! in advanced-mode SQL — `WHERE`, `HAVING`, `SELECT` projections, +//! `CHECK`, `DEFAULT` — and is the *superset* of ADR-0026's +//! grammar: it adds arithmetic, string concatenation, function +//! calls, and `CASE` on top of the comparison / `LIKE` / `IN` / +//! `BETWEEN` / `IS NULL` predicate set. +//! +//! # One unified ladder +//! +//! ADR-0026's DSL grammar stratifies a *boolean* layer above a +//! *predicate* layer because the DSL forbids a boolean +//! sub-expression as a comparison operand. Standard SQL draws no +//! such line — a boolean is a value — so this grammar is a single +//! precedence ladder, loosest tier first: +//! +//! ```text +//! or_expr := and_expr ( OR and_expr )* +//! and_expr := not_expr ( AND not_expr )* +//! not_expr := NOT not_expr | predicate +//! predicate := additive predicate_tail? +//! predicate_tail := cmp_op additive +//! | [ NOT ] LIKE additive +//! | [ NOT ] BETWEEN additive AND additive +//! | [ NOT ] IN ( additive ( , additive )* ) +//! | IS [ NOT ] NULL +//! additive := multiplicative ( ( + | - | || ) multiplicative )* +//! multiplicative := unary ( ( * | / | % ) unary )* +//! unary := ( - | + ) unary | primary +//! primary := literal | ( or_expr ) | case_expr | name_or_call +//! name_or_call := identifier [ '(' call_args? ')' ] +//! call_args := '*' | DISTINCT arg_list | arg_list +//! case_expr := CASE [ or_expr ] ( WHEN or_expr THEN or_expr )+ +//! [ ELSE or_expr ] END +//! ``` +//! +//! Stratification removes left recursion (every recursion is +//! guarded by a token) and encodes precedence in the layering. +//! Each tier transition, and each genuine recursion, goes through +//! [`Node::Subgrammar`] — a `Seq` / `Choice` embeds its children +//! by value and cannot close a cycle. The walker counts active +//! `Subgrammar` frames and refuses past `MAX_SUBGRAMMAR_DEPTH` +//! (ADR-0026 §1) — reused here unchanged. +//! +//! # No AST (ADR-0031 §2) +//! +//! Unlike `grammar::expr`, this fragment carries **no AST +//! builder**. Its consumers run or store SQL as text (ADR-0030 +//! §4/§6), so there is no `Expr` to build. The fragment's output +//! is the three other walker products — accept/reject, the flat +//! `MatchedPath` of terminals (which drives highlighting, +//! completion, the expected-set, and hints), and, via those +//! terminals' `span`s, the source range a consumer slices when it +//! needs the expression as text. The grammar tier owns +//! validation, highlight, completion, and the no-left-recursion +//! guarantee; it simply has no tree to hand back. + +use crate::dsl::grammar::{IdentSource, Node, Word}; + +// ================================================================= +// Shared leaf nodes +// ================================================================= + +/// `,` — the separator inside `IN ( … )` and a function-call +/// argument list. +static COMMA: Node = Node::Punct(','); + +/// A column reference inside an expression. `IdentSource::Columns` +/// drives Tab completion against the statement's table(s) from the +/// same `SchemaCache` the DSL uses (ADR-0031 §5). The same slot +/// also begins a `name_or_call` function call — the grammar does +/// not decide which (ADR-0031 §1, §2), so the slot optimises for +/// the common case (a column) and a function name simply is not a +/// completion candidate. `const` so it can be embedded by value. +const EXPR_IDENT: Node = Node::Ident { + source: IdentSource::Columns, + role: "sql_expr_ident", + validator: None, + highlight_override: None, + writes_table: false, + writes_column: false, + writes_user_listed_column: false, +}; + +// ================================================================= +// or_expr := and_expr ( OR and_expr )* — the fragment entry point +// ================================================================= + +static OR_TAIL_NODES: &[Node] = &[ + Node::Word(Word::keyword("or")), + Node::Subgrammar(&AND_EXPR), +]; +static OR_TAIL: Node = Node::Seq(OR_TAIL_NODES); +static SQL_OR_EXPR_NODES: &[Node] = &[ + Node::Subgrammar(&AND_EXPR), + Node::Repeated { + inner: &OR_TAIL, + separator: None, + min: 0, + }, +]; +/// `or_expr` — the loosest tier and the grammar's entry point. +pub static SQL_OR_EXPR: Node = Node::Seq(SQL_OR_EXPR_NODES); + +/// The SQL expression fragment, ready to drop into a SQL command +/// `Seq` as one node — the advanced-mode counterpart of +/// [`crate::dsl::grammar::expr::EXPRESSION`]. Walking it consumes a +/// whole `or_expr`. +pub static SQL_EXPRESSION: Node = Node::Subgrammar(&SQL_OR_EXPR); + +// ================================================================= +// and_expr := not_expr ( AND not_expr )* +// ================================================================= + +static AND_TAIL_NODES: &[Node] = &[ + Node::Word(Word::keyword("and")), + Node::Subgrammar(&NOT_EXPR), +]; +static AND_TAIL: Node = Node::Seq(AND_TAIL_NODES); +static AND_EXPR_NODES: &[Node] = &[ + Node::Subgrammar(&NOT_EXPR), + Node::Repeated { + inner: &AND_TAIL, + separator: None, + min: 0, + }, +]; +static AND_EXPR: Node = Node::Seq(AND_EXPR_NODES); + +// ================================================================= +// not_expr := NOT not_expr | predicate +// ================================================================= + +static NOT_FORM_NODES: &[Node] = &[ + Node::Word(Word::keyword("not")), + Node::Subgrammar(&NOT_EXPR), +]; +static NOT_EXPR_CHOICES: &[Node] = &[ + Node::Seq(NOT_FORM_NODES), + Node::Subgrammar(&PREDICATE), +]; +static NOT_EXPR: Node = Node::Choice(NOT_EXPR_CHOICES); + +// ================================================================= +// predicate := additive predicate_tail? +// ================================================================= +// +// `predicate_tail` is optional: a bare `additive` with no +// comparison operator is itself a valid expression — that is what +// a `SELECT` projection item (`a + b`) or a literal `WHERE 1` +// needs. ADR-0026's DSL grammar made the tail mandatory because it +// forbade a bare column as a boolean; SQL does not. + +static PREDICATE_NODES: &[Node] = &[ + Node::Subgrammar(&ADDITIVE), + Node::Optional(&PREDICATE_TAIL), +]; +static PREDICATE: Node = Node::Seq(PREDICATE_NODES); + +// ---- cmp_op := <= | <> | >= | != | < | > | = -------------------- +// +// Two-character operators come before their one-character +// prefixes: `walk_literal` matches `<` against the `<` of `<=`, +// so `<=` / `<>` must be tried first (ADR-0026's note, inherited). + +static CMP_OP_CHOICES: &[Node] = &[ + Node::Literal("<="), + Node::Literal("<>"), + Node::Literal(">="), + Node::Literal("!="), + Node::Literal("<"), + Node::Literal(">"), + Node::Literal("="), +]; + +// ---- predicate_tail branches ------------------------------------ + +/// `cmp_op additive`. +static COMPARE_FORM_NODES: &[Node] = &[ + Node::Choice(CMP_OP_CHOICES), + Node::Subgrammar(&ADDITIVE), +]; + +/// `IS [NOT] NULL`. +static IS_NULL_NODES: &[Node] = &[ + Node::Word(Word::keyword("is")), + Node::Optional(&Node::Word(Word::keyword("not"))), + Node::Word(Word::keyword("null")), +]; + +/// `LIKE additive`. +static LIKE_FORM_NODES: &[Node] = &[ + Node::Word(Word::keyword("like")), + Node::Subgrammar(&ADDITIVE), +]; + +/// `BETWEEN additive AND additive`. The inner `and` is consumed +/// here, so a stray `and` at the `and_expr` tier is always a +/// connective. +static BETWEEN_FORM_NODES: &[Node] = &[ + Node::Word(Word::keyword("between")), + Node::Subgrammar(&ADDITIVE), + Node::Word(Word::keyword("and")), + Node::Subgrammar(&ADDITIVE), +]; + +/// `IN ( additive [, additive]* )`. +static IN_ITEM: Node = Node::Subgrammar(&ADDITIVE); +static IN_FORM_NODES: &[Node] = &[ + Node::Word(Word::keyword("in")), + Node::Punct('('), + Node::Repeated { + inner: &IN_ITEM, + separator: Some(&COMMA), + min: 1, + }, + Node::Punct(')'), +]; + +/// The negatable predicate bodies — each starts with a distinct +/// keyword, so this `Choice` discriminates cleanly. +static NEGATABLE_CHOICES: &[Node] = &[ + Node::Seq(LIKE_FORM_NODES), + Node::Seq(BETWEEN_FORM_NODES), + Node::Seq(IN_FORM_NODES), +]; + +/// `NOT (LIKE … | BETWEEN … | IN …)` — the infix `NOT` is factored +/// in front of the negatable choice (ADR-0026's factoring). +static NOT_NEGATABLE_NODES: &[Node] = &[ + Node::Word(Word::keyword("not")), + Node::Choice(NEGATABLE_CHOICES), +]; + +/// `predicate_tail`. Branch discrimination relies on each branch's +/// *first* child reporting a clean `NoMatch` on a non-match — no +/// branch starts with an `Optional` (ADR-0026's hazard). The infix +/// `NOT` is its own explicit `NOT negatable` branch. +static PREDICATE_TAIL_CHOICES: &[Node] = &[ + Node::Seq(COMPARE_FORM_NODES), + Node::Seq(IS_NULL_NODES), + Node::Seq(NOT_NEGATABLE_NODES), + Node::Seq(LIKE_FORM_NODES), + Node::Seq(BETWEEN_FORM_NODES), + Node::Seq(IN_FORM_NODES), +]; +static PREDICATE_TAIL: Node = Node::Choice(PREDICATE_TAIL_CHOICES); + +// ================================================================= +// additive := multiplicative ( ( + | - | || ) multiplicative )* +// ================================================================= + +static ADD_OP_CHOICES: &[Node] = &[ + Node::Punct('+'), + Node::Punct('-'), + Node::Literal("||"), +]; +static ADD_TAIL_NODES: &[Node] = &[ + Node::Choice(ADD_OP_CHOICES), + Node::Subgrammar(&MULTIPLICATIVE), +]; +static ADD_TAIL: Node = Node::Seq(ADD_TAIL_NODES); +static ADDITIVE_NODES: &[Node] = &[ + Node::Subgrammar(&MULTIPLICATIVE), + Node::Repeated { + inner: &ADD_TAIL, + separator: None, + min: 0, + }, +]; +static ADDITIVE: Node = Node::Seq(ADDITIVE_NODES); + +// ================================================================= +// multiplicative := unary ( ( * | / | % ) unary )* +// ================================================================= + +static MUL_OP_CHOICES: &[Node] = &[ + Node::Punct('*'), + Node::Punct('/'), + Node::Punct('%'), +]; +static MUL_TAIL_NODES: &[Node] = &[ + Node::Choice(MUL_OP_CHOICES), + Node::Subgrammar(&UNARY), +]; +static MUL_TAIL: Node = Node::Seq(MUL_TAIL_NODES); +static MULTIPLICATIVE_NODES: &[Node] = &[ + Node::Subgrammar(&UNARY), + Node::Repeated { + inner: &MUL_TAIL, + separator: None, + min: 0, + }, +]; +static MULTIPLICATIVE: Node = Node::Seq(MULTIPLICATIVE_NODES); + +// ================================================================= +// unary := ( - | + ) unary | primary +// ================================================================= + +static SIGN_CHOICES: &[Node] = &[Node::Punct('-'), Node::Punct('+')]; +static UNARY_SIGN_NODES: &[Node] = &[ + Node::Choice(SIGN_CHOICES), + Node::Subgrammar(&UNARY), +]; +static UNARY_CHOICES: &[Node] = &[ + Node::Seq(UNARY_SIGN_NODES), + Node::Subgrammar(&PRIMARY), +]; +static UNARY: Node = Node::Choice(UNARY_CHOICES); + +// ================================================================= +// primary := literal | ( or_expr ) | case_expr | name_or_call +// ================================================================= + +/// `( or_expr )` — a parenthesised group is a whole expression. +static PAREN_GROUP_NODES: &[Node] = &[ + Node::Punct('('), + Node::Subgrammar(&SQL_OR_EXPR), + Node::Punct(')'), +]; + +// ---- case_expr -------------------------------------------------- +// +// `CASE [operand] (WHEN cond THEN result)+ [ELSE result] END`. +// Searched-CASE (no operand) and simple-CASE (with operand) are two +// `Choice` branches *after* the shared `CASE` keyword is factored +// out: the searched branch opens with `WHEN`, the simple branch +// with an operand expression. Branch 1's leading `Repeated(min 1)` +// reports a clean `NoMatch` when the next token is not `WHEN`, so +// the `Choice` falls through to the simple branch cleanly. + +static WHEN_CLAUSE_NODES: &[Node] = &[ + Node::Word(Word::keyword("when")), + Node::Subgrammar(&SQL_OR_EXPR), + Node::Word(Word::keyword("then")), + Node::Subgrammar(&SQL_OR_EXPR), +]; +static WHEN_CLAUSE: Node = Node::Seq(WHEN_CLAUSE_NODES); + +static ELSE_CLAUSE_NODES: &[Node] = &[ + Node::Word(Word::keyword("else")), + Node::Subgrammar(&SQL_OR_EXPR), +]; +static ELSE_CLAUSE: Node = Node::Seq(ELSE_CLAUSE_NODES); + +/// Searched-CASE body: `(WHEN … THEN …)+ [ELSE …] END`. +static SEARCHED_CASE_NODES: &[Node] = &[ + Node::Repeated { + inner: &WHEN_CLAUSE, + separator: None, + min: 1, + }, + Node::Optional(&ELSE_CLAUSE), + Node::Word(Word::keyword("end")), +]; +/// Simple-CASE body: `operand (WHEN … THEN …)+ [ELSE …] END`. +static SIMPLE_CASE_NODES: &[Node] = &[ + Node::Subgrammar(&SQL_OR_EXPR), + Node::Repeated { + inner: &WHEN_CLAUSE, + separator: None, + min: 1, + }, + Node::Optional(&ELSE_CLAUSE), + Node::Word(Word::keyword("end")), +]; +static CASE_BODY_CHOICES: &[Node] = &[ + Node::Seq(SEARCHED_CASE_NODES), + Node::Seq(SIMPLE_CASE_NODES), +]; +static CASE_NODES: &[Node] = &[ + Node::Word(Word::keyword("case")), + Node::Choice(CASE_BODY_CHOICES), +]; + +// ---- name_or_call ----------------------------------------------- +// +// `identifier [ '(' call_args? ')' ]`. The identifier is matched +// once; the `( call_args )` group is an `Optional` tail — present +// is a function call, absent is a column reference. Factoring this +// (rather than two `Choice` branches sharing the identifier +// prefix) avoids the function-call branch committing on the +// identifier and discarding the column-ref branch. + +/// One function-call argument — a whole expression. +static CALL_ARG: Node = Node::Subgrammar(&SQL_OR_EXPR); + +/// `call_args := '*' | DISTINCT arg_list | arg_list`. `count(*)` +/// is the one place `*` is an argument; `count(distinct col)` the +/// one place `DISTINCT` leads the list. The grammar admits the +/// call shape structurally — it does not know which names are +/// aggregates (ADR-0031 §1). +static DISTINCT_ARGS_NODES: &[Node] = &[ + Node::Word(Word::keyword("distinct")), + Node::Repeated { + inner: &CALL_ARG, + separator: Some(&COMMA), + min: 1, + }, +]; +static CALL_ARGS_CHOICES: &[Node] = &[ + Node::Punct('*'), + Node::Seq(DISTINCT_ARGS_NODES), + Node::Repeated { + inner: &CALL_ARG, + separator: Some(&COMMA), + min: 1, + }, +]; +static CALL_ARGS: Node = Node::Choice(CALL_ARGS_CHOICES); + +static CALL_TAIL_NODES: &[Node] = &[ + Node::Punct('('), + Node::Optional(&CALL_ARGS), + Node::Punct(')'), +]; +static CALL_TAIL: Node = Node::Seq(CALL_TAIL_NODES); + +static NAME_OR_CALL_NODES: &[Node] = &[EXPR_IDENT, Node::Optional(&CALL_TAIL)]; + +/// `primary`. Keyword literals (`null` / `true` / `false`) and the +/// `CASE` keyword come before `name_or_call`, so they parse as +/// what they are rather than as column references. +static PRIMARY_CHOICES: &[Node] = &[ + Node::Word(Word::keyword("null")), + Node::Word(Word::keyword("true")), + Node::Word(Word::keyword("false")), + Node::NumberLit { validator: None }, + Node::StringLit, + Node::Seq(PAREN_GROUP_NODES), + Node::Seq(CASE_NODES), + Node::Seq(NAME_OR_CALL_NODES), +]; +static PRIMARY: Node = Node::Choice(PRIMARY_CHOICES); + +// ================================================================= +// Tests +// ================================================================= + +#[cfg(test)] +mod tests { + use super::SQL_OR_EXPR; + use crate::dsl::walker::context::WalkContext; + use crate::dsl::walker::driver::{NodeWalkResult, walk_node}; + use crate::dsl::walker::outcome::MatchedPath; + + /// Walk `input` against the SQL expression fragment. Returns + /// `true` only when the walk matches *and* consumes all of + /// `input` (trailing whitespace allowed). + fn walks(input: &str) -> bool { + let mut ctx = WalkContext::new(); + let mut path = MatchedPath::new(); + let mut per_byte = Vec::new(); + match walk_node(input, 0, &SQL_OR_EXPR, &mut ctx, &mut path, &mut per_byte) { + NodeWalkResult::Matched { end, .. } => input[end..].trim().is_empty(), + _ => false, + } + } + + /// Assert `input` is a complete SQL expression. + fn good(input: &str) { + assert!(walks(input), "{input:?} should be a valid SQL expression"); + } + + /// Assert `input` is *not* a complete SQL expression. + fn bad(input: &str) { + assert!(!walks(input), "{input:?} should NOT walk as a complete expression"); + } + + #[test] + fn bare_operands_are_expressions() { + // A projection item / `WHERE 1` — no predicate operator. + for input in ["1", "col", "'text'", "true", "false", "null", "-7"] { + good(input); + } + } + + #[test] + fn every_comparison_operator() { + for op in ["=", "!=", "<>", "<", "<=", ">", ">="] { + good(&format!("a {op} 1")); + } + } + + #[test] + fn arithmetic_and_precedence() { + for input in [ + "a + b", + "a - b", + "a * b", + "a / b", + "a % b", + "a + b * c", + "(a + b) * c", + "a + b - c + d", + "-a + b", + "a - -b", + "- -a", + ] { + good(input); + } + } + + #[test] + fn string_concatenation() { + good("first || ' ' || last"); + } + + #[test] + fn function_calls() { + for input in [ + "upper(name)", + "length(x)", + "f()", + "round(price, 2)", + "count(*)", + "count(distinct customer_id)", + "coalesce(a, b, c)", + "upper(lower(name))", + ] { + good(input); + } + } + + #[test] + fn case_searched_and_simple() { + good("case when a > 0 then 'pos' else 'neg' end"); + good("case when a > 0 then 1 when a < 0 then -1 else 0 end"); + good("case grade when 1 then 'A' when 2 then 'B' end"); + good("case status when 'open' then 1 else 0 end"); + } + + #[test] + fn the_predicate_set() { + good("name like 'A%'"); + good("name not like 'A%'"); + good("age between 18 and 65"); + good("age not between 0 and 17"); + good("status in (1, 2, 3)"); + good("status not in ('a', 'b')"); + good("email is null"); + good("email is not null"); + } + + #[test] + fn boolean_connectives_and_precedence() { + good("a = 1 and b = 2"); + good("a = 1 or b = 2"); + good("a = 1 or b = 2 and c = 3"); + good("not a = 1"); + good("not (a = 1 or b = 2)"); + good("(a = 1 or b = 2) and c = 3"); + } + + #[test] + fn predicates_over_arithmetic_operands() { + // The operands of a comparison are full scalar + // expressions — the whole point of the superset. + good("price * 1.1 > budget"); + good("upper(name) = 'ADA'"); + good("a + b between c and d"); + } + + #[test] + fn keywords_are_case_insensitive() { + good("A = 1 AND B IS NOT NULL"); + good("CASE WHEN x > 0 THEN 1 ELSE 0 END"); + good("name LIKE 'x%' OR age BETWEEN 1 AND 9"); + } + + #[test] + fn nested_parentheses_walk() { + good("((a = 1 and b = 2) or (c = 3))"); + good("(((1)))"); + } + + #[test] + fn malformed_expressions_do_not_walk() { + bad("a +"); // dangling operator + bad("a in b"); // IN requires a parenthesised list + bad("= 1"); // no left operand + bad("a = "); // no right operand + bad("case a end"); // CASE with no WHEN clause + bad("and b"); // leading connective + bad("upper("); // unclosed call + } + + #[test] + fn deeply_nested_parentheses_are_capped() { + // Far past MAX_SUBGRAMMAR_DEPTH — must fail with a + // friendly error rather than overflowing the stack. + let depth = 400; + let input = format!("{}1{}", "(".repeat(depth), ")".repeat(depth)); + assert!(!walks(&input), "pathological nesting must be rejected"); + } +}