grammar: SQL expression grammar fragment (ADR-0031)

A new `src/dsl/grammar/sql_expr.rs` authored as a parallel fragment to `expr.rs` (the DSL `WHERE` grammar, ADR-0026). The ADR's stratified ladder lands as named `static` `Node`s, one per precedence tier: or_expr → and_expr → not_expr → predicate → additive → multiplicative → unary → primary Recursion through `Node::Subgrammar` reuses ADR-0026's `MAX_SUBGRAMMAR_DEPTH = 64` cap unchanged; no new walker capability is required. `predicate_tail` follows ADR-0026's factoring (shared operand prefix, infix `NOT` as an explicit branch, no `Optional`-first branch) so `Choice` discriminates cleanly. `name_or_call` factors the identifier-prefix shared between column refs and function calls into a single `Ident` followed by an `Optional` `( call_args )` tail — the same hazard-avoidance shape `predicate_tail` uses. The fragment exports `pub static SQL_OR_EXPR` (test entry) and `pub static SQL_EXPRESSION` (drop-in `Subgrammar(&SQL_OR_EXPR)` that SQL `CommandNode` shapes embed in their `Seq`). No AST builder — every Phase-1 consumer (SELECT projection, WHERE) runs validated SQL as text per ADR-0030 §4/§6. 13 unit tests cover every operator and precedence pair, the full predicate set, `CASE` (searched + simple) including `count(*)` and `count(distinct …)`, parenthesised regrouping, case-insensitive keywords, the depth cap, and a representative set of malformed inputs that do *not* walk. Module registered via one new line in `grammar/mod.rs`.
2026-05-19 21:39:49 +00:00
parent 81793a3a85
commit c93f9394f5
2 changed files with 600 additions and 0 deletions
@@ -27,6 +27,7 @@ pub mod data;
 pub mod ddl;
 pub mod expr;
 pub mod shared;
 pub mod sql_expr;
 use crate::dsl::command::Command;
 use crate::dsl::walker::context::WalkContext;
@@ -0,0 +1,599 @@
 //! The SQL expression grammar fragment (ADR-0031).
 //!
 //! This is the advanced-mode counterpart of `grammar::expr` (the
 //! DSL `WHERE` grammar, ADR-0026). It fills every expression slot
 //! in advanced-mode SQL — `WHERE`, `HAVING`, `SELECT` projections,
 //! `CHECK`, `DEFAULT` — and is the *superset* of ADR-0026's
 //! grammar: it adds arithmetic, string concatenation, function
 //! calls, and `CASE` on top of the comparison / `LIKE` / `IN` /
 //! `BETWEEN` / `IS NULL` predicate set.
 //!
 //! # One unified ladder
 //!
 //! ADR-0026's DSL grammar stratifies a *boolean* layer above a
 //! *predicate* layer because the DSL forbids a boolean
 //! sub-expression as a comparison operand. Standard SQL draws no
 //! such line — a boolean is a value — so this grammar is a single
 //! precedence ladder, loosest tier first:
 //!
 //! ```text
 //! or_expr         := and_expr      ( OR  and_expr )*
 //! and_expr        := not_expr      ( AND not_expr )*
 //! not_expr        := NOT not_expr  |  predicate
 //! predicate       := additive predicate_tail?
 //! predicate_tail  := cmp_op additive
 //!                  | [ NOT ] LIKE additive
 //!                  | [ NOT ] BETWEEN additive AND additive
 //!                  | [ NOT ] IN ( additive ( , additive )* )
 //!                  | IS [ NOT ] NULL
 //! additive        := multiplicative ( ( + | - | || ) multiplicative )*
 //! multiplicative  := unary ( ( * | / | % ) unary )*
 //! unary           := ( - | + ) unary  |  primary
 //! primary         := literal | ( or_expr ) | case_expr | name_or_call
 //! name_or_call    := identifier  [ '(' call_args? ')' ]
 //! call_args       := '*'  |  DISTINCT arg_list  |  arg_list
 //! case_expr       := CASE [ or_expr ] ( WHEN or_expr THEN or_expr )+
 //!                         [ ELSE or_expr ] END
 //! ```
 //!
 //! Stratification removes left recursion (every recursion is
 //! guarded by a token) and encodes precedence in the layering.
 //! Each tier transition, and each genuine recursion, goes through
 //! [`Node::Subgrammar`] — a `Seq` / `Choice` embeds its children
 //! by value and cannot close a cycle. The walker counts active
 //! `Subgrammar` frames and refuses past `MAX_SUBGRAMMAR_DEPTH`
 //! (ADR-0026 §1) — reused here unchanged.
 //!
 //! # No AST (ADR-0031 §2)
 //!
 //! Unlike `grammar::expr`, this fragment carries **no AST
 //! builder**. Its consumers run or store SQL as text (ADR-0030
 //! §4/§6), so there is no `Expr` to build. The fragment's output
 //! is the three other walker products — accept/reject, the flat
 //! `MatchedPath` of terminals (which drives highlighting,
 //! completion, the expected-set, and hints), and, via those
 //! terminals' `span`s, the source range a consumer slices when it
 //! needs the expression as text. The grammar tier owns
 //! validation, highlight, completion, and the no-left-recursion
 //! guarantee; it simply has no tree to hand back.
 use crate::dsl::grammar::{IdentSource, Node, Word};
 // =================================================================
 // Shared leaf nodes
 // =================================================================
 /// `,` — the separator inside `IN ( … )` and a function-call
 /// argument list.
 static COMMA: Node = Node::Punct(',');
 /// A column reference inside an expression. `IdentSource::Columns`
 /// drives Tab completion against the statement's table(s) from the
 /// same `SchemaCache` the DSL uses (ADR-0031 §5). The same slot
 /// also begins a `name_or_call` function call — the grammar does
 /// not decide which (ADR-0031 §1, §2), so the slot optimises for
 /// the common case (a column) and a function name simply is not a
 /// completion candidate. `const` so it can be embedded by value.
 const EXPR_IDENT: Node = Node::Ident {
    source: IdentSource::Columns,
    role: "sql_expr_ident",
    validator: None,
    highlight_override: None,
    writes_table: false,
    writes_column: false,
    writes_user_listed_column: false,
 };
 // =================================================================
 // or_expr := and_expr ( OR and_expr )*  — the fragment entry point
 // =================================================================
 static OR_TAIL_NODES: &[Node] = &[
    Node::Word(Word::keyword("or")),
    Node::Subgrammar(&AND_EXPR),
 ];
 static OR_TAIL: Node = Node::Seq(OR_TAIL_NODES);
 static SQL_OR_EXPR_NODES: &[Node] = &[
    Node::Subgrammar(&AND_EXPR),
    Node::Repeated {
        inner: &OR_TAIL,
        separator: None,
        min: 0,
    },
 ];
 /// `or_expr` — the loosest tier and the grammar's entry point.
 pub static SQL_OR_EXPR: Node = Node::Seq(SQL_OR_EXPR_NODES);
 /// The SQL expression fragment, ready to drop into a SQL command
 /// `Seq` as one node — the advanced-mode counterpart of
 /// [`crate::dsl::grammar::expr::EXPRESSION`]. Walking it consumes a
 /// whole `or_expr`.
 pub static SQL_EXPRESSION: Node = Node::Subgrammar(&SQL_OR_EXPR);
 // =================================================================
 // and_expr := not_expr ( AND not_expr )*
 // =================================================================
 static AND_TAIL_NODES: &[Node] = &[
    Node::Word(Word::keyword("and")),
    Node::Subgrammar(&NOT_EXPR),
 ];
 static AND_TAIL: Node = Node::Seq(AND_TAIL_NODES);
 static AND_EXPR_NODES: &[Node] = &[
    Node::Subgrammar(&NOT_EXPR),
    Node::Repeated {
        inner: &AND_TAIL,
        separator: None,
        min: 0,
    },
 ];
 static AND_EXPR: Node = Node::Seq(AND_EXPR_NODES);
 // =================================================================
 // not_expr := NOT not_expr | predicate
 // =================================================================
 static NOT_FORM_NODES: &[Node] = &[
    Node::Word(Word::keyword("not")),
    Node::Subgrammar(&NOT_EXPR),
 ];
 static NOT_EXPR_CHOICES: &[Node] = &[
    Node::Seq(NOT_FORM_NODES),
    Node::Subgrammar(&PREDICATE),
 ];
 static NOT_EXPR: Node = Node::Choice(NOT_EXPR_CHOICES);
 // =================================================================
 // predicate := additive predicate_tail?
 // =================================================================
 //
 // `predicate_tail` is optional: a bare `additive` with no
 // comparison operator is itself a valid expression — that is what
 // a `SELECT` projection item (`a + b`) or a literal `WHERE 1`
 // needs. ADR-0026's DSL grammar made the tail mandatory because it
 // forbade a bare column as a boolean; SQL does not.
 static PREDICATE_NODES: &[Node] = &[
    Node::Subgrammar(&ADDITIVE),
    Node::Optional(&PREDICATE_TAIL),
 ];
 static PREDICATE: Node = Node::Seq(PREDICATE_NODES);
 // ---- cmp_op := <= | <> | >= | != | < | > | = --------------------
 //
 // Two-character operators come before their one-character
 // prefixes: `walk_literal` matches `<` against the `<` of `<=`,
 // so `<=` / `<>` must be tried first (ADR-0026's note, inherited).
 static CMP_OP_CHOICES: &[Node] = &[
    Node::Literal("<="),
    Node::Literal("<>"),
    Node::Literal(">="),
    Node::Literal("!="),
    Node::Literal("<"),
    Node::Literal(">"),
    Node::Literal("="),
 ];
 // ---- predicate_tail branches ------------------------------------
 /// `cmp_op additive`.
 static COMPARE_FORM_NODES: &[Node] = &[
    Node::Choice(CMP_OP_CHOICES),
    Node::Subgrammar(&ADDITIVE),
 ];
 /// `IS [NOT] NULL`.
 static IS_NULL_NODES: &[Node] = &[
    Node::Word(Word::keyword("is")),
    Node::Optional(&Node::Word(Word::keyword("not"))),
    Node::Word(Word::keyword("null")),
 ];
 /// `LIKE additive`.
 static LIKE_FORM_NODES: &[Node] = &[
    Node::Word(Word::keyword("like")),
    Node::Subgrammar(&ADDITIVE),
 ];
 /// `BETWEEN additive AND additive`. The inner `and` is consumed
 /// here, so a stray `and` at the `and_expr` tier is always a
 /// connective.
 static BETWEEN_FORM_NODES: &[Node] = &[
    Node::Word(Word::keyword("between")),
    Node::Subgrammar(&ADDITIVE),
    Node::Word(Word::keyword("and")),
    Node::Subgrammar(&ADDITIVE),
 ];
 /// `IN ( additive [, additive]* )`.
 static IN_ITEM: Node = Node::Subgrammar(&ADDITIVE);
 static IN_FORM_NODES: &[Node] = &[
    Node::Word(Word::keyword("in")),
    Node::Punct('('),
    Node::Repeated {
        inner: &IN_ITEM,
        separator: Some(&COMMA),
        min: 1,
    },
    Node::Punct(')'),
 ];
 /// The negatable predicate bodies — each starts with a distinct
 /// keyword, so this `Choice` discriminates cleanly.
 static NEGATABLE_CHOICES: &[Node] = &[
    Node::Seq(LIKE_FORM_NODES),
    Node::Seq(BETWEEN_FORM_NODES),
    Node::Seq(IN_FORM_NODES),
 ];
 /// `NOT (LIKE … | BETWEEN … | IN …)` — the infix `NOT` is factored
 /// in front of the negatable choice (ADR-0026's factoring).
 static NOT_NEGATABLE_NODES: &[Node] = &[
    Node::Word(Word::keyword("not")),
    Node::Choice(NEGATABLE_CHOICES),
 ];
 /// `predicate_tail`. Branch discrimination relies on each branch's
 /// *first* child reporting a clean `NoMatch` on a non-match — no
 /// branch starts with an `Optional` (ADR-0026's hazard). The infix
 /// `NOT` is its own explicit `NOT negatable` branch.
 static PREDICATE_TAIL_CHOICES: &[Node] = &[
    Node::Seq(COMPARE_FORM_NODES),
    Node::Seq(IS_NULL_NODES),
    Node::Seq(NOT_NEGATABLE_NODES),
    Node::Seq(LIKE_FORM_NODES),
    Node::Seq(BETWEEN_FORM_NODES),
    Node::Seq(IN_FORM_NODES),
 ];
 static PREDICATE_TAIL: Node = Node::Choice(PREDICATE_TAIL_CHOICES);
 // =================================================================
 // additive := multiplicative ( ( + | - | || ) multiplicative )*
 // =================================================================
 static ADD_OP_CHOICES: &[Node] = &[
    Node::Punct('+'),
    Node::Punct('-'),
    Node::Literal("||"),
 ];
 static ADD_TAIL_NODES: &[Node] = &[
    Node::Choice(ADD_OP_CHOICES),
    Node::Subgrammar(&MULTIPLICATIVE),
 ];
 static ADD_TAIL: Node = Node::Seq(ADD_TAIL_NODES);
 static ADDITIVE_NODES: &[Node] = &[
    Node::Subgrammar(&MULTIPLICATIVE),
    Node::Repeated {
        inner: &ADD_TAIL,
        separator: None,
        min: 0,
    },
 ];
 static ADDITIVE: Node = Node::Seq(ADDITIVE_NODES);
 // =================================================================
 // multiplicative := unary ( ( * | / | % ) unary )*
 // =================================================================
 static MUL_OP_CHOICES: &[Node] = &[
    Node::Punct('*'),
    Node::Punct('/'),
    Node::Punct('%'),
 ];
 static MUL_TAIL_NODES: &[Node] = &[
    Node::Choice(MUL_OP_CHOICES),
    Node::Subgrammar(&UNARY),
 ];
 static MUL_TAIL: Node = Node::Seq(MUL_TAIL_NODES);
 static MULTIPLICATIVE_NODES: &[Node] = &[
    Node::Subgrammar(&UNARY),
    Node::Repeated {
        inner: &MUL_TAIL,
        separator: None,
        min: 0,
    },
 ];
 static MULTIPLICATIVE: Node = Node::Seq(MULTIPLICATIVE_NODES);
 // =================================================================
 // unary := ( - | + ) unary | primary
 // =================================================================
 static SIGN_CHOICES: &[Node] = &[Node::Punct('-'), Node::Punct('+')];
 static UNARY_SIGN_NODES: &[Node] = &[
    Node::Choice(SIGN_CHOICES),
    Node::Subgrammar(&UNARY),
 ];
 static UNARY_CHOICES: &[Node] = &[
    Node::Seq(UNARY_SIGN_NODES),
    Node::Subgrammar(&PRIMARY),
 ];
 static UNARY: Node = Node::Choice(UNARY_CHOICES);
 // =================================================================
 // primary := literal | ( or_expr ) | case_expr | name_or_call
 // =================================================================
 /// `( or_expr )` — a parenthesised group is a whole expression.
 static PAREN_GROUP_NODES: &[Node] = &[
    Node::Punct('('),
    Node::Subgrammar(&SQL_OR_EXPR),
    Node::Punct(')'),
 ];
 // ---- case_expr --------------------------------------------------
 //
 // `CASE [operand] (WHEN cond THEN result)+ [ELSE result] END`.
 // Searched-CASE (no operand) and simple-CASE (with operand) are two
 // `Choice` branches *after* the shared `CASE` keyword is factored
 // out: the searched branch opens with `WHEN`, the simple branch
 // with an operand expression. Branch 1's leading `Repeated(min 1)`
 // reports a clean `NoMatch` when the next token is not `WHEN`, so
 // the `Choice` falls through to the simple branch cleanly.
 static WHEN_CLAUSE_NODES: &[Node] = &[
    Node::Word(Word::keyword("when")),
    Node::Subgrammar(&SQL_OR_EXPR),
    Node::Word(Word::keyword("then")),
    Node::Subgrammar(&SQL_OR_EXPR),
 ];
 static WHEN_CLAUSE: Node = Node::Seq(WHEN_CLAUSE_NODES);
 static ELSE_CLAUSE_NODES: &[Node] = &[
    Node::Word(Word::keyword("else")),
    Node::Subgrammar(&SQL_OR_EXPR),
 ];
 static ELSE_CLAUSE: Node = Node::Seq(ELSE_CLAUSE_NODES);
 /// Searched-CASE body: `(WHEN … THEN …)+ [ELSE …] END`.
 static SEARCHED_CASE_NODES: &[Node] = &[
    Node::Repeated {
        inner: &WHEN_CLAUSE,
        separator: None,
        min: 1,
    },
    Node::Optional(&ELSE_CLAUSE),
    Node::Word(Word::keyword("end")),
 ];
 /// Simple-CASE body: `operand (WHEN … THEN …)+ [ELSE …] END`.
 static SIMPLE_CASE_NODES: &[Node] = &[
    Node::Subgrammar(&SQL_OR_EXPR),
    Node::Repeated {
        inner: &WHEN_CLAUSE,
        separator: None,
        min: 1,
    },
    Node::Optional(&ELSE_CLAUSE),
    Node::Word(Word::keyword("end")),
 ];
 static CASE_BODY_CHOICES: &[Node] = &[
    Node::Seq(SEARCHED_CASE_NODES),
    Node::Seq(SIMPLE_CASE_NODES),
 ];
 static CASE_NODES: &[Node] = &[
    Node::Word(Word::keyword("case")),
    Node::Choice(CASE_BODY_CHOICES),
 ];
 // ---- name_or_call -----------------------------------------------
 //
 // `identifier [ '(' call_args? ')' ]`. The identifier is matched
 // once; the `( call_args )` group is an `Optional` tail — present
 // is a function call, absent is a column reference. Factoring this
 // (rather than two `Choice` branches sharing the identifier
 // prefix) avoids the function-call branch committing on the
 // identifier and discarding the column-ref branch.
 /// One function-call argument — a whole expression.
 static CALL_ARG: Node = Node::Subgrammar(&SQL_OR_EXPR);
 /// `call_args := '*' | DISTINCT arg_list | arg_list`. `count(*)`
 /// is the one place `*` is an argument; `count(distinct col)` the
 /// one place `DISTINCT` leads the list. The grammar admits the
 /// call shape structurally — it does not know which names are
 /// aggregates (ADR-0031 §1).
 static DISTINCT_ARGS_NODES: &[Node] = &[
    Node::Word(Word::keyword("distinct")),
    Node::Repeated {
        inner: &CALL_ARG,
        separator: Some(&COMMA),
        min: 1,
    },
 ];
 static CALL_ARGS_CHOICES: &[Node] = &[
    Node::Punct('*'),
    Node::Seq(DISTINCT_ARGS_NODES),
    Node::Repeated {
        inner: &CALL_ARG,
        separator: Some(&COMMA),
        min: 1,
    },
 ];
 static CALL_ARGS: Node = Node::Choice(CALL_ARGS_CHOICES);
 static CALL_TAIL_NODES: &[Node] = &[
    Node::Punct('('),
    Node::Optional(&CALL_ARGS),
    Node::Punct(')'),
 ];
 static CALL_TAIL: Node = Node::Seq(CALL_TAIL_NODES);
 static NAME_OR_CALL_NODES: &[Node] = &[EXPR_IDENT, Node::Optional(&CALL_TAIL)];
 /// `primary`. Keyword literals (`null` / `true` / `false`) and the
 /// `CASE` keyword come before `name_or_call`, so they parse as
 /// what they are rather than as column references.
 static PRIMARY_CHOICES: &[Node] = &[
    Node::Word(Word::keyword("null")),
    Node::Word(Word::keyword("true")),
    Node::Word(Word::keyword("false")),
    Node::NumberLit { validator: None },
    Node::StringLit,
    Node::Seq(PAREN_GROUP_NODES),
    Node::Seq(CASE_NODES),
    Node::Seq(NAME_OR_CALL_NODES),
 ];
 static PRIMARY: Node = Node::Choice(PRIMARY_CHOICES);
 // =================================================================
 // Tests
 // =================================================================
 #[cfg(test)]
 mod tests {
    use super::SQL_OR_EXPR;
    use crate::dsl::walker::context::WalkContext;
    use crate::dsl::walker::driver::{NodeWalkResult, walk_node};
    use crate::dsl::walker::outcome::MatchedPath;
    /// Walk `input` against the SQL expression fragment. Returns
    /// `true` only when the walk matches *and* consumes all of
    /// `input` (trailing whitespace allowed).
    fn walks(input: &str) -> bool {
        let mut ctx = WalkContext::new();
        let mut path = MatchedPath::new();
        let mut per_byte = Vec::new();
        match walk_node(input, 0, &SQL_OR_EXPR, &mut ctx, &mut path, &mut per_byte) {
            NodeWalkResult::Matched { end, .. } => input[end..].trim().is_empty(),
            _ => false,
        }
    }
    /// Assert `input` is a complete SQL expression.
    fn good(input: &str) {
        assert!(walks(input), "{input:?} should be a valid SQL expression");
    }
    /// Assert `input` is *not* a complete SQL expression.
    fn bad(input: &str) {
        assert!(!walks(input), "{input:?} should NOT walk as a complete expression");
    }
    #[test]
    fn bare_operands_are_expressions() {
        // A projection item / `WHERE 1` — no predicate operator.
        for input in ["1", "col", "'text'", "true", "false", "null", "-7"] {
            good(input);
        }
    }
    #[test]
    fn every_comparison_operator() {
        for op in ["=", "!=", "<>", "<", "<=", ">", ">="] {
            good(&format!("a {op} 1"));
        }
    }
    #[test]
    fn arithmetic_and_precedence() {
        for input in [
            "a + b",
            "a - b",
            "a * b",
            "a / b",
            "a % b",
            "a + b * c",
            "(a + b) * c",
            "a + b - c + d",
            "-a + b",
            "a - -b",
            "- -a",
        ] {
            good(input);
        }
    }
    #[test]
    fn string_concatenation() {
        good("first || ' ' || last");
    }
    #[test]
    fn function_calls() {
        for input in [
            "upper(name)",
            "length(x)",
            "f()",
            "round(price, 2)",
            "count(*)",
            "count(distinct customer_id)",
            "coalesce(a, b, c)",
            "upper(lower(name))",
        ] {
            good(input);
        }
    }
    #[test]
    fn case_searched_and_simple() {
        good("case when a > 0 then 'pos' else 'neg' end");
        good("case when a > 0 then 1 when a < 0 then -1 else 0 end");
        good("case grade when 1 then 'A' when 2 then 'B' end");
        good("case status when 'open' then 1 else 0 end");
    }
    #[test]
    fn the_predicate_set() {
        good("name like 'A%'");
        good("name not like 'A%'");
        good("age between 18 and 65");
        good("age not between 0 and 17");
        good("status in (1, 2, 3)");
        good("status not in ('a', 'b')");
        good("email is null");
        good("email is not null");
    }
    #[test]
    fn boolean_connectives_and_precedence() {
        good("a = 1 and b = 2");
        good("a = 1 or b = 2");
        good("a = 1 or b = 2 and c = 3");
        good("not a = 1");
        good("not (a = 1 or b = 2)");
        good("(a = 1 or b = 2) and c = 3");
    }
    #[test]
    fn predicates_over_arithmetic_operands() {
        // The operands of a comparison are full scalar
        // expressions — the whole point of the superset.
        good("price * 1.1 > budget");
        good("upper(name) = 'ADA'");
        good("a + b between c and d");
    }
    #[test]
    fn keywords_are_case_insensitive() {
        good("A = 1 AND B IS NOT NULL");
        good("CASE WHEN x > 0 THEN 1 ELSE 0 END");
        good("name LIKE 'x%' OR age BETWEEN 1 AND 9");
    }
    #[test]
    fn nested_parentheses_walk() {
        good("((a = 1 and b = 2) or (c = 3))");
        good("(((1)))");
    }
    #[test]
    fn malformed_expressions_do_not_walk() {
        bad("a +");          // dangling operator
        bad("a in b");       // IN requires a parenthesised list
        bad("= 1");          // no left operand
        bad("a = ");         // no right operand
        bad("case a end");   // CASE with no WHEN clause
        bad("and b");        // leading connective
        bad("upper(");       // unclosed call
    }
    #[test]
    fn deeply_nested_parentheses_are_capped() {
        // Far past MAX_SUBGRAMMAR_DEPTH — must fail with a
        // friendly error rather than overflowing the stack.
        let depth = 400;
        let input = format!("{}1{}", "(".repeat(depth), ")".repeat(depth));
        assert!(!walks(&input), "pathological nesting must be rejected");
    }
 }