//! The full SQL `SELECT` grammar fragment (ADR-0032). //! //! ADR-0030 Phase 2. This fragment is the standalone walkable //! shape for the full standard-SQL `SELECT`: `INNER` / `LEFT` / //! `RIGHT` / `FULL OUTER` / `CROSS` joins, `GROUP BY` / `HAVING`, //! the four set operators (`UNION` / `UNION ALL` / `INTERSECT` //! / `EXCEPT`), `WITH` and `WITH RECURSIVE` common table //! expressions, `LIMIT … OFFSET`, `DISTINCT`, `t.*` projection, //! and bare-alias projection (lifting ADR-0030 Phase 1 §4.2). //! //! Recursion into `SQL_SELECT_COMPOUND` is via `Node::Subgrammar` //! at sub-phase 2a; sub-phase 2b replaces those references with //! `Node::ScopedSubgrammar` for completion-scope discipline //! (ADR-0032 §10.2). The Phase-1 `data::SELECT` `CommandNode` //! continues to use its own grammar until sub-phase 2c's //! migration — this fragment is reachable only from its own //! tests in 2a. //! //! # BNF (ADR-0032 §1) //! //! ```text //! select_statement := [ with_clause ] compound_select [ ';' ] //! compound_select := select_core ( set_op select_core )* //! [ order_by_clause ] //! [ limit_clause ] //! set_op := UNION [ ALL ] | INTERSECT | EXCEPT //! select_core := SELECT [ DISTINCT | ALL ] //! projection_list //! [ from_clause ] //! [ where_clause ] //! [ group_by_clause ] //! [ having_clause ] //! with_clause := WITH [ RECURSIVE ] cte_def //! ( ',' cte_def )* //! cte_def := identifier [ '(' column_name_list ')' ] //! AS '(' compound_select ')' //! projection_list := projection_item ( ',' projection_item )* //! projection_item := '*' //! | identifier '.' '*' //! | sql_expr [ [ AS ] identifier ] //! from_clause := FROM table_source ( join_clause )* //! table_source := identifier [ [ AS ] identifier ] //! join_clause := [ INNER ] JOIN table_source ON sql_expr //! | LEFT [ OUTER ] JOIN table_source ON sql_expr //! | RIGHT [ OUTER ] JOIN table_source ON sql_expr //! | FULL [ OUTER ] JOIN table_source ON sql_expr //! | CROSS JOIN table_source //! where_clause := WHERE sql_expr //! group_by_clause := GROUP BY sql_expr ( ',' sql_expr )* //! having_clause := HAVING sql_expr //! order_by_clause := ORDER BY order_item ( ',' order_item )* //! order_item := sql_expr [ ASC | DESC ] //! limit_clause := LIMIT sql_expr [ OFFSET sql_expr ] //! ``` //! //! # Disambiguation via `Node::Lookahead` //! //! Two places need lookahead to dispatch cleanly: //! //! - **Projection item** (ADR-0032 §1 `projection_item`). The //! three alternatives all share a leading identifier shape //! (`*` and the `ident . *` qualified wildcard, plus `sql_expr` //! which also begins on an ident for the column-ref case). A //! factory peeks the first 3 tokens to pick `*`, `ident . *`, //! or `sql_expr [ alias ]`. //! //! - **Bare alias** (ADR-0032 §1 — lifts Phase-1 §4.2). The //! walker's `walk_ident` happily matches keyword-shaped tokens //! as identifiers, and `Choice`/`Optional` are first-match- //! wins (no backtracking on a successful match). To prevent //! bare-alias slots from swallowing continuation keywords, the //! alias slot is a `Lookahead` that returns an empty `Choice` //! (NoMatch) when the next ident-shaped token is a //! continuation keyword for that position. use crate::dsl::grammar::{IdentSource, Node, ValidationError, Word, sql_expr}; use crate::dsl::walker::context::WalkContext; use crate::dsl::walker::lex_helpers::{consume_ident, skip_whitespace}; // ================================================================= // Validators // ================================================================= /// Reject internal `__rdbms_*` metadata tables in any /// table-source slot (ADR-0030 §6 reused by ADR-0032 §4 — extends /// to every Phase-2 table-source slot: `FROM`, `JOIN` targets, /// CTE name, and the `FROM` inside any CTE body; ADR-0033 §1 /// reuses it on the SQL `INSERT` target slot). pub(crate) fn reject_internal_table(name: &str) -> Result<(), ValidationError> { if name.to_ascii_lowercase().starts_with("__rdbms_") { Err(ValidationError { message_key: "select.internal_table", args: vec![("table", name.to_string())], }) } else { Ok(()) } } // ================================================================= // Shared leaf nodes // ================================================================= const COMMA: Node = Node::Punct(','); const STAR: Node = Node::Punct('*'); const LPAREN: Node = Node::Punct('('); const RPAREN: Node = Node::Punct(')'); const SEMI: Node = Node::Punct(';'); // SQL expression slot — `Node::Subgrammar(&sql_expr::SQL_OR_EXPR)` // is inlined at each use site rather than aliased through a // named `const`. The `const SQL_EXPR: Node = …` form triggered // a Rust const-evaluation cycle through the sql_expr ⇄ // sql_select recursion (valid at link time, where statics // resolve lazily, but not at const-eval). Stays as a plain // `Subgrammar` — sql_expr recursion is part of the precedence // ladder, not a new lexical scope (ADR-0032 §10.2). /// A node that never matches. Used as the "no" branch of /// lookahead-driven disambiguation: an empty `Choice` walks to /// `NoMatch`, which `Optional` / `Choice` gracefully treat as /// "skip" or "fall through to the next branch". static EMPTY_NOMATCH: Node = Node::Choice(&[]); // ================================================================= // Bare-alias dispatch (ADR-0032 §1) // ================================================================= // // The walker's `walk_ident` accepts any identifier-shape token, // including keyword-shape ones. With `Optional` / `Choice` // being first-match-wins, an unrestricted bare-alias slot would // greedily consume `FROM` / `WHERE` / `JOIN` / etc. as if they // were aliases. `Node::Lookahead` peeks the next token; when it // matches a continuation keyword for this position, the factory // returns `EMPTY_NOMATCH` so `Optional` skips and the keyword // reaches the next clause. /// Continuation keywords that may legitimately follow a /// projection item's bare alias (or its absence). Includes the /// `select_core` follow keywords and the compound-query / outer /// suffix keywords. `as` is not listed — the AS-form alias is a /// separate `Choice` branch that fires before the lookahead. const PROJECTION_FOLLOW_SET: &[&str] = &[ "from", "where", "group", "order", "having", "limit", "union", "intersect", "except", ]; /// Continuation keywords that may legitimately follow a table /// source's bare alias (or its absence). Includes the join /// keywords (so `FROM a JOIN b` doesn't read `JOIN` as `a`'s /// alias) and the `select_core` / compound suffix keywords. /// `on` is included because `FROM a JOIN b ON …` reaches `on` /// only when `b` has no alias — `on` is not a base-table name a /// learner would type as an alias. const TABLE_SOURCE_FOLLOW_SET: &[&str] = &[ "where", "group", "order", "having", "limit", "union", "intersect", "except", "inner", "left", "right", "full", "cross", "join", "on", ]; fn peek_next_ident_lower(source: &str, pos: usize) -> Option { let p = skip_whitespace(source, pos); consume_ident(source, p).map(|(s, e)| source[s..e].to_ascii_lowercase()) } fn projection_bare_alias_factory( _: &WalkContext, source: &str, pos: usize, ) -> Node { match peek_next_ident_lower(source, pos) { Some(word) if PROJECTION_FOLLOW_SET.iter().any(|k| *k == word) => { Node::Subgrammar(&EMPTY_NOMATCH) } Some(_) => PROJECTION_BARE_ALIAS_IDENT, None => Node::Subgrammar(&EMPTY_NOMATCH), } } fn table_source_bare_alias_factory( _: &WalkContext, source: &str, pos: usize, ) -> Node { match peek_next_ident_lower(source, pos) { Some(word) if TABLE_SOURCE_FOLLOW_SET.iter().any(|k| *k == word) => { Node::Subgrammar(&EMPTY_NOMATCH) } Some(_) => TABLE_SOURCE_BARE_ALIAS_IDENT, None => Node::Subgrammar(&EMPTY_NOMATCH), } } // ================================================================= // Alias slot // ================================================================= /// Projection-list alias slot. `writes_projection_alias: true` /// pushes the matched name onto the top frame's /// `projection_aliases` so `ORDER BY` candidates can offer it /// (ADR-0032 §10.4). const PROJECTION_BARE_ALIAS_IDENT: Node = Node::Ident { source: IdentSource::NewName, role: "projection_alias", validator: None, highlight_override: None, writes_table: false, writes_column: false, writes_user_listed_column: false, writes_table_alias: false, writes_cte_name: false, writes_projection_alias: true, }; /// Table-source alias slot — `writes_table_alias: true` so the /// matched name lands on the most-recently-pushed /// `TableBinding`'s `alias` (ADR-0032 §10.1). const TABLE_SOURCE_BARE_ALIAS_IDENT: Node = Node::Ident { source: IdentSource::NewName, role: "table_alias", validator: None, highlight_override: None, writes_table: false, writes_column: false, writes_user_listed_column: false, writes_table_alias: true, writes_cte_name: false, writes_projection_alias: false, }; static PROJECTION_AS_ALIAS_NODES: &[Node] = &[ Node::Word(Word::keyword("as")), PROJECTION_BARE_ALIAS_IDENT, ]; static PROJECTION_AS_ALIAS: Node = Node::Seq(PROJECTION_AS_ALIAS_NODES); static TABLE_SOURCE_AS_ALIAS_NODES: &[Node] = &[ Node::Word(Word::keyword("as")), TABLE_SOURCE_BARE_ALIAS_IDENT, ]; static TABLE_SOURCE_AS_ALIAS: Node = Node::Seq(TABLE_SOURCE_AS_ALIAS_NODES); static PROJECTION_ALIAS_CHOICES: &[Node] = &[ Node::Subgrammar(&PROJECTION_AS_ALIAS), Node::Lookahead(projection_bare_alias_factory), ]; static PROJECTION_ALIAS_CHOICE: Node = Node::Choice(PROJECTION_ALIAS_CHOICES); static PROJECTION_ALIAS_OPTIONAL: Node = Node::Optional(&PROJECTION_ALIAS_CHOICE); static TABLE_SOURCE_ALIAS_CHOICES: &[Node] = &[ Node::Subgrammar(&TABLE_SOURCE_AS_ALIAS), Node::Lookahead(table_source_bare_alias_factory), ]; static TABLE_SOURCE_ALIAS_CHOICE: Node = Node::Choice(TABLE_SOURCE_ALIAS_CHOICES); static TABLE_SOURCE_ALIAS_OPTIONAL: Node = Node::Optional(&TABLE_SOURCE_ALIAS_CHOICE); // ================================================================= // Projection item // ================================================================= const QUALIFIED_STAR_QUALIFIER: Node = Node::Ident { source: IdentSource::Tables, role: "qualified_star_qualifier", validator: None, highlight_override: None, writes_table: false, writes_column: false, writes_user_listed_column: false, writes_table_alias: false, writes_cte_name: false, writes_projection_alias: false, }; static QUALIFIED_STAR_NODES: &[Node] = &[ QUALIFIED_STAR_QUALIFIER, Node::Punct('.'), Node::Punct('*'), ]; static QUALIFIED_STAR: Node = Node::Seq(QUALIFIED_STAR_NODES); static PROJECTION_EXPR_ITEM_NODES: &[Node] = &[ Node::Subgrammar(&sql_expr::SQL_OR_EXPR), Node::Subgrammar(&PROJECTION_ALIAS_OPTIONAL), ]; static PROJECTION_EXPR_ITEM: Node = Node::Seq(PROJECTION_EXPR_ITEM_NODES); /// Dispatch one projection item via a 3-token lookahead. /// /// - `*` (and only `*`) → bare wildcard. /// - `ident . *` → qualified wildcard. /// - anything else → `sql_expr [ alias ]`. /// /// The factory is the cleanest way to handle the shared-prefix /// ambiguity between `t.*` and `sql_expr` (which can match a /// bare `t`), since the walker's `Choice` doesn't backtrack on /// a committed match. fn projection_item_factory( _: &WalkContext, source: &str, pos: usize, ) -> Node { let p = skip_whitespace(source, pos); let bytes = source.as_bytes(); if bytes.get(p) == Some(&b'*') { return STAR; } if let Some((_, end1)) = consume_ident(source, p) { let after_ident = skip_whitespace(source, end1); if bytes.get(after_ident) == Some(&b'.') { let after_dot = skip_whitespace(source, after_ident + 1); if bytes.get(after_dot) == Some(&b'*') { return Node::Subgrammar(&QUALIFIED_STAR); } } } Node::Subgrammar(&PROJECTION_EXPR_ITEM) } static PROJECTION_ITEM: Node = Node::Lookahead(projection_item_factory); static PROJECTION_LIST: Node = Node::Repeated { inner: &PROJECTION_ITEM, separator: Some(&COMMA), min: 1, }; // ================================================================= // DISTINCT / ALL prefix // ================================================================= static DISTINCT_OR_ALL_CHOICES: &[Node] = &[ Node::Word(Word::keyword("distinct")), Node::Word(Word::keyword("all")), ]; static DISTINCT_OR_ALL_CHOICE: Node = Node::Choice(DISTINCT_OR_ALL_CHOICES); static DISTINCT_OR_ALL_OPTIONAL: Node = Node::Optional(&DISTINCT_OR_ALL_CHOICE); // ================================================================= // Table source (FROM / JOIN target) // ================================================================= const TABLE_NAME_IDENT: Node = Node::Ident { source: IdentSource::Tables, role: "table_name", validator: Some(reject_internal_table), highlight_override: None, writes_table: true, writes_column: false, writes_user_listed_column: false, writes_table_alias: false, writes_cte_name: false, writes_projection_alias: false, }; static TABLE_SOURCE_NODES: &[Node] = &[ TABLE_NAME_IDENT, Node::Subgrammar(&TABLE_SOURCE_ALIAS_OPTIONAL), ]; static TABLE_SOURCE: Node = Node::Seq(TABLE_SOURCE_NODES); // ================================================================= // JOIN flavours // ================================================================= const JOIN_WORD: Node = Node::Word(Word::keyword("join")); const ON_WORD: Node = Node::Word(Word::keyword("on")); static OUTER_OPTIONAL: Node = Node::Optional(&Node::Word(Word::keyword("outer"))); // `INNER JOIN` and bare `JOIN` are split into two Choice // branches so each branch has a distinct leading keyword // (`inner` vs `join`). Avoids the "optional leading child → // idx > 0 → EOF becomes Incomplete" hazard in walk_seq that a // shared `Optional(Word("inner"))` would otherwise create. static INNER_JOIN_NODES: &[Node] = &[ Node::Word(Word::keyword("inner")), JOIN_WORD, Node::Subgrammar(&TABLE_SOURCE), ON_WORD, Node::Subgrammar(&sql_expr::SQL_OR_EXPR), ]; static BARE_JOIN_NODES: &[Node] = &[ JOIN_WORD, Node::Subgrammar(&TABLE_SOURCE), ON_WORD, Node::Subgrammar(&sql_expr::SQL_OR_EXPR), ]; static LEFT_JOIN_NODES: &[Node] = &[ Node::Word(Word::keyword("left")), Node::Subgrammar(&OUTER_OPTIONAL), JOIN_WORD, Node::Subgrammar(&TABLE_SOURCE), ON_WORD, Node::Subgrammar(&sql_expr::SQL_OR_EXPR), ]; static RIGHT_JOIN_NODES: &[Node] = &[ Node::Word(Word::keyword("right")), Node::Subgrammar(&OUTER_OPTIONAL), JOIN_WORD, Node::Subgrammar(&TABLE_SOURCE), ON_WORD, Node::Subgrammar(&sql_expr::SQL_OR_EXPR), ]; static FULL_JOIN_NODES: &[Node] = &[ Node::Word(Word::keyword("full")), Node::Subgrammar(&OUTER_OPTIONAL), JOIN_WORD, Node::Subgrammar(&TABLE_SOURCE), ON_WORD, Node::Subgrammar(&sql_expr::SQL_OR_EXPR), ]; static CROSS_JOIN_NODES: &[Node] = &[ Node::Word(Word::keyword("cross")), JOIN_WORD, Node::Subgrammar(&TABLE_SOURCE), ]; /// JOIN flavour dispatch. Each branch has a distinct leading /// keyword so `Choice` first-match-wins discriminates cleanly /// without invoking the walker's `Optional`-leading-child /// hazard. static JOIN_CLAUSE_CHOICES: &[Node] = &[ Node::Seq(LEFT_JOIN_NODES), Node::Seq(RIGHT_JOIN_NODES), Node::Seq(FULL_JOIN_NODES), Node::Seq(CROSS_JOIN_NODES), Node::Seq(INNER_JOIN_NODES), Node::Seq(BARE_JOIN_NODES), ]; static JOIN_CLAUSE: Node = Node::Choice(JOIN_CLAUSE_CHOICES); // ================================================================= // FROM / WHERE / GROUP BY / HAVING // ================================================================= static FROM_CLAUSE_NODES: &[Node] = &[ Node::Word(Word::keyword("from")), Node::Subgrammar(&TABLE_SOURCE), Node::Repeated { inner: &JOIN_CLAUSE, separator: None, min: 0, }, ]; static FROM_CLAUSE: Node = Node::Seq(FROM_CLAUSE_NODES); static WHERE_CLAUSE_NODES: &[Node] = &[ Node::Word(Word::keyword("where")), Node::Subgrammar(&sql_expr::SQL_OR_EXPR), ]; /// `WHERE sql_expr`. `pub(crate)` so the SQL DML statements /// (ADR-0033 — UPDATE / DELETE) reuse the exact same predicate /// clause, keeping the Phase-2 predicate diagnostics identical. pub(crate) static WHERE_CLAUSE: Node = Node::Seq(WHERE_CLAUSE_NODES); static GROUP_BY_CLAUSE_NODES: &[Node] = &[ Node::Word(Word::keyword("group")), Node::Word(Word::keyword("by")), Node::Repeated { inner: &Node::Subgrammar(&sql_expr::SQL_OR_EXPR), separator: Some(&COMMA), min: 1, }, ]; static GROUP_BY_CLAUSE: Node = Node::Seq(GROUP_BY_CLAUSE_NODES); static HAVING_CLAUSE_NODES: &[Node] = &[ Node::Word(Word::keyword("having")), Node::Subgrammar(&sql_expr::SQL_OR_EXPR), ]; static HAVING_CLAUSE: Node = Node::Seq(HAVING_CLAUSE_NODES); // ================================================================= // ORDER BY / LIMIT / OFFSET // ================================================================= static ASC_DESC_CHOICES: &[Node] = &[ Node::Word(Word::keyword("asc")), Node::Word(Word::keyword("desc")), ]; static ASC_DESC_CHOICE: Node = Node::Choice(ASC_DESC_CHOICES); static ORDER_ITEM_NODES: &[Node] = &[ Node::Subgrammar(&sql_expr::SQL_OR_EXPR), Node::Optional(&ASC_DESC_CHOICE), ]; static ORDER_ITEM: Node = Node::Seq(ORDER_ITEM_NODES); static ORDER_BY_CLAUSE_NODES: &[Node] = &[ Node::Word(Word::keyword("order")), Node::Word(Word::keyword("by")), Node::Repeated { inner: &ORDER_ITEM, separator: Some(&COMMA), min: 1, }, ]; static ORDER_BY_CLAUSE: Node = Node::Seq(ORDER_BY_CLAUSE_NODES); static OFFSET_NODES: &[Node] = &[ Node::Word(Word::keyword("offset")), Node::Subgrammar(&sql_expr::SQL_OR_EXPR), ]; static OFFSET_SEQ: Node = Node::Seq(OFFSET_NODES); static OFFSET_OPTIONAL: Node = Node::Optional(&OFFSET_SEQ); static LIMIT_CLAUSE_NODES: &[Node] = &[ Node::Word(Word::keyword("limit")), Node::Subgrammar(&sql_expr::SQL_OR_EXPR), Node::Subgrammar(&OFFSET_OPTIONAL), ]; static LIMIT_CLAUSE: Node = Node::Seq(LIMIT_CLAUSE_NODES); // ================================================================= // select_core (per-leg of a compound) // ================================================================= static SELECT_CORE_NODES: &[Node] = &[ Node::Word(Word::keyword("select")), Node::Subgrammar(&DISTINCT_OR_ALL_OPTIONAL), Node::Subgrammar(&PROJECTION_LIST), Node::Optional(&FROM_CLAUSE), Node::Optional(&WHERE_CLAUSE), Node::Optional(&GROUP_BY_CLAUSE), Node::Optional(&HAVING_CLAUSE), ]; static SELECT_CORE: Node = Node::Seq(SELECT_CORE_NODES); // ================================================================= // compound_select // ================================================================= // // `UNION ALL` is a single `Choice` branch (matched before bare // `UNION`) so the matched-path keyword sequence reads cleanly. // `UNION` and `UNION ALL` are factored as one `Seq[union, // Optional(all)]` branch so the Choice doesn't commit on `union` // inside a multi-token branch and then fail when `all` is // missing. The trailing `Optional(all)` is the last child of // the Seq, so a skip there doesn't trigger the // optional-leading-then-EOF-becomes-Incomplete hazard. static UNION_OR_UNION_ALL_NODES: &[Node] = &[ Node::Word(Word::keyword("union")), Node::Optional(&Node::Word(Word::keyword("all"))), ]; static SET_OP_CHOICES: &[Node] = &[ Node::Seq(UNION_OR_UNION_ALL_NODES), Node::Word(Word::keyword("intersect")), Node::Word(Word::keyword("except")), ]; static SET_OP: Node = Node::Choice(SET_OP_CHOICES); static SET_OP_TAIL_NODES: &[Node] = &[Node::Subgrammar(&SET_OP), Node::Subgrammar(&SELECT_CORE)]; static SET_OP_TAIL: Node = Node::Seq(SET_OP_TAIL_NODES); static PLAIN_COMPOUND_NODES: &[Node] = &[ Node::Subgrammar(&SELECT_CORE), Node::Repeated { inner: &SET_OP_TAIL, separator: None, min: 0, }, Node::Optional(&ORDER_BY_CLAUSE), Node::Optional(&LIMIT_CLAUSE), ]; /// The compound select shape **without** a leading `WITH_CLAUSE`. /// The Choice-fronted `SQL_SELECT_COMPOUND` references this twice: /// once after a `WITH_CLAUSE` prefix and once on its own. static PLAIN_COMPOUND: Node = Node::Seq(PLAIN_COMPOUND_NODES); // ADR-0032 §10.3: a subquery / CTE body may declare its own // CTEs (shadowing outer ones of the same name). The compound // recursion point therefore accepts an optional leading WITH. // // `Optional(WITH_CLAUSE)` would soft-commit the Seq on the // 0-byte skip and turn downstream `NoMatch` into `Failed` for // the Choice that decides "is `(...)` a paren expression or a // scalar subquery?" (see sql_expr.rs PAREN_INSIDE_CHOICES). A // Choice between WITH-prefixed and plain forms preserves the // fast NoMatch on non-`with` non-`select` first tokens, so the // expression branch of that Choice still wins on `(a + b)`. static WITH_PREFIXED_COMPOUND_NODES: &[Node] = &[ Node::Subgrammar(&WITH_CLAUSE), Node::Subgrammar(&PLAIN_COMPOUND), ]; static WITH_PREFIXED_COMPOUND: Node = Node::Seq(WITH_PREFIXED_COMPOUND_NODES); static COMPOUND_CHOICES: &[Node] = &[ Node::Subgrammar(&WITH_PREFIXED_COMPOUND), Node::Subgrammar(&PLAIN_COMPOUND), ]; /// The compound-select fragment that subqueries / CTE bodies /// recurse into. /// /// Referenced via `Subgrammar` (2a) / `ScopedSubgrammar` (2b). /// A Choice between the `WITH …` prefix form and the plain /// `SELECT …` form (ADR-0032 §10.3). pub static SQL_SELECT_COMPOUND: Node = Node::Choice(COMPOUND_CHOICES); // ================================================================= // CTE definitions // ================================================================= const CTE_NAME_IDENT: Node = Node::Ident { source: IdentSource::NewName, role: "cte_name", validator: Some(reject_internal_table), highlight_override: None, writes_table: false, writes_column: false, writes_user_listed_column: false, writes_table_alias: false, writes_cte_name: true, writes_projection_alias: false, }; const CTE_COLUMN_IDENT: Node = Node::Ident { source: IdentSource::NewName, role: "cte_column", validator: None, highlight_override: None, writes_table: false, writes_column: false, writes_user_listed_column: false, writes_table_alias: false, writes_cte_name: false, writes_projection_alias: false, }; static CTE_COLUMN_LIST_NODES: &[Node] = &[ LPAREN, Node::Repeated { inner: &CTE_COLUMN_IDENT, separator: Some(&COMMA), min: 1, }, RPAREN, ]; static CTE_COLUMN_LIST_SEQ: Node = Node::Seq(CTE_COLUMN_LIST_NODES); static CTE_COLUMN_LIST_OPTIONAL: Node = Node::Optional(&CTE_COLUMN_LIST_SEQ); // CTE body recursion pushes a fresh lexical scope frame (ADR- // 0032 §4 / §10.2). Subqueries in `sql_expr.rs` do the same; // the top-level statement's own COMPOUND embedding does not // (it shares the implicit bottom frame). static CTE_BODY_NODES: &[Node] = &[ LPAREN, Node::ScopedSubgrammar(&SQL_SELECT_COMPOUND), RPAREN, ]; static CTE_BODY: Node = Node::Seq(CTE_BODY_NODES); static CTE_DEF_NODES: &[Node] = &[ CTE_NAME_IDENT, Node::Subgrammar(&CTE_COLUMN_LIST_OPTIONAL), Node::Word(Word::keyword("as")), Node::Subgrammar(&CTE_BODY), ]; static CTE_DEF: Node = Node::Seq(CTE_DEF_NODES); static WITH_CLAUSE_NODES: &[Node] = &[ Node::Word(Word::keyword("with")), Node::Optional(&Node::Word(Word::keyword("recursive"))), Node::Repeated { inner: &CTE_DEF, separator: Some(&COMMA), min: 1, }, ]; static WITH_CLAUSE: Node = Node::Seq(WITH_CLAUSE_NODES); // ================================================================= // select_statement — the fragment entry point // ================================================================= static SELECT_STATEMENT_NODES: &[Node] = &[ Node::Optional(&WITH_CLAUSE), Node::Subgrammar(&SQL_SELECT_COMPOUND), Node::Optional(&SEMI), ]; /// The full statement, including the optional `WITH` prefix /// and a tolerated trailing `;`. /// /// Used by the fragment's own tests and by any future /// `data::WITH` `CommandNode` that dispatches `WITH …` /// statements. Top-level `SELECT` statements (entry word /// `select`) reference `SQL_SELECT_TAIL` instead, which omits /// the leading `SELECT` keyword that the registry's /// entry-word dispatch already consumed. pub static SQL_SELECT_STATEMENT: Node = Node::Seq(SELECT_STATEMENT_NODES); // ================================================================= // select_statement — entry-consumed form (ADR-0030 §6, 2c) // ================================================================= /// The post-`SELECT` portion of a top-level statement. /// `data::SELECT`'s `CommandNode` has `entry: Word::keyword /// ("select")`, so the registry's dispatch consumes the leading /// `SELECT` keyword before the shape walks. The shape is then /// the rest of `select_core` (`DISTINCT/ALL`, projection, /// FROM, WHERE, GROUP BY, HAVING), followed by the compound /// set-op chain (each subsequent leg's `SELECT` keyword is /// part of `SET_OP_TAIL`), the outer `ORDER BY` / `LIMIT`, and /// a tolerated trailing `;`. /// /// WITH-prefixed statements (`WITH x AS (…) SELECT …`) are /// dispatched separately by entry word `with`. Adding a /// `data::WITH` `CommandNode` is a future sub-phase; for now /// top-level WITH falls back to the chumsky parser route, the /// same as in Phase 1. static SQL_SELECT_TAIL_NODES: &[Node] = &[ Node::Subgrammar(&DISTINCT_OR_ALL_OPTIONAL), Node::Subgrammar(&PROJECTION_LIST), Node::Optional(&FROM_CLAUSE), Node::Optional(&WHERE_CLAUSE), Node::Optional(&GROUP_BY_CLAUSE), Node::Optional(&HAVING_CLAUSE), Node::Repeated { inner: &SET_OP_TAIL, separator: None, min: 0, }, Node::Optional(&ORDER_BY_CLAUSE), Node::Optional(&LIMIT_CLAUSE), Node::Optional(&SEMI), ]; pub static SQL_SELECT_TAIL: Node = Node::Seq(SQL_SELECT_TAIL_NODES); // ================================================================= // with_clause — entry-consumed form (ADR-0032 §4, 2c) // ================================================================= /// The post-`WITH` portion of a top-level statement. /// `data::WITH`'s `CommandNode` has `entry: Word::keyword /// ("with")`, so the registry's dispatch consumes the leading /// `WITH` keyword before the shape walks. The shape is then /// the optional `RECURSIVE` modifier, the `cte_def` list, and /// the trailing `compound_select` (with optional outer ORDER /// BY / LIMIT and a tolerated `;`). static SQL_WITH_TAIL_NODES: &[Node] = &[ Node::Optional(&Node::Word(Word::keyword("recursive"))), Node::Repeated { inner: &CTE_DEF, separator: Some(&COMMA), min: 1, }, Node::Subgrammar(&SQL_SELECT_COMPOUND), Node::Optional(&SEMI), ]; pub static SQL_WITH_TAIL: Node = Node::Seq(SQL_WITH_TAIL_NODES); // ================================================================= // Tests // ================================================================= #[cfg(test)] mod tests { use super::{SQL_SELECT_COMPOUND, SQL_SELECT_STATEMENT}; use crate::dsl::grammar::Node; use crate::dsl::walker::context::WalkContext; use crate::dsl::walker::driver::{NodeWalkResult, walk_node}; use crate::dsl::walker::outcome::MatchedPath; /// Walk `input` against `fragment`. Returns `true` only when /// the walk matches *and* consumes all of `input` (trailing /// whitespace allowed). fn walks_via(fragment: &'static Node, input: &str) -> bool { let mut ctx = WalkContext::new(); let mut path = MatchedPath::new(); let mut per_byte = Vec::new(); match walk_node(input, 0, fragment, &mut ctx, &mut path, &mut per_byte) { NodeWalkResult::Matched { end, .. } => { input[end..].trim().is_empty() } _ => false, } } fn walks(input: &str) -> bool { walks_via(&SQL_SELECT_STATEMENT, input) } fn good(input: &str) { assert!( walks(input), "{input:?} should be a valid SELECT statement" ); } fn bad(input: &str) { assert!( !walks(input), "{input:?} should NOT walk as a complete SELECT statement" ); } // ----- minimal forms ----- #[test] fn bare_constant_select_with_no_from() { good("select 1"); good("select 'hello'"); good("select null"); good("select true"); good("select false"); } #[test] fn single_table_select_star() { good("select * from users"); good("select * from users;"); } #[test] fn single_column_projection() { good("select name from users"); good("select name, age from users"); good("select name, age, email from users"); } // ----- DISTINCT / ALL ----- #[test] fn distinct_modifier() { good("select distinct name from users"); good("select distinct a, b from t"); } #[test] fn all_modifier() { good("select all name from users"); } // Note: `select distinct all name from users` and the like // are admitted structurally — the second keyword parses as // a column reference (the walker doesn't reject keyword-shape // idents as columns). Engine semantics deals with it. This // matches ADR-0030's "grammar admits, engine rejects" posture. // ----- projection wildcard / qualified-star / alias ----- #[test] fn qualified_star_projection() { good("select users.* from users"); good("select u.* from users u"); good("select a.*, b.* from a join b on x = y"); } #[test] fn mixed_projection_with_qualified_star() { good("select users.*, age from users"); } #[test] fn projection_with_as_alias() { good("select name as n from users"); good("select name as n, age as a from users"); } #[test] fn projection_with_bare_alias() { good("select name n from users"); good("select name n, age a from users"); } #[test] fn projection_alias_mixed_forms() { good("select name as n, age a, email from users"); } #[test] fn projection_bare_alias_does_not_swallow_from() { // The bare-alias lookahead must skip when next ident // is `from`; otherwise this would fail with "alias `from` // followed by nothing". good("select name from users"); } #[test] fn projection_bare_alias_does_not_swallow_where_or_group_etc() { good("select name from users where id > 0"); good("select name from users group by name"); good("select name from users order by name"); good("select name from users limit 5"); good("select name from users group by name having count(*) > 1"); } #[test] fn projection_expression_with_arithmetic() { good("select a + b from t"); good("select a + b as total from t"); good("select a * 2 from t"); } #[test] fn projection_function_calls() { good("select upper(name) from users"); good("select count(*) from users"); good("select count(distinct customer_id) from orders"); } // ----- FROM / JOIN flavours ----- #[test] fn from_with_table_alias() { good("select * from users u"); good("select * from users as u"); } #[test] fn inner_join_explicit() { good("select * from a inner join b on x = y"); } #[test] fn inner_join_bare() { good("select * from a join b on x = y"); } #[test] fn left_outer_join() { good("select * from a left join b on x = y"); good("select * from a left outer join b on x = y"); } #[test] fn right_outer_join() { good("select * from a right join b on x = y"); good("select * from a right outer join b on x = y"); } #[test] fn full_outer_join() { good("select * from a full join b on x = y"); good("select * from a full outer join b on x = y"); } #[test] fn cross_join() { good("select * from a cross join b"); } #[test] fn cross_join_with_no_on() { // CROSS JOIN takes no ON; an ON clause is a parse error. bad("select * from a cross join b on x = y"); } #[test] fn chained_joins() { good("select * from a join b on x = y join c on y = z"); good("select * from a left join b on x = y inner join c on y = z"); } #[test] fn join_with_table_aliases() { good("select * from a u join b v on x = y"); good("select * from a as u join b as v on x = y"); } // ----- WHERE / GROUP BY / HAVING ----- #[test] fn where_clause() { good("select * from t where id = 1"); good("select * from t where a > 0 and b < 10"); } #[test] fn group_by_single_column() { good("select name from t group by name"); } #[test] fn group_by_multiple_columns() { good("select a, b from t group by a, b"); } #[test] fn group_by_expression() { good("select count(*) from t group by upper(name)"); } #[test] fn having_clause() { good("select name from t group by name having count(*) > 1"); // HAVING without GROUP BY is admitted structurally; // engine may reject. The grammar admits it. good("select count(*) from t having count(*) > 0"); } // ----- set operators ----- #[test] fn union_two_selects() { good("select a from t union select b from u"); } #[test] fn union_all_two_selects() { good("select a from t union all select b from u"); } #[test] fn intersect_two_selects() { good("select a from t intersect select b from u"); } #[test] fn except_two_selects() { good("select a from t except select b from u"); } #[test] fn set_op_chain() { good( "select a from t union select b from u intersect select c from v", ); } #[test] fn set_op_with_outer_order_by_and_limit() { good( "select a from t union select b from u order by a limit 10", ); } // ----- ORDER BY / LIMIT / OFFSET ----- #[test] fn order_by_single_column() { good("select * from t order by name"); } #[test] fn order_by_with_direction() { good("select * from t order by name asc"); good("select * from t order by name desc"); } #[test] fn order_by_multiple_items() { good("select * from t order by name asc, age desc"); } #[test] fn order_by_column_position() { // A column-position reference falls out of `sql_expr` // (an integer literal is a valid expression). good("select a, b from t order by 1"); good("select a, b from t order by 1, 2 desc"); } #[test] fn limit_only() { good("select * from t limit 10"); } #[test] fn limit_with_offset() { good("select * from t limit 10 offset 5"); } #[test] fn legacy_limit_comma_form_rejected() { // `LIMIT m, n` (offset-first MySQL/SQLite legacy) is // OOS per ADR-0032 §13 OOS-4. bad("select * from t limit 5, 10"); } // ----- CTEs ----- #[test] fn non_recursive_cte() { good("with x as (select 1) select * from x"); } #[test] fn non_recursive_cte_select_star() { good("with x as (select * from users) select * from x"); } #[test] fn cte_with_column_list_rename() { good("with x(n) as (select name from users) select n from x"); good("with x(a, b) as (select a, b from t) select * from x"); } #[test] fn recursive_cte() { good( "with recursive r as (select 1 union all select 2) select * from r", ); } #[test] fn multiple_ctes() { good( "with a as (select 1), b as (select 2) select * from a union select * from b", ); } // ----- subquery shapes (recursion through SQL_SELECT_COMPOUND) ----- // // True subquery expressions inside `sql_expr` arrive in 2b // (additive `Choice` branches in `sql_expr.rs`). 2a verifies // that the compound fragment recurses cleanly from CTE // bodies and that the deepest depth check still fires. #[test] fn nested_cte_body_with_union() { good( "with x as (select 1 union select 2) select * from x", ); } // ----- case insensitivity / spacing ----- #[test] fn keywords_are_case_insensitive() { good("SELECT * FROM users"); good("Select Distinct A From T Where Id = 1 Order By A Desc Limit 5 Offset 2"); good("WITH RECURSIVE r AS (SELECT 1 UNION ALL SELECT 2) SELECT * FROM r"); } #[test] fn trailing_semicolon_tolerated() { good("select 1;"); good("select * from users;"); good("with x as (select 1) select * from x;"); } // ----- malformed input ----- #[test] fn empty_projection_rejected() { // Note: `select from t` is structurally admitted as // ` AS ` — the walker does not // reject keyword-shape idents as column refs. This // matches ADR-0030's posture (grammar admits, engine // rejects). The genuinely-malformed `select` alone is // still rejected because there is no expression to // match. bad("select"); } #[test] fn missing_join_target() { bad("select * from a join"); bad("select * from a join b"); bad("select * from a join b on"); } #[test] fn dangling_set_op() { bad("select a from t union"); bad("select a from t union select"); } #[test] fn dangling_clauses() { bad("select a from t where"); bad("select a from t order by"); bad("select a from t group by"); bad("select a from t having"); bad("select a from t limit"); bad("select a from t limit 5 offset"); } #[test] fn cte_missing_body() { bad("with x as select 1"); bad("with x as ("); bad("with x as ()"); } #[test] fn cte_missing_as() { bad("with x (select 1) select * from x"); } #[test] fn bare_recursive_without_with_is_invalid() { bad("recursive r as (select 1) select * from r"); } // ----- OOS shapes (ADR-0032 §13) ----- #[test] fn comma_from_is_rejected() { // OOS-3: implicit cross join via comma list. bad("select * from a, b"); } #[test] fn natural_join_rejected() { // OOS-2. bad("select * from a natural join b"); } #[test] fn using_clause_rejected() { // OOS-2. bad("select * from a join b using (id)"); } #[test] fn values_row_source_rejected() { // OOS-7. bad("select * from (values (1), (2))"); } #[test] fn lateral_join_rejected() { // OOS-6. The bare comma-FROM form is rejected because // we do not admit comma-separated FROM lists (OOS-3), // so `from a, lateral …` cannot parse as a join. The // single-token `LATERAL JOIN` form is admitted // structurally — `lateral` parses as a table-source // bare alias for `a` and the JOIN that follows is just // a normal join. This matches the rest of the grammar's // posture: keyword-shape identifiers are admitted as // names; non-admitted syntactic forms (comma-FROM) are // what makes a query reject. bad("select * from a, lateral (select 1)"); } #[test] fn window_function_rejected() { // OOS-5: `OVER (…)` window clauses are not part of the // Phase-2 grammar. bad("select row_number() over () from t"); bad("select sum(x) over (partition by y) from t"); } #[test] fn derived_table_in_from_rejected() { // OOS-1: `FROM (SELECT …) alias` is OOS. // CTEs cover the same use case. bad("select * from (select * from users) sub"); bad("select * from (select * from users) as sub"); } // ----- internal-table rejection (ADR-0030 §6) ----- #[test] fn internal_table_in_from_rejected() { bad("select * from __rdbms_columns"); bad("select * from __rdbms_playground_columns"); } #[test] fn internal_table_as_cte_name_rejected() { bad("with __rdbms_x as (select 1) select * from __rdbms_x"); } #[test] fn internal_table_in_cte_body_rejected() { bad("with x as (select * from __rdbms_columns) select * from x"); } #[test] fn internal_table_in_join_rejected() { bad("select * from users join __rdbms_columns on x = y"); } // ----- depth cap (ADR-0026 §1 / ADR-0032 §9) ----- #[test] fn pathological_nesting_capped() { // Deep parenthesised CTE-body chain is rejected by the // shared `MAX_SUBGRAMMAR_DEPTH = 64` cap, not by stack // overflow. let depth = 200; let mut input = String::new(); for _ in 0..depth { input.push_str("with x as ("); } input.push_str("select 1"); for _ in 0..depth { input.push_str(") select * from x"); } assert!(!walks(&input)); } // ----- compound-select fragment entry point ----- #[test] fn compound_fragment_walks_with_or_without_with_clause() { // SQL_SELECT_COMPOUND is what subqueries / CTE bodies // recurse into. It admits a select_core + optional // set-op chain + outer ORDER/LIMIT, optionally // prefixed by a WITH clause (ADR-0032 §10.3 — a // subquery body may declare its own CTEs that shadow // outer ones). assert!(walks_via(&SQL_SELECT_COMPOUND, "select 1")); assert!(walks_via( &SQL_SELECT_COMPOUND, "select a from t union select b from u", )); assert!(walks_via( &SQL_SELECT_COMPOUND, "with x as (select 1) select * from x", )); } // ---- ADR-0032 §5/§6 — subqueries and qualified refs in // ---- statement-level positions (sql_expr extensions // ---- recurse through SQL_SELECT_COMPOUND via // ---- ScopedSubgrammar). #[test] fn qualified_ref_in_where_clause() { good("select * from t where t.id = 1"); good("select * from a join b on a.id = b.id"); good("select t.name from t where t.age > 18"); } #[test] fn scalar_subquery_in_where_clause() { good("select * from t where x = (select y from u)"); good("select * from t where x > (select count(*) from u)"); } #[test] fn in_subquery_in_where_clause() { good("select * from t where id in (select user_id from orders)"); good( "select * from customers where id not in (select customer_id from blocklist)", ); } #[test] fn exists_subquery_in_where_clause() { good( "select * from customers c where exists (select 1 from orders o where o.customer_id = c.id)", ); good("select * from t where not exists (select 1 from u)"); } #[test] fn nested_subqueries() { good( "select * from t where x in (select y from u where y in (select z from v))", ); } #[test] fn subquery_in_projection() { good("select (select max(price) from products) from t"); good( "select name, (select count(*) from orders where customer_id = c.id) from customers c", ); } #[test] fn cte_body_references_qualified_columns() { good( "with x as (select t.name, t.age from t) select x.name from x", ); } }