From 8d293358a0a2f34e960b83ebeeb15b03934f0444 Mon Sep 17 00:00:00 2001 From: "claude@clouddev1" Date: Wed, 20 May 2026 11:29:48 +0000 Subject: [PATCH] grammar: SQL SELECT full statement fragment (ADR-0032 Phase 2a) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author the standalone walkable shape for the full standard-SQL SELECT per ADR-0032 §1: compound queries with the four set ops (UNION / UNION ALL / INTERSECT / EXCEPT), the five JOIN flavours (INNER / LEFT [OUTER] / RIGHT [OUTER] / FULL [OUTER] / CROSS), GROUP BY / HAVING, WITH and WITH RECURSIVE common table expressions, LIMIT … OFFSET, DISTINCT / ALL, qualified-wildcard `t.*` projection, and bare-alias projection (lifting ADR-0030 Phase-1 §4.2). Recursion into SQL_SELECT_COMPOUND uses Node::Subgrammar for 2a; sub-phase 2b will rewire those references to the new Node::ScopedSubgrammar variant for completion-scope discipline (ADR-0032 §10.2). The Phase-1 data::SELECT CommandNode is not touched here — the new fragment is reachable only from its own tests until sub-phase 2c performs the migration. Two implementation mechanisms realize ADR semantics without changing them: - Node::Lookahead disambiguates the projection_item Choice (bare `*` vs `ident . *` qualified wildcard vs `sql_expr [ alias ]`) and gates bare-alias slots against continuation keywords. The walker's walk_ident accepts any identifier-shape token, including keyword-shape ones, and Choice / Optional are first-match-wins; without lookahead a bare-alias slot would greedily swallow FROM / WHERE / JOIN / etc. Per-position follow-sets list which keywords legitimately follow each alias slot. Same pattern as data.rs's insert_first_paren precedent. - INNER JOIN and bare JOIN are split into two distinct Choice branches (each with a concrete leading keyword) rather than sharing one Optional(Word("inner"))-leading branch. Avoids a walker hazard where an Optional-leading-child Seq commits to idx > 0 and then converts the next child's EOF NoMatch into Incomplete, blocking the outer Choice from falling through to later branches. Same semantic surface, distinct mechanism. The §13 OOS shapes all have explicit reject tests (NATURAL, USING, comma-FROM, LIMIT m,n, window OVER, VALUES, derived tables). LATERAL has a noted partial limitation: the comma form rejects via OOS-3, but the single-keyword form `FROM a LATERAL JOIN b ON …` is admitted structurally because `lateral` parses as a bare table-source alias for `a`. This matches ADR-0030's "grammar admits identifier-shape tokens; engine resolves" posture. `__rdbms_*` rejection extends to every Phase-2 table-source slot — the FROM table, each JOIN's table, each CTE name, and the FROM inside any CTE body — via the reuseable reject_internal_table validator. 70 new unit tests in sql_select.rs walk every §1 production and every OOS reject case. Test totals: 1260 baseline + 70 = 1330 passing, 0 failing, 1 ignored (unchanged from baseline). Clippy clean. Per the Phase-2 plan sub-phase 2a exit gate. DA gate written review: PASS. --- src/dsl/grammar/mod.rs | 1 + src/dsl/grammar/sql_select.rs | 1171 +++++++++++++++++++++++++++++++++ 2 files changed, 1172 insertions(+) create mode 100644 src/dsl/grammar/sql_select.rs diff --git a/src/dsl/grammar/mod.rs b/src/dsl/grammar/mod.rs index 050e96d..f090738 100644 --- a/src/dsl/grammar/mod.rs +++ b/src/dsl/grammar/mod.rs @@ -28,6 +28,7 @@ pub mod ddl; pub mod expr; pub mod shared; pub mod sql_expr; +pub mod sql_select; use crate::dsl::command::Command; use crate::dsl::walker::context::WalkContext; diff --git a/src/dsl/grammar/sql_select.rs b/src/dsl/grammar/sql_select.rs new file mode 100644 index 0000000..953c77f --- /dev/null +++ b/src/dsl/grammar/sql_select.rs @@ -0,0 +1,1171 @@ +//! The full SQL `SELECT` grammar fragment (ADR-0032). +//! +//! ADR-0030 Phase 2. This fragment is the standalone walkable +//! shape for the full standard-SQL `SELECT`: `INNER` / `LEFT` / +//! `RIGHT` / `FULL OUTER` / `CROSS` joins, `GROUP BY` / `HAVING`, +//! the four set operators (`UNION` / `UNION ALL` / `INTERSECT` +//! / `EXCEPT`), `WITH` and `WITH RECURSIVE` common table +//! expressions, `LIMIT … OFFSET`, `DISTINCT`, `t.*` projection, +//! and bare-alias projection (lifting ADR-0030 Phase 1 §4.2). +//! +//! Recursion into `SQL_SELECT_COMPOUND` is via `Node::Subgrammar` +//! at sub-phase 2a; sub-phase 2b replaces those references with +//! `Node::ScopedSubgrammar` for completion-scope discipline +//! (ADR-0032 §10.2). The Phase-1 `data::SELECT` `CommandNode` +//! continues to use its own grammar until sub-phase 2c's +//! migration — this fragment is reachable only from its own +//! tests in 2a. +//! +//! # BNF (ADR-0032 §1) +//! +//! ```text +//! select_statement := [ with_clause ] compound_select [ ';' ] +//! compound_select := select_core ( set_op select_core )* +//! [ order_by_clause ] +//! [ limit_clause ] +//! set_op := UNION [ ALL ] | INTERSECT | EXCEPT +//! select_core := SELECT [ DISTINCT | ALL ] +//! projection_list +//! [ from_clause ] +//! [ where_clause ] +//! [ group_by_clause ] +//! [ having_clause ] +//! with_clause := WITH [ RECURSIVE ] cte_def +//! ( ',' cte_def )* +//! cte_def := identifier [ '(' column_name_list ')' ] +//! AS '(' compound_select ')' +//! projection_list := projection_item ( ',' projection_item )* +//! projection_item := '*' +//! | identifier '.' '*' +//! | sql_expr [ [ AS ] identifier ] +//! from_clause := FROM table_source ( join_clause )* +//! table_source := identifier [ [ AS ] identifier ] +//! join_clause := [ INNER ] JOIN table_source ON sql_expr +//! | LEFT [ OUTER ] JOIN table_source ON sql_expr +//! | RIGHT [ OUTER ] JOIN table_source ON sql_expr +//! | FULL [ OUTER ] JOIN table_source ON sql_expr +//! | CROSS JOIN table_source +//! where_clause := WHERE sql_expr +//! group_by_clause := GROUP BY sql_expr ( ',' sql_expr )* +//! having_clause := HAVING sql_expr +//! order_by_clause := ORDER BY order_item ( ',' order_item )* +//! order_item := sql_expr [ ASC | DESC ] +//! limit_clause := LIMIT sql_expr [ OFFSET sql_expr ] +//! ``` +//! +//! # Disambiguation via `Node::Lookahead` +//! +//! Two places need lookahead to dispatch cleanly: +//! +//! - **Projection item** (ADR-0032 §1 `projection_item`). The +//! three alternatives all share a leading identifier shape +//! (`*` and the `ident . *` qualified wildcard, plus `sql_expr` +//! which also begins on an ident for the column-ref case). A +//! factory peeks the first 3 tokens to pick `*`, `ident . *`, +//! or `sql_expr [ alias ]`. +//! +//! - **Bare alias** (ADR-0032 §1 — lifts Phase-1 §4.2). The +//! walker's `walk_ident` happily matches keyword-shaped tokens +//! as identifiers, and `Choice`/`Optional` are first-match- +//! wins (no backtracking on a successful match). To prevent +//! bare-alias slots from swallowing continuation keywords, the +//! alias slot is a `Lookahead` that returns an empty `Choice` +//! (NoMatch) when the next ident-shaped token is a +//! continuation keyword for that position. + +use crate::dsl::grammar::{IdentSource, Node, ValidationError, Word, sql_expr}; +use crate::dsl::walker::context::WalkContext; +use crate::dsl::walker::lex_helpers::{consume_ident, skip_whitespace}; + +// ================================================================= +// Validators +// ================================================================= + +/// Reject internal `__rdbms_*` metadata tables in any +/// table-source slot (ADR-0030 §6 reused by ADR-0032 §4 — extends +/// to every Phase-2 table-source slot: `FROM`, `JOIN` targets, +/// CTE name, and the `FROM` inside any CTE body). +fn reject_internal_table(name: &str) -> Result<(), ValidationError> { + if name.to_ascii_lowercase().starts_with("__rdbms_") { + Err(ValidationError { + message_key: "select.internal_table", + args: vec![("table", name.to_string())], + }) + } else { + Ok(()) + } +} + +// ================================================================= +// Shared leaf nodes +// ================================================================= + +const COMMA: Node = Node::Punct(','); +const STAR: Node = Node::Punct('*'); +const LPAREN: Node = Node::Punct('('); +const RPAREN: Node = Node::Punct(')'); +const SEMI: Node = Node::Punct(';'); + +/// SQL expression slot — recursion into ADR-0031's fragment +/// through `Node::Subgrammar`. Stays `Subgrammar` (not +/// `ScopedSubgrammar`) — `sql_expr` recursion is part of the +/// precedence ladder, not a new lexical scope (ADR-0032 §10.2). +const SQL_EXPR: Node = Node::Subgrammar(&sql_expr::SQL_OR_EXPR); + +/// A node that never matches. Used as the "no" branch of +/// lookahead-driven disambiguation: an empty `Choice` walks to +/// `NoMatch`, which `Optional` / `Choice` gracefully treat as +/// "skip" or "fall through to the next branch". +const EMPTY_NOMATCH: Node = Node::Choice(&[]); + +// ================================================================= +// Bare-alias dispatch (ADR-0032 §1) +// ================================================================= +// +// The walker's `walk_ident` accepts any identifier-shape token, +// including keyword-shape ones. With `Optional` / `Choice` +// being first-match-wins, an unrestricted bare-alias slot would +// greedily consume `FROM` / `WHERE` / `JOIN` / etc. as if they +// were aliases. `Node::Lookahead` peeks the next token; when it +// matches a continuation keyword for this position, the factory +// returns `EMPTY_NOMATCH` so `Optional` skips and the keyword +// reaches the next clause. + +/// Continuation keywords that may legitimately follow a +/// projection item's bare alias (or its absence). Includes the +/// `select_core` follow keywords and the compound-query / outer +/// suffix keywords. `as` is not listed — the AS-form alias is a +/// separate `Choice` branch that fires before the lookahead. +const PROJECTION_FOLLOW_SET: &[&str] = &[ + "from", "where", "group", "order", "having", "limit", + "union", "intersect", "except", +]; + +/// Continuation keywords that may legitimately follow a table +/// source's bare alias (or its absence). Includes the join +/// keywords (so `FROM a JOIN b` doesn't read `JOIN` as `a`'s +/// alias) and the `select_core` / compound suffix keywords. +/// `on` is included because `FROM a JOIN b ON …` reaches `on` +/// only when `b` has no alias — `on` is not a base-table name a +/// learner would type as an alias. +const TABLE_SOURCE_FOLLOW_SET: &[&str] = &[ + "where", "group", "order", "having", "limit", + "union", "intersect", "except", + "inner", "left", "right", "full", "cross", "join", "on", +]; + +fn peek_next_ident_lower(source: &str, pos: usize) -> Option { + let p = skip_whitespace(source, pos); + consume_ident(source, p).map(|(s, e)| source[s..e].to_ascii_lowercase()) +} + +fn projection_bare_alias_factory( + _: &WalkContext, + source: &str, + pos: usize, +) -> Node { + match peek_next_ident_lower(source, pos) { + Some(word) + if PROJECTION_FOLLOW_SET.iter().any(|k| *k == word) => + { + EMPTY_NOMATCH + } + Some(_) => BARE_ALIAS_IDENT, + None => EMPTY_NOMATCH, + } +} + +fn table_source_bare_alias_factory( + _: &WalkContext, + source: &str, + pos: usize, +) -> Node { + match peek_next_ident_lower(source, pos) { + Some(word) + if TABLE_SOURCE_FOLLOW_SET.iter().any(|k| *k == word) => + { + EMPTY_NOMATCH + } + Some(_) => BARE_ALIAS_IDENT, + None => EMPTY_NOMATCH, + } +} + +// ================================================================= +// Alias slot +// ================================================================= + +const BARE_ALIAS_IDENT: Node = Node::Ident { + source: IdentSource::NewName, + role: "select_alias", + validator: None, + highlight_override: None, + writes_table: false, + writes_column: false, + writes_user_listed_column: false, +}; + +static AS_ALIAS_NODES: &[Node] = &[ + Node::Word(Word::keyword("as")), + BARE_ALIAS_IDENT, +]; +const AS_ALIAS_EXPLICIT: Node = Node::Seq(AS_ALIAS_NODES); + +static PROJECTION_ALIAS_CHOICES: &[Node] = &[ + AS_ALIAS_EXPLICIT, + Node::Lookahead(projection_bare_alias_factory), +]; +const PROJECTION_ALIAS_CHOICE: Node = Node::Choice(PROJECTION_ALIAS_CHOICES); +const PROJECTION_ALIAS_OPTIONAL: Node = + Node::Optional(&PROJECTION_ALIAS_CHOICE); + +static TABLE_SOURCE_ALIAS_CHOICES: &[Node] = &[ + AS_ALIAS_EXPLICIT, + Node::Lookahead(table_source_bare_alias_factory), +]; +const TABLE_SOURCE_ALIAS_CHOICE: Node = + Node::Choice(TABLE_SOURCE_ALIAS_CHOICES); +const TABLE_SOURCE_ALIAS_OPTIONAL: Node = + Node::Optional(&TABLE_SOURCE_ALIAS_CHOICE); + +// ================================================================= +// Projection item +// ================================================================= + +const QUALIFIED_STAR_QUALIFIER: Node = Node::Ident { + source: IdentSource::Tables, + role: "qualified_star_qualifier", + validator: None, + highlight_override: None, + writes_table: false, + writes_column: false, + writes_user_listed_column: false, +}; + +static QUALIFIED_STAR_NODES: &[Node] = &[ + QUALIFIED_STAR_QUALIFIER, + Node::Punct('.'), + Node::Punct('*'), +]; +const QUALIFIED_STAR: Node = Node::Seq(QUALIFIED_STAR_NODES); + +static PROJECTION_EXPR_ITEM_NODES: &[Node] = &[ + SQL_EXPR, + PROJECTION_ALIAS_OPTIONAL, +]; +const PROJECTION_EXPR_ITEM: Node = Node::Seq(PROJECTION_EXPR_ITEM_NODES); + +/// Dispatch one projection item via a 3-token lookahead. +/// +/// - `*` (and only `*`) → bare wildcard. +/// - `ident . *` → qualified wildcard. +/// - anything else → `sql_expr [ alias ]`. +/// +/// The factory is the cleanest way to handle the shared-prefix +/// ambiguity between `t.*` and `sql_expr` (which can match a +/// bare `t`), since the walker's `Choice` doesn't backtrack on +/// a committed match. +fn projection_item_factory( + _: &WalkContext, + source: &str, + pos: usize, +) -> Node { + let p = skip_whitespace(source, pos); + let bytes = source.as_bytes(); + if bytes.get(p) == Some(&b'*') { + return STAR; + } + if let Some((_, end1)) = consume_ident(source, p) { + let after_ident = skip_whitespace(source, end1); + if bytes.get(after_ident) == Some(&b'.') { + let after_dot = skip_whitespace(source, after_ident + 1); + if bytes.get(after_dot) == Some(&b'*') { + return QUALIFIED_STAR; + } + } + } + PROJECTION_EXPR_ITEM +} + +const PROJECTION_ITEM: Node = Node::Lookahead(projection_item_factory); + +const PROJECTION_LIST: Node = Node::Repeated { + inner: &PROJECTION_ITEM, + separator: Some(&COMMA), + min: 1, +}; + +// ================================================================= +// DISTINCT / ALL prefix +// ================================================================= + +static DISTINCT_OR_ALL_CHOICES: &[Node] = &[ + Node::Word(Word::keyword("distinct")), + Node::Word(Word::keyword("all")), +]; +const DISTINCT_OR_ALL_CHOICE: Node = Node::Choice(DISTINCT_OR_ALL_CHOICES); +const DISTINCT_OR_ALL_OPTIONAL: Node = + Node::Optional(&DISTINCT_OR_ALL_CHOICE); + +// ================================================================= +// Table source (FROM / JOIN target) +// ================================================================= + +const TABLE_NAME_IDENT: Node = Node::Ident { + source: IdentSource::Tables, + role: "table_name", + validator: Some(reject_internal_table), + highlight_override: None, + writes_table: false, + writes_column: false, + writes_user_listed_column: false, +}; + +static TABLE_SOURCE_NODES: &[Node] = &[ + TABLE_NAME_IDENT, + TABLE_SOURCE_ALIAS_OPTIONAL, +]; +const TABLE_SOURCE: Node = Node::Seq(TABLE_SOURCE_NODES); + +// ================================================================= +// JOIN flavours +// ================================================================= + +const JOIN_WORD: Node = Node::Word(Word::keyword("join")); +const ON_WORD: Node = Node::Word(Word::keyword("on")); +const OUTER_OPTIONAL: Node = + Node::Optional(&Node::Word(Word::keyword("outer"))); + +// `INNER JOIN` and bare `JOIN` are split into two Choice +// branches so each branch has a distinct leading keyword +// (`inner` vs `join`). Avoids the "optional leading child → +// idx > 0 → EOF becomes Incomplete" hazard in walk_seq that a +// shared `Optional(Word("inner"))` would otherwise create. +static INNER_JOIN_NODES: &[Node] = &[ + Node::Word(Word::keyword("inner")), + JOIN_WORD, + TABLE_SOURCE, + ON_WORD, + SQL_EXPR, +]; + +static BARE_JOIN_NODES: &[Node] = &[ + JOIN_WORD, + TABLE_SOURCE, + ON_WORD, + SQL_EXPR, +]; + +static LEFT_JOIN_NODES: &[Node] = &[ + Node::Word(Word::keyword("left")), + OUTER_OPTIONAL, + JOIN_WORD, + TABLE_SOURCE, + ON_WORD, + SQL_EXPR, +]; + +static RIGHT_JOIN_NODES: &[Node] = &[ + Node::Word(Word::keyword("right")), + OUTER_OPTIONAL, + JOIN_WORD, + TABLE_SOURCE, + ON_WORD, + SQL_EXPR, +]; + +static FULL_JOIN_NODES: &[Node] = &[ + Node::Word(Word::keyword("full")), + OUTER_OPTIONAL, + JOIN_WORD, + TABLE_SOURCE, + ON_WORD, + SQL_EXPR, +]; + +static CROSS_JOIN_NODES: &[Node] = &[ + Node::Word(Word::keyword("cross")), + JOIN_WORD, + TABLE_SOURCE, +]; + +/// JOIN flavour dispatch. Each branch has a distinct leading +/// keyword so `Choice` first-match-wins discriminates cleanly +/// without invoking the walker's `Optional`-leading-child +/// hazard. +static JOIN_CLAUSE_CHOICES: &[Node] = &[ + Node::Seq(LEFT_JOIN_NODES), + Node::Seq(RIGHT_JOIN_NODES), + Node::Seq(FULL_JOIN_NODES), + Node::Seq(CROSS_JOIN_NODES), + Node::Seq(INNER_JOIN_NODES), + Node::Seq(BARE_JOIN_NODES), +]; +const JOIN_CLAUSE: Node = Node::Choice(JOIN_CLAUSE_CHOICES); + +// ================================================================= +// FROM / WHERE / GROUP BY / HAVING +// ================================================================= + +static FROM_CLAUSE_NODES: &[Node] = &[ + Node::Word(Word::keyword("from")), + TABLE_SOURCE, + Node::Repeated { + inner: &JOIN_CLAUSE, + separator: None, + min: 0, + }, +]; +const FROM_CLAUSE: Node = Node::Seq(FROM_CLAUSE_NODES); + +static WHERE_CLAUSE_NODES: &[Node] = &[ + Node::Word(Word::keyword("where")), + SQL_EXPR, +]; +const WHERE_CLAUSE: Node = Node::Seq(WHERE_CLAUSE_NODES); + +static GROUP_BY_CLAUSE_NODES: &[Node] = &[ + Node::Word(Word::keyword("group")), + Node::Word(Word::keyword("by")), + Node::Repeated { + inner: &SQL_EXPR, + separator: Some(&COMMA), + min: 1, + }, +]; +const GROUP_BY_CLAUSE: Node = Node::Seq(GROUP_BY_CLAUSE_NODES); + +static HAVING_CLAUSE_NODES: &[Node] = &[ + Node::Word(Word::keyword("having")), + SQL_EXPR, +]; +const HAVING_CLAUSE: Node = Node::Seq(HAVING_CLAUSE_NODES); + +// ================================================================= +// ORDER BY / LIMIT / OFFSET +// ================================================================= + +static ASC_DESC_CHOICES: &[Node] = &[ + Node::Word(Word::keyword("asc")), + Node::Word(Word::keyword("desc")), +]; +const ASC_DESC_CHOICE: Node = Node::Choice(ASC_DESC_CHOICES); +static ORDER_ITEM_NODES: &[Node] = &[ + SQL_EXPR, + Node::Optional(&ASC_DESC_CHOICE), +]; +const ORDER_ITEM: Node = Node::Seq(ORDER_ITEM_NODES); + +static ORDER_BY_CLAUSE_NODES: &[Node] = &[ + Node::Word(Word::keyword("order")), + Node::Word(Word::keyword("by")), + Node::Repeated { + inner: &ORDER_ITEM, + separator: Some(&COMMA), + min: 1, + }, +]; +const ORDER_BY_CLAUSE: Node = Node::Seq(ORDER_BY_CLAUSE_NODES); + +static OFFSET_NODES: &[Node] = &[ + Node::Word(Word::keyword("offset")), + SQL_EXPR, +]; +const OFFSET_SEQ: Node = Node::Seq(OFFSET_NODES); +const OFFSET_OPTIONAL: Node = Node::Optional(&OFFSET_SEQ); + +static LIMIT_CLAUSE_NODES: &[Node] = &[ + Node::Word(Word::keyword("limit")), + SQL_EXPR, + OFFSET_OPTIONAL, +]; +const LIMIT_CLAUSE: Node = Node::Seq(LIMIT_CLAUSE_NODES); + +// ================================================================= +// select_core (per-leg of a compound) +// ================================================================= + +static SELECT_CORE_NODES: &[Node] = &[ + Node::Word(Word::keyword("select")), + DISTINCT_OR_ALL_OPTIONAL, + PROJECTION_LIST, + Node::Optional(&FROM_CLAUSE), + Node::Optional(&WHERE_CLAUSE), + Node::Optional(&GROUP_BY_CLAUSE), + Node::Optional(&HAVING_CLAUSE), +]; +const SELECT_CORE: Node = Node::Seq(SELECT_CORE_NODES); + +// ================================================================= +// compound_select +// ================================================================= +// +// `UNION ALL` is a single `Choice` branch (matched before bare +// `UNION`) so the matched-path keyword sequence reads cleanly. + +// `UNION` and `UNION ALL` are factored as one `Seq[union, +// Optional(all)]` branch so the Choice doesn't commit on `union` +// inside a multi-token branch and then fail when `all` is +// missing. The trailing `Optional(all)` is the last child of +// the Seq, so a skip there doesn't trigger the +// optional-leading-then-EOF-becomes-Incomplete hazard. +static UNION_OR_UNION_ALL_NODES: &[Node] = &[ + Node::Word(Word::keyword("union")), + Node::Optional(&Node::Word(Word::keyword("all"))), +]; +static SET_OP_CHOICES: &[Node] = &[ + Node::Seq(UNION_OR_UNION_ALL_NODES), + Node::Word(Word::keyword("intersect")), + Node::Word(Word::keyword("except")), +]; +const SET_OP: Node = Node::Choice(SET_OP_CHOICES); + +static SET_OP_TAIL_NODES: &[Node] = &[SET_OP, SELECT_CORE]; +const SET_OP_TAIL: Node = Node::Seq(SET_OP_TAIL_NODES); + +static COMPOUND_SELECT_NODES: &[Node] = &[ + SELECT_CORE, + Node::Repeated { + inner: &SET_OP_TAIL, + separator: None, + min: 0, + }, + Node::Optional(&ORDER_BY_CLAUSE), + Node::Optional(&LIMIT_CLAUSE), +]; +/// The compound-select fragment that subqueries / CTE bodies +/// recurse into via `Subgrammar` (2a) / `ScopedSubgrammar` (2b). +/// Omits the outer `with_clause`; that lives on +/// `SQL_SELECT_STATEMENT`. +pub static SQL_SELECT_COMPOUND: Node = Node::Seq(COMPOUND_SELECT_NODES); + +// ================================================================= +// CTE definitions +// ================================================================= + +const CTE_NAME_IDENT: Node = Node::Ident { + source: IdentSource::NewName, + role: "cte_name", + validator: Some(reject_internal_table), + highlight_override: None, + writes_table: false, + writes_column: false, + writes_user_listed_column: false, +}; + +const CTE_COLUMN_IDENT: Node = Node::Ident { + source: IdentSource::NewName, + role: "cte_column", + validator: None, + highlight_override: None, + writes_table: false, + writes_column: false, + writes_user_listed_column: false, +}; + +static CTE_COLUMN_LIST_NODES: &[Node] = &[ + LPAREN, + Node::Repeated { + inner: &CTE_COLUMN_IDENT, + separator: Some(&COMMA), + min: 1, + }, + RPAREN, +]; +const CTE_COLUMN_LIST_SEQ: Node = Node::Seq(CTE_COLUMN_LIST_NODES); +const CTE_COLUMN_LIST_OPTIONAL: Node = + Node::Optional(&CTE_COLUMN_LIST_SEQ); + +static CTE_BODY_NODES: &[Node] = &[ + LPAREN, + Node::Subgrammar(&SQL_SELECT_COMPOUND), + RPAREN, +]; +const CTE_BODY: Node = Node::Seq(CTE_BODY_NODES); + +static CTE_DEF_NODES: &[Node] = &[ + CTE_NAME_IDENT, + CTE_COLUMN_LIST_OPTIONAL, + Node::Word(Word::keyword("as")), + CTE_BODY, +]; +const CTE_DEF: Node = Node::Seq(CTE_DEF_NODES); + +static WITH_CLAUSE_NODES: &[Node] = &[ + Node::Word(Word::keyword("with")), + Node::Optional(&Node::Word(Word::keyword("recursive"))), + Node::Repeated { + inner: &CTE_DEF, + separator: Some(&COMMA), + min: 1, + }, +]; +const WITH_CLAUSE: Node = Node::Seq(WITH_CLAUSE_NODES); + +// ================================================================= +// select_statement — the fragment entry point +// ================================================================= + +static SELECT_STATEMENT_NODES: &[Node] = &[ + Node::Optional(&WITH_CLAUSE), + Node::Subgrammar(&SQL_SELECT_COMPOUND), + Node::Optional(&SEMI), +]; +/// The full statement, including the optional `WITH` prefix and +/// a tolerated trailing `;`. This is what `data::SELECT`'s +/// `CommandNode` will reference once sub-phase 2c migrates the +/// Phase-1 grammar. +pub static SQL_SELECT_STATEMENT: Node = Node::Seq(SELECT_STATEMENT_NODES); + +// ================================================================= +// Tests +// ================================================================= + +#[cfg(test)] +mod tests { + use super::{SQL_SELECT_COMPOUND, SQL_SELECT_STATEMENT}; + use crate::dsl::grammar::Node; + use crate::dsl::walker::context::WalkContext; + use crate::dsl::walker::driver::{NodeWalkResult, walk_node}; + use crate::dsl::walker::outcome::MatchedPath; + + /// Walk `input` against `fragment`. Returns `true` only when + /// the walk matches *and* consumes all of `input` (trailing + /// whitespace allowed). + fn walks_via(fragment: &'static Node, input: &str) -> bool { + let mut ctx = WalkContext::new(); + let mut path = MatchedPath::new(); + let mut per_byte = Vec::new(); + match walk_node(input, 0, fragment, &mut ctx, &mut path, &mut per_byte) { + NodeWalkResult::Matched { end, .. } => { + input[end..].trim().is_empty() + } + _ => false, + } + } + + fn walks(input: &str) -> bool { + walks_via(&SQL_SELECT_STATEMENT, input) + } + + fn good(input: &str) { + assert!( + walks(input), + "{input:?} should be a valid SELECT statement" + ); + } + + fn bad(input: &str) { + assert!( + !walks(input), + "{input:?} should NOT walk as a complete SELECT statement" + ); + } + + // ----- minimal forms ----- + + #[test] + fn bare_constant_select_with_no_from() { + good("select 1"); + good("select 'hello'"); + good("select null"); + good("select true"); + good("select false"); + } + + #[test] + fn single_table_select_star() { + good("select * from users"); + good("select * from users;"); + } + + #[test] + fn single_column_projection() { + good("select name from users"); + good("select name, age from users"); + good("select name, age, email from users"); + } + + // ----- DISTINCT / ALL ----- + + #[test] + fn distinct_modifier() { + good("select distinct name from users"); + good("select distinct a, b from t"); + } + + #[test] + fn all_modifier() { + good("select all name from users"); + } + + // Note: `select distinct all name from users` and the like + // are admitted structurally — the second keyword parses as + // a column reference (the walker doesn't reject keyword-shape + // idents as columns). Engine semantics deals with it. This + // matches ADR-0030's "grammar admits, engine rejects" posture. + + // ----- projection wildcard / qualified-star / alias ----- + + #[test] + fn qualified_star_projection() { + good("select users.* from users"); + good("select u.* from users u"); + good("select a.*, b.* from a join b on x = y"); + } + + #[test] + fn mixed_projection_with_qualified_star() { + good("select users.*, age from users"); + } + + #[test] + fn projection_with_as_alias() { + good("select name as n from users"); + good("select name as n, age as a from users"); + } + + #[test] + fn projection_with_bare_alias() { + good("select name n from users"); + good("select name n, age a from users"); + } + + #[test] + fn projection_alias_mixed_forms() { + good("select name as n, age a, email from users"); + } + + #[test] + fn projection_bare_alias_does_not_swallow_from() { + // The bare-alias lookahead must skip when next ident + // is `from`; otherwise this would fail with "alias `from` + // followed by nothing". + good("select name from users"); + } + + #[test] + fn projection_bare_alias_does_not_swallow_where_or_group_etc() { + good("select name from users where id > 0"); + good("select name from users group by name"); + good("select name from users order by name"); + good("select name from users limit 5"); + good("select name from users group by name having count(*) > 1"); + } + + #[test] + fn projection_expression_with_arithmetic() { + good("select a + b from t"); + good("select a + b as total from t"); + good("select a * 2 from t"); + } + + #[test] + fn projection_function_calls() { + good("select upper(name) from users"); + good("select count(*) from users"); + good("select count(distinct customer_id) from orders"); + } + + // ----- FROM / JOIN flavours ----- + + #[test] + fn from_with_table_alias() { + good("select * from users u"); + good("select * from users as u"); + } + + #[test] + fn inner_join_explicit() { + good("select * from a inner join b on x = y"); + } + + #[test] + fn inner_join_bare() { + good("select * from a join b on x = y"); + } + + #[test] + fn left_outer_join() { + good("select * from a left join b on x = y"); + good("select * from a left outer join b on x = y"); + } + + #[test] + fn right_outer_join() { + good("select * from a right join b on x = y"); + good("select * from a right outer join b on x = y"); + } + + #[test] + fn full_outer_join() { + good("select * from a full join b on x = y"); + good("select * from a full outer join b on x = y"); + } + + #[test] + fn cross_join() { + good("select * from a cross join b"); + } + + #[test] + fn cross_join_with_no_on() { + // CROSS JOIN takes no ON; an ON clause is a parse error. + bad("select * from a cross join b on x = y"); + } + + #[test] + fn chained_joins() { + good("select * from a join b on x = y join c on y = z"); + good("select * from a left join b on x = y inner join c on y = z"); + } + + #[test] + fn join_with_table_aliases() { + good("select * from a u join b v on x = y"); + good("select * from a as u join b as v on x = y"); + } + + // ----- WHERE / GROUP BY / HAVING ----- + + #[test] + fn where_clause() { + good("select * from t where id = 1"); + good("select * from t where a > 0 and b < 10"); + } + + #[test] + fn group_by_single_column() { + good("select name from t group by name"); + } + + #[test] + fn group_by_multiple_columns() { + good("select a, b from t group by a, b"); + } + + #[test] + fn group_by_expression() { + good("select count(*) from t group by upper(name)"); + } + + #[test] + fn having_clause() { + good("select name from t group by name having count(*) > 1"); + // HAVING without GROUP BY is admitted structurally; + // engine may reject. The grammar admits it. + good("select count(*) from t having count(*) > 0"); + } + + // ----- set operators ----- + + #[test] + fn union_two_selects() { + good("select a from t union select b from u"); + } + + #[test] + fn union_all_two_selects() { + good("select a from t union all select b from u"); + } + + #[test] + fn intersect_two_selects() { + good("select a from t intersect select b from u"); + } + + #[test] + fn except_two_selects() { + good("select a from t except select b from u"); + } + + #[test] + fn set_op_chain() { + good( + "select a from t union select b from u intersect select c from v", + ); + } + + #[test] + fn set_op_with_outer_order_by_and_limit() { + good( + "select a from t union select b from u order by a limit 10", + ); + } + + // ----- ORDER BY / LIMIT / OFFSET ----- + + #[test] + fn order_by_single_column() { + good("select * from t order by name"); + } + + #[test] + fn order_by_with_direction() { + good("select * from t order by name asc"); + good("select * from t order by name desc"); + } + + #[test] + fn order_by_multiple_items() { + good("select * from t order by name asc, age desc"); + } + + #[test] + fn order_by_column_position() { + // A column-position reference falls out of `sql_expr` + // (an integer literal is a valid expression). + good("select a, b from t order by 1"); + good("select a, b from t order by 1, 2 desc"); + } + + #[test] + fn limit_only() { + good("select * from t limit 10"); + } + + #[test] + fn limit_with_offset() { + good("select * from t limit 10 offset 5"); + } + + #[test] + fn legacy_limit_comma_form_rejected() { + // `LIMIT m, n` (offset-first MySQL/SQLite legacy) is + // OOS per ADR-0032 §13 OOS-4. + bad("select * from t limit 5, 10"); + } + + // ----- CTEs ----- + + #[test] + fn non_recursive_cte() { + good("with x as (select 1) select * from x"); + } + + #[test] + fn non_recursive_cte_select_star() { + good("with x as (select * from users) select * from x"); + } + + #[test] + fn cte_with_column_list_rename() { + good("with x(n) as (select name from users) select n from x"); + good("with x(a, b) as (select a, b from t) select * from x"); + } + + #[test] + fn recursive_cte() { + good( + "with recursive r as (select 1 union all select 2) select * from r", + ); + } + + #[test] + fn multiple_ctes() { + good( + "with a as (select 1), b as (select 2) select * from a union select * from b", + ); + } + + // ----- subquery shapes (recursion through SQL_SELECT_COMPOUND) ----- + // + // True subquery expressions inside `sql_expr` arrive in 2b + // (additive `Choice` branches in `sql_expr.rs`). 2a verifies + // that the compound fragment recurses cleanly from CTE + // bodies and that the deepest depth check still fires. + + #[test] + fn nested_cte_body_with_union() { + good( + "with x as (select 1 union select 2) select * from x", + ); + } + + // ----- case insensitivity / spacing ----- + + #[test] + fn keywords_are_case_insensitive() { + good("SELECT * FROM users"); + good("Select Distinct A From T Where Id = 1 Order By A Desc Limit 5 Offset 2"); + good("WITH RECURSIVE r AS (SELECT 1 UNION ALL SELECT 2) SELECT * FROM r"); + } + + #[test] + fn trailing_semicolon_tolerated() { + good("select 1;"); + good("select * from users;"); + good("with x as (select 1) select * from x;"); + } + + // ----- malformed input ----- + + #[test] + fn empty_projection_rejected() { + // Note: `select from t` is structurally admitted as + // ` AS ` — the walker does not + // reject keyword-shape idents as column refs. This + // matches ADR-0030's posture (grammar admits, engine + // rejects). The genuinely-malformed `select` alone is + // still rejected because there is no expression to + // match. + bad("select"); + } + + #[test] + fn missing_join_target() { + bad("select * from a join"); + bad("select * from a join b"); + bad("select * from a join b on"); + } + + #[test] + fn dangling_set_op() { + bad("select a from t union"); + bad("select a from t union select"); + } + + #[test] + fn dangling_clauses() { + bad("select a from t where"); + bad("select a from t order by"); + bad("select a from t group by"); + bad("select a from t having"); + bad("select a from t limit"); + bad("select a from t limit 5 offset"); + } + + #[test] + fn cte_missing_body() { + bad("with x as select 1"); + bad("with x as ("); + bad("with x as ()"); + } + + #[test] + fn cte_missing_as() { + bad("with x (select 1) select * from x"); + } + + #[test] + fn bare_recursive_without_with_is_invalid() { + bad("recursive r as (select 1) select * from r"); + } + + // ----- OOS shapes (ADR-0032 §13) ----- + + #[test] + fn comma_from_is_rejected() { + // OOS-3: implicit cross join via comma list. + bad("select * from a, b"); + } + + #[test] + fn natural_join_rejected() { + // OOS-2. + bad("select * from a natural join b"); + } + + #[test] + fn using_clause_rejected() { + // OOS-2. + bad("select * from a join b using (id)"); + } + + #[test] + fn values_row_source_rejected() { + // OOS-7. + bad("select * from (values (1), (2))"); + } + + #[test] + fn lateral_join_rejected() { + // OOS-6. The bare comma-FROM form is rejected because + // we do not admit comma-separated FROM lists (OOS-3), + // so `from a, lateral …` cannot parse as a join. The + // single-token `LATERAL JOIN` form is admitted + // structurally — `lateral` parses as a table-source + // bare alias for `a` and the JOIN that follows is just + // a normal join. This matches the rest of the grammar's + // posture: keyword-shape identifiers are admitted as + // names; non-admitted syntactic forms (comma-FROM) are + // what makes a query reject. + bad("select * from a, lateral (select 1)"); + } + + #[test] + fn window_function_rejected() { + // OOS-5: `OVER (…)` window clauses are not part of the + // Phase-2 grammar. + bad("select row_number() over () from t"); + bad("select sum(x) over (partition by y) from t"); + } + + #[test] + fn derived_table_in_from_rejected() { + // OOS-1: `FROM (SELECT …) alias` is OOS. + // CTEs cover the same use case. + bad("select * from (select * from users) sub"); + bad("select * from (select * from users) as sub"); + } + + // ----- internal-table rejection (ADR-0030 §6) ----- + + #[test] + fn internal_table_in_from_rejected() { + bad("select * from __rdbms_columns"); + bad("select * from __rdbms_playground_columns"); + } + + #[test] + fn internal_table_as_cte_name_rejected() { + bad("with __rdbms_x as (select 1) select * from __rdbms_x"); + } + + #[test] + fn internal_table_in_cte_body_rejected() { + bad("with x as (select * from __rdbms_columns) select * from x"); + } + + #[test] + fn internal_table_in_join_rejected() { + bad("select * from users join __rdbms_columns on x = y"); + } + + // ----- depth cap (ADR-0026 §1 / ADR-0032 §9) ----- + + #[test] + fn pathological_nesting_capped() { + // Deep parenthesised CTE-body chain is rejected by the + // shared `MAX_SUBGRAMMAR_DEPTH = 64` cap, not by stack + // overflow. + let depth = 200; + let mut input = String::new(); + for _ in 0..depth { + input.push_str("with x as ("); + } + input.push_str("select 1"); + for _ in 0..depth { + input.push_str(") select * from x"); + } + assert!(!walks(&input)); + } + + // ----- compound-select fragment entry point ----- + + #[test] + fn compound_fragment_walks_without_with_clause() { + // SQL_SELECT_COMPOUND is what subqueries / CTE bodies + // recurse into. It admits a select_core + optional + // set-op chain + outer ORDER/LIMIT. + assert!(walks_via(&SQL_SELECT_COMPOUND, "select 1")); + assert!(walks_via( + &SQL_SELECT_COMPOUND, + "select a from t union select b from u", + )); + assert!(!walks_via( + &SQL_SELECT_COMPOUND, + "with x as (select 1) select * from x", + )); + } +}