diff --git a/src/dsl/grammar/mod.rs b/src/dsl/grammar/mod.rs index 050e96d..f090738 100644 --- a/src/dsl/grammar/mod.rs +++ b/src/dsl/grammar/mod.rs @@ -28,6 +28,7 @@ pub mod ddl; pub mod expr; pub mod shared; pub mod sql_expr; +pub mod sql_select; use crate::dsl::command::Command; use crate::dsl::walker::context::WalkContext; diff --git a/src/dsl/grammar/sql_select.rs b/src/dsl/grammar/sql_select.rs new file mode 100644 index 0000000..953c77f --- /dev/null +++ b/src/dsl/grammar/sql_select.rs @@ -0,0 +1,1171 @@ +//! The full SQL `SELECT` grammar fragment (ADR-0032). +//! +//! ADR-0030 Phase 2. This fragment is the standalone walkable +//! shape for the full standard-SQL `SELECT`: `INNER` / `LEFT` / +//! `RIGHT` / `FULL OUTER` / `CROSS` joins, `GROUP BY` / `HAVING`, +//! the four set operators (`UNION` / `UNION ALL` / `INTERSECT` +//! / `EXCEPT`), `WITH` and `WITH RECURSIVE` common table +//! expressions, `LIMIT … OFFSET`, `DISTINCT`, `t.*` projection, +//! and bare-alias projection (lifting ADR-0030 Phase 1 §4.2). +//! +//! Recursion into `SQL_SELECT_COMPOUND` is via `Node::Subgrammar` +//! at sub-phase 2a; sub-phase 2b replaces those references with +//! `Node::ScopedSubgrammar` for completion-scope discipline +//! (ADR-0032 §10.2). The Phase-1 `data::SELECT` `CommandNode` +//! continues to use its own grammar until sub-phase 2c's +//! migration — this fragment is reachable only from its own +//! tests in 2a. +//! +//! # BNF (ADR-0032 §1) +//! +//! ```text +//! select_statement := [ with_clause ] compound_select [ ';' ] +//! compound_select := select_core ( set_op select_core )* +//! [ order_by_clause ] +//! [ limit_clause ] +//! set_op := UNION [ ALL ] | INTERSECT | EXCEPT +//! select_core := SELECT [ DISTINCT | ALL ] +//! projection_list +//! [ from_clause ] +//! [ where_clause ] +//! [ group_by_clause ] +//! [ having_clause ] +//! with_clause := WITH [ RECURSIVE ] cte_def +//! ( ',' cte_def )* +//! cte_def := identifier [ '(' column_name_list ')' ] +//! AS '(' compound_select ')' +//! projection_list := projection_item ( ',' projection_item )* +//! projection_item := '*' +//! | identifier '.' '*' +//! | sql_expr [ [ AS ] identifier ] +//! from_clause := FROM table_source ( join_clause )* +//! table_source := identifier [ [ AS ] identifier ] +//! join_clause := [ INNER ] JOIN table_source ON sql_expr +//! | LEFT [ OUTER ] JOIN table_source ON sql_expr +//! | RIGHT [ OUTER ] JOIN table_source ON sql_expr +//! | FULL [ OUTER ] JOIN table_source ON sql_expr +//! | CROSS JOIN table_source +//! where_clause := WHERE sql_expr +//! group_by_clause := GROUP BY sql_expr ( ',' sql_expr )* +//! having_clause := HAVING sql_expr +//! order_by_clause := ORDER BY order_item ( ',' order_item )* +//! order_item := sql_expr [ ASC | DESC ] +//! limit_clause := LIMIT sql_expr [ OFFSET sql_expr ] +//! ``` +//! +//! # Disambiguation via `Node::Lookahead` +//! +//! Two places need lookahead to dispatch cleanly: +//! +//! - **Projection item** (ADR-0032 §1 `projection_item`). The +//! three alternatives all share a leading identifier shape +//! (`*` and the `ident . *` qualified wildcard, plus `sql_expr` +//! which also begins on an ident for the column-ref case). A +//! factory peeks the first 3 tokens to pick `*`, `ident . *`, +//! or `sql_expr [ alias ]`. +//! +//! - **Bare alias** (ADR-0032 §1 — lifts Phase-1 §4.2). The +//! walker's `walk_ident` happily matches keyword-shaped tokens +//! as identifiers, and `Choice`/`Optional` are first-match- +//! wins (no backtracking on a successful match). To prevent +//! bare-alias slots from swallowing continuation keywords, the +//! alias slot is a `Lookahead` that returns an empty `Choice` +//! (NoMatch) when the next ident-shaped token is a +//! continuation keyword for that position. + +use crate::dsl::grammar::{IdentSource, Node, ValidationError, Word, sql_expr}; +use crate::dsl::walker::context::WalkContext; +use crate::dsl::walker::lex_helpers::{consume_ident, skip_whitespace}; + +// ================================================================= +// Validators +// ================================================================= + +/// Reject internal `__rdbms_*` metadata tables in any +/// table-source slot (ADR-0030 §6 reused by ADR-0032 §4 — extends +/// to every Phase-2 table-source slot: `FROM`, `JOIN` targets, +/// CTE name, and the `FROM` inside any CTE body). +fn reject_internal_table(name: &str) -> Result<(), ValidationError> { + if name.to_ascii_lowercase().starts_with("__rdbms_") { + Err(ValidationError { + message_key: "select.internal_table", + args: vec![("table", name.to_string())], + }) + } else { + Ok(()) + } +} + +// ================================================================= +// Shared leaf nodes +// ================================================================= + +const COMMA: Node = Node::Punct(','); +const STAR: Node = Node::Punct('*'); +const LPAREN: Node = Node::Punct('('); +const RPAREN: Node = Node::Punct(')'); +const SEMI: Node = Node::Punct(';'); + +/// SQL expression slot — recursion into ADR-0031's fragment +/// through `Node::Subgrammar`. Stays `Subgrammar` (not +/// `ScopedSubgrammar`) — `sql_expr` recursion is part of the +/// precedence ladder, not a new lexical scope (ADR-0032 §10.2). +const SQL_EXPR: Node = Node::Subgrammar(&sql_expr::SQL_OR_EXPR); + +/// A node that never matches. Used as the "no" branch of +/// lookahead-driven disambiguation: an empty `Choice` walks to +/// `NoMatch`, which `Optional` / `Choice` gracefully treat as +/// "skip" or "fall through to the next branch". +const EMPTY_NOMATCH: Node = Node::Choice(&[]); + +// ================================================================= +// Bare-alias dispatch (ADR-0032 §1) +// ================================================================= +// +// The walker's `walk_ident` accepts any identifier-shape token, +// including keyword-shape ones. With `Optional` / `Choice` +// being first-match-wins, an unrestricted bare-alias slot would +// greedily consume `FROM` / `WHERE` / `JOIN` / etc. as if they +// were aliases. `Node::Lookahead` peeks the next token; when it +// matches a continuation keyword for this position, the factory +// returns `EMPTY_NOMATCH` so `Optional` skips and the keyword +// reaches the next clause. + +/// Continuation keywords that may legitimately follow a +/// projection item's bare alias (or its absence). Includes the +/// `select_core` follow keywords and the compound-query / outer +/// suffix keywords. `as` is not listed — the AS-form alias is a +/// separate `Choice` branch that fires before the lookahead. +const PROJECTION_FOLLOW_SET: &[&str] = &[ + "from", "where", "group", "order", "having", "limit", + "union", "intersect", "except", +]; + +/// Continuation keywords that may legitimately follow a table +/// source's bare alias (or its absence). Includes the join +/// keywords (so `FROM a JOIN b` doesn't read `JOIN` as `a`'s +/// alias) and the `select_core` / compound suffix keywords. +/// `on` is included because `FROM a JOIN b ON …` reaches `on` +/// only when `b` has no alias — `on` is not a base-table name a +/// learner would type as an alias. +const TABLE_SOURCE_FOLLOW_SET: &[&str] = &[ + "where", "group", "order", "having", "limit", + "union", "intersect", "except", + "inner", "left", "right", "full", "cross", "join", "on", +]; + +fn peek_next_ident_lower(source: &str, pos: usize) -> Option { + let p = skip_whitespace(source, pos); + consume_ident(source, p).map(|(s, e)| source[s..e].to_ascii_lowercase()) +} + +fn projection_bare_alias_factory( + _: &WalkContext, + source: &str, + pos: usize, +) -> Node { + match peek_next_ident_lower(source, pos) { + Some(word) + if PROJECTION_FOLLOW_SET.iter().any(|k| *k == word) => + { + EMPTY_NOMATCH + } + Some(_) => BARE_ALIAS_IDENT, + None => EMPTY_NOMATCH, + } +} + +fn table_source_bare_alias_factory( + _: &WalkContext, + source: &str, + pos: usize, +) -> Node { + match peek_next_ident_lower(source, pos) { + Some(word) + if TABLE_SOURCE_FOLLOW_SET.iter().any(|k| *k == word) => + { + EMPTY_NOMATCH + } + Some(_) => BARE_ALIAS_IDENT, + None => EMPTY_NOMATCH, + } +} + +// ================================================================= +// Alias slot +// ================================================================= + +const BARE_ALIAS_IDENT: Node = Node::Ident { + source: IdentSource::NewName, + role: "select_alias", + validator: None, + highlight_override: None, + writes_table: false, + writes_column: false, + writes_user_listed_column: false, +}; + +static AS_ALIAS_NODES: &[Node] = &[ + Node::Word(Word::keyword("as")), + BARE_ALIAS_IDENT, +]; +const AS_ALIAS_EXPLICIT: Node = Node::Seq(AS_ALIAS_NODES); + +static PROJECTION_ALIAS_CHOICES: &[Node] = &[ + AS_ALIAS_EXPLICIT, + Node::Lookahead(projection_bare_alias_factory), +]; +const PROJECTION_ALIAS_CHOICE: Node = Node::Choice(PROJECTION_ALIAS_CHOICES); +const PROJECTION_ALIAS_OPTIONAL: Node = + Node::Optional(&PROJECTION_ALIAS_CHOICE); + +static TABLE_SOURCE_ALIAS_CHOICES: &[Node] = &[ + AS_ALIAS_EXPLICIT, + Node::Lookahead(table_source_bare_alias_factory), +]; +const TABLE_SOURCE_ALIAS_CHOICE: Node = + Node::Choice(TABLE_SOURCE_ALIAS_CHOICES); +const TABLE_SOURCE_ALIAS_OPTIONAL: Node = + Node::Optional(&TABLE_SOURCE_ALIAS_CHOICE); + +// ================================================================= +// Projection item +// ================================================================= + +const QUALIFIED_STAR_QUALIFIER: Node = Node::Ident { + source: IdentSource::Tables, + role: "qualified_star_qualifier", + validator: None, + highlight_override: None, + writes_table: false, + writes_column: false, + writes_user_listed_column: false, +}; + +static QUALIFIED_STAR_NODES: &[Node] = &[ + QUALIFIED_STAR_QUALIFIER, + Node::Punct('.'), + Node::Punct('*'), +]; +const QUALIFIED_STAR: Node = Node::Seq(QUALIFIED_STAR_NODES); + +static PROJECTION_EXPR_ITEM_NODES: &[Node] = &[ + SQL_EXPR, + PROJECTION_ALIAS_OPTIONAL, +]; +const PROJECTION_EXPR_ITEM: Node = Node::Seq(PROJECTION_EXPR_ITEM_NODES); + +/// Dispatch one projection item via a 3-token lookahead. +/// +/// - `*` (and only `*`) → bare wildcard. +/// - `ident . *` → qualified wildcard. +/// - anything else → `sql_expr [ alias ]`. +/// +/// The factory is the cleanest way to handle the shared-prefix +/// ambiguity between `t.*` and `sql_expr` (which can match a +/// bare `t`), since the walker's `Choice` doesn't backtrack on +/// a committed match. +fn projection_item_factory( + _: &WalkContext, + source: &str, + pos: usize, +) -> Node { + let p = skip_whitespace(source, pos); + let bytes = source.as_bytes(); + if bytes.get(p) == Some(&b'*') { + return STAR; + } + if let Some((_, end1)) = consume_ident(source, p) { + let after_ident = skip_whitespace(source, end1); + if bytes.get(after_ident) == Some(&b'.') { + let after_dot = skip_whitespace(source, after_ident + 1); + if bytes.get(after_dot) == Some(&b'*') { + return QUALIFIED_STAR; + } + } + } + PROJECTION_EXPR_ITEM +} + +const PROJECTION_ITEM: Node = Node::Lookahead(projection_item_factory); + +const PROJECTION_LIST: Node = Node::Repeated { + inner: &PROJECTION_ITEM, + separator: Some(&COMMA), + min: 1, +}; + +// ================================================================= +// DISTINCT / ALL prefix +// ================================================================= + +static DISTINCT_OR_ALL_CHOICES: &[Node] = &[ + Node::Word(Word::keyword("distinct")), + Node::Word(Word::keyword("all")), +]; +const DISTINCT_OR_ALL_CHOICE: Node = Node::Choice(DISTINCT_OR_ALL_CHOICES); +const DISTINCT_OR_ALL_OPTIONAL: Node = + Node::Optional(&DISTINCT_OR_ALL_CHOICE); + +// ================================================================= +// Table source (FROM / JOIN target) +// ================================================================= + +const TABLE_NAME_IDENT: Node = Node::Ident { + source: IdentSource::Tables, + role: "table_name", + validator: Some(reject_internal_table), + highlight_override: None, + writes_table: false, + writes_column: false, + writes_user_listed_column: false, +}; + +static TABLE_SOURCE_NODES: &[Node] = &[ + TABLE_NAME_IDENT, + TABLE_SOURCE_ALIAS_OPTIONAL, +]; +const TABLE_SOURCE: Node = Node::Seq(TABLE_SOURCE_NODES); + +// ================================================================= +// JOIN flavours +// ================================================================= + +const JOIN_WORD: Node = Node::Word(Word::keyword("join")); +const ON_WORD: Node = Node::Word(Word::keyword("on")); +const OUTER_OPTIONAL: Node = + Node::Optional(&Node::Word(Word::keyword("outer"))); + +// `INNER JOIN` and bare `JOIN` are split into two Choice +// branches so each branch has a distinct leading keyword +// (`inner` vs `join`). Avoids the "optional leading child → +// idx > 0 → EOF becomes Incomplete" hazard in walk_seq that a +// shared `Optional(Word("inner"))` would otherwise create. +static INNER_JOIN_NODES: &[Node] = &[ + Node::Word(Word::keyword("inner")), + JOIN_WORD, + TABLE_SOURCE, + ON_WORD, + SQL_EXPR, +]; + +static BARE_JOIN_NODES: &[Node] = &[ + JOIN_WORD, + TABLE_SOURCE, + ON_WORD, + SQL_EXPR, +]; + +static LEFT_JOIN_NODES: &[Node] = &[ + Node::Word(Word::keyword("left")), + OUTER_OPTIONAL, + JOIN_WORD, + TABLE_SOURCE, + ON_WORD, + SQL_EXPR, +]; + +static RIGHT_JOIN_NODES: &[Node] = &[ + Node::Word(Word::keyword("right")), + OUTER_OPTIONAL, + JOIN_WORD, + TABLE_SOURCE, + ON_WORD, + SQL_EXPR, +]; + +static FULL_JOIN_NODES: &[Node] = &[ + Node::Word(Word::keyword("full")), + OUTER_OPTIONAL, + JOIN_WORD, + TABLE_SOURCE, + ON_WORD, + SQL_EXPR, +]; + +static CROSS_JOIN_NODES: &[Node] = &[ + Node::Word(Word::keyword("cross")), + JOIN_WORD, + TABLE_SOURCE, +]; + +/// JOIN flavour dispatch. Each branch has a distinct leading +/// keyword so `Choice` first-match-wins discriminates cleanly +/// without invoking the walker's `Optional`-leading-child +/// hazard. +static JOIN_CLAUSE_CHOICES: &[Node] = &[ + Node::Seq(LEFT_JOIN_NODES), + Node::Seq(RIGHT_JOIN_NODES), + Node::Seq(FULL_JOIN_NODES), + Node::Seq(CROSS_JOIN_NODES), + Node::Seq(INNER_JOIN_NODES), + Node::Seq(BARE_JOIN_NODES), +]; +const JOIN_CLAUSE: Node = Node::Choice(JOIN_CLAUSE_CHOICES); + +// ================================================================= +// FROM / WHERE / GROUP BY / HAVING +// ================================================================= + +static FROM_CLAUSE_NODES: &[Node] = &[ + Node::Word(Word::keyword("from")), + TABLE_SOURCE, + Node::Repeated { + inner: &JOIN_CLAUSE, + separator: None, + min: 0, + }, +]; +const FROM_CLAUSE: Node = Node::Seq(FROM_CLAUSE_NODES); + +static WHERE_CLAUSE_NODES: &[Node] = &[ + Node::Word(Word::keyword("where")), + SQL_EXPR, +]; +const WHERE_CLAUSE: Node = Node::Seq(WHERE_CLAUSE_NODES); + +static GROUP_BY_CLAUSE_NODES: &[Node] = &[ + Node::Word(Word::keyword("group")), + Node::Word(Word::keyword("by")), + Node::Repeated { + inner: &SQL_EXPR, + separator: Some(&COMMA), + min: 1, + }, +]; +const GROUP_BY_CLAUSE: Node = Node::Seq(GROUP_BY_CLAUSE_NODES); + +static HAVING_CLAUSE_NODES: &[Node] = &[ + Node::Word(Word::keyword("having")), + SQL_EXPR, +]; +const HAVING_CLAUSE: Node = Node::Seq(HAVING_CLAUSE_NODES); + +// ================================================================= +// ORDER BY / LIMIT / OFFSET +// ================================================================= + +static ASC_DESC_CHOICES: &[Node] = &[ + Node::Word(Word::keyword("asc")), + Node::Word(Word::keyword("desc")), +]; +const ASC_DESC_CHOICE: Node = Node::Choice(ASC_DESC_CHOICES); +static ORDER_ITEM_NODES: &[Node] = &[ + SQL_EXPR, + Node::Optional(&ASC_DESC_CHOICE), +]; +const ORDER_ITEM: Node = Node::Seq(ORDER_ITEM_NODES); + +static ORDER_BY_CLAUSE_NODES: &[Node] = &[ + Node::Word(Word::keyword("order")), + Node::Word(Word::keyword("by")), + Node::Repeated { + inner: &ORDER_ITEM, + separator: Some(&COMMA), + min: 1, + }, +]; +const ORDER_BY_CLAUSE: Node = Node::Seq(ORDER_BY_CLAUSE_NODES); + +static OFFSET_NODES: &[Node] = &[ + Node::Word(Word::keyword("offset")), + SQL_EXPR, +]; +const OFFSET_SEQ: Node = Node::Seq(OFFSET_NODES); +const OFFSET_OPTIONAL: Node = Node::Optional(&OFFSET_SEQ); + +static LIMIT_CLAUSE_NODES: &[Node] = &[ + Node::Word(Word::keyword("limit")), + SQL_EXPR, + OFFSET_OPTIONAL, +]; +const LIMIT_CLAUSE: Node = Node::Seq(LIMIT_CLAUSE_NODES); + +// ================================================================= +// select_core (per-leg of a compound) +// ================================================================= + +static SELECT_CORE_NODES: &[Node] = &[ + Node::Word(Word::keyword("select")), + DISTINCT_OR_ALL_OPTIONAL, + PROJECTION_LIST, + Node::Optional(&FROM_CLAUSE), + Node::Optional(&WHERE_CLAUSE), + Node::Optional(&GROUP_BY_CLAUSE), + Node::Optional(&HAVING_CLAUSE), +]; +const SELECT_CORE: Node = Node::Seq(SELECT_CORE_NODES); + +// ================================================================= +// compound_select +// ================================================================= +// +// `UNION ALL` is a single `Choice` branch (matched before bare +// `UNION`) so the matched-path keyword sequence reads cleanly. + +// `UNION` and `UNION ALL` are factored as one `Seq[union, +// Optional(all)]` branch so the Choice doesn't commit on `union` +// inside a multi-token branch and then fail when `all` is +// missing. The trailing `Optional(all)` is the last child of +// the Seq, so a skip there doesn't trigger the +// optional-leading-then-EOF-becomes-Incomplete hazard. +static UNION_OR_UNION_ALL_NODES: &[Node] = &[ + Node::Word(Word::keyword("union")), + Node::Optional(&Node::Word(Word::keyword("all"))), +]; +static SET_OP_CHOICES: &[Node] = &[ + Node::Seq(UNION_OR_UNION_ALL_NODES), + Node::Word(Word::keyword("intersect")), + Node::Word(Word::keyword("except")), +]; +const SET_OP: Node = Node::Choice(SET_OP_CHOICES); + +static SET_OP_TAIL_NODES: &[Node] = &[SET_OP, SELECT_CORE]; +const SET_OP_TAIL: Node = Node::Seq(SET_OP_TAIL_NODES); + +static COMPOUND_SELECT_NODES: &[Node] = &[ + SELECT_CORE, + Node::Repeated { + inner: &SET_OP_TAIL, + separator: None, + min: 0, + }, + Node::Optional(&ORDER_BY_CLAUSE), + Node::Optional(&LIMIT_CLAUSE), +]; +/// The compound-select fragment that subqueries / CTE bodies +/// recurse into via `Subgrammar` (2a) / `ScopedSubgrammar` (2b). +/// Omits the outer `with_clause`; that lives on +/// `SQL_SELECT_STATEMENT`. +pub static SQL_SELECT_COMPOUND: Node = Node::Seq(COMPOUND_SELECT_NODES); + +// ================================================================= +// CTE definitions +// ================================================================= + +const CTE_NAME_IDENT: Node = Node::Ident { + source: IdentSource::NewName, + role: "cte_name", + validator: Some(reject_internal_table), + highlight_override: None, + writes_table: false, + writes_column: false, + writes_user_listed_column: false, +}; + +const CTE_COLUMN_IDENT: Node = Node::Ident { + source: IdentSource::NewName, + role: "cte_column", + validator: None, + highlight_override: None, + writes_table: false, + writes_column: false, + writes_user_listed_column: false, +}; + +static CTE_COLUMN_LIST_NODES: &[Node] = &[ + LPAREN, + Node::Repeated { + inner: &CTE_COLUMN_IDENT, + separator: Some(&COMMA), + min: 1, + }, + RPAREN, +]; +const CTE_COLUMN_LIST_SEQ: Node = Node::Seq(CTE_COLUMN_LIST_NODES); +const CTE_COLUMN_LIST_OPTIONAL: Node = + Node::Optional(&CTE_COLUMN_LIST_SEQ); + +static CTE_BODY_NODES: &[Node] = &[ + LPAREN, + Node::Subgrammar(&SQL_SELECT_COMPOUND), + RPAREN, +]; +const CTE_BODY: Node = Node::Seq(CTE_BODY_NODES); + +static CTE_DEF_NODES: &[Node] = &[ + CTE_NAME_IDENT, + CTE_COLUMN_LIST_OPTIONAL, + Node::Word(Word::keyword("as")), + CTE_BODY, +]; +const CTE_DEF: Node = Node::Seq(CTE_DEF_NODES); + +static WITH_CLAUSE_NODES: &[Node] = &[ + Node::Word(Word::keyword("with")), + Node::Optional(&Node::Word(Word::keyword("recursive"))), + Node::Repeated { + inner: &CTE_DEF, + separator: Some(&COMMA), + min: 1, + }, +]; +const WITH_CLAUSE: Node = Node::Seq(WITH_CLAUSE_NODES); + +// ================================================================= +// select_statement — the fragment entry point +// ================================================================= + +static SELECT_STATEMENT_NODES: &[Node] = &[ + Node::Optional(&WITH_CLAUSE), + Node::Subgrammar(&SQL_SELECT_COMPOUND), + Node::Optional(&SEMI), +]; +/// The full statement, including the optional `WITH` prefix and +/// a tolerated trailing `;`. This is what `data::SELECT`'s +/// `CommandNode` will reference once sub-phase 2c migrates the +/// Phase-1 grammar. +pub static SQL_SELECT_STATEMENT: Node = Node::Seq(SELECT_STATEMENT_NODES); + +// ================================================================= +// Tests +// ================================================================= + +#[cfg(test)] +mod tests { + use super::{SQL_SELECT_COMPOUND, SQL_SELECT_STATEMENT}; + use crate::dsl::grammar::Node; + use crate::dsl::walker::context::WalkContext; + use crate::dsl::walker::driver::{NodeWalkResult, walk_node}; + use crate::dsl::walker::outcome::MatchedPath; + + /// Walk `input` against `fragment`. Returns `true` only when + /// the walk matches *and* consumes all of `input` (trailing + /// whitespace allowed). + fn walks_via(fragment: &'static Node, input: &str) -> bool { + let mut ctx = WalkContext::new(); + let mut path = MatchedPath::new(); + let mut per_byte = Vec::new(); + match walk_node(input, 0, fragment, &mut ctx, &mut path, &mut per_byte) { + NodeWalkResult::Matched { end, .. } => { + input[end..].trim().is_empty() + } + _ => false, + } + } + + fn walks(input: &str) -> bool { + walks_via(&SQL_SELECT_STATEMENT, input) + } + + fn good(input: &str) { + assert!( + walks(input), + "{input:?} should be a valid SELECT statement" + ); + } + + fn bad(input: &str) { + assert!( + !walks(input), + "{input:?} should NOT walk as a complete SELECT statement" + ); + } + + // ----- minimal forms ----- + + #[test] + fn bare_constant_select_with_no_from() { + good("select 1"); + good("select 'hello'"); + good("select null"); + good("select true"); + good("select false"); + } + + #[test] + fn single_table_select_star() { + good("select * from users"); + good("select * from users;"); + } + + #[test] + fn single_column_projection() { + good("select name from users"); + good("select name, age from users"); + good("select name, age, email from users"); + } + + // ----- DISTINCT / ALL ----- + + #[test] + fn distinct_modifier() { + good("select distinct name from users"); + good("select distinct a, b from t"); + } + + #[test] + fn all_modifier() { + good("select all name from users"); + } + + // Note: `select distinct all name from users` and the like + // are admitted structurally — the second keyword parses as + // a column reference (the walker doesn't reject keyword-shape + // idents as columns). Engine semantics deals with it. This + // matches ADR-0030's "grammar admits, engine rejects" posture. + + // ----- projection wildcard / qualified-star / alias ----- + + #[test] + fn qualified_star_projection() { + good("select users.* from users"); + good("select u.* from users u"); + good("select a.*, b.* from a join b on x = y"); + } + + #[test] + fn mixed_projection_with_qualified_star() { + good("select users.*, age from users"); + } + + #[test] + fn projection_with_as_alias() { + good("select name as n from users"); + good("select name as n, age as a from users"); + } + + #[test] + fn projection_with_bare_alias() { + good("select name n from users"); + good("select name n, age a from users"); + } + + #[test] + fn projection_alias_mixed_forms() { + good("select name as n, age a, email from users"); + } + + #[test] + fn projection_bare_alias_does_not_swallow_from() { + // The bare-alias lookahead must skip when next ident + // is `from`; otherwise this would fail with "alias `from` + // followed by nothing". + good("select name from users"); + } + + #[test] + fn projection_bare_alias_does_not_swallow_where_or_group_etc() { + good("select name from users where id > 0"); + good("select name from users group by name"); + good("select name from users order by name"); + good("select name from users limit 5"); + good("select name from users group by name having count(*) > 1"); + } + + #[test] + fn projection_expression_with_arithmetic() { + good("select a + b from t"); + good("select a + b as total from t"); + good("select a * 2 from t"); + } + + #[test] + fn projection_function_calls() { + good("select upper(name) from users"); + good("select count(*) from users"); + good("select count(distinct customer_id) from orders"); + } + + // ----- FROM / JOIN flavours ----- + + #[test] + fn from_with_table_alias() { + good("select * from users u"); + good("select * from users as u"); + } + + #[test] + fn inner_join_explicit() { + good("select * from a inner join b on x = y"); + } + + #[test] + fn inner_join_bare() { + good("select * from a join b on x = y"); + } + + #[test] + fn left_outer_join() { + good("select * from a left join b on x = y"); + good("select * from a left outer join b on x = y"); + } + + #[test] + fn right_outer_join() { + good("select * from a right join b on x = y"); + good("select * from a right outer join b on x = y"); + } + + #[test] + fn full_outer_join() { + good("select * from a full join b on x = y"); + good("select * from a full outer join b on x = y"); + } + + #[test] + fn cross_join() { + good("select * from a cross join b"); + } + + #[test] + fn cross_join_with_no_on() { + // CROSS JOIN takes no ON; an ON clause is a parse error. + bad("select * from a cross join b on x = y"); + } + + #[test] + fn chained_joins() { + good("select * from a join b on x = y join c on y = z"); + good("select * from a left join b on x = y inner join c on y = z"); + } + + #[test] + fn join_with_table_aliases() { + good("select * from a u join b v on x = y"); + good("select * from a as u join b as v on x = y"); + } + + // ----- WHERE / GROUP BY / HAVING ----- + + #[test] + fn where_clause() { + good("select * from t where id = 1"); + good("select * from t where a > 0 and b < 10"); + } + + #[test] + fn group_by_single_column() { + good("select name from t group by name"); + } + + #[test] + fn group_by_multiple_columns() { + good("select a, b from t group by a, b"); + } + + #[test] + fn group_by_expression() { + good("select count(*) from t group by upper(name)"); + } + + #[test] + fn having_clause() { + good("select name from t group by name having count(*) > 1"); + // HAVING without GROUP BY is admitted structurally; + // engine may reject. The grammar admits it. + good("select count(*) from t having count(*) > 0"); + } + + // ----- set operators ----- + + #[test] + fn union_two_selects() { + good("select a from t union select b from u"); + } + + #[test] + fn union_all_two_selects() { + good("select a from t union all select b from u"); + } + + #[test] + fn intersect_two_selects() { + good("select a from t intersect select b from u"); + } + + #[test] + fn except_two_selects() { + good("select a from t except select b from u"); + } + + #[test] + fn set_op_chain() { + good( + "select a from t union select b from u intersect select c from v", + ); + } + + #[test] + fn set_op_with_outer_order_by_and_limit() { + good( + "select a from t union select b from u order by a limit 10", + ); + } + + // ----- ORDER BY / LIMIT / OFFSET ----- + + #[test] + fn order_by_single_column() { + good("select * from t order by name"); + } + + #[test] + fn order_by_with_direction() { + good("select * from t order by name asc"); + good("select * from t order by name desc"); + } + + #[test] + fn order_by_multiple_items() { + good("select * from t order by name asc, age desc"); + } + + #[test] + fn order_by_column_position() { + // A column-position reference falls out of `sql_expr` + // (an integer literal is a valid expression). + good("select a, b from t order by 1"); + good("select a, b from t order by 1, 2 desc"); + } + + #[test] + fn limit_only() { + good("select * from t limit 10"); + } + + #[test] + fn limit_with_offset() { + good("select * from t limit 10 offset 5"); + } + + #[test] + fn legacy_limit_comma_form_rejected() { + // `LIMIT m, n` (offset-first MySQL/SQLite legacy) is + // OOS per ADR-0032 §13 OOS-4. + bad("select * from t limit 5, 10"); + } + + // ----- CTEs ----- + + #[test] + fn non_recursive_cte() { + good("with x as (select 1) select * from x"); + } + + #[test] + fn non_recursive_cte_select_star() { + good("with x as (select * from users) select * from x"); + } + + #[test] + fn cte_with_column_list_rename() { + good("with x(n) as (select name from users) select n from x"); + good("with x(a, b) as (select a, b from t) select * from x"); + } + + #[test] + fn recursive_cte() { + good( + "with recursive r as (select 1 union all select 2) select * from r", + ); + } + + #[test] + fn multiple_ctes() { + good( + "with a as (select 1), b as (select 2) select * from a union select * from b", + ); + } + + // ----- subquery shapes (recursion through SQL_SELECT_COMPOUND) ----- + // + // True subquery expressions inside `sql_expr` arrive in 2b + // (additive `Choice` branches in `sql_expr.rs`). 2a verifies + // that the compound fragment recurses cleanly from CTE + // bodies and that the deepest depth check still fires. + + #[test] + fn nested_cte_body_with_union() { + good( + "with x as (select 1 union select 2) select * from x", + ); + } + + // ----- case insensitivity / spacing ----- + + #[test] + fn keywords_are_case_insensitive() { + good("SELECT * FROM users"); + good("Select Distinct A From T Where Id = 1 Order By A Desc Limit 5 Offset 2"); + good("WITH RECURSIVE r AS (SELECT 1 UNION ALL SELECT 2) SELECT * FROM r"); + } + + #[test] + fn trailing_semicolon_tolerated() { + good("select 1;"); + good("select * from users;"); + good("with x as (select 1) select * from x;"); + } + + // ----- malformed input ----- + + #[test] + fn empty_projection_rejected() { + // Note: `select from t` is structurally admitted as + // ` AS ` — the walker does not + // reject keyword-shape idents as column refs. This + // matches ADR-0030's posture (grammar admits, engine + // rejects). The genuinely-malformed `select` alone is + // still rejected because there is no expression to + // match. + bad("select"); + } + + #[test] + fn missing_join_target() { + bad("select * from a join"); + bad("select * from a join b"); + bad("select * from a join b on"); + } + + #[test] + fn dangling_set_op() { + bad("select a from t union"); + bad("select a from t union select"); + } + + #[test] + fn dangling_clauses() { + bad("select a from t where"); + bad("select a from t order by"); + bad("select a from t group by"); + bad("select a from t having"); + bad("select a from t limit"); + bad("select a from t limit 5 offset"); + } + + #[test] + fn cte_missing_body() { + bad("with x as select 1"); + bad("with x as ("); + bad("with x as ()"); + } + + #[test] + fn cte_missing_as() { + bad("with x (select 1) select * from x"); + } + + #[test] + fn bare_recursive_without_with_is_invalid() { + bad("recursive r as (select 1) select * from r"); + } + + // ----- OOS shapes (ADR-0032 §13) ----- + + #[test] + fn comma_from_is_rejected() { + // OOS-3: implicit cross join via comma list. + bad("select * from a, b"); + } + + #[test] + fn natural_join_rejected() { + // OOS-2. + bad("select * from a natural join b"); + } + + #[test] + fn using_clause_rejected() { + // OOS-2. + bad("select * from a join b using (id)"); + } + + #[test] + fn values_row_source_rejected() { + // OOS-7. + bad("select * from (values (1), (2))"); + } + + #[test] + fn lateral_join_rejected() { + // OOS-6. The bare comma-FROM form is rejected because + // we do not admit comma-separated FROM lists (OOS-3), + // so `from a, lateral …` cannot parse as a join. The + // single-token `LATERAL JOIN` form is admitted + // structurally — `lateral` parses as a table-source + // bare alias for `a` and the JOIN that follows is just + // a normal join. This matches the rest of the grammar's + // posture: keyword-shape identifiers are admitted as + // names; non-admitted syntactic forms (comma-FROM) are + // what makes a query reject. + bad("select * from a, lateral (select 1)"); + } + + #[test] + fn window_function_rejected() { + // OOS-5: `OVER (…)` window clauses are not part of the + // Phase-2 grammar. + bad("select row_number() over () from t"); + bad("select sum(x) over (partition by y) from t"); + } + + #[test] + fn derived_table_in_from_rejected() { + // OOS-1: `FROM (SELECT …) alias` is OOS. + // CTEs cover the same use case. + bad("select * from (select * from users) sub"); + bad("select * from (select * from users) as sub"); + } + + // ----- internal-table rejection (ADR-0030 §6) ----- + + #[test] + fn internal_table_in_from_rejected() { + bad("select * from __rdbms_columns"); + bad("select * from __rdbms_playground_columns"); + } + + #[test] + fn internal_table_as_cte_name_rejected() { + bad("with __rdbms_x as (select 1) select * from __rdbms_x"); + } + + #[test] + fn internal_table_in_cte_body_rejected() { + bad("with x as (select * from __rdbms_columns) select * from x"); + } + + #[test] + fn internal_table_in_join_rejected() { + bad("select * from users join __rdbms_columns on x = y"); + } + + // ----- depth cap (ADR-0026 §1 / ADR-0032 §9) ----- + + #[test] + fn pathological_nesting_capped() { + // Deep parenthesised CTE-body chain is rejected by the + // shared `MAX_SUBGRAMMAR_DEPTH = 64` cap, not by stack + // overflow. + let depth = 200; + let mut input = String::new(); + for _ in 0..depth { + input.push_str("with x as ("); + } + input.push_str("select 1"); + for _ in 0..depth { + input.push_str(") select * from x"); + } + assert!(!walks(&input)); + } + + // ----- compound-select fragment entry point ----- + + #[test] + fn compound_fragment_walks_without_with_clause() { + // SQL_SELECT_COMPOUND is what subqueries / CTE bodies + // recurse into. It admits a select_core + optional + // set-op chain + outer ORDER/LIMIT. + assert!(walks_via(&SQL_SELECT_COMPOUND, "select 1")); + assert!(walks_via( + &SQL_SELECT_COMPOUND, + "select a from t union select b from u", + )); + assert!(!walks_via( + &SQL_SELECT_COMPOUND, + "with x as (select 1) select * from x", + )); + } +}