From a491df32a04d17e10a267a602605fbfd0f944f2e Mon Sep 17 00:00:00 2001 From: "claude@clouddev1" Date: Wed, 20 May 2026 15:42:44 +0000 Subject: [PATCH] grammar: migrate Phase-1 SELECT to the ADR-0032 fragment (sub-phase 2c) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Phase-1 SQL `SELECT` grammar nodes that used to live in `src/dsl/grammar/data.rs` retire — 22 statics / consts and the `reject_internal_table` validator copy are removed, ~150 lines of grammar machinery gone. `data::SELECT.shape` now references the post-`SELECT` portion of the ADR-0032 fragment via a thin `Node::Subgrammar(&sql_select::SQL_SELECT_TAIL)`. `SQL_SELECT_TAIL` is a new export from `sql_select.rs`, parallel to `SQL_SELECT_STATEMENT`. It represents what a top-level `SELECT` statement looks like AFTER the registry's entry-word dispatch has already consumed the leading `SELECT` keyword: the DISTINCT/ALL prefix, projection list, optional FROM / WHERE / GROUP BY / HAVING, the compound set-op chain (each subsequent leg's `SELECT` is part of `SET_OP_TAIL`), outer ORDER BY / LIMIT, and a tolerated trailing `;`. WITH-prefixed statements (`WITH x AS (…) SELECT * FROM x`) are NOT in 2c's scope — they need a separate `data::WITH` `CommandNode` so the entry-word dispatch routes correctly. For now, top-level WITH continues to fall through to the chumsky parser route (the same as in Phase 1). The `SQL_SELECT_STATEMENT` static (which includes the optional WITH prefix) stays available for use by that future CommandNode or by any other consumer that needs the full statement shape. All seven Phase-1 SQL `SELECT` integration tests (`tests/sql_select.rs`) pass without modification, satisfying the 2c exit gate's "behaviour preserved" requirement. The 70 fragment unit tests and the 26 driver-level scope tests also pass — the migration is a refactor, no new tests required. Behaviour change explicitly sanctioned by ADR-0032 §8: Phase-1's `LIMIT_VALIDATOR` (positive-int-only, parse-time) is superseded by the full `sql_expr` admission. `LIMIT max(10, x)` and similar now parse; the engine constrains the value at execution time per the ADR's "grammar admits, engine rejects" posture. Plan §2b status note: the 2026-05-20 deferral of §10.3 stage 2 (CTE output-column harvest derivation) is recorded in `docs/plans/20260520-adr-0032-phase-2.md` per the user-approved deferral. Test totals: 1366 passing (unchanged), 0 failed, 1 ignored. Clippy clean. data.rs loses ~150 lines of dead grammar; the single source of truth for the SQL `SELECT` shape is now `sql_select.rs`. --- docs/plans/20260520-adr-0032-phase-2.md | 28 +++++ src/dsl/grammar/data.rs | 159 +++--------------------- src/dsl/grammar/sql_select.rs | 50 +++++++- 3 files changed, 93 insertions(+), 144 deletions(-) diff --git a/docs/plans/20260520-adr-0032-phase-2.md b/docs/plans/20260520-adr-0032-phase-2.md index 760008e..42da551 100644 --- a/docs/plans/20260520-adr-0032-phase-2.md +++ b/docs/plans/20260520-adr-0032-phase-2.md @@ -270,6 +270,34 @@ coverage). ## Sub-phase 2b — `ScopedSubgrammar` + scope accumulators +### Implementation status (2026-05-20) + +Sub-phase 2b shipped in five commits (`4f89106`, `98a74b2`, +`b522d09`, `4ff054c`, and earlier 2a foundations). The +scope-accumulator infrastructure — `Node::ScopedSubgrammar`, +`ScopeFrame`, `from_scope_stack`, the new Ident flags +(`writes_table_alias`, `writes_cte_name`, +`writes_projection_alias`), and the sql_expr §5 / §6 additive +extensions — is complete and exercised by 26 driver-level +tests. + +**Deferral, explicitly user-approved:** §10.3 stage 2 (the six +CTE output-column derivation rules) is NOT implemented in 2b. +Stage 1 (placeholder CTE binding push) IS implemented; stage 2 +will fold into 2d (where the new arity-check pass needs +declared-vs-derived column counts) and 2e (where qualified- +prefix completion needs CTE columns). Until then, a CTE +binding's `columns` stays empty after the body exits, and +qualified-prefix completion past `cte_alias.|` returns an +empty candidate list. The CTE-name is still visible as a +table source from inside the body (WITH RECURSIVE +self-reference works) and from outside (downstream CTE-name +validators see it). + +The 2g cross-cut matrix rows for §10.3 derivation cases are +deferred along with the harvest; they will land when 2d/2e +implements the rules. + ### Scope (in) - Add the new walker node variant `Node::ScopedSubgrammar(&Node)`. diff --git a/src/dsl/grammar/data.rs b/src/dsl/grammar/data.rs index 0e2d956..6cb7f73 100644 --- a/src/dsl/grammar/data.rs +++ b/src/dsl/grammar/data.rs @@ -20,7 +20,7 @@ use crate::dsl::command::{Command, Expr, RowFilter}; use crate::dsl::grammar::{ CommandNode, IdentSource, Node, NumberValidator, ValidationError, Word, expr, shared::{column_value_list, current_column_value}, - sql_expr, + sql_select, }; use crate::dsl::walker::context::WalkContext; use crate::dsl::value::Value; @@ -393,141 +393,15 @@ const EXPLAIN_SHAPE: Node = Node::Choice(EXPLAIN_CHOICES); // cycle through the sql_expr ⇄ sql_select recursion (see the // matching note in sql_select.rs). -/// `as ` — the explicit projection alias. Implicit -/// aliasing (`select a x`) is not supported: a bare alias is -/// ambiguous with the `from` keyword, whereas `as` is not. -const SELECT_ALIAS_IDENT: Node = Node::Ident { - source: IdentSource::NewName, - role: "select_alias", - validator: None, - highlight_override: None, - writes_table: false, - writes_column: false, - writes_user_listed_column: false, -writes_table_alias: false, -writes_cte_name: false, -writes_projection_alias: false, -}; -static SELECT_AS_ALIAS_NODES: &[Node] = &[ - Node::Word(Word::keyword("as")), - SELECT_ALIAS_IDENT, -]; -static SELECT_AS_ALIAS: Node = Node::Seq(SELECT_AS_ALIAS_NODES); - -/// A projection item: a SQL expression with an optional alias. -static SELECT_PROJ_ITEM_NODES: &[Node] = &[ - Node::Subgrammar(&sql_expr::SQL_OR_EXPR), - Node::Optional(&SELECT_AS_ALIAS), -]; -static SELECT_PROJ_ITEM: Node = Node::Seq(SELECT_PROJ_ITEM_NODES); - -/// `proj_item ( , proj_item )*`. -/// `static` (not `const`) to avoid a Rust const-evaluation -/// cycle through the `sql_expr` ⇄ `sql_select` recursion. The -/// cycle is valid at link-time (statics resolve lazily) but -/// not at const-eval — see notes in sql_select.rs. -static SELECT_PROJ_LIST: Node = Node::Repeated { - inner: &SELECT_PROJ_ITEM, - separator: Some(&Node::Punct(',')), - min: 1, -}; - -/// `projection := '*' | proj_item ( , proj_item )*`. (`t.*` -/// qualified star is Phase 2 — it needs join scope.) -static SELECT_PROJECTION_CHOICES: &[Node] = - &[Node::Punct('*'), Node::Subgrammar(&SELECT_PROJ_LIST)]; -static SELECT_PROJECTION: Node = Node::Choice(SELECT_PROJECTION_CHOICES); - -/// The `FROM` table. `writes_table` so the `WHERE` / `ORDER BY` -/// expression column slots complete against this table; the -/// validator rejects the internal `__rdbms_*` metadata tables -/// (ADR-0030 §6). -const SELECT_FROM_TABLE: Node = Node::Ident { - source: IdentSource::Tables, - role: "table_name", - validator: Some(reject_internal_table), - highlight_override: None, - writes_table: true, - writes_column: false, - writes_user_listed_column: false, -writes_table_alias: false, -writes_cte_name: false, -writes_projection_alias: false, -}; - -/// `where `. -static SELECT_WHERE_NODES: &[Node] = &[Node::Word(Word::keyword("where")), Node::Subgrammar(&sql_expr::SQL_OR_EXPR)]; -static SELECT_WHERE: Node = Node::Seq(SELECT_WHERE_NODES); - -/// `order by ( , )*`, each item a SQL expression -/// with an optional `asc` / `desc`. -const SELECT_SORT_DIR_CHOICES: &[Node] = &[ - Node::Word(Word::keyword("asc")), - Node::Word(Word::keyword("desc")), -]; -static SELECT_ORDER_ITEM_NODES: &[Node] = &[ - Node::Subgrammar(&sql_expr::SQL_OR_EXPR), - Node::Optional(&Node::Choice(SELECT_SORT_DIR_CHOICES)), -]; -static SELECT_ORDER_ITEM: Node = Node::Seq(SELECT_ORDER_ITEM_NODES); -static SELECT_ORDER_BY_NODES: &[Node] = &[ - Node::Word(Word::keyword("order")), - Node::Word(Word::keyword("by")), - Node::Repeated { - inner: &SELECT_ORDER_ITEM, - separator: Some(&Node::Punct(',')), - min: 1, - }, -]; -static SELECT_ORDER_BY: Node = Node::Seq(SELECT_ORDER_BY_NODES); - -/// `limit ` — reuses the `show data` non-negative-integer -/// validator (`OFFSET` is Phase 2). -static SELECT_LIMIT_NODES: &[Node] = &[ - Node::Word(Word::keyword("limit")), - Node::NumberLit { - validator: Some(LIMIT_VALIDATOR), - }, -]; -static SELECT_LIMIT: Node = Node::Seq(SELECT_LIMIT_NODES); - -/// `from ` — the FROM clause is optional so that the -/// zero-table form `select 1`, `select upper('x')` (a constant -/// or function-call expression) is admitted alongside the -/// single-table form. The expression slots resolve column -/// completion against the FROM table when present; without it, -/// the engine will reject any column references. -static SELECT_FROM_CLAUSE_NODES: &[Node] = &[ - Node::Word(Word::keyword("from")), - SELECT_FROM_TABLE, -]; -static SELECT_FROM_CLAUSE: Node = Node::Seq(SELECT_FROM_CLAUSE_NODES); - -const SELECT_NODES: &[Node] = &[ - Node::Subgrammar(&SELECT_PROJECTION), - Node::Optional(&SELECT_FROM_CLAUSE), - Node::Optional(&SELECT_WHERE), - Node::Optional(&SELECT_ORDER_BY), - Node::Optional(&SELECT_LIMIT), - // ADR-0030 §3: one statement per submission; a trailing `;` - // is tolerated. - Node::Optional(&Node::Punct(';')), -]; -const SELECT_SHAPE: Node = Node::Seq(SELECT_NODES); - -/// Reject a reference to an internal `__rdbms_*` metadata table -/// in a `SELECT`'s `FROM` (ADR-0030 §6 — those tables are not in -/// the query surface). -fn reject_internal_table(name: &str) -> Result<(), ValidationError> { - if name.to_ascii_lowercase().starts_with("__rdbms_") { - Err(ValidationError { - message_key: "select.internal_table", - args: vec![("table", name.to_string())], - }) - } else { - Ok(()) - } -} +// Phase 1's local `SELECT_*` grammar nodes have been retired in +// favour of `sql_select::SQL_SELECT_TAIL` (ADR-0032 sub-phase +// 2c). The shape definition that `data::SELECT` references now +// lives in the dedicated `sql_select` module — including the +// `reject_internal_table` validator, the `LIMIT` count +// validator, and the projection / FROM / WHERE / ORDER BY +// machinery. The full §1 grammar (JOIN, GROUP BY, HAVING, +// set-ops, qualified refs, subqueries, CTEs) is admitted as a +// natural superset. // ================================================================= // AST builders @@ -1027,12 +901,17 @@ pub static EXPLAIN: CommandNode = CommandNode { help_id: Some("data.explain"), usage_ids: &["parse.usage.explain"],}; -/// SQL `SELECT` (ADR-0030 §6, ADR-0031). Advanced mode only — -/// gated by `grammar::is_advanced_only`. `help_id` is `None` -/// until the `help sql` page lands (ADR-0030 Phase 6). +/// SQL `SELECT` (ADR-0030 §6, ADR-0031, ADR-0032). +/// +/// Advanced mode only — gated by `grammar::is_advanced_only`. +/// The shape is the post-`SELECT` portion of a top-level +/// statement; the registry's entry-word dispatch consumes the +/// leading `SELECT` keyword before the shape walks (sub-phase +/// 2c migration). `help_id` is `None` until the `help sql` +/// page lands (ADR-0030 Phase 6). pub static SELECT: CommandNode = CommandNode { entry: Word::keyword("select"), - shape: SELECT_SHAPE, + shape: Node::Subgrammar(&sql_select::SQL_SELECT_TAIL), ast_builder: build_select, help_id: None, usage_ids: &["parse.usage.select"],}; diff --git a/src/dsl/grammar/sql_select.rs b/src/dsl/grammar/sql_select.rs index 5e21590..2860911 100644 --- a/src/dsl/grammar/sql_select.rs +++ b/src/dsl/grammar/sql_select.rs @@ -660,12 +660,54 @@ static SELECT_STATEMENT_NODES: &[Node] = &[ Node::Subgrammar(&SQL_SELECT_COMPOUND), Node::Optional(&SEMI), ]; -/// The full statement, including the optional `WITH` prefix and -/// a tolerated trailing `;`. This is what `data::SELECT`'s -/// `CommandNode` will reference once sub-phase 2c migrates the -/// Phase-1 grammar. +/// The full statement, including the optional `WITH` prefix +/// and a tolerated trailing `;`. +/// +/// Used by the fragment's own tests and by any future +/// `data::WITH` `CommandNode` that dispatches `WITH …` +/// statements. Top-level `SELECT` statements (entry word +/// `select`) reference `SQL_SELECT_TAIL` instead, which omits +/// the leading `SELECT` keyword that the registry's +/// entry-word dispatch already consumed. pub static SQL_SELECT_STATEMENT: Node = Node::Seq(SELECT_STATEMENT_NODES); +// ================================================================= +// select_statement — entry-consumed form (ADR-0030 §6, 2c) +// ================================================================= + +/// The post-`SELECT` portion of a top-level statement. +/// `data::SELECT`'s `CommandNode` has `entry: Word::keyword +/// ("select")`, so the registry's dispatch consumes the leading +/// `SELECT` keyword before the shape walks. The shape is then +/// the rest of `select_core` (`DISTINCT/ALL`, projection, +/// FROM, WHERE, GROUP BY, HAVING), followed by the compound +/// set-op chain (each subsequent leg's `SELECT` keyword is +/// part of `SET_OP_TAIL`), the outer `ORDER BY` / `LIMIT`, and +/// a tolerated trailing `;`. +/// +/// WITH-prefixed statements (`WITH x AS (…) SELECT …`) are +/// dispatched separately by entry word `with`. Adding a +/// `data::WITH` `CommandNode` is a future sub-phase; for now +/// top-level WITH falls back to the chumsky parser route, the +/// same as in Phase 1. +static SQL_SELECT_TAIL_NODES: &[Node] = &[ + Node::Subgrammar(&DISTINCT_OR_ALL_OPTIONAL), + Node::Subgrammar(&PROJECTION_LIST), + Node::Optional(&FROM_CLAUSE), + Node::Optional(&WHERE_CLAUSE), + Node::Optional(&GROUP_BY_CLAUSE), + Node::Optional(&HAVING_CLAUSE), + Node::Repeated { + inner: &SET_OP_TAIL, + separator: None, + min: 0, + }, + Node::Optional(&ORDER_BY_CLAUSE), + Node::Optional(&LIMIT_CLAUSE), + Node::Optional(&SEMI), +]; +pub static SQL_SELECT_TAIL: Node = Node::Seq(SQL_SELECT_TAIL_NODES); + // ================================================================= // Tests // =================================================================