rdbms-playground/src/dsl/grammar/sql_insert.rs

//! SQL `INSERT` grammar (ADR-0033 §1, sub-phase 3b).
//!
//! Grammar-as-text (ADR-0030 §4): the walker validates that the
//! `INSERT` is in the supported subset; the worker executes the
//! validated SQL text and re-persists the target table's CSV
//! (ADR-0030 §11). The shape here is the post-`INSERT` portion —
//! the entry-word dispatch consumes the leading `INSERT` keyword
//! before this shape walks (mirroring `sql_select::SQL_SELECT_TAIL`).
//!
//! Scope (3b): single- and multi-row `VALUES`, an optional
//! `(column_name_list)`, and the `__rdbms_*` target rejection.
//! `INSERT … SELECT` (3c), `shortid` auto-fill (3d), `RETURNING`
//! (3g), and `ON CONFLICT … ` UPSERT (3h) land in later
//! sub-phases.

use crate::completion::TableColumn;
use crate::dsl::grammar::shared::{SET_VALUE, count_tuple_values};
use crate::dsl::grammar::sql_expr;
use crate::dsl::grammar::sql_select::{
    RETURNING_CLAUSE, SQL_SELECT_COMPOUND, WHERE_CLAUSE, reject_internal_table,
};
use crate::dsl::grammar::{IdentSource, Node, Word};
use crate::dsl::walker::context::WalkContext;

static COMMA: Node = Node::Punct(',');

/// The `INSERT` target table. `__rdbms_*` rejected (ADR-0030 §6 /
/// ADR-0033 §1). `writes_table` populates `current_table` /
/// `current_table_columns` so the optional column list and the
/// `VALUES` expressions get column completion against the target.
const TARGET_TABLE: Node = Node::Ident {
    source: IdentSource::Tables,
    role: "insert_target_table",
    validator: Some(reject_internal_table),
    highlight_override: None,
    writes_table: true,
    writes_column: false,
    writes_user_listed_column: false,
    writes_table_alias: false,
    writes_cte_name: false,
    writes_projection_alias: false,
};

/// One column name inside the optional `(col1, col2, …)` list.
///
/// `writes_user_listed_column: true` records the listed columns into
/// `WalkContext::user_listed_columns` so the `VALUES` factory
/// (`sql_value_list`, ADR-0036 Phase 3b) maps each value position to
/// the listed column in the user's order (Form A). `build_sql_insert`
/// still collects `listed_columns` independently from the matched
/// `insert_column` idents, so this flag only adds the live typed-slot
/// mapping — nothing else reads `user_listed_columns` on the SQL path.
static COLUMN_NAME: Node = Node::Ident {
    source: IdentSource::Columns,
    role: "insert_column",
    validator: None,
    highlight_override: None,
    writes_table: false,
    writes_column: false,
    writes_user_listed_column: true,
    writes_table_alias: false,
    writes_cte_name: false,
    writes_projection_alias: false,
};

static COLUMN_LIST_NODES: &[Node] = &[
    Node::Punct('('),
    Node::Repeated {
        inner: &COLUMN_NAME,
        separator: Some(&COMMA),
        min: 1,
    },
    Node::Punct(')'),
];
const OPTIONAL_COLUMN_LIST: Node = Node::Optional(&Node::Seq(COLUMN_LIST_NODES));

/// One value expression inside a `VALUES` tuple. Consumes the
/// shared `sql_expr` grammar (ADR-0031), so literals, operators,
/// `CASE`, function calls, etc. are all admitted; the engine
/// evaluates them at execution time. Used as the schemaless / fallback
/// value (see `sql_value_list`).
static VALUE_EXPR: Node = Node::Subgrammar(&sql_expr::SQL_OR_EXPR);

/// The fallback value list — the pre-Phase-3b type-blind
/// `Repeated(sql_expr)`. Used for schemaless walks and (crucially) for
/// any tuple whose value-count does NOT match the target column count,
/// so the post-walk per-tuple arity diagnostic (ADR-0033 §8.1) still
/// sees all the values in the matched path and fires its friendly
/// message — a fixed-length typed `Seq` would instead reject the tuple
/// and suppress that diagnostic.
fn fallback_value_list() -> Node {
    Node::Repeated {
        inner: &VALUE_EXPR,
        separator: Some(&COMMA),
        min: 1,
    }
}

/// The target columns a `VALUES` tuple's positions map onto (ADR-0036
/// Phase 3b). Mirrors `db::do_sql_insert`'s positional rule — NOT the
/// DSL's `column_value_list`:
///   - **Form A** (`user_listed_columns` set, from the `(col, …)`
///     list): the listed columns, in the user's order. An *omitted*
///     `shortid` is auto-filled at execution (the X4 note) and has no
///     `VALUES` position, so it is correctly absent here.
///   - **Form B** (no column list): ALL columns in declaration order,
///     including `serial` / `shortid` — advanced-mode Form B auto-fills
///     *nothing* (`plan_autogen_autofill` returns early on an empty
///     column list), so the user supplies a value for every column.
///
/// Empty when schemaless, the table is unknown, or a Form A list
/// resolves to nothing (callers fall back to the type-blind list).
fn target_value_columns(ctx: &WalkContext) -> Vec<TableColumn> {
    let Some(table_cols) = ctx.current_table_columns.as_ref() else {
        return Vec::new();
    };
    ctx.user_listed_columns.as_ref().map_or_else(
        || table_cols.clone(),
        |listed| {
            listed
                .iter()
                .filter_map(|name| {
                    table_cols
                        .iter()
                        .find(|c| c.name.eq_ignore_ascii_case(name))
                        .cloned()
                })
                .collect()
        },
    )
}

// `count_tuple_values` moved to `grammar::shared` (issue #17) so the
// simple-mode DSL insert arity gate can share it; the advanced grammar
// imports it above.

/// Tuple value-list lookahead (ADR-0036 Phase 3b). Gates the typed
/// per-column path on arity so the typed `Seq` is used only where it
/// can succeed, leaving wrong-arity tuples to the type-blind path (and
/// thus to the per-tuple arity diagnostic, ADR-0033 §8.1, which a
/// fixed-length `Seq` would otherwise suppress by rejecting the tuple):
///   - a **closed** tuple routes to typed slots only on an *exact*
///     match (`count == columns`);
///   - an **open** (still-typing) tuple routes to typed slots while
///     there is still room (`count <= columns`), so the per-column hint
///     shows from the moment `(` is opened through each position.
///
/// Returns a small node — the heavy typed `Seq` is built + memoized by
/// the `DynamicSubgrammar` — matching `insert_first_paren`'s leak
/// discipline. Schemaless / unknown table → type-blind fallback.
fn tuple_value_list(ctx: &WalkContext, source: &str, pos: usize) -> Node {
    let cols = target_value_columns(ctx);
    let (count, closed) = count_tuple_values(source, pos);
    let arity_ok = if closed {
        count == cols.len()
    } else {
        count <= cols.len()
    };
    if !cols.is_empty() && arity_ok {
        Node::DynamicSubgrammar(sql_value_list)
    } else {
        fallback_value_list()
    }
}

/// Schema-aware typed value list for one correct-arity `VALUES` tuple
/// (ADR-0036 Phase 3b). Emits, per target column, a zero-width
/// `SetColumn(col)` marker (establishes the active column) followed by
/// the shared boundary-aware [`SET_VALUE`] slot — so a lone literal
/// routes to the column's typed slot (live hint + numeric-shape
/// highlight) and any expression falls through to `sql_expr`. Reached
/// only via [`tuple_value_list`] when arity matches and the schema is
/// known; the empty-cols guard is defensive.
fn sql_value_list(ctx: &WalkContext) -> Node {
    let cols = target_value_columns(ctx);
    if cols.is_empty() {
        return fallback_value_list();
    }
    let mut children: Vec<Node> = Vec::with_capacity(cols.len() * 3);
    for (i, col) in cols.into_iter().enumerate() {
        if i > 0 {
            children.push(Node::Punct(','));
        }
        let leaked: &'static TableColumn = Box::leak(Box::new(col));
        children.push(Node::SetColumn(leaked));
        children.push(SET_VALUE);
    }
    Node::Seq(Box::leak(children.into_boxed_slice()))
}

static VALUE_TUPLE_NODES: &[Node] = &[
    Node::Punct('('),
    Node::Lookahead(tuple_value_list),
    Node::Punct(')'),
];
/// `'(' <value-list> ')'` — one row of values. The value list is the
/// arity-gated `tuple_value_list` (ADR-0036 Phase 3b): a correct-arity
/// tuple gets per-column typed slots; a wrong-arity tuple keeps the
/// type-blind `sql_expr` repeat so the §8.1 arity diagnostic fires.
static VALUE_TUPLE: Node = Node::Seq(VALUE_TUPLE_NODES);

static VALUES_CLAUSE_NODES: &[Node] = &[
    Node::Word(Word::keyword("values")),
    Node::Repeated {
        inner: &VALUE_TUPLE,
        separator: Some(&COMMA),
        min: 1,
    },
];
/// `VALUES tuple (',' tuple)*` — single- or multi-row.
const VALUES_CLAUSE: Node = Node::Seq(VALUES_CLAUSE_NODES);

/// The row source: either a `VALUES` clause or a `SELECT`
/// compound (ADR-0033 §4, sub-phase 3c). `SQL_SELECT_COMPOUND`
/// is itself a Choice that admits a leading `WITH` (ADR-0032
/// §10.3), so `INSERT INTO t WITH x AS (…) SELECT …` parses
/// through this slot for free (R4). The two branches start on
/// disjoint keywords (`values` vs `select`/`with`), so the
/// Choice never ambiguously commits.
static ROW_SOURCE_CHOICES: &[Node] = &[VALUES_CLAUSE, Node::Subgrammar(&SQL_SELECT_COMPOUND)];
const ROW_SOURCE: Node = Node::Choice(ROW_SOURCE_CHOICES);

// =================================================================
// ON CONFLICT … DO NOTHING / DO UPDATE  (ADR-0033 §9, sub-phase 3h)
// =================================================================

/// One column in the optional `ON CONFLICT (col, …)` conflict
/// target. A DISTINCT role from `insert_column` — the conflict
/// target names existing unique-constraint columns, not the
/// inserted column list, and `build_sql_insert` collects only
/// `insert_column` into `listed_columns` (which drives `shortid`
/// auto-fill). Sharing the role would corrupt that set.
static CONFLICT_TARGET_COLUMN: Node = Node::Ident {
    source: IdentSource::Columns,
    role: "conflict_target_column",
    validator: None,
    highlight_override: None,
    writes_table: false,
    writes_column: false,
    writes_user_listed_column: false,
    writes_table_alias: false,
    writes_cte_name: false,
    writes_projection_alias: false,
};

static CONFLICT_TARGET_NODES: &[Node] = &[
    Node::Punct('('),
    Node::Repeated {
        inner: &CONFLICT_TARGET_COLUMN,
        separator: Some(&COMMA),
        min: 1,
    },
    Node::Punct(')'),
];
/// Optional `(col, …)` conflict target — which unique constraint
/// to react to. Standard SQL allows omitting it (any conflict).
const OPTIONAL_CONFLICT_TARGET: Node = Node::Optional(&Node::Seq(CONFLICT_TARGET_NODES));

/// The column on the left of one `DO UPDATE SET col = expr`
/// assignment. Mirrors `sql_update`'s `ASSIGN_COLUMN` shape (same
/// `update_set_column` role so it gets the same column completion /
/// diagnostics against the target table). `writes_column: true`
/// resolves the column type into `current_column` so the RHS
/// `SET_VALUE` lookahead can dispatch the typed slot for a lone
/// literal (ADR-0036 Phase 3a).
const UPSERT_SET_COLUMN: Node = Node::Ident {
    source: IdentSource::Columns,
    role: "update_set_column",
    validator: None,
    highlight_override: None,
    writes_table: false,
    writes_column: true,
    writes_user_listed_column: false,
    writes_table_alias: false,
    writes_cte_name: false,
    writes_projection_alias: false,
};

/// `column '=' <value>` — the RHS is the boundary-aware `SET_VALUE`
/// slot (ADR-0036 Phase 3a), shared with `sql_update`: a lone literal
/// routes to the column-typed slot (live hint + highlight) while an
/// expression — `excluded.col`, operators, `CASE`, function calls —
/// falls through to the full `sql_expr` grammar (ADR-0031). `excluded`
/// is the would-have-been-inserted row (ADR-0033 §9); it parses as a
/// qualified ref via `sql_expr` and the engine resolves it.
static UPSERT_ASSIGNMENT_NODES: &[Node] = &[UPSERT_SET_COLUMN, Node::Punct('='), SET_VALUE];
static UPSERT_ASSIGNMENT: Node = Node::Seq(UPSERT_ASSIGNMENT_NODES);
// `const` — used by value in `DO_UPDATE_NODES` (static-vs-const
// rule: a `Node` referenced by value in a `static [...]` must be
// `const`; `inner: &UPSERT_ASSIGNMENT` is fine since that one is
// referenced via `&`).
const UPSERT_ASSIGNMENT_LIST: Node = Node::Repeated {
    inner: &UPSERT_ASSIGNMENT,
    separator: Some(&COMMA),
    min: 1,
};

static DO_UPDATE_NODES: &[Node] = &[
    Node::Word(Word::keyword("update")),
    Node::Word(Word::keyword("set")),
    UPSERT_ASSIGNMENT_LIST,
    Node::Optional(&WHERE_CLAUSE),
];
/// The action after the shared `do`: `NOTHING | UPDATE SET … [ WHERE
/// … ]`. The `do` keyword is factored OUT of this Choice
/// deliberately. A Choice whose branches *shared* a `do` prefix
/// would break on the walker's `walk_seq`/`walk_choice` interaction
/// (ADR-0033 Amendment 1): a branch matching `do` then failing its
/// *second* token returns a hard `Failed` past idx 0, which stops
/// `walk_choice` from trying the next branch. With `do` hoisted into
/// the enclosing Seq, each branch's FIRST token (`nothing` vs
/// `update`) disambiguates, so a non-match of branch 0 is a clean
/// `NoMatch` that falls through to branch 1.
static DO_ACTION_CHOICES: &[Node] = &[
    Node::Word(Word::keyword("nothing")),
    Node::Seq(DO_UPDATE_NODES),
];
// `const` — used by value in `ON_CONFLICT_CLAUSE_NODES`.
const DO_ACTION: Node = Node::Choice(DO_ACTION_CHOICES);

static ON_CONFLICT_CLAUSE_NODES: &[Node] = &[
    Node::Word(Word::keyword("on")),
    Node::Word(Word::keyword("conflict")),
    OPTIONAL_CONFLICT_TARGET,
    Node::Word(Word::keyword("do")),
    DO_ACTION,
];
/// `ON CONFLICT [ (col, …) ] DO ( NOTHING | UPDATE SET … )`
/// (ADR-0033 §9). Sits between the row source and `RETURNING` in
/// `SQL_INSERT_SHAPE`.
static ON_CONFLICT_CLAUSE: Node = Node::Seq(ON_CONFLICT_CLAUSE_NODES);

static SQL_INSERT_TAIL_NODES: &[Node] = &[
    Node::Word(Word::keyword("into")),
    TARGET_TABLE,
    OPTIONAL_COLUMN_LIST,
    ROW_SOURCE,
    Node::Optional(&ON_CONFLICT_CLAUSE),
    Node::Optional(&RETURNING_CLAUSE),
    Node::Optional(&Node::Punct(';')),
];

/// The post-`INSERT` portion of a SQL `INSERT` statement
/// (ADR-0033 §1): `INTO <table> [ '(' col_list ')' ] VALUES
/// <tuple> (',' <tuple>)* [ ';' ]`.
///
/// The entry-word dispatch consumes the leading `INSERT` keyword
/// before this shape walks, so a `CommandNode` references it as
/// its `shape` (sub-phase 3b registers a development entry word;
/// sub-phase 3j wires the shared `insert` entry word).
pub static SQL_INSERT_SHAPE: Node = Node::Seq(SQL_INSERT_TAIL_NODES);

// =================================================================
// Tests — grammar accept/reject for the post-`INSERT` tail.
// =================================================================

#[cfg(test)]
mod tests {
    use super::SQL_INSERT_SHAPE;
    use crate::dsl::walker::context::WalkContext;
    use crate::dsl::walker::driver::{NodeWalkResult, walk_node};
    use crate::dsl::walker::outcome::MatchedPath;

    /// Walk `input` against the INSERT tail. Returns `true` only
    /// when the walk matches *and* consumes all of `input`
    /// (trailing whitespace allowed). Schemaless context: the
    /// shape is structural, so table/column idents match by shape
    /// and `reject_internal_table` still fires on `__rdbms_*`.
    fn walks(input: &str) -> bool {
        let mut ctx = WalkContext::new();
        let mut path = MatchedPath::new();
        let mut per_byte = Vec::new();
        match walk_node(
            input,
            0,
            &SQL_INSERT_SHAPE,
            &mut ctx,
            &mut path,
            &mut per_byte,
        ) {
            NodeWalkResult::Matched { end, .. } => input[end..].trim().is_empty(),
            _ => false,
        }
    }

    fn good(input: &str) {
        assert!(walks(input), "{input:?} should be a valid INSERT tail");
    }

    fn bad(input: &str) {
        assert!(
            !walks(input),
            "{input:?} should NOT walk as a complete INSERT tail"
        );
    }

    #[test]
    fn single_row_values() {
        good("into orders values (1, 2.0)");
        good("into orders values (1, 'text', true, null)");
        good("into orders values (1);");
    }

    #[test]
    fn multi_row_values() {
        good("into orders values (1, 'a'), (2, 'b')");
        good("into orders values (1), (2), (3)");
        good("into orders values (1, 'a'), (2, 'b');");
    }

    #[test]
    fn explicit_column_list() {
        good("into orders (id, total) values (1, 2.0)");
        good("into orders (id) values (1)");
        good("into orders (a, b, c) values (1, 2, 3), (4, 5, 6)");
    }

    #[test]
    fn value_expressions_admit_sql_expr() {
        good("into t values (1 + 2)");
        good("into t values (case when 1 > 0 then 'y' else 'n' end)");
    }

    #[test]
    fn returning_tail_admitted() {
        // 3g: optional RETURNING projection_list tail, on both row
        // sources.
        good("into orders values (1, 2.0) returning *");
        good("into orders (id, total) values (1, 2.0) returning id");
        good("into orders values (1, 'a'), (2, 'b') returning id, total");
        good("into archive select * from orders returning *");
        good("into orders values (1) returning id as new_id;");
    }

    #[test]
    fn on_conflict_clause_admitted() {
        // 3h: ON CONFLICT … DO NOTHING / DO UPDATE (ADR-0033 §9).
        good("into t (id, name) values (1, 'x') on conflict (id) do nothing");
        good("into t (id, name) values (1, 'x') on conflict do nothing");
        good(
            "into t (id, name) values (1, 'x') on conflict (id) do update set name = excluded.name",
        );
        good(
            "into t (id, name) values (1, 'x') on conflict (id) do update set name = 'y' where id > 0",
        );
        // Multi-column conflict target + multi-assignment DO UPDATE.
        good("into t (a, b) values (1, 2) on conflict (a, b) do update set b = excluded.b, a = 9");
        // ON CONFLICT composes with RETURNING (order: row source,
        // ON CONFLICT, RETURNING).
        good("into t (id) values (1) on conflict (id) do nothing returning *");
        good("into t (id) values (1) on conflict (id) do update set id = excluded.id returning id");
    }

    #[test]
    fn on_conflict_structurally_incomplete_rejected() {
        // `do` with no action.
        bad("into t (id) values (1) on conflict (id) do");
        // DO UPDATE with no SET.
        bad("into t (id) values (1) on conflict (id) do update");
        // DO UPDATE SET with no assignment.
        bad("into t (id) values (1) on conflict (id) do update set");
        // Bare ON with no CONFLICT.
        bad("into t (id) values (1) on do nothing");
    }

    #[test]
    fn internal_target_table_rejected() {
        bad("into __rdbms_playground_columns values (1)");
        bad("into __rdbms_playground_relationships (a) values (1)");
    }

    #[test]
    fn select_row_source() {
        // 3c: the row source is a Choice between VALUES and a
        // SELECT compound (which itself admits a leading WITH).
        good("into archive select * from orders");
        good("into archive select * from orders where created < '2025-01-01'");
        good("into archive select * from orders;");
    }

    #[test]
    fn select_row_source_with_column_list() {
        good("into target (a, b) select x, y from source");
        good("into target (id) select id from source");
    }

    #[test]
    fn with_prefixed_select_row_source() {
        // R4 invariant: a WITH-prefixed SELECT row source parses
        // through SQL_SELECT_COMPOUND's WITH-prefixed branch.
        good("into archive with t as (select * from orders) select * from t");
        good(
            "into summary (id, total) with t as (select * from orders) \
             select id, total from t",
        );
    }

    #[test]
    fn select_row_source_rejects_internal_from_table() {
        // DA gate: the SELECT's FROM slot must still reject
        // `__rdbms_*` tables (Phase-2 gate, not silently dropped on
        // the DML path).
        bad("into archive select * from __rdbms_playground_columns");
    }

    #[test]
    fn incomplete_select_row_source_rejected() {
        // A bare `select` with no projection is not a complete row
        // source.
        bad("into archive select");
        bad("into archive select * from");
    }

    #[test]
    fn structurally_incomplete_or_wrong_rejected() {
        // Missing VALUES.
        bad("into orders");
        bad("into orders (id, total)");
        // Empty value tuple — at least one expression required.
        bad("into orders values ()");
        // Missing INTO.
        bad("orders values (1)");
        // Trailing comma with no following tuple.
        bad("into orders values (1),");
        // Unclosed tuple.
        bad("into orders values (1, 2");
    }
}