Files
rdbms-playground/src/dsl/walker/lex_helpers.rs
T
claude@clouddev1 7e79ca865a ADR-0024 Phase B: DDL commands without value literals
Migrate the five DDL commands at four entry words: drop (drop
table / drop column / drop relationship), add (add column /
add 1:n relationship), rename (rename column), change (change
column). The walker route now owns these end-to-end; chumsky
declarations remain unreachable for these inputs but stay
until Phase F.

Walker extensions:
- New node kinds: NumberLit (with optional content validator)
  and Literal(&str) (verbatim byte sequence with word-boundary
  lookahead — used for the `1` in `add 1:n …` so it surfaces
  as `\`1\`` in the expected-set, matching the existing
  parse_error_pedagogy contract).
- Flag (--name) terminal — Phase A stubbed; now wired to the
  walker driver with consume_flag() in lex_helpers.
- Repeated combinator with optional separator and `min` floor.
  Used by referential clauses (0..2 `on <delete|update>` runs)
  and change-column flags (0..N --force-conversion /
  --dont-convert; AST builder enforces mutual exclusion).
- Optional now propagates its inner's expectations as a
  `skipped` field on the Matched result. Seq accumulates these
  across children so the next failure's expected-set surfaces
  the full union — closes the keyword-completion regression
  (`add column ` must offer `to`, `table`, plus the table-name
  identifier slot).
- Expectation::Ident gained a `source: IdentSource` field; the
  parser-side bridge maps Tables/Columns/Relationships/Types
  to the IdentSlot::expected_label strings ("table name",
  "column name", …) so the existing completion engine's
  schema-cache lookup still resolves.
- Walker error wording now includes "after `<consumed>`,
  expected …" framing — matches the chumsky-side test
  contract for structural errors mid-shape.
- AST-builder validation errors now propagate as
  WalkOutcome::ValidationFailed (not the generic "AST builder
  failed" fallback), so `change column … --force-conversion
  --dont-convert` and repeated `on delete` clauses surface
  their friendly catalog wording verbatim.

Grammar additions:
- src/dsl/grammar/shared.rs: type-name validator (TYPE_VALIDATOR
  uses Type::from_str via parse.custom.unknown_type catalog),
  qualified_column sub-grammar, referential action keyword
  (`cascade`/`restrict`/`set null`/`no action`), repeated
  on-clauses.
- src/dsl/grammar/ddl.rs: drop/add/rename/change CommandNodes
  with inline shapes (per-use-site `role` annotations let the
  AST builder discriminate parent vs child columns, etc.).
  The four entry words each have one CommandNode whose `shape`
  is a Choice across sub-forms.

Tests:
- 14 new walker-specific tests covering all DDL forms (bare
  drop table, drop column with optional connectives, drop
  relationship by name and by endpoints, add column with type
  validator, rename column, change column with each flag form
  + mutual-exclusion check, add 1:n relationship minimal /
  full, repeated-clause-twice rejection).
- Total: 819 passed, 0 failed, 1 ignored (was 805 / 1).
- cargo clippy --all-targets -- -D warnings clean.
2026-05-15 06:59:27 +00:00

189 lines
5.8 KiB
Rust

//! Byte-level helpers for the scannerless walker (ADR-0024
//! §scannerless).
//!
//! Each helper takes the source string and a byte position,
//! returns either `Some(end_position)` (matched, post-token end)
//! or `None` (didn't match here). Helpers are pure and span-
//! exact; multi-byte UTF-8 within identifiers and string
//! literals is handled byte-correctly.
//!
//! These helpers internally mirror the logic of the legacy
//! `dsl::lexer` module but are invoked per-position by the
//! walker rather than as a pre-pass.
/// Return the byte index of the first non-whitespace byte at or
/// after `start`. If the rest is all whitespace, returns
/// `source.len()`.
pub fn skip_whitespace(source: &str, start: usize) -> usize {
let bytes = source.as_bytes();
let mut i = start;
while i < bytes.len() && bytes[i].is_ascii_whitespace() {
i += 1;
}
i
}
/// Identifier shape: ASCII letter or `_` to start, then ASCII
/// alphanumeric or `_`. Returns `Some((start, end))` on match.
pub fn consume_ident(source: &str, start: usize) -> Option<(usize, usize)> {
let bytes = source.as_bytes();
let first = *bytes.get(start)?;
if !(first.is_ascii_alphabetic() || first == b'_') {
return None;
}
let mut i = start + 1;
while i < bytes.len() {
let b = bytes[i];
if b.is_ascii_alphanumeric() || b == b'_' {
i += 1;
} else {
break;
}
}
Some((start, i))
}
/// Try to match `keyword` at `position` case-insensitively.
///
/// The match must end at a non-identifier byte (or end-of-input)
/// so that `save` doesn't half-match the prefix of `saved`.
/// Returns the end byte index on match.
pub fn match_keyword(source: &str, position: usize, keyword: &str) -> Option<usize> {
let bytes = source.as_bytes();
let kw_bytes = keyword.as_bytes();
if position + kw_bytes.len() > bytes.len() {
return None;
}
for (offset, &kb) in kw_bytes.iter().enumerate() {
let sb = bytes[position + offset];
if !sb.eq_ignore_ascii_case(&kb) {
return None;
}
}
let end = position + kw_bytes.len();
if end < bytes.len() {
let next = bytes[end];
if next.is_ascii_alphanumeric() || next == b'_' {
return None;
}
}
Some(end)
}
/// Bare-path token: a non-whitespace run.
///
/// Per ADR-0024 the path-bearing UX dropped the "spaces don't
/// need quoting" feature; paths with spaces use `StringLit`.
/// Phase A's `import` / `export` slots use this.
pub fn consume_bare_path(source: &str, start: usize) -> Option<(usize, usize)> {
let bytes = source.as_bytes();
if start >= bytes.len() || bytes[start].is_ascii_whitespace() {
return None;
}
let mut i = start;
while i < bytes.len() && !bytes[i].is_ascii_whitespace() {
i += 1;
}
Some((start, i))
}
/// Match a single punctuation character at `position`.
#[allow(dead_code)]
pub fn match_punct(source: &str, position: usize, ch: char) -> Option<usize> {
let bytes = source.as_bytes();
if position < bytes.len() && bytes[position] == ch as u8 {
Some(position + 1)
} else {
None
}
}
/// Number literal: optional leading `-` (when adjacent to a digit),
/// then 1+ digits, optional `.` + 1+ digits.
///
/// Mirrors `dsl::lexer::lex_number`. Used by Phase B's `add 1:n
/// relationship` form (where the literal `1` lexes as a Number)
/// and by Phase D's value-literal slots.
pub fn consume_number_literal(source: &str, start: usize) -> Option<(usize, usize)> {
let bytes = source.as_bytes();
if start >= bytes.len() {
return None;
}
let mut i = start;
let leading_minus = bytes[i] == b'-'
&& i + 1 < bytes.len()
&& bytes[i + 1].is_ascii_digit();
if leading_minus {
i += 1;
}
if i >= bytes.len() || !bytes[i].is_ascii_digit() {
return None;
}
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
if i < bytes.len() && bytes[i] == b'.' {
let after = i + 1;
if after < bytes.len() && bytes[after].is_ascii_digit() {
i = after;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
}
}
Some((start, i))
}
/// Flag token: `--name` where name is alphanumeric / `-` / `_`,
/// at least one character. Returns the span (including `--`) on
/// match. The caller checks `name` against an expected value.
pub fn consume_flag(source: &str, start: usize) -> Option<(usize, usize)> {
let bytes = source.as_bytes();
if start + 2 > bytes.len() || &bytes[start..start + 2] != b"--" {
return None;
}
let mut i = start + 2;
while i < bytes.len() {
let b = bytes[i];
if b.is_ascii_alphanumeric() || b == b'-' || b == b'_' {
i += 1;
} else {
break;
}
}
if i == start + 2 {
return None;
}
Some((start, i))
}
/// Single-quoted string literal with `''` escape (mirrors
/// `dsl::lexer::lex_string`).
///
/// Returns `(start, end)` where end is past the closing quote,
/// plus the unescaped content. `None` when the literal is
/// unterminated or the position isn't at a `'`.
#[allow(dead_code)]
pub fn consume_string_literal(source: &str, start: usize) -> Option<((usize, usize), String)> {
let bytes = source.as_bytes();
if start >= bytes.len() || bytes[start] != b'\'' {
return None;
}
let mut content = String::new();
let mut i = start + 1;
while i < bytes.len() {
if bytes[i] == b'\'' {
if bytes.get(i + 1) == Some(&b'\'') {
content.push('\'');
i += 2;
continue;
}
return Some(((start, i + 1), content));
}
let ch = source[i..].chars().next()?;
content.push(ch);
i += ch.len_utf8();
}
None
}