Files
rdbms-playground/src/dsl/walker/lex_helpers.rs
T
claude@clouddev1 41b7e9a049 style: format the whole tree with cargo fmt (stock defaults, #35)
One-time, mechanical reformat — no functional changes. The tree was not
rustfmt-clean (~1800 hunks across ~100 files); this brings it to stock
`cargo fmt` defaults so a `cargo fmt --check` CI gate can follow.
Behaviour-preserving: 2509 pass / 0 fail / 1 ignored (unchanged baseline),
clippy clean. A .git-blame-ignore-revs entry follows so `git blame`
skips this commit.
2026-06-17 21:39:19 +00:00

187 lines
5.8 KiB
Rust

//! Byte-level helpers for the scannerless walker (ADR-0024
//! §scannerless).
//!
//! Each helper takes the source string and a byte position,
//! returns either `Some(end_position)` (matched, post-token end)
//! or `None` (didn't match here). Helpers are pure and span-
//! exact; multi-byte UTF-8 within identifiers and string
//! literals is handled byte-correctly.
//!
//! These helpers internally mirror the logic of the legacy
//! `dsl::lexer` module but are invoked per-position by the
//! walker rather than as a pre-pass.
/// Return the byte index of the first non-whitespace byte at or
/// after `start`. If the rest is all whitespace, returns
/// `source.len()`.
pub fn skip_whitespace(source: &str, start: usize) -> usize {
let bytes = source.as_bytes();
let mut i = start;
while i < bytes.len() && bytes[i].is_ascii_whitespace() {
i += 1;
}
i
}
/// Identifier shape: ASCII letter or `_` to start, then ASCII
/// alphanumeric or `_`. Returns `Some((start, end))` on match.
pub fn consume_ident(source: &str, start: usize) -> Option<(usize, usize)> {
let bytes = source.as_bytes();
let first = *bytes.get(start)?;
if !(first.is_ascii_alphabetic() || first == b'_') {
return None;
}
let mut i = start + 1;
while i < bytes.len() {
let b = bytes[i];
if b.is_ascii_alphanumeric() || b == b'_' {
i += 1;
} else {
break;
}
}
Some((start, i))
}
/// Try to match `keyword` at `position` case-insensitively.
///
/// The match must end at a non-identifier byte (or end-of-input)
/// so that `save` doesn't half-match the prefix of `saved`.
/// Returns the end byte index on match.
pub fn match_keyword(source: &str, position: usize, keyword: &str) -> Option<usize> {
let bytes = source.as_bytes();
let kw_bytes = keyword.as_bytes();
if position + kw_bytes.len() > bytes.len() {
return None;
}
for (offset, &kb) in kw_bytes.iter().enumerate() {
let sb = bytes[position + offset];
if !sb.eq_ignore_ascii_case(&kb) {
return None;
}
}
let end = position + kw_bytes.len();
if end < bytes.len() {
let next = bytes[end];
if next.is_ascii_alphanumeric() || next == b'_' {
return None;
}
}
Some(end)
}
/// Bare-path token: a non-whitespace run.
///
/// Per ADR-0024 the path-bearing UX dropped the "spaces don't
/// need quoting" feature; paths with spaces use `StringLit`.
/// Phase A's `import` / `export` slots use this.
pub fn consume_bare_path(source: &str, start: usize) -> Option<(usize, usize)> {
let bytes = source.as_bytes();
if start >= bytes.len() || bytes[start].is_ascii_whitespace() {
return None;
}
let mut i = start;
while i < bytes.len() && !bytes[i].is_ascii_whitespace() {
i += 1;
}
Some((start, i))
}
/// Match a single punctuation character at `position`.
#[allow(dead_code)]
pub fn match_punct(source: &str, position: usize, ch: char) -> Option<usize> {
let bytes = source.as_bytes();
if position < bytes.len() && bytes[position] == ch as u8 {
Some(position + 1)
} else {
None
}
}
/// Number literal: optional leading `-` (when adjacent to a digit),
/// then 1+ digits, optional `.` + 1+ digits.
///
/// Mirrors `dsl::lexer::lex_number`. Used by Phase B's `add 1:n
/// relationship` form (where the literal `1` lexes as a Number)
/// and by Phase D's value-literal slots.
pub fn consume_number_literal(source: &str, start: usize) -> Option<(usize, usize)> {
let bytes = source.as_bytes();
if start >= bytes.len() {
return None;
}
let mut i = start;
let leading_minus = bytes[i] == b'-' && i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit();
if leading_minus {
i += 1;
}
if i >= bytes.len() || !bytes[i].is_ascii_digit() {
return None;
}
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
if i < bytes.len() && bytes[i] == b'.' {
let after = i + 1;
if after < bytes.len() && bytes[after].is_ascii_digit() {
i = after;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
}
}
Some((start, i))
}
/// Flag token: `--name` where name is alphanumeric / `-` / `_`,
/// at least one character. Returns the span (including `--`) on
/// match. The caller checks `name` against an expected value.
pub fn consume_flag(source: &str, start: usize) -> Option<(usize, usize)> {
let bytes = source.as_bytes();
if start + 2 > bytes.len() || &bytes[start..start + 2] != b"--" {
return None;
}
let mut i = start + 2;
while i < bytes.len() {
let b = bytes[i];
if b.is_ascii_alphanumeric() || b == b'-' || b == b'_' {
i += 1;
} else {
break;
}
}
if i == start + 2 {
return None;
}
Some((start, i))
}
/// Single-quoted string literal with `''` escape (mirrors
/// `dsl::lexer::lex_string`).
///
/// Returns `(start, end)` where end is past the closing quote,
/// plus the unescaped content. `None` when the literal is
/// unterminated or the position isn't at a `'`.
#[allow(dead_code)]
pub fn consume_string_literal(source: &str, start: usize) -> Option<((usize, usize), String)> {
let bytes = source.as_bytes();
if start >= bytes.len() || bytes[start] != b'\'' {
return None;
}
let mut content = String::new();
let mut i = start + 1;
while i < bytes.len() {
if bytes[i] == b'\'' {
if bytes.get(i + 1) == Some(&b'\'') {
content.push('\'');
i += 2;
continue;
}
return Some(((start, i + 1), content));
}
let ch = source[i..].chars().next()?;
content.push(ch);
i += ch.len_utf8();
}
None
}