41b7e9a049
One-time, mechanical reformat — no functional changes. The tree was not rustfmt-clean (~1800 hunks across ~100 files); this brings it to stock `cargo fmt` defaults so a `cargo fmt --check` CI gate can follow. Behaviour-preserving: 2509 pass / 0 fail / 1 ignored (unchanged baseline), clippy clean. A .git-blame-ignore-revs entry follows so `git blame` skips this commit.
187 lines
5.8 KiB
Rust
187 lines
5.8 KiB
Rust
//! Byte-level helpers for the scannerless walker (ADR-0024
|
|
//! §scannerless).
|
|
//!
|
|
//! Each helper takes the source string and a byte position,
|
|
//! returns either `Some(end_position)` (matched, post-token end)
|
|
//! or `None` (didn't match here). Helpers are pure and span-
|
|
//! exact; multi-byte UTF-8 within identifiers and string
|
|
//! literals is handled byte-correctly.
|
|
//!
|
|
//! These helpers internally mirror the logic of the legacy
|
|
//! `dsl::lexer` module but are invoked per-position by the
|
|
//! walker rather than as a pre-pass.
|
|
|
|
/// Return the byte index of the first non-whitespace byte at or
|
|
/// after `start`. If the rest is all whitespace, returns
|
|
/// `source.len()`.
|
|
pub fn skip_whitespace(source: &str, start: usize) -> usize {
|
|
let bytes = source.as_bytes();
|
|
let mut i = start;
|
|
while i < bytes.len() && bytes[i].is_ascii_whitespace() {
|
|
i += 1;
|
|
}
|
|
i
|
|
}
|
|
|
|
/// Identifier shape: ASCII letter or `_` to start, then ASCII
|
|
/// alphanumeric or `_`. Returns `Some((start, end))` on match.
|
|
pub fn consume_ident(source: &str, start: usize) -> Option<(usize, usize)> {
|
|
let bytes = source.as_bytes();
|
|
let first = *bytes.get(start)?;
|
|
if !(first.is_ascii_alphabetic() || first == b'_') {
|
|
return None;
|
|
}
|
|
let mut i = start + 1;
|
|
while i < bytes.len() {
|
|
let b = bytes[i];
|
|
if b.is_ascii_alphanumeric() || b == b'_' {
|
|
i += 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
Some((start, i))
|
|
}
|
|
|
|
/// Try to match `keyword` at `position` case-insensitively.
|
|
///
|
|
/// The match must end at a non-identifier byte (or end-of-input)
|
|
/// so that `save` doesn't half-match the prefix of `saved`.
|
|
/// Returns the end byte index on match.
|
|
pub fn match_keyword(source: &str, position: usize, keyword: &str) -> Option<usize> {
|
|
let bytes = source.as_bytes();
|
|
let kw_bytes = keyword.as_bytes();
|
|
if position + kw_bytes.len() > bytes.len() {
|
|
return None;
|
|
}
|
|
for (offset, &kb) in kw_bytes.iter().enumerate() {
|
|
let sb = bytes[position + offset];
|
|
if !sb.eq_ignore_ascii_case(&kb) {
|
|
return None;
|
|
}
|
|
}
|
|
let end = position + kw_bytes.len();
|
|
if end < bytes.len() {
|
|
let next = bytes[end];
|
|
if next.is_ascii_alphanumeric() || next == b'_' {
|
|
return None;
|
|
}
|
|
}
|
|
Some(end)
|
|
}
|
|
|
|
/// Bare-path token: a non-whitespace run.
|
|
///
|
|
/// Per ADR-0024 the path-bearing UX dropped the "spaces don't
|
|
/// need quoting" feature; paths with spaces use `StringLit`.
|
|
/// Phase A's `import` / `export` slots use this.
|
|
pub fn consume_bare_path(source: &str, start: usize) -> Option<(usize, usize)> {
|
|
let bytes = source.as_bytes();
|
|
if start >= bytes.len() || bytes[start].is_ascii_whitespace() {
|
|
return None;
|
|
}
|
|
let mut i = start;
|
|
while i < bytes.len() && !bytes[i].is_ascii_whitespace() {
|
|
i += 1;
|
|
}
|
|
Some((start, i))
|
|
}
|
|
|
|
/// Match a single punctuation character at `position`.
|
|
#[allow(dead_code)]
|
|
pub fn match_punct(source: &str, position: usize, ch: char) -> Option<usize> {
|
|
let bytes = source.as_bytes();
|
|
if position < bytes.len() && bytes[position] == ch as u8 {
|
|
Some(position + 1)
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
/// Number literal: optional leading `-` (when adjacent to a digit),
|
|
/// then 1+ digits, optional `.` + 1+ digits.
|
|
///
|
|
/// Mirrors `dsl::lexer::lex_number`. Used by Phase B's `add 1:n
|
|
/// relationship` form (where the literal `1` lexes as a Number)
|
|
/// and by Phase D's value-literal slots.
|
|
pub fn consume_number_literal(source: &str, start: usize) -> Option<(usize, usize)> {
|
|
let bytes = source.as_bytes();
|
|
if start >= bytes.len() {
|
|
return None;
|
|
}
|
|
let mut i = start;
|
|
let leading_minus = bytes[i] == b'-' && i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit();
|
|
if leading_minus {
|
|
i += 1;
|
|
}
|
|
if i >= bytes.len() || !bytes[i].is_ascii_digit() {
|
|
return None;
|
|
}
|
|
while i < bytes.len() && bytes[i].is_ascii_digit() {
|
|
i += 1;
|
|
}
|
|
if i < bytes.len() && bytes[i] == b'.' {
|
|
let after = i + 1;
|
|
if after < bytes.len() && bytes[after].is_ascii_digit() {
|
|
i = after;
|
|
while i < bytes.len() && bytes[i].is_ascii_digit() {
|
|
i += 1;
|
|
}
|
|
}
|
|
}
|
|
Some((start, i))
|
|
}
|
|
|
|
/// Flag token: `--name` where name is alphanumeric / `-` / `_`,
|
|
/// at least one character. Returns the span (including `--`) on
|
|
/// match. The caller checks `name` against an expected value.
|
|
pub fn consume_flag(source: &str, start: usize) -> Option<(usize, usize)> {
|
|
let bytes = source.as_bytes();
|
|
if start + 2 > bytes.len() || &bytes[start..start + 2] != b"--" {
|
|
return None;
|
|
}
|
|
let mut i = start + 2;
|
|
while i < bytes.len() {
|
|
let b = bytes[i];
|
|
if b.is_ascii_alphanumeric() || b == b'-' || b == b'_' {
|
|
i += 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
if i == start + 2 {
|
|
return None;
|
|
}
|
|
Some((start, i))
|
|
}
|
|
|
|
/// Single-quoted string literal with `''` escape (mirrors
|
|
/// `dsl::lexer::lex_string`).
|
|
///
|
|
/// Returns `(start, end)` where end is past the closing quote,
|
|
/// plus the unescaped content. `None` when the literal is
|
|
/// unterminated or the position isn't at a `'`.
|
|
#[allow(dead_code)]
|
|
pub fn consume_string_literal(source: &str, start: usize) -> Option<((usize, usize), String)> {
|
|
let bytes = source.as_bytes();
|
|
if start >= bytes.len() || bytes[start] != b'\'' {
|
|
return None;
|
|
}
|
|
let mut content = String::new();
|
|
let mut i = start + 1;
|
|
while i < bytes.len() {
|
|
if bytes[i] == b'\'' {
|
|
if bytes.get(i + 1) == Some(&b'\'') {
|
|
content.push('\'');
|
|
i += 2;
|
|
continue;
|
|
}
|
|
return Some(((start, i + 1), content));
|
|
}
|
|
let ch = source[i..].chars().next()?;
|
|
content.push(ch);
|
|
i += ch.len_utf8();
|
|
}
|
|
None
|
|
}
|