rdbms-playground/src/dsl/walker/lex_helpers.rs

//! Byte-level helpers for the scannerless walker (ADR-0024
//! §scannerless).
//!
//! Each helper takes the source string and a byte position,
//! returns either `Some(end_position)` (matched, post-token end)
//! or `None` (didn't match here). Helpers are pure and span-
//! exact; multi-byte UTF-8 within identifiers and string
//! literals is handled byte-correctly.
//!
//! These helpers internally mirror the logic of the legacy
//! `dsl::lexer` module but are invoked per-position by the
//! walker rather than as a pre-pass.

/// Return the byte index of the first non-whitespace byte at or
/// after `start`. If the rest is all whitespace, returns
/// `source.len()`.
pub fn skip_whitespace(source: &str, start: usize) -> usize {
    let bytes = source.as_bytes();
    let mut i = start;
    while i < bytes.len() && bytes[i].is_ascii_whitespace() {
        i += 1;
    }
    i
}

/// Identifier shape: ASCII letter or `_` to start, then ASCII
/// alphanumeric or `_`. Returns `Some((start, end))` on match.
pub fn consume_ident(source: &str, start: usize) -> Option<(usize, usize)> {
    let bytes = source.as_bytes();
    let first = *bytes.get(start)?;
    if !(first.is_ascii_alphabetic() || first == b'_') {
        return None;
    }
    let mut i = start + 1;
    while i < bytes.len() {
        let b = bytes[i];
        if b.is_ascii_alphanumeric() || b == b'_' {
            i += 1;
        } else {
            break;
        }
    }
    Some((start, i))
}

/// Try to match `keyword` at `position` case-insensitively.
///
/// The match must end at a non-identifier byte (or end-of-input)
/// so that `save` doesn't half-match the prefix of `saved`.
/// Returns the end byte index on match.
pub fn match_keyword(source: &str, position: usize, keyword: &str) -> Option<usize> {
    let bytes = source.as_bytes();
    let kw_bytes = keyword.as_bytes();
    if position + kw_bytes.len() > bytes.len() {
        return None;
    }
    for (offset, &kb) in kw_bytes.iter().enumerate() {
        let sb = bytes[position + offset];
        if !sb.eq_ignore_ascii_case(&kb) {
            return None;
        }
    }
    let end = position + kw_bytes.len();
    if end < bytes.len() {
        let next = bytes[end];
        if next.is_ascii_alphanumeric() || next == b'_' {
            return None;
        }
    }
    Some(end)
}

/// Bare-path token: a non-whitespace run.
///
/// Per ADR-0024 the path-bearing UX dropped the "spaces don't
/// need quoting" feature; paths with spaces use `StringLit`.
/// Phase A's `import` / `export` slots use this.
pub fn consume_bare_path(source: &str, start: usize) -> Option<(usize, usize)> {
    let bytes = source.as_bytes();
    if start >= bytes.len() || bytes[start].is_ascii_whitespace() {
        return None;
    }
    let mut i = start;
    while i < bytes.len() && !bytes[i].is_ascii_whitespace() {
        i += 1;
    }
    Some((start, i))
}

/// Match a single punctuation character at `position`.
#[allow(dead_code)]
pub fn match_punct(source: &str, position: usize, ch: char) -> Option<usize> {
    let bytes = source.as_bytes();
    if position < bytes.len() && bytes[position] == ch as u8 {
        Some(position + 1)
    } else {
        None
    }
}