rdbms-playground/src/dsl/walker/lex_helpers.rs

//! Byte-level helpers for the scannerless walker (ADR-0024
//! §scannerless).
//!
//! Each helper takes the source string and a byte position,
//! returns either `Some(end_position)` (matched, post-token end)
//! or `None` (didn't match here). Helpers are pure and span-
//! exact; multi-byte UTF-8 within identifiers and string
//! literals is handled byte-correctly.
//!
//! These helpers internally mirror the logic of the legacy
//! `dsl::lexer` module but are invoked per-position by the
//! walker rather than as a pre-pass.

/// Return the byte index of the first non-whitespace byte at or
/// after `start`. If the rest is all whitespace, returns
/// `source.len()`.
pub fn skip_whitespace(source: &str, start: usize) -> usize {
    let bytes = source.as_bytes();
    let mut i = start;
    while i < bytes.len() && bytes[i].is_ascii_whitespace() {
        i += 1;
    }
    i
}

/// Identifier shape: ASCII letter or `_` to start, then ASCII
/// alphanumeric or `_`. Returns `Some((start, end))` on match.
pub fn consume_ident(source: &str, start: usize) -> Option<(usize, usize)> {
    let bytes = source.as_bytes();
    let first = *bytes.get(start)?;
    if !(first.is_ascii_alphabetic() || first == b'_') {
        return None;
    }
    let mut i = start + 1;
    while i < bytes.len() {
        let b = bytes[i];
        if b.is_ascii_alphanumeric() || b == b'_' {
            i += 1;
        } else {
            break;
        }
    }
    Some((start, i))
}

/// Try to match `keyword` at `position` case-insensitively.
///
/// The match must end at a non-identifier byte (or end-of-input)
/// so that `save` doesn't half-match the prefix of `saved`.
/// Returns the end byte index on match.
pub fn match_keyword(source: &str, position: usize, keyword: &str) -> Option<usize> {
    let bytes = source.as_bytes();
    let kw_bytes = keyword.as_bytes();
    if position + kw_bytes.len() > bytes.len() {
        return None;
    }
    for (offset, &kb) in kw_bytes.iter().enumerate() {
        let sb = bytes[position + offset];
        if !sb.eq_ignore_ascii_case(&kb) {
            return None;
        }
    }
    let end = position + kw_bytes.len();
    if end < bytes.len() {
        let next = bytes[end];
        if next.is_ascii_alphanumeric() || next == b'_' {
            return None;
        }
    }
    Some(end)
}

/// Bare-path token: a non-whitespace run.
///
/// Per ADR-0024 the path-bearing UX dropped the "spaces don't
/// need quoting" feature; paths with spaces use `StringLit`.
/// Phase A's `import` / `export` slots use this.
pub fn consume_bare_path(source: &str, start: usize) -> Option<(usize, usize)> {
    let bytes = source.as_bytes();
    if start >= bytes.len() || bytes[start].is_ascii_whitespace() {
        return None;
    }
    let mut i = start;
    while i < bytes.len() && !bytes[i].is_ascii_whitespace() {
        i += 1;
    }
    Some((start, i))
}

/// Match a single punctuation character at `position`.
#[allow(dead_code)]
pub fn match_punct(source: &str, position: usize, ch: char) -> Option<usize> {
    let bytes = source.as_bytes();
    if position < bytes.len() && bytes[position] == ch as u8 {
        Some(position + 1)
    } else {
        None
    }
}

/// Number literal: optional leading `-` (when adjacent to a digit),
/// then 1+ digits, optional `.` + 1+ digits.
///
/// Mirrors `dsl::lexer::lex_number`. Used by Phase B's `add 1:n
/// relationship` form (where the literal `1` lexes as a Number)
/// and by Phase D's value-literal slots.
pub fn consume_number_literal(source: &str, start: usize) -> Option<(usize, usize)> {
    let bytes = source.as_bytes();
    if start >= bytes.len() {
        return None;
    }
    let mut i = start;
    let leading_minus = bytes[i] == b'-' && i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit();
    if leading_minus {
        i += 1;
    }
    if i >= bytes.len() || !bytes[i].is_ascii_digit() {
        return None;
    }
    while i < bytes.len() && bytes[i].is_ascii_digit() {
        i += 1;
    }
    if i < bytes.len() && bytes[i] == b'.' {
        let after = i + 1;
        if after < bytes.len() && bytes[after].is_ascii_digit() {
            i = after;
            while i < bytes.len() && bytes[i].is_ascii_digit() {
                i += 1;
            }
        }
    }
    Some((start, i))
}

/// Flag token: `--name` where name is alphanumeric / `-` / `_`,
/// at least one character. Returns the span (including `--`) on
/// match. The caller checks `name` against an expected value.
pub fn consume_flag(source: &str, start: usize) -> Option<(usize, usize)> {
    let bytes = source.as_bytes();
    if start + 2 > bytes.len() || &bytes[start..start + 2] != b"--" {
        return None;
    }
    let mut i = start + 2;
    while i < bytes.len() {
        let b = bytes[i];
        if b.is_ascii_alphanumeric() || b == b'-' || b == b'_' {
            i += 1;
        } else {
            break;
        }
    }
    if i == start + 2 {
        return None;
    }
    Some((start, i))
}

/// Single-quoted string literal with `''` escape (mirrors
/// `dsl::lexer::lex_string`).
///
/// Returns `(start, end)` where end is past the closing quote,
/// plus the unescaped content. `None` when the literal is
/// unterminated or the position isn't at a `'`.
#[allow(dead_code)]
pub fn consume_string_literal(source: &str, start: usize) -> Option<((usize, usize), String)> {
    let bytes = source.as_bytes();
    if start >= bytes.len() || bytes[start] != b'\'' {
        return None;
    }
    let mut content = String::new();
    let mut i = start + 1;
    while i < bytes.len() {
        if bytes[i] == b'\'' {
            if bytes.get(i + 1) == Some(&b'\'') {
                content.push('\'');
                i += 2;
                continue;
            }
            return Some(((start, i + 1), content));
        }
        let ch = source[i..].chars().next()?;
        content.push(ch);
        i += ch.len_utf8();
    }
    None
}