ADR-0020 implementation: lexer + parser refactor over &[Token]
New `dsl::keyword` module: macro-driven Keyword and Punct enums (single source of truth — enum, lex-side mapping, catalog-key derivation generated from one declaration). New `dsl::lexer` module: tokenizer producing a span-tagged Vec<Token>. Always succeeds; lex-shape errors (unterminated string, unrecognised character, malformed flag) embed as TokenKind::Error tokens so I4 can highlight invalid input uniformly. Parser refactored from `Parser<'a, &'a str, ...>` to `Parser<'a, &'a [Token], ...>`. All 50+ existing parser unit tests ported and passing; aggregation across `choice` now works as designed (e.g. `add` → "expected `1` or `column`", `drop` → "expected `column`, `relationship`, or `table`", `frobulate Customers` lists all ten command-entry keywords). Custom `try_map` content errors (unknown type, mutually-exclusive flags, "with pk needs at least one column", "specified twice") preserved. `replay` bare-path UX kept via the source-slice special case from ADR-0020 §6 (~10 lines, documented inline). Tests: 650 passing, 0 failing, 1 ignored (610 baseline + 40 new lexer/keyword tests). Clippy clean.
This commit is contained in:
@@ -0,0 +1,287 @@
|
||||
//! Keyword and punctuation tables for the DSL lexer (ADR-0020 §2a).
|
||||
//!
|
||||
//! `define_keywords!` and `define_punct!` are the single source
|
||||
//! of truth from which the enums, the lex-side string→variant
|
||||
//! mappings, and the `parse.token.*` catalog-key derivations
|
||||
//! all come. Adding a new keyword is one line in the
|
||||
//! `define_keywords!` invocation plus one line in
|
||||
//! `src/friendly/strings/en-US.yaml` under
|
||||
//! `parse.token.keyword.<lit>` (the catalog validator catches a
|
||||
//! missing entry at test time per ADR-0021 §7). Adding a new
|
||||
//! punctuation kind is symmetric.
|
||||
|
||||
macro_rules! define_keywords {
|
||||
( $( $variant:ident => $literal:literal ),+ $(,)? ) => {
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum Keyword {
|
||||
$( $variant ),+
|
||||
}
|
||||
|
||||
impl Keyword {
|
||||
/// Every variant paired with its canonical lowercase
|
||||
/// literal. Iteration order is the macro
|
||||
/// declaration order.
|
||||
pub const ALL: &'static [(Keyword, &'static str)] = &[
|
||||
$( (Keyword::$variant, $literal) ),+
|
||||
];
|
||||
|
||||
/// Lex-side mapping. Case-insensitive per ADR-0009.
|
||||
/// `None` for any input that isn't a reserved word —
|
||||
/// the lexer then keeps the input as
|
||||
/// `TokenKind::Identifier`.
|
||||
#[must_use]
|
||||
pub fn from_word(s: &str) -> Option<Self> {
|
||||
Self::ALL
|
||||
.iter()
|
||||
.find(|(_, lit)| s.eq_ignore_ascii_case(lit))
|
||||
.map(|(kw, _)| *kw)
|
||||
}
|
||||
|
||||
/// Canonical lowercase literal for this variant.
|
||||
#[must_use]
|
||||
pub fn as_str(self) -> &'static str {
|
||||
Self::ALL
|
||||
.iter()
|
||||
.find(|(kw, _)| *kw == self)
|
||||
.map(|(_, lit)| *lit)
|
||||
.expect("ALL covers every variant by construction")
|
||||
}
|
||||
|
||||
/// Catalog key under `parse.token.keyword.*`
|
||||
/// (ADR-0021 §4). The renderer looks this up to get
|
||||
/// the user-facing wording for the keyword.
|
||||
#[must_use]
|
||||
pub fn catalog_token_key(self) -> String {
|
||||
format!("parse.token.keyword.{}", self.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Keyword {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.as_str())
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
define_keywords! {
|
||||
// Commands (entry keywords).
|
||||
Create => "create",
|
||||
Drop => "drop",
|
||||
Add => "add",
|
||||
Rename => "rename",
|
||||
Change => "change",
|
||||
Show => "show",
|
||||
Insert => "insert",
|
||||
Update => "update",
|
||||
Delete => "delete",
|
||||
Replay => "replay",
|
||||
// Object words.
|
||||
Table => "table",
|
||||
Column => "column",
|
||||
Data => "data",
|
||||
Relationship => "relationship",
|
||||
Pk => "pk",
|
||||
// Connectives.
|
||||
With => "with",
|
||||
From => "from",
|
||||
To => "to",
|
||||
Into => "into",
|
||||
As => "as",
|
||||
In => "in",
|
||||
On => "on",
|
||||
Set => "set",
|
||||
Where => "where",
|
||||
Values => "values",
|
||||
// Value literals.
|
||||
Null => "null",
|
||||
True => "true",
|
||||
False => "false",
|
||||
// Referential-action vocabulary (ADR-0013). `set` and `null`
|
||||
// re-use the connective and value-literal keywords above —
|
||||
// `set null` is the parser's job to recognise as a sequence,
|
||||
// not the lexer's.
|
||||
Cascade => "cascade",
|
||||
Restrict => "restrict",
|
||||
Action => "action",
|
||||
No => "no",
|
||||
}
|
||||
|
||||
macro_rules! define_punct {
|
||||
( $( $variant:ident => ($literal:literal, $name:literal) ),+ $(,)? ) => {
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum Punct {
|
||||
$( $variant ),+
|
||||
}
|
||||
|
||||
impl Punct {
|
||||
/// Every variant paired with its character and
|
||||
/// snake-case name suffix.
|
||||
pub const ALL: &'static [(Punct, char, &'static str)] = &[
|
||||
$( (Punct::$variant, $literal, $name) ),+
|
||||
];
|
||||
|
||||
/// Lex-side mapping. `None` for any character that
|
||||
/// isn't punctuation — the lexer then either
|
||||
/// classifies it as part of another token or
|
||||
/// emits an `Error(LexError::UnknownChar)`.
|
||||
#[must_use]
|
||||
pub fn from_char(c: char) -> Option<Self> {
|
||||
Self::ALL
|
||||
.iter()
|
||||
.find(|(_, lit, _)| *lit == c)
|
||||
.map(|(p, _, _)| *p)
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn as_char(self) -> char {
|
||||
Self::ALL
|
||||
.iter()
|
||||
.find(|(p, _, _)| *p == self)
|
||||
.map(|(_, c, _)| *c)
|
||||
.expect("ALL covers every variant by construction")
|
||||
}
|
||||
|
||||
/// Catalog key under `parse.token.punct.*`
|
||||
/// (ADR-0021 §4).
|
||||
#[must_use]
|
||||
pub fn catalog_token_key(self) -> String {
|
||||
let suffix = Self::ALL
|
||||
.iter()
|
||||
.find(|(p, _, _)| *p == self)
|
||||
.map(|(_, _, n)| *n)
|
||||
.expect("ALL covers every variant by construction");
|
||||
format!("parse.token.punct.{suffix}")
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Punct {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
use std::fmt::Write;
|
||||
f.write_char(self.as_char())
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
define_punct! {
|
||||
Colon => (':', "colon"),
|
||||
OpenParen => ('(', "open_paren"),
|
||||
CloseParen => (')', "close_paren"),
|
||||
Comma => (',', "comma"),
|
||||
Equals => ('=', "equals"),
|
||||
Dot => ('.', "dot"),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
#[test]
|
||||
fn keyword_from_word_round_trips_every_variant() {
|
||||
for &(kw, lit) in Keyword::ALL {
|
||||
assert_eq!(Keyword::from_word(lit), Some(kw));
|
||||
assert_eq!(kw.as_str(), lit);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keyword_from_word_is_case_insensitive() {
|
||||
assert_eq!(Keyword::from_word("CREATE"), Some(Keyword::Create));
|
||||
assert_eq!(Keyword::from_word("Create"), Some(Keyword::Create));
|
||||
assert_eq!(Keyword::from_word("cReAtE"), Some(Keyword::Create));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keyword_from_word_returns_none_for_non_keyword() {
|
||||
assert_eq!(Keyword::from_word("Customers"), None);
|
||||
assert_eq!(Keyword::from_word("frobulate"), None);
|
||||
// Type-name candidates explicitly stay non-keyword
|
||||
// (ADR-0020 §2): they remain identifiers that the
|
||||
// parser validates via `Type::from_str`.
|
||||
assert_eq!(Keyword::from_word("text"), None);
|
||||
assert_eq!(Keyword::from_word("int"), None);
|
||||
assert_eq!(Keyword::from_word("varchar"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keyword_literals_are_unique() {
|
||||
let mut lits: Vec<&str> = Keyword::ALL.iter().map(|(_, lit)| *lit).collect();
|
||||
lits.sort_unstable();
|
||||
let count_before = lits.len();
|
||||
lits.dedup();
|
||||
assert_eq!(lits.len(), count_before, "keyword literals must be unique");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keyword_catalog_token_key_format() {
|
||||
assert_eq!(
|
||||
Keyword::Create.catalog_token_key(),
|
||||
"parse.token.keyword.create"
|
||||
);
|
||||
assert_eq!(
|
||||
Keyword::Pk.catalog_token_key(),
|
||||
"parse.token.keyword.pk"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keyword_display_uses_canonical_lowercase() {
|
||||
assert_eq!(format!("{}", Keyword::Create), "create");
|
||||
assert_eq!(format!("{}", Keyword::Relationship), "relationship");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn punct_round_trips_every_variant() {
|
||||
for &(p, c, _) in Punct::ALL {
|
||||
assert_eq!(Punct::from_char(c), Some(p));
|
||||
assert_eq!(p.as_char(), c);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn punct_from_char_returns_none_for_non_punct() {
|
||||
assert_eq!(Punct::from_char('a'), None);
|
||||
assert_eq!(Punct::from_char(' '), None);
|
||||
assert_eq!(Punct::from_char('-'), None);
|
||||
assert_eq!(Punct::from_char('\''), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn punct_chars_are_unique() {
|
||||
let mut chars: Vec<char> = Punct::ALL.iter().map(|(_, c, _)| *c).collect();
|
||||
chars.sort_unstable();
|
||||
let count_before = chars.len();
|
||||
chars.dedup();
|
||||
assert_eq!(chars.len(), count_before, "punct chars must be unique");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn punct_catalog_token_key_format() {
|
||||
assert_eq!(
|
||||
Punct::Colon.catalog_token_key(),
|
||||
"parse.token.punct.colon"
|
||||
);
|
||||
assert_eq!(
|
||||
Punct::OpenParen.catalog_token_key(),
|
||||
"parse.token.punct.open_paren"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn every_command_entry_keyword_is_declared() {
|
||||
// Sanity: the ten command entry keywords from
|
||||
// ADR-0009/0014/0006 must all be reachable. If a future
|
||||
// ADR adds a command, this list grows alongside it.
|
||||
for cmd in [
|
||||
"create", "drop", "add", "rename", "change", "show",
|
||||
"insert", "update", "delete", "replay",
|
||||
] {
|
||||
assert!(
|
||||
Keyword::from_word(cmd).is_some(),
|
||||
"command entry keyword `{cmd}` must be declared",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,598 @@
|
||||
//! DSL lexer (ADR-0020).
|
||||
//!
|
||||
//! Pure tokenizer: takes the source `&str` and produces a
|
||||
//! `Vec<Token>` with byte-offset spans. Lex-shape errors
|
||||
//! (unterminated string, unrecognised character, malformed
|
||||
//! `--` flag) surface as `TokenKind::Error(_)` tokens — not a
|
||||
//! `Result` variant. The parser sees `Error` tokens and raises
|
||||
//! a structural error at that point; I4 (syntax highlighting,
|
||||
//! future) walks the same token stream and renders Error tokens
|
||||
//! with an error glyph. ADR-0020 §2 explains the rationale for
|
||||
//! the in-stream error model.
|
||||
|
||||
use crate::dsl::keyword::{Keyword, Punct};
|
||||
|
||||
pub type Span = (usize, usize);
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct Token {
|
||||
pub kind: TokenKind,
|
||||
pub span: Span,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum TokenKind {
|
||||
/// Reserved word recognised against the closed `Keyword`
|
||||
/// set. Case-insensitive at lex time per ADR-0009.
|
||||
Keyword(Keyword),
|
||||
/// Anything alphabetic-or-underscore-then-alphanumeric that
|
||||
/// did not match a keyword. Case is preserved per ADR-0009.
|
||||
Identifier(String),
|
||||
/// Numeric literal, raw text. The parser is responsible for
|
||||
/// any further validation (e.g. `Value::Number` storage). A
|
||||
/// leading `-` is included when present and immediately
|
||||
/// adjacent to a digit (no whitespace).
|
||||
Number(String),
|
||||
/// Single-quoted string literal, with the `''` escape
|
||||
/// processed (so `'don''t'` produces `"don't"`). The span
|
||||
/// covers the surrounding quotes; the payload does not.
|
||||
StringLiteral(String),
|
||||
/// One-character punctuation per the closed `Punct` set.
|
||||
Punct(Punct),
|
||||
/// `--name` flag. The payload is the part after `--`.
|
||||
Flag(String),
|
||||
/// Lex-time shape error. The parser surfaces this with a
|
||||
/// catalog-driven message (ADR-0021 §4
|
||||
/// `parse.token.error.*`).
|
||||
Error(LexError),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum LexError {
|
||||
/// `'` opened a string literal that ran to end of input
|
||||
/// without a closing `'`. Span covers the opening quote
|
||||
/// through end-of-input.
|
||||
UnterminatedString,
|
||||
/// Character not recognised at this position. Span covers
|
||||
/// the single character (UTF-8 width respected).
|
||||
UnknownChar(char),
|
||||
/// `--` not followed by an identifier-shaped tail. Today
|
||||
/// only reachable with literal trailing `--`; reserved as
|
||||
/// a distinct kind so the renderer can produce a sharper
|
||||
/// hint than "unknown character".
|
||||
BadFlag,
|
||||
}
|
||||
|
||||
/// Tokenize an input string.
|
||||
///
|
||||
/// Always succeeds in producing a `Vec<Token>` — lex-shape
|
||||
/// errors are embedded as `TokenKind::Error` tokens. Whitespace
|
||||
/// between tokens is silently skipped (ADR-0009: liberal
|
||||
/// whitespace).
|
||||
#[must_use]
|
||||
pub fn lex(input: &str) -> Vec<Token> {
|
||||
let mut tokens = Vec::new();
|
||||
let bytes = input.as_bytes();
|
||||
let mut pos = 0;
|
||||
while pos < bytes.len() {
|
||||
let b = bytes[pos];
|
||||
if b.is_ascii_whitespace() {
|
||||
pos += 1;
|
||||
continue;
|
||||
}
|
||||
if b.is_ascii_alphabetic() || b == b'_' {
|
||||
let (tok, next) = lex_identifier(input, pos);
|
||||
tokens.push(tok);
|
||||
pos = next;
|
||||
continue;
|
||||
}
|
||||
if b.is_ascii_digit() {
|
||||
let (tok, next) = lex_number(input, pos, false);
|
||||
tokens.push(tok);
|
||||
pos = next;
|
||||
continue;
|
||||
}
|
||||
if b == b'-' {
|
||||
// `--name` flag, `-<digit>` negative-number literal,
|
||||
// or a bare `-` (UnknownChar — no Minus variant in
|
||||
// the current grammar).
|
||||
let next_b = bytes.get(pos + 1).copied();
|
||||
if next_b == Some(b'-') {
|
||||
let (tok, next) = lex_flag(input, pos);
|
||||
tokens.push(tok);
|
||||
pos = next;
|
||||
continue;
|
||||
}
|
||||
if next_b.is_some_and(|c| c.is_ascii_digit()) {
|
||||
let (tok, next) = lex_number(input, pos, true);
|
||||
tokens.push(tok);
|
||||
pos = next;
|
||||
continue;
|
||||
}
|
||||
tokens.push(Token {
|
||||
kind: TokenKind::Error(LexError::UnknownChar('-')),
|
||||
span: (pos, pos + 1),
|
||||
});
|
||||
pos += 1;
|
||||
continue;
|
||||
}
|
||||
if b == b'\'' {
|
||||
let (tok, next) = lex_string(input, pos);
|
||||
tokens.push(tok);
|
||||
pos = next;
|
||||
continue;
|
||||
}
|
||||
if let Some(p) = Punct::from_char(b as char) {
|
||||
tokens.push(Token {
|
||||
kind: TokenKind::Punct(p),
|
||||
span: (pos, pos + 1),
|
||||
});
|
||||
pos += 1;
|
||||
continue;
|
||||
}
|
||||
// Anything else: read one whole char (UTF-8 safe) and
|
||||
// emit an UnknownChar error token covering its bytes.
|
||||
let ch = input[pos..]
|
||||
.chars()
|
||||
.next()
|
||||
.expect("pos < bytes.len() ⇒ at least one char");
|
||||
let len = ch.len_utf8();
|
||||
tokens.push(Token {
|
||||
kind: TokenKind::Error(LexError::UnknownChar(ch)),
|
||||
span: (pos, pos + len),
|
||||
});
|
||||
pos += len;
|
||||
}
|
||||
tokens
|
||||
}
|
||||
|
||||
fn lex_identifier(input: &str, start: usize) -> (Token, usize) {
|
||||
let bytes = input.as_bytes();
|
||||
let mut end = start + 1; // first byte already validated by caller
|
||||
while end < bytes.len() {
|
||||
let b = bytes[end];
|
||||
if b.is_ascii_alphanumeric() || b == b'_' {
|
||||
end += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
let word = &input[start..end];
|
||||
let kind = Keyword::from_word(word).map_or_else(
|
||||
|| TokenKind::Identifier(word.to_string()),
|
||||
TokenKind::Keyword,
|
||||
);
|
||||
(
|
||||
Token {
|
||||
kind,
|
||||
span: (start, end),
|
||||
},
|
||||
end,
|
||||
)
|
||||
}
|
||||
|
||||
fn lex_number(input: &str, start: usize, leading_minus: bool) -> (Token, usize) {
|
||||
let bytes = input.as_bytes();
|
||||
let mut end = start;
|
||||
if leading_minus {
|
||||
end += 1; // consume the leading '-'
|
||||
}
|
||||
while end < bytes.len() && bytes[end].is_ascii_digit() {
|
||||
end += 1;
|
||||
}
|
||||
// Optional fractional part: `.` followed by ≥1 digit. A
|
||||
// trailing `.` with no digits behind it is left alone (it
|
||||
// lexes as a separate Punct(Dot) — useful for `Customers.id`
|
||||
// when an identifier is misread as a number, though that
|
||||
// path is not currently reachable).
|
||||
if end < bytes.len() && bytes[end] == b'.' {
|
||||
let after_dot = end + 1;
|
||||
if after_dot < bytes.len() && bytes[after_dot].is_ascii_digit() {
|
||||
end = after_dot;
|
||||
while end < bytes.len() && bytes[end].is_ascii_digit() {
|
||||
end += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
(
|
||||
Token {
|
||||
kind: TokenKind::Number(input[start..end].to_string()),
|
||||
span: (start, end),
|
||||
},
|
||||
end,
|
||||
)
|
||||
}
|
||||
|
||||
fn lex_string(input: &str, start: usize) -> (Token, usize) {
|
||||
let bytes = input.as_bytes();
|
||||
debug_assert_eq!(bytes[start], b'\'');
|
||||
let mut content = String::new();
|
||||
let mut i = start + 1;
|
||||
while i < bytes.len() {
|
||||
if bytes[i] == b'\'' {
|
||||
// `''` escape: append one literal `'` and continue.
|
||||
if bytes.get(i + 1) == Some(&b'\'') {
|
||||
content.push('\'');
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
// Closing quote.
|
||||
return (
|
||||
Token {
|
||||
kind: TokenKind::StringLiteral(content),
|
||||
span: (start, i + 1),
|
||||
},
|
||||
i + 1,
|
||||
);
|
||||
}
|
||||
let ch = input[i..]
|
||||
.chars()
|
||||
.next()
|
||||
.expect("i < bytes.len() ⇒ at least one char");
|
||||
content.push(ch);
|
||||
i += ch.len_utf8();
|
||||
}
|
||||
(
|
||||
Token {
|
||||
kind: TokenKind::Error(LexError::UnterminatedString),
|
||||
span: (start, bytes.len()),
|
||||
},
|
||||
bytes.len(),
|
||||
)
|
||||
}
|
||||
|
||||
fn lex_flag(input: &str, start: usize) -> (Token, usize) {
|
||||
let bytes = input.as_bytes();
|
||||
debug_assert!(bytes[start..].starts_with(b"--"));
|
||||
let mut end = start + 2;
|
||||
while end < bytes.len() {
|
||||
let b = bytes[end];
|
||||
if b.is_ascii_alphanumeric() || b == b'-' || b == b'_' {
|
||||
end += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if end == start + 2 {
|
||||
return (
|
||||
Token {
|
||||
kind: TokenKind::Error(LexError::BadFlag),
|
||||
span: (start, end),
|
||||
},
|
||||
end,
|
||||
);
|
||||
}
|
||||
(
|
||||
Token {
|
||||
kind: TokenKind::Flag(input[start + 2..end].to_string()),
|
||||
span: (start, end),
|
||||
},
|
||||
end,
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
fn kinds(input: &str) -> Vec<TokenKind> {
|
||||
lex(input).into_iter().map(|t| t.kind).collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_input_produces_no_tokens() {
|
||||
assert_eq!(lex(""), Vec::<Token>::new());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn whitespace_only_produces_no_tokens() {
|
||||
assert_eq!(lex(" "), Vec::<Token>::new());
|
||||
assert_eq!(lex("\t\n \r"), Vec::<Token>::new());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_keyword_lexes_to_keyword_variant() {
|
||||
assert_eq!(
|
||||
kinds("create"),
|
||||
vec![TokenKind::Keyword(Keyword::Create)],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keyword_match_is_case_insensitive() {
|
||||
assert_eq!(
|
||||
kinds("CREATE"),
|
||||
vec![TokenKind::Keyword(Keyword::Create)],
|
||||
);
|
||||
assert_eq!(
|
||||
kinds("CrEaTe"),
|
||||
vec![TokenKind::Keyword(Keyword::Create)],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_keyword_word_lexes_to_identifier_preserving_case() {
|
||||
assert_eq!(
|
||||
kinds("Customers"),
|
||||
vec![TokenKind::Identifier("Customers".to_string())],
|
||||
);
|
||||
assert_eq!(
|
||||
kinds("customer_v2"),
|
||||
vec![TokenKind::Identifier("customer_v2".to_string())],
|
||||
);
|
||||
// Type names stay as identifiers (ADR-0020 §2).
|
||||
assert_eq!(
|
||||
kinds("text"),
|
||||
vec![TokenKind::Identifier("text".to_string())],
|
||||
);
|
||||
assert_eq!(
|
||||
kinds("varchar"),
|
||||
vec![TokenKind::Identifier("varchar".to_string())],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn identifier_starts_with_letter_or_underscore_only() {
|
||||
// A bare digit lexes as a number, not the start of an
|
||||
// identifier. The parser then rejects it where an
|
||||
// identifier was expected — this behaviour matches the
|
||||
// pre-lexer parser.
|
||||
assert_eq!(
|
||||
kinds("1Customers"),
|
||||
vec![
|
||||
TokenKind::Number("1".to_string()),
|
||||
TokenKind::Identifier("Customers".to_string()),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn positive_integer_lexes_as_number() {
|
||||
assert_eq!(kinds("42"), vec![TokenKind::Number("42".to_string())]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn negative_integer_lexes_with_sign_attached() {
|
||||
assert_eq!(kinds("-5"), vec![TokenKind::Number("-5".to_string())]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fractional_number_lexes_as_one_token() {
|
||||
assert_eq!(
|
||||
kinds("3.14"),
|
||||
vec![TokenKind::Number("3.14".to_string())],
|
||||
);
|
||||
assert_eq!(
|
||||
kinds("-3.14"),
|
||||
vec![TokenKind::Number("-3.14".to_string())],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn trailing_dot_without_digits_does_not_attach() {
|
||||
// `1.` lexes as Number("1") then Punct(Dot). The parser
|
||||
// can decide what (if anything) that combination means.
|
||||
assert_eq!(
|
||||
kinds("1."),
|
||||
vec![
|
||||
TokenKind::Number("1".to_string()),
|
||||
TokenKind::Punct(Punct::Dot),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dot_inside_qualified_name_lexes_as_punct() {
|
||||
// `Customers.id` is identifier, dot, identifier — the
|
||||
// parser composes these for `<Table>.<Col>` references.
|
||||
assert_eq!(
|
||||
kinds("Customers.id"),
|
||||
vec![
|
||||
TokenKind::Identifier("Customers".to_string()),
|
||||
TokenKind::Punct(Punct::Dot),
|
||||
TokenKind::Identifier("id".to_string()),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bare_minus_lexes_as_unknown_char() {
|
||||
assert_eq!(
|
||||
kinds("-"),
|
||||
vec![TokenKind::Error(LexError::UnknownChar('-'))],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn string_literal_lexes_with_escape_processed() {
|
||||
assert_eq!(
|
||||
kinds("'hello'"),
|
||||
vec![TokenKind::StringLiteral("hello".to_string())],
|
||||
);
|
||||
assert_eq!(
|
||||
kinds("'don''t'"),
|
||||
vec![TokenKind::StringLiteral("don't".to_string())],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_string_literal_lexes_to_empty_payload() {
|
||||
assert_eq!(
|
||||
kinds("''"),
|
||||
vec![TokenKind::StringLiteral(String::new())],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn string_literal_preserves_internal_whitespace() {
|
||||
assert_eq!(
|
||||
kinds("'a b\tc'"),
|
||||
vec![TokenKind::StringLiteral("a b\tc".to_string())],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unterminated_string_emits_error_token() {
|
||||
assert_eq!(
|
||||
kinds("'oops"),
|
||||
vec![TokenKind::Error(LexError::UnterminatedString)],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn string_literal_with_multi_byte_unicode_is_safe() {
|
||||
let toks = lex("'café'");
|
||||
assert_eq!(toks.len(), 1);
|
||||
assert_eq!(
|
||||
toks[0].kind,
|
||||
TokenKind::StringLiteral("café".to_string()),
|
||||
);
|
||||
// Span covers all bytes including the multi-byte é.
|
||||
assert_eq!(toks[0].span, (0, "'café'".len()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn each_punct_lexes_to_its_variant() {
|
||||
for &(p, c, _) in Punct::ALL {
|
||||
assert_eq!(
|
||||
kinds(&c.to_string()),
|
||||
vec![TokenKind::Punct(p)],
|
||||
"lexing `{c}`",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn flag_lexes_with_payload_minus_dashes() {
|
||||
assert_eq!(
|
||||
kinds("--all-rows"),
|
||||
vec![TokenKind::Flag("all-rows".to_string())],
|
||||
);
|
||||
assert_eq!(
|
||||
kinds("--create-fk"),
|
||||
vec![TokenKind::Flag("create-fk".to_string())],
|
||||
);
|
||||
assert_eq!(
|
||||
kinds("--force-conversion"),
|
||||
vec![TokenKind::Flag("force-conversion".to_string())],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bare_double_dash_emits_bad_flag_error() {
|
||||
assert_eq!(kinds("--"), vec![TokenKind::Error(LexError::BadFlag)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unknown_character_emits_error_token() {
|
||||
assert_eq!(
|
||||
kinds("$"),
|
||||
vec![TokenKind::Error(LexError::UnknownChar('$'))],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unknown_character_with_multi_byte_does_not_panic() {
|
||||
// Unicode emoji as an unknown char — span must respect
|
||||
// UTF-8 width.
|
||||
let toks = lex("✓");
|
||||
assert_eq!(toks.len(), 1);
|
||||
assert!(matches!(
|
||||
toks[0].kind,
|
||||
TokenKind::Error(LexError::UnknownChar('✓'))
|
||||
));
|
||||
assert_eq!(toks[0].span, (0, "✓".len()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn whitespace_separates_otherwise_adjacent_tokens() {
|
||||
assert_eq!(
|
||||
kinds("create table"),
|
||||
vec![
|
||||
TokenKind::Keyword(Keyword::Create),
|
||||
TokenKind::Keyword(Keyword::Table),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn create_table_full_command_lexes_to_expected_sequence() {
|
||||
assert_eq!(
|
||||
kinds("create table Customers with pk id:int"),
|
||||
vec![
|
||||
TokenKind::Keyword(Keyword::Create),
|
||||
TokenKind::Keyword(Keyword::Table),
|
||||
TokenKind::Identifier("Customers".to_string()),
|
||||
TokenKind::Keyword(Keyword::With),
|
||||
TokenKind::Keyword(Keyword::Pk),
|
||||
TokenKind::Identifier("id".to_string()),
|
||||
TokenKind::Punct(Punct::Colon),
|
||||
TokenKind::Identifier("int".to_string()),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_to_n_cardinality_lexes_as_number_colon_identifier() {
|
||||
assert_eq!(
|
||||
kinds("1:n"),
|
||||
vec![
|
||||
TokenKind::Number("1".to_string()),
|
||||
TokenKind::Punct(Punct::Colon),
|
||||
TokenKind::Identifier("n".to_string()),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn insert_with_value_list_lexes_correctly() {
|
||||
assert_eq!(
|
||||
kinds("insert into T values (1, 'hi', null)"),
|
||||
vec![
|
||||
TokenKind::Keyword(Keyword::Insert),
|
||||
TokenKind::Keyword(Keyword::Into),
|
||||
TokenKind::Identifier("T".to_string()),
|
||||
TokenKind::Keyword(Keyword::Values),
|
||||
TokenKind::Punct(Punct::OpenParen),
|
||||
TokenKind::Number("1".to_string()),
|
||||
TokenKind::Punct(Punct::Comma),
|
||||
TokenKind::StringLiteral("hi".to_string()),
|
||||
TokenKind::Punct(Punct::Comma),
|
||||
TokenKind::Keyword(Keyword::Null),
|
||||
TokenKind::Punct(Punct::CloseParen),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn spans_are_byte_exact_for_simple_input() {
|
||||
let toks = lex("create table");
|
||||
assert_eq!(toks.len(), 2);
|
||||
assert_eq!(toks[0].span, (0, "create".len()));
|
||||
assert_eq!(toks[1].span, ("create ".len(), "create table".len()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn trailing_whitespace_is_stripped() {
|
||||
assert_eq!(
|
||||
kinds("create "),
|
||||
vec![TokenKind::Keyword(Keyword::Create)],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn error_tokens_appear_in_stream_alongside_valid_tokens() {
|
||||
// The lexer keeps producing tokens after an error; the
|
||||
// parser will reject the Error token at whatever point
|
||||
// it tries to consume it.
|
||||
assert_eq!(
|
||||
kinds("create $ table"),
|
||||
vec![
|
||||
TokenKind::Keyword(Keyword::Create),
|
||||
TokenKind::Error(LexError::UnknownChar('$')),
|
||||
TokenKind::Keyword(Keyword::Table),
|
||||
],
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -11,6 +11,8 @@
|
||||
|
||||
pub mod action;
|
||||
pub mod command;
|
||||
pub mod keyword;
|
||||
pub mod lexer;
|
||||
pub mod parser;
|
||||
pub mod shortid;
|
||||
pub mod types;
|
||||
|
||||
+466
-527
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user