ADR-0020 implementation: lexer + parser refactor over &[Token]

New `dsl::keyword` module: macro-driven Keyword and Punct enums (single source of truth — enum, lex-side mapping, catalog-key derivation generated from one declaration). New `dsl::lexer` module: tokenizer producing a span-tagged Vec<Token>. Always succeeds; lex-shape errors (unterminated string, unrecognised character, malformed flag) embed as TokenKind::Error tokens so I4 can highlight invalid input uniformly. Parser refactored from `Parser<'a, &'a str, ...>` to `Parser<'a, &'a [Token], ...>`. All 50+ existing parser unit tests ported and passing; aggregation across `choice` now works as designed (e.g. `add` → "expected `1` or `column`", `drop` → "expected `column`, `relationship`, or `table`", `frobulate Customers` lists all ten command-entry keywords). Custom `try_map` content errors (unknown type, mutually-exclusive flags, "with pk needs at least one column", "specified twice") preserved. `replay` bare-path UX kept via the source-slice special case from ADR-0020 §6 (~10 lines, documented inline). Tests: 650 passing, 0 failing, 1 ignored (610 baseline + 40 new lexer/keyword tests). Clippy clean.
2026-05-10 09:22:13 +00:00
parent 857ee753f2
commit fdaf7e3e0e
4 changed files with 1353 additions and 527 deletions
@@ -0,0 +1,287 @@
+//! Keyword and punctuation tables for the DSL lexer (ADR-0020 §2a).
+//!
+//! `define_keywords!` and `define_punct!` are the single source
+//! of truth from which the enums, the lex-side string→variant
+//! mappings, and the `parse.token.*` catalog-key derivations
+//! all come. Adding a new keyword is one line in the
+//! `define_keywords!` invocation plus one line in
+//! `src/friendly/strings/en-US.yaml` under
+//! `parse.token.keyword.<lit>` (the catalog validator catches a
+//! missing entry at test time per ADR-0021 §7). Adding a new
+//! punctuation kind is symmetric.
+
+macro_rules! define_keywords {
+    ( $( $variant:ident => $literal:literal ),+ $(,)? ) => {
+        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+        pub enum Keyword {
+            $( $variant ),+
+        }
+
+        impl Keyword {
+            /// Every variant paired with its canonical lowercase
+            /// literal. Iteration order is the macro
+            /// declaration order.
+            pub const ALL: &'static [(Keyword, &'static str)] = &[
+                $( (Keyword::$variant, $literal) ),+
+            ];
+
+            /// Lex-side mapping. Case-insensitive per ADR-0009.
+            /// `None` for any input that isn't a reserved word —
+            /// the lexer then keeps the input as
+            /// `TokenKind::Identifier`.
+            #[must_use]
+            pub fn from_word(s: &str) -> Option<Self> {
+                Self::ALL
+                    .iter()
+                    .find(|(_, lit)| s.eq_ignore_ascii_case(lit))
+                    .map(|(kw, _)| *kw)
+            }
+
+            /// Canonical lowercase literal for this variant.
+            #[must_use]
+            pub fn as_str(self) -> &'static str {
+                Self::ALL
+                    .iter()
+                    .find(|(kw, _)| *kw == self)
+                    .map(|(_, lit)| *lit)
+                    .expect("ALL covers every variant by construction")
+            }
+
+            /// Catalog key under `parse.token.keyword.*`
+            /// (ADR-0021 §4). The renderer looks this up to get
+            /// the user-facing wording for the keyword.
+            #[must_use]
+            pub fn catalog_token_key(self) -> String {
+                format!("parse.token.keyword.{}", self.as_str())
+            }
+        }
+
+        impl std::fmt::Display for Keyword {
+            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                f.write_str(self.as_str())
+            }
+        }
+    };
+}
+
+define_keywords! {
+    // Commands (entry keywords).
+    Create => "create",
+    Drop => "drop",
+    Add => "add",
+    Rename => "rename",
+    Change => "change",
+    Show => "show",
+    Insert => "insert",
+    Update => "update",
+    Delete => "delete",
+    Replay => "replay",
+    // Object words.
+    Table => "table",
+    Column => "column",
+    Data => "data",
+    Relationship => "relationship",
+    Pk => "pk",
+    // Connectives.
+    With => "with",
+    From => "from",
+    To => "to",
+    Into => "into",
+    As => "as",
+    In => "in",
+    On => "on",
+    Set => "set",
+    Where => "where",
+    Values => "values",
+    // Value literals.
+    Null => "null",
+    True => "true",
+    False => "false",
+    // Referential-action vocabulary (ADR-0013). `set` and `null`
+    // re-use the connective and value-literal keywords above —
+    // `set null` is the parser's job to recognise as a sequence,
+    // not the lexer's.
+    Cascade => "cascade",
+    Restrict => "restrict",
+    Action => "action",
+    No => "no",
+}
+
+macro_rules! define_punct {
+    ( $( $variant:ident => ($literal:literal, $name:literal) ),+ $(,)? ) => {
+        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+        pub enum Punct {
+            $( $variant ),+
+        }
+
+        impl Punct {
+            /// Every variant paired with its character and
+            /// snake-case name suffix.
+            pub const ALL: &'static [(Punct, char, &'static str)] = &[
+                $( (Punct::$variant, $literal, $name) ),+
+            ];
+
+            /// Lex-side mapping. `None` for any character that
+            /// isn't punctuation — the lexer then either
+            /// classifies it as part of another token or
+            /// emits an `Error(LexError::UnknownChar)`.
+            #[must_use]
+            pub fn from_char(c: char) -> Option<Self> {
+                Self::ALL
+                    .iter()
+                    .find(|(_, lit, _)| *lit == c)
+                    .map(|(p, _, _)| *p)
+            }
+
+            #[must_use]
+            pub fn as_char(self) -> char {
+                Self::ALL
+                    .iter()
+                    .find(|(p, _, _)| *p == self)
+                    .map(|(_, c, _)| *c)
+                    .expect("ALL covers every variant by construction")
+            }
+
+            /// Catalog key under `parse.token.punct.*`
+            /// (ADR-0021 §4).
+            #[must_use]
+            pub fn catalog_token_key(self) -> String {
+                let suffix = Self::ALL
+                    .iter()
+                    .find(|(p, _, _)| *p == self)
+                    .map(|(_, _, n)| *n)
+                    .expect("ALL covers every variant by construction");
+                format!("parse.token.punct.{suffix}")
+            }
+        }
+
+        impl std::fmt::Display for Punct {
+            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                use std::fmt::Write;
+                f.write_char(self.as_char())
+            }
+        }
+    };
+}
+
+define_punct! {
+    Colon => (':', "colon"),
+    OpenParen => ('(', "open_paren"),
+    CloseParen => (')', "close_paren"),
+    Comma => (',', "comma"),
+    Equals => ('=', "equals"),
+    Dot => ('.', "dot"),
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use pretty_assertions::assert_eq;
+
+    #[test]
+    fn keyword_from_word_round_trips_every_variant() {
+        for &(kw, lit) in Keyword::ALL {
+            assert_eq!(Keyword::from_word(lit), Some(kw));
+            assert_eq!(kw.as_str(), lit);
+        }
+    }
+
+    #[test]
+    fn keyword_from_word_is_case_insensitive() {
+        assert_eq!(Keyword::from_word("CREATE"), Some(Keyword::Create));
+        assert_eq!(Keyword::from_word("Create"), Some(Keyword::Create));
+        assert_eq!(Keyword::from_word("cReAtE"), Some(Keyword::Create));
+    }
+
+    #[test]
+    fn keyword_from_word_returns_none_for_non_keyword() {
+        assert_eq!(Keyword::from_word("Customers"), None);
+        assert_eq!(Keyword::from_word("frobulate"), None);
+        // Type-name candidates explicitly stay non-keyword
+        // (ADR-0020 §2): they remain identifiers that the
+        // parser validates via `Type::from_str`.
+        assert_eq!(Keyword::from_word("text"), None);
+        assert_eq!(Keyword::from_word("int"), None);
+        assert_eq!(Keyword::from_word("varchar"), None);
+    }
+
+    #[test]
+    fn keyword_literals_are_unique() {
+        let mut lits: Vec<&str> = Keyword::ALL.iter().map(|(_, lit)| *lit).collect();
+        lits.sort_unstable();
+        let count_before = lits.len();
+        lits.dedup();
+        assert_eq!(lits.len(), count_before, "keyword literals must be unique");
+    }
+
+    #[test]
+    fn keyword_catalog_token_key_format() {
+        assert_eq!(
+            Keyword::Create.catalog_token_key(),
+            "parse.token.keyword.create"
+        );
+        assert_eq!(
+            Keyword::Pk.catalog_token_key(),
+            "parse.token.keyword.pk"
+        );
+    }
+
+    #[test]
+    fn keyword_display_uses_canonical_lowercase() {
+        assert_eq!(format!("{}", Keyword::Create), "create");
+        assert_eq!(format!("{}", Keyword::Relationship), "relationship");
+    }
+
+    #[test]
+    fn punct_round_trips_every_variant() {
+        for &(p, c, _) in Punct::ALL {
+            assert_eq!(Punct::from_char(c), Some(p));
+            assert_eq!(p.as_char(), c);
+        }
+    }
+
+    #[test]
+    fn punct_from_char_returns_none_for_non_punct() {
+        assert_eq!(Punct::from_char('a'), None);
+        assert_eq!(Punct::from_char(' '), None);
+        assert_eq!(Punct::from_char('-'), None);
+        assert_eq!(Punct::from_char('\''), None);
+    }
+
+    #[test]
+    fn punct_chars_are_unique() {
+        let mut chars: Vec<char> = Punct::ALL.iter().map(|(_, c, _)| *c).collect();
+        chars.sort_unstable();
+        let count_before = chars.len();
+        chars.dedup();
+        assert_eq!(chars.len(), count_before, "punct chars must be unique");
+    }
+
+    #[test]
+    fn punct_catalog_token_key_format() {
+        assert_eq!(
+            Punct::Colon.catalog_token_key(),
+            "parse.token.punct.colon"
+        );
+        assert_eq!(
+            Punct::OpenParen.catalog_token_key(),
+            "parse.token.punct.open_paren"
+        );
+    }
+
+    #[test]
+    fn every_command_entry_keyword_is_declared() {
+        // Sanity: the ten command entry keywords from
+        // ADR-0009/0014/0006 must all be reachable. If a future
+        // ADR adds a command, this list grows alongside it.
+        for cmd in [
+            "create", "drop", "add", "rename", "change", "show",
+            "insert", "update", "delete", "replay",
+        ] {
+            assert!(
+                Keyword::from_word(cmd).is_some(),
+                "command entry keyword `{cmd}` must be declared",
+            );
+        }
+    }
+}
@@ -0,0 +1,598 @@
+//! DSL lexer (ADR-0020).
+//!
+//! Pure tokenizer: takes the source `&str` and produces a
+//! `Vec<Token>` with byte-offset spans. Lex-shape errors
+//! (unterminated string, unrecognised character, malformed
+//! `--` flag) surface as `TokenKind::Error(_)` tokens — not a
+//! `Result` variant. The parser sees `Error` tokens and raises
+//! a structural error at that point; I4 (syntax highlighting,
+//! future) walks the same token stream and renders Error tokens
+//! with an error glyph. ADR-0020 §2 explains the rationale for
+//! the in-stream error model.
+
+use crate::dsl::keyword::{Keyword, Punct};
+
+pub type Span = (usize, usize);
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Token {
+    pub kind: TokenKind,
+    pub span: Span,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum TokenKind {
+    /// Reserved word recognised against the closed `Keyword`
+    /// set. Case-insensitive at lex time per ADR-0009.
+    Keyword(Keyword),
+    /// Anything alphabetic-or-underscore-then-alphanumeric that
+    /// did not match a keyword. Case is preserved per ADR-0009.
+    Identifier(String),
+    /// Numeric literal, raw text. The parser is responsible for
+    /// any further validation (e.g. `Value::Number` storage). A
+    /// leading `-` is included when present and immediately
+    /// adjacent to a digit (no whitespace).
+    Number(String),
+    /// Single-quoted string literal, with the `''` escape
+    /// processed (so `'don''t'` produces `"don't"`). The span
+    /// covers the surrounding quotes; the payload does not.
+    StringLiteral(String),
+    /// One-character punctuation per the closed `Punct` set.
+    Punct(Punct),
+    /// `--name` flag. The payload is the part after `--`.
+    Flag(String),
+    /// Lex-time shape error. The parser surfaces this with a
+    /// catalog-driven message (ADR-0021 §4
+    /// `parse.token.error.*`).
+    Error(LexError),
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum LexError {
+    /// `'` opened a string literal that ran to end of input
+    /// without a closing `'`. Span covers the opening quote
+    /// through end-of-input.
+    UnterminatedString,
+    /// Character not recognised at this position. Span covers
+    /// the single character (UTF-8 width respected).
+    UnknownChar(char),
+    /// `--` not followed by an identifier-shaped tail. Today
+    /// only reachable with literal trailing `--`; reserved as
+    /// a distinct kind so the renderer can produce a sharper
+    /// hint than "unknown character".
+    BadFlag,
+}
+
+/// Tokenize an input string.
+///
+/// Always succeeds in producing a `Vec<Token>` — lex-shape
+/// errors are embedded as `TokenKind::Error` tokens. Whitespace
+/// between tokens is silently skipped (ADR-0009: liberal
+/// whitespace).
+#[must_use]
+pub fn lex(input: &str) -> Vec<Token> {
+    let mut tokens = Vec::new();
+    let bytes = input.as_bytes();
+    let mut pos = 0;
+    while pos < bytes.len() {
+        let b = bytes[pos];
+        if b.is_ascii_whitespace() {
+            pos += 1;
+            continue;
+        }
+        if b.is_ascii_alphabetic() || b == b'_' {
+            let (tok, next) = lex_identifier(input, pos);
+            tokens.push(tok);
+            pos = next;
+            continue;
+        }
+        if b.is_ascii_digit() {
+            let (tok, next) = lex_number(input, pos, false);
+            tokens.push(tok);
+            pos = next;
+            continue;
+        }
+        if b == b'-' {
+            // `--name` flag, `-<digit>` negative-number literal,
+            // or a bare `-` (UnknownChar — no Minus variant in
+            // the current grammar).
+            let next_b = bytes.get(pos + 1).copied();
+            if next_b == Some(b'-') {
+                let (tok, next) = lex_flag(input, pos);
+                tokens.push(tok);
+                pos = next;
+                continue;
+            }
+            if next_b.is_some_and(|c| c.is_ascii_digit()) {
+                let (tok, next) = lex_number(input, pos, true);
+                tokens.push(tok);
+                pos = next;
+                continue;
+            }
+            tokens.push(Token {
+                kind: TokenKind::Error(LexError::UnknownChar('-')),
+                span: (pos, pos + 1),
+            });
+            pos += 1;
+            continue;
+        }
+        if b == b'\'' {
+            let (tok, next) = lex_string(input, pos);
+            tokens.push(tok);
+            pos = next;
+            continue;
+        }
+        if let Some(p) = Punct::from_char(b as char) {
+            tokens.push(Token {
+                kind: TokenKind::Punct(p),
+                span: (pos, pos + 1),
+            });
+            pos += 1;
+            continue;
+        }
+        // Anything else: read one whole char (UTF-8 safe) and
+        // emit an UnknownChar error token covering its bytes.
+        let ch = input[pos..]
+            .chars()
+            .next()
+            .expect("pos < bytes.len() ⇒ at least one char");
+        let len = ch.len_utf8();
+        tokens.push(Token {
+            kind: TokenKind::Error(LexError::UnknownChar(ch)),
+            span: (pos, pos + len),
+        });
+        pos += len;
+    }
+    tokens
+}
+
+fn lex_identifier(input: &str, start: usize) -> (Token, usize) {
+    let bytes = input.as_bytes();
+    let mut end = start + 1; // first byte already validated by caller
+    while end < bytes.len() {
+        let b = bytes[end];
+        if b.is_ascii_alphanumeric() || b == b'_' {
+            end += 1;
+        } else {
+            break;
+        }
+    }
+    let word = &input[start..end];
+    let kind = Keyword::from_word(word).map_or_else(
+        || TokenKind::Identifier(word.to_string()),
+        TokenKind::Keyword,
+    );
+    (
+        Token {
+            kind,
+            span: (start, end),
+        },
+        end,
+    )
+}
+
+fn lex_number(input: &str, start: usize, leading_minus: bool) -> (Token, usize) {
+    let bytes = input.as_bytes();
+    let mut end = start;
+    if leading_minus {
+        end += 1; // consume the leading '-'
+    }
+    while end < bytes.len() && bytes[end].is_ascii_digit() {
+        end += 1;
+    }
+    // Optional fractional part: `.` followed by ≥1 digit. A
+    // trailing `.` with no digits behind it is left alone (it
+    // lexes as a separate Punct(Dot) — useful for `Customers.id`
+    // when an identifier is misread as a number, though that
+    // path is not currently reachable).
+    if end < bytes.len() && bytes[end] == b'.' {
+        let after_dot = end + 1;
+        if after_dot < bytes.len() && bytes[after_dot].is_ascii_digit() {
+            end = after_dot;
+            while end < bytes.len() && bytes[end].is_ascii_digit() {
+                end += 1;
+            }
+        }
+    }
+    (
+        Token {
+            kind: TokenKind::Number(input[start..end].to_string()),
+            span: (start, end),
+        },
+        end,
+    )
+}
+
+fn lex_string(input: &str, start: usize) -> (Token, usize) {
+    let bytes = input.as_bytes();
+    debug_assert_eq!(bytes[start], b'\'');
+    let mut content = String::new();
+    let mut i = start + 1;
+    while i < bytes.len() {
+        if bytes[i] == b'\'' {
+            // `''` escape: append one literal `'` and continue.
+            if bytes.get(i + 1) == Some(&b'\'') {
+                content.push('\'');
+                i += 2;
+                continue;
+            }
+            // Closing quote.
+            return (
+                Token {
+                    kind: TokenKind::StringLiteral(content),
+                    span: (start, i + 1),
+                },
+                i + 1,
+            );
+        }
+        let ch = input[i..]
+            .chars()
+            .next()
+            .expect("i < bytes.len() ⇒ at least one char");
+        content.push(ch);
+        i += ch.len_utf8();
+    }
+    (
+        Token {
+            kind: TokenKind::Error(LexError::UnterminatedString),
+            span: (start, bytes.len()),
+        },
+        bytes.len(),
+    )
+}
+
+fn lex_flag(input: &str, start: usize) -> (Token, usize) {
+    let bytes = input.as_bytes();
+    debug_assert!(bytes[start..].starts_with(b"--"));
+    let mut end = start + 2;
+    while end < bytes.len() {
+        let b = bytes[end];
+        if b.is_ascii_alphanumeric() || b == b'-' || b == b'_' {
+            end += 1;
+        } else {
+            break;
+        }
+    }
+    if end == start + 2 {
+        return (
+            Token {
+                kind: TokenKind::Error(LexError::BadFlag),
+                span: (start, end),
+            },
+            end,
+        );
+    }
+    (
+        Token {
+            kind: TokenKind::Flag(input[start + 2..end].to_string()),
+            span: (start, end),
+        },
+        end,
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use pretty_assertions::assert_eq;
+
+    fn kinds(input: &str) -> Vec<TokenKind> {
+        lex(input).into_iter().map(|t| t.kind).collect()
+    }
+
+    #[test]
+    fn empty_input_produces_no_tokens() {
+        assert_eq!(lex(""), Vec::<Token>::new());
+    }
+
+    #[test]
+    fn whitespace_only_produces_no_tokens() {
+        assert_eq!(lex("   "), Vec::<Token>::new());
+        assert_eq!(lex("\t\n  \r"), Vec::<Token>::new());
+    }
+
+    #[test]
+    fn single_keyword_lexes_to_keyword_variant() {
+        assert_eq!(
+            kinds("create"),
+            vec![TokenKind::Keyword(Keyword::Create)],
+        );
+    }
+
+    #[test]
+    fn keyword_match_is_case_insensitive() {
+        assert_eq!(
+            kinds("CREATE"),
+            vec![TokenKind::Keyword(Keyword::Create)],
+        );
+        assert_eq!(
+            kinds("CrEaTe"),
+            vec![TokenKind::Keyword(Keyword::Create)],
+        );
+    }
+
+    #[test]
+    fn non_keyword_word_lexes_to_identifier_preserving_case() {
+        assert_eq!(
+            kinds("Customers"),
+            vec![TokenKind::Identifier("Customers".to_string())],
+        );
+        assert_eq!(
+            kinds("customer_v2"),
+            vec![TokenKind::Identifier("customer_v2".to_string())],
+        );
+        // Type names stay as identifiers (ADR-0020 §2).
+        assert_eq!(
+            kinds("text"),
+            vec![TokenKind::Identifier("text".to_string())],
+        );
+        assert_eq!(
+            kinds("varchar"),
+            vec![TokenKind::Identifier("varchar".to_string())],
+        );
+    }
+
+    #[test]
+    fn identifier_starts_with_letter_or_underscore_only() {
+        // A bare digit lexes as a number, not the start of an
+        // identifier. The parser then rejects it where an
+        // identifier was expected — this behaviour matches the
+        // pre-lexer parser.
+        assert_eq!(
+            kinds("1Customers"),
+            vec![
+                TokenKind::Number("1".to_string()),
+                TokenKind::Identifier("Customers".to_string()),
+            ],
+        );
+    }
+
+    #[test]
+    fn positive_integer_lexes_as_number() {
+        assert_eq!(kinds("42"), vec![TokenKind::Number("42".to_string())]);
+    }
+
+    #[test]
+    fn negative_integer_lexes_with_sign_attached() {
+        assert_eq!(kinds("-5"), vec![TokenKind::Number("-5".to_string())]);
+    }
+
+    #[test]
+    fn fractional_number_lexes_as_one_token() {
+        assert_eq!(
+            kinds("3.14"),
+            vec![TokenKind::Number("3.14".to_string())],
+        );
+        assert_eq!(
+            kinds("-3.14"),
+            vec![TokenKind::Number("-3.14".to_string())],
+        );
+    }
+
+    #[test]
+    fn trailing_dot_without_digits_does_not_attach() {
+        // `1.` lexes as Number("1") then Punct(Dot). The parser
+        // can decide what (if anything) that combination means.
+        assert_eq!(
+            kinds("1."),
+            vec![
+                TokenKind::Number("1".to_string()),
+                TokenKind::Punct(Punct::Dot),
+            ],
+        );
+    }
+
+    #[test]
+    fn dot_inside_qualified_name_lexes_as_punct() {
+        // `Customers.id` is identifier, dot, identifier — the
+        // parser composes these for `<Table>.<Col>` references.
+        assert_eq!(
+            kinds("Customers.id"),
+            vec![
+                TokenKind::Identifier("Customers".to_string()),
+                TokenKind::Punct(Punct::Dot),
+                TokenKind::Identifier("id".to_string()),
+            ],
+        );
+    }
+
+    #[test]
+    fn bare_minus_lexes_as_unknown_char() {
+        assert_eq!(
+            kinds("-"),
+            vec![TokenKind::Error(LexError::UnknownChar('-'))],
+        );
+    }
+
+    #[test]
+    fn string_literal_lexes_with_escape_processed() {
+        assert_eq!(
+            kinds("'hello'"),
+            vec![TokenKind::StringLiteral("hello".to_string())],
+        );
+        assert_eq!(
+            kinds("'don''t'"),
+            vec![TokenKind::StringLiteral("don't".to_string())],
+        );
+    }
+
+    #[test]
+    fn empty_string_literal_lexes_to_empty_payload() {
+        assert_eq!(
+            kinds("''"),
+            vec![TokenKind::StringLiteral(String::new())],
+        );
+    }
+
+    #[test]
+    fn string_literal_preserves_internal_whitespace() {
+        assert_eq!(
+            kinds("'a b\tc'"),
+            vec![TokenKind::StringLiteral("a b\tc".to_string())],
+        );
+    }
+
+    #[test]
+    fn unterminated_string_emits_error_token() {
+        assert_eq!(
+            kinds("'oops"),
+            vec![TokenKind::Error(LexError::UnterminatedString)],
+        );
+    }
+
+    #[test]
+    fn string_literal_with_multi_byte_unicode_is_safe() {
+        let toks = lex("'café'");
+        assert_eq!(toks.len(), 1);
+        assert_eq!(
+            toks[0].kind,
+            TokenKind::StringLiteral("café".to_string()),
+        );
+        // Span covers all bytes including the multi-byte é.
+        assert_eq!(toks[0].span, (0, "'café'".len()));
+    }
+
+    #[test]
+    fn each_punct_lexes_to_its_variant() {
+        for &(p, c, _) in Punct::ALL {
+            assert_eq!(
+                kinds(&c.to_string()),
+                vec![TokenKind::Punct(p)],
+                "lexing `{c}`",
+            );
+        }
+    }
+
+    #[test]
+    fn flag_lexes_with_payload_minus_dashes() {
+        assert_eq!(
+            kinds("--all-rows"),
+            vec![TokenKind::Flag("all-rows".to_string())],
+        );
+        assert_eq!(
+            kinds("--create-fk"),
+            vec![TokenKind::Flag("create-fk".to_string())],
+        );
+        assert_eq!(
+            kinds("--force-conversion"),
+            vec![TokenKind::Flag("force-conversion".to_string())],
+        );
+    }
+
+    #[test]
+    fn bare_double_dash_emits_bad_flag_error() {
+        assert_eq!(kinds("--"), vec![TokenKind::Error(LexError::BadFlag)]);
+    }
+
+    #[test]
+    fn unknown_character_emits_error_token() {
+        assert_eq!(
+            kinds("$"),
+            vec![TokenKind::Error(LexError::UnknownChar('$'))],
+        );
+    }
+
+    #[test]
+    fn unknown_character_with_multi_byte_does_not_panic() {
+        // Unicode emoji as an unknown char — span must respect
+        // UTF-8 width.
+        let toks = lex("✓");
+        assert_eq!(toks.len(), 1);
+        assert!(matches!(
+            toks[0].kind,
+            TokenKind::Error(LexError::UnknownChar('✓'))
+        ));
+        assert_eq!(toks[0].span, (0, "✓".len()));
+    }
+
+    #[test]
+    fn whitespace_separates_otherwise_adjacent_tokens() {
+        assert_eq!(
+            kinds("create table"),
+            vec![
+                TokenKind::Keyword(Keyword::Create),
+                TokenKind::Keyword(Keyword::Table),
+            ],
+        );
+    }
+
+    #[test]
+    fn create_table_full_command_lexes_to_expected_sequence() {
+        assert_eq!(
+            kinds("create table Customers with pk id:int"),
+            vec![
+                TokenKind::Keyword(Keyword::Create),
+                TokenKind::Keyword(Keyword::Table),
+                TokenKind::Identifier("Customers".to_string()),
+                TokenKind::Keyword(Keyword::With),
+                TokenKind::Keyword(Keyword::Pk),
+                TokenKind::Identifier("id".to_string()),
+                TokenKind::Punct(Punct::Colon),
+                TokenKind::Identifier("int".to_string()),
+            ],
+        );
+    }
+
+    #[test]
+    fn one_to_n_cardinality_lexes_as_number_colon_identifier() {
+        assert_eq!(
+            kinds("1:n"),
+            vec![
+                TokenKind::Number("1".to_string()),
+                TokenKind::Punct(Punct::Colon),
+                TokenKind::Identifier("n".to_string()),
+            ],
+        );
+    }
+
+    #[test]
+    fn insert_with_value_list_lexes_correctly() {
+        assert_eq!(
+            kinds("insert into T values (1, 'hi', null)"),
+            vec![
+                TokenKind::Keyword(Keyword::Insert),
+                TokenKind::Keyword(Keyword::Into),
+                TokenKind::Identifier("T".to_string()),
+                TokenKind::Keyword(Keyword::Values),
+                TokenKind::Punct(Punct::OpenParen),
+                TokenKind::Number("1".to_string()),
+                TokenKind::Punct(Punct::Comma),
+                TokenKind::StringLiteral("hi".to_string()),
+                TokenKind::Punct(Punct::Comma),
+                TokenKind::Keyword(Keyword::Null),
+                TokenKind::Punct(Punct::CloseParen),
+            ],
+        );
+    }
+
+    #[test]
+    fn spans_are_byte_exact_for_simple_input() {
+        let toks = lex("create  table");
+        assert_eq!(toks.len(), 2);
+        assert_eq!(toks[0].span, (0, "create".len()));
+        assert_eq!(toks[1].span, ("create  ".len(), "create  table".len()));
+    }
+
+    #[test]
+    fn trailing_whitespace_is_stripped() {
+        assert_eq!(
+            kinds("create  "),
+            vec![TokenKind::Keyword(Keyword::Create)],
+        );
+    }
+
+    #[test]
+    fn error_tokens_appear_in_stream_alongside_valid_tokens() {
+        // The lexer keeps producing tokens after an error; the
+        // parser will reject the Error token at whatever point
+        // it tries to consume it.
+        assert_eq!(
+            kinds("create $ table"),
+            vec![
+                TokenKind::Keyword(Keyword::Create),
+                TokenKind::Error(LexError::UnknownChar('$')),
+                TokenKind::Keyword(Keyword::Table),
+            ],
+        );
+    }
+}
@@ -11,6 +11,8 @@

 pub mod action;
 pub mod command;
+pub mod keyword;
+pub mod lexer;
 pub mod parser;
 pub mod shortid;
 pub mod types;