//! Walker-driven highlighting (ADR-0024 §migration Phase F). //! //! `highlight_runs(source)` returns the per-byte highlight class //! assignments for every token shape in `source`. It is the //! single entry point that consumers (input panel, echo lines) //! should use to colour DSL input — there is no separate lexer //! pre-pass. //! //! Strategy: //! //! - Try the walker first. Whatever it consumed end-to-end (entry //! word + matching nodes) contributes `WalkResult::per_byte_class`. //! - For any bytes the walker did not cover — input the walker //! doesn't engage on at all (no registered entry word), trailing //! junk after a partial match, or content past a structural //! failure — fall back to a byte-shape scanner that classifies //! each consumed token by its shape using the same `lex_helpers` //! primitives the walker uses internally. //! //! The two streams are returned in source-byte order; whitespace //! gaps are not represented (the renderer fills them with the //! default foreground colour). use crate::dsl::grammar::HighlightClass; use crate::dsl::walker::context::WalkContext; use crate::dsl::walker::lex_helpers::{ consume_bare_path, consume_flag, consume_ident, consume_number_literal, consume_string_literal, skip_whitespace, }; use crate::dsl::walker::outcome::{ByteClass, WalkBound}; /// Produce the per-byte highlight classes for `source`. /// /// Defaults to `Mode::Simple`. Callers in advanced-mode UIs /// should use [`highlight_runs_in_mode`] so SQL keywords get /// matched and highlighted past the entry word (the simple-mode /// gate at the dispatcher truncates the walker on advanced-only /// commands, ADR-0030 §2). #[must_use] pub fn highlight_runs(source: &str) -> Vec { highlight_runs_in_mode(source, crate::mode::Mode::Simple) } /// Mode-aware [`highlight_runs`] (ADR-0032 §10.6 follow-up). /// /// In `Mode::Advanced` the walker matches every Phase-2 SQL /// token, producing the keyword classes the renderer needs to /// colour `select` / `from` / `where` / `union` / `case` / etc. #[must_use] pub fn highlight_runs_in_mode( source: &str, mode: crate::mode::Mode, ) -> Vec { let mut ctx = WalkContext::new(); ctx.mode = mode; let (result, _cmd) = super::walk(source, WalkBound::EndOfInput, &mut ctx); let mut classes: Vec = result .map(|r| r.per_byte_class) .unwrap_or_default(); let scan_start = classes.last().map_or(0, |c| c.end); scan_remainder(source, scan_start, &mut classes); classes } /// Byte-shape scan from `start` to end of source, appending each /// classified token to `classes`. Whitespace gaps are skipped. fn scan_remainder(source: &str, start: usize, classes: &mut Vec) { let bytes = source.as_bytes(); let mut pos = start; while pos < bytes.len() { pos = skip_whitespace(source, pos); if pos >= bytes.len() { break; } let b = bytes[pos]; // Identifier first — covers keywords-by-shape, since at // the highlight layer we no longer distinguish keyword from // identifier without a successful walker match. if (b.is_ascii_alphabetic() || b == b'_') && let Some((s, e)) = consume_ident(source, pos) { classes.push(ByteClass { start: s, end: e, class: HighlightClass::Identifier, }); pos = e; continue; } if b == b'\'' { // Quoted string. Unterminated → mark the rest as Error // so the user sees the unclosed run highlighted. if let Some(((s, e), _)) = consume_string_literal(source, pos) { classes.push(ByteClass { start: s, end: e, class: HighlightClass::String, }); pos = e; } else { classes.push(ByteClass { start: pos, end: bytes.len(), class: HighlightClass::Error, }); pos = bytes.len(); } continue; } if b == b'-' && bytes.get(pos + 1) == Some(&b'-') { // Flag. `--` without a body is BadFlag → Error. if let Some((s, e)) = consume_flag(source, pos) { classes.push(ByteClass { start: s, end: e, class: HighlightClass::Flag, }); pos = e; } else { classes.push(ByteClass { start: pos, end: pos + 2, class: HighlightClass::Error, }); pos += 2; } continue; } let looks_like_number = b.is_ascii_digit() || (b == b'-' && bytes .get(pos + 1) .copied() .is_some_and(|c| c.is_ascii_digit())); if looks_like_number && let Some((s, e)) = consume_number_literal(source, pos) { classes.push(ByteClass { start: s, end: e, class: HighlightClass::Number, }); pos = e; continue; } if matches!(b, b':' | b'(' | b')' | b',' | b'=' | b'.') { classes.push(ByteClass { start: pos, end: pos + 1, class: HighlightClass::Punct, }); pos += 1; continue; } // Bare-path tail (e.g., trailing `frobulate widgets` past // a partial command match): only used when we know the // remainder isn't structured. Without a grammar context // here we conservatively treat as Error so the user sees // the unknown-shape byte highlighted. // // For multi-byte UTF-8 (emoji, unknown unicode) advance // one whole codepoint as Error. let ch = source[pos..] .chars() .next() .expect("pos < bytes.len() ⇒ at least one char"); let len = ch.len_utf8(); // If the char is alphanumeric (unusual at this fall-through // — should already have been caught above), classify as // Identifier-ish. Otherwise Error. let class = if ch.is_ascii_alphanumeric() || ch == '_' { HighlightClass::Identifier } else if ch.is_whitespace() { // Whitespace is filtered above; this branch is unreachable // in practice. pos += len; continue; } else { HighlightClass::Error }; let _ = consume_bare_path; // silence unused-import lint when not exercised classes.push(ByteClass { start: pos, end: pos + len, class, }); pos += len; } } #[cfg(test)] mod tests { use super::*; fn run(input: &str) -> Vec<(usize, usize, HighlightClass)> { highlight_runs(input) .into_iter() .map(|c| (c.start, c.end, c.class)) .collect() } #[test] fn empty_input_yields_no_runs() { assert!(highlight_runs("").is_empty()); assert!(highlight_runs(" ").is_empty()); } #[test] fn entry_keyword_classified_as_keyword() { assert_eq!(run("quit"), vec![(0, 4, HighlightClass::Keyword)]); } #[test] fn keyword_plus_identifier_via_walker() { // `show data Customers` walks end-to-end. let runs = run("show data Customers"); assert_eq!( runs, vec![ (0, 4, HighlightClass::Keyword), (5, 9, HighlightClass::Keyword), (10, 19, HighlightClass::Identifier), ], ); } #[test] fn unknown_command_word_classified_by_byte_shape() { // Walker doesn't engage; fallback classifies as Identifier. assert_eq!(run("frobulate"), vec![(0, 9, HighlightClass::Identifier)]); } #[test] fn unknown_chars_classified_as_error() { assert_eq!(run("$"), vec![(0, 1, HighlightClass::Error)]); } #[test] fn unterminated_string_classified_as_error_through_to_eof() { assert_eq!(run("'oops"), vec![(0, 5, HighlightClass::Error)]); } #[test] fn string_literal_classified() { assert_eq!(run("'hello'"), vec![(0, 7, HighlightClass::String)]); } #[test] fn flag_classified_via_fallback() { // Walker doesn't engage for a bare `--all-rows`. assert_eq!( run("--all-rows"), vec![(0, 10, HighlightClass::Flag)], ); } #[test] fn bare_double_dash_classified_as_error() { assert_eq!(run("--"), vec![(0, 2, HighlightClass::Error)]); } #[test] fn number_classified_via_fallback() { assert_eq!(run("42"), vec![(0, 2, HighlightClass::Number)]); assert_eq!(run("-5"), vec![(0, 2, HighlightClass::Number)]); assert_eq!(run("3.14"), vec![(0, 4, HighlightClass::Number)]); } #[test] fn punct_classified_via_fallback() { // Bare `:` outside any walker context. assert_eq!(run(":"), vec![(0, 1, HighlightClass::Punct)]); } #[test] fn trailing_tokens_after_partial_walk_are_byte_scanned() { // `quit nonsense` — walker matches `quit`, then trailing // `nonsense` is fallback-classified. let runs = run("quit nonsense"); assert_eq!( runs, vec![ (0, 4, HighlightClass::Keyword), (5, 13, HighlightClass::Identifier), ], ); } #[test] fn whitespace_gaps_are_not_represented_as_runs() { // Two whitespace-separated tokens produce exactly two // class spans; the renderer fills the gap with fg. let runs = run("show table"); assert_eq!(runs.len(), 2); assert_eq!(runs[0].2, HighlightClass::Keyword); assert_eq!(runs[1].2, HighlightClass::Keyword); } #[test] fn full_command_walks_with_each_class() { // `update T set Name='hi' --all-rows` — walker covers it // all end-to-end; the per-byte class slice carries each // node's contribution. let runs = highlight_runs("update T set Name='hi' --all-rows"); let classes: Vec = runs.iter().map(|c| c.class).collect(); assert!(classes.contains(&HighlightClass::Keyword)); assert!(classes.contains(&HighlightClass::Identifier)); assert!(classes.contains(&HighlightClass::String)); assert!(classes.contains(&HighlightClass::Punct)); assert!(classes.contains(&HighlightClass::Flag)); } #[test] fn utf8_unknown_char_advances_one_codepoint() { // ✓ is a 3-byte UTF-8 codepoint; the fallback emits a // 3-byte Error span (not three 1-byte spans). let runs = run("✓"); assert_eq!(runs.len(), 1); assert_eq!(runs[0].2, HighlightClass::Error); assert_eq!(runs[0].1 - runs[0].0, "✓".len()); } #[test] fn string_with_multi_byte_unicode_classified_as_string() { // 'café' is a single string literal; the walker doesn't // engage here (no `'café'` entry keyword), so the fallback // scans and classifies as String. let runs = run("'café'"); assert_eq!(runs.len(), 1); assert_eq!(runs[0].2, HighlightClass::String); assert_eq!(runs[0].1, "'café'".len()); } // ---- ADR-0030 §8 / ADR-0032 — SQL keyword highlighting ---- fn run_advanced(input: &str) -> Vec<(usize, usize, HighlightClass)> { highlight_runs_in_mode(input, crate::mode::Mode::Advanced) .into_iter() .map(|c| (c.start, c.end, c.class)) .collect() } #[test] fn sql_select_keywords_classified() { // ADR-0030 §8 — `select` / `from` get keyword class in // Advanced mode (Simple mode gates SELECT out at the // dispatcher, so only the entry word would highlight). let runs = run_advanced("select * from t"); assert!( runs.iter().any(|(s, e, c)| { *c == HighlightClass::Keyword && (*s, *e) == (0, 6) }), "expected `select` keyword span 0..6; got {runs:?}", ); assert!( runs.iter().any(|(s, e, c)| { *c == HighlightClass::Keyword && (*s, *e) == (9, 13) }), "expected `from` keyword span 9..13; got {runs:?}", ); } #[test] fn sql_expression_operators_classified_as_keywords() { // ADR-0031 §5: LIKE / BETWEEN / IN / IS / AND / OR / NOT // are part of the predicate ladder. Walker matches them // as Word nodes; highlight class = Keyword. let input = "select * from t where a like 'x' and b between 1 and 5"; let runs = run_advanced(input); let keywords: Vec<&str> = runs .iter() .filter(|(_, _, c)| *c == HighlightClass::Keyword) .map(|(s, e, _)| &input[*s..*e]) .collect(); assert!(keywords.contains(&"like"), "no `like`; got {keywords:?}"); assert!(keywords.contains(&"and"), "no `and`; got {keywords:?}"); assert!( keywords.contains(&"between"), "no `between`; got {keywords:?}", ); } #[test] fn sql_case_expression_keywords_classified() { let input = "select case when a = 1 then 'one' else 'other' end from t"; let runs = run_advanced(input); let keywords: Vec<&str> = runs .iter() .filter(|(_, _, c)| *c == HighlightClass::Keyword) .map(|(s, e, _)| &input[*s..*e]) .collect(); for kw in ["case", "when", "then", "else", "end"] { assert!( keywords.contains(&kw), "missing `{kw}` keyword; got {keywords:?}", ); } } #[test] fn sql_dml_keywords_classified() { // ADR-0030 §8 / ADR-0033 — the DML entry words and clause // keywords (INSERT / INTO / VALUES / ON / CONFLICT / // RETURNING / UPDATE / SET / DELETE / FROM) all get the // Keyword class in Advanced mode. 3k cross-cut: the // ambient highlighter covers the DML surface, not just // SELECT. let keywords_of = |input: &'static str| -> Vec<&'static str> { run_advanced(input) .into_iter() .filter(|(_, _, c)| *c == HighlightClass::Keyword) .map(|(s, e, _)| &input[s..e]) .collect() }; let insert = keywords_of( "insert into t (a) values (1) on conflict (a) do update set a = excluded.a returning a", ); for kw in ["insert", "into", "values", "on", "conflict", "do", "update", "set", "returning"] { assert!(insert.contains(&kw), "INSERT/UPSERT: missing `{kw}`; got {insert:?}"); } let update = keywords_of("update t set a = 1 where id = 2 returning a"); for kw in ["update", "set", "where", "returning"] { assert!(update.contains(&kw), "UPDATE: missing `{kw}`; got {update:?}"); } let delete = keywords_of("delete from t where id = 1 returning *"); for kw in ["delete", "from", "where", "returning"] { assert!(delete.contains(&kw), "DELETE: missing `{kw}`; got {delete:?}"); } } }