From 7bdd3987e1181ba42511c16bd22fa5bfd2893442 Mon Sep 17 00:00:00 2001 From: "claude@clouddev1" Date: Fri, 15 May 2026 08:19:52 +0000 Subject: [PATCH] ADR-0024 Phase F (full) step 1: walker-driven highlighting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the lex()-driven `base_runs` span builder in `input_render.rs` with `walker::highlight_runs`. The new walker-side `dsl::walker::highlight` module returns per-byte `HighlightClass` assignments for every token shape in the source: - For commands the walker engages on, `WalkResult::per_byte_class` is the authoritative source (keyword / identifier / number / string / punct / flag). - Trailing junk past a partial match — and inputs the walker doesn't engage on at all (no registered entry word) — fall through to a byte-shape scanner over `lex_helpers` so unknown command words, stray punctuation, and unterminated strings still highlight sensibly. `Theme::highlight_class_color` is the walker-side analogue of `token_color(&TokenKind)`; the renderer reads `walker::highlight_runs` output and looks up colours through it. `token_color` and the `lex()` pre-pass remain in place for now — the lexer module is still consumed by usage rendering and completion until the remaining Phase F steps land. `HighlightClass`'s and `WalkResult::per_byte_class`'s `#[allow(dead_code)]` annotations come off — they're now part of the production highlight path. Tests: - 16 new tests under `dsl::walker::highlight` cover end-to-end walks, byte-shape fallbacks (unknown commands, bare flags, numbers, punctuation), UTF-8 codepoint advance, and trailing- token handling after partial walks. - Existing `input_render` tests pass unchanged. - 860 total tests passing (727 lib + 133 integration), 1 ignored. Clippy clean with `nursery` lints + `-D warnings`. --- src/dsl/grammar/mod.rs | 7 +- src/dsl/walker/highlight.rs | 319 ++++++++++++++++++++++++++++++++++++ src/dsl/walker/mod.rs | 2 + src/dsl/walker/outcome.rs | 2 - src/input_render.rs | 16 +- src/theme.rs | 20 +++ 6 files changed, 354 insertions(+), 12 deletions(-) create mode 100644 src/dsl/walker/highlight.rs diff --git a/src/dsl/grammar/mod.rs b/src/dsl/grammar/mod.rs index 9223529..b5f1006 100644 --- a/src/dsl/grammar/mod.rs +++ b/src/dsl/grammar/mod.rs @@ -33,11 +33,10 @@ use crate::dsl::walker::outcome::MatchedPath; /// Highlight class assigned to a matched terminal. /// -/// Phase A records these on the `WalkResult::per_byte_class` -/// slice; the existing input-renderer (chumsky-driven) still -/// owns the user-visible highlight today. +/// Recorded on the `WalkResult::per_byte_class` slice and surfaced +/// by `walker::highlight_runs` to the input/echo-line renderers +/// (ADR-0024 §architecture). #[derive(Debug, Clone, Copy, PartialEq, Eq)] -#[allow(dead_code)] pub enum HighlightClass { Keyword, Identifier, diff --git a/src/dsl/walker/highlight.rs b/src/dsl/walker/highlight.rs new file mode 100644 index 0000000..43cf97e --- /dev/null +++ b/src/dsl/walker/highlight.rs @@ -0,0 +1,319 @@ +//! Walker-driven highlighting (ADR-0024 §migration Phase F). +//! +//! `highlight_runs(source)` returns the per-byte highlight class +//! assignments for every token shape in `source`. It is the +//! single entry point that consumers (input panel, echo lines) +//! should use to colour DSL input — there is no separate lexer +//! pre-pass. +//! +//! Strategy: +//! +//! - Try the walker first. Whatever it consumed end-to-end (entry +//! word + matching nodes) contributes `WalkResult::per_byte_class`. +//! - For any bytes the walker did not cover — input the walker +//! doesn't engage on at all (no registered entry word), trailing +//! junk after a partial match, or content past a structural +//! failure — fall back to a byte-shape scanner that classifies +//! each consumed token by its shape using the same `lex_helpers` +//! primitives the walker uses internally. +//! +//! The two streams are returned in source-byte order; whitespace +//! gaps are not represented (the renderer fills them with the +//! default foreground colour). + +use crate::dsl::grammar::HighlightClass; +use crate::dsl::walker::context::WalkContext; +use crate::dsl::walker::lex_helpers::{ + consume_bare_path, consume_flag, consume_ident, consume_number_literal, + consume_string_literal, skip_whitespace, +}; +use crate::dsl::walker::outcome::{ByteClass, WalkBound}; + +/// Produce the per-byte highlight classes for `source`. +/// +/// On a successful walk this is exactly the walker's recorded +/// classes. On partial / unmatched input the byte-shape scanner +/// fills the gap so the renderer keeps colouring through trailing +/// tokens and unknown-command inputs. +#[must_use] +pub fn highlight_runs(source: &str) -> Vec { + let mut ctx = WalkContext::new(); + let (result, _cmd) = super::walk(source, WalkBound::EndOfInput, &mut ctx); + let mut classes: Vec = result + .map(|r| r.per_byte_class) + .unwrap_or_default(); + + let scan_start = classes.last().map_or(0, |c| c.end); + scan_remainder(source, scan_start, &mut classes); + classes +} + +/// Byte-shape scan from `start` to end of source, appending each +/// classified token to `classes`. Whitespace gaps are skipped. +fn scan_remainder(source: &str, start: usize, classes: &mut Vec) { + let bytes = source.as_bytes(); + let mut pos = start; + while pos < bytes.len() { + pos = skip_whitespace(source, pos); + if pos >= bytes.len() { + break; + } + let b = bytes[pos]; + // Identifier first — covers keywords-by-shape, since at + // the highlight layer we no longer distinguish keyword from + // identifier without a successful walker match. + if (b.is_ascii_alphabetic() || b == b'_') + && let Some((s, e)) = consume_ident(source, pos) + { + classes.push(ByteClass { + start: s, + end: e, + class: HighlightClass::Identifier, + }); + pos = e; + continue; + } + if b == b'\'' { + // Quoted string. Unterminated → mark the rest as Error + // so the user sees the unclosed run highlighted. + if let Some(((s, e), _)) = consume_string_literal(source, pos) { + classes.push(ByteClass { + start: s, + end: e, + class: HighlightClass::String, + }); + pos = e; + } else { + classes.push(ByteClass { + start: pos, + end: bytes.len(), + class: HighlightClass::Error, + }); + pos = bytes.len(); + } + continue; + } + if b == b'-' && bytes.get(pos + 1) == Some(&b'-') { + // Flag. `--` without a body is BadFlag → Error. + if let Some((s, e)) = consume_flag(source, pos) { + classes.push(ByteClass { + start: s, + end: e, + class: HighlightClass::Flag, + }); + pos = e; + } else { + classes.push(ByteClass { + start: pos, + end: pos + 2, + class: HighlightClass::Error, + }); + pos += 2; + } + continue; + } + let looks_like_number = b.is_ascii_digit() + || (b == b'-' + && bytes + .get(pos + 1) + .copied() + .is_some_and(|c| c.is_ascii_digit())); + if looks_like_number + && let Some((s, e)) = consume_number_literal(source, pos) + { + classes.push(ByteClass { + start: s, + end: e, + class: HighlightClass::Number, + }); + pos = e; + continue; + } + if matches!(b, b':' | b'(' | b')' | b',' | b'=' | b'.') { + classes.push(ByteClass { + start: pos, + end: pos + 1, + class: HighlightClass::Punct, + }); + pos += 1; + continue; + } + // Bare-path tail (e.g., trailing `frobulate widgets` past + // a partial command match): only used when we know the + // remainder isn't structured. Without a grammar context + // here we conservatively treat as Error so the user sees + // the unknown-shape byte highlighted. + // + // For multi-byte UTF-8 (emoji, unknown unicode) advance + // one whole codepoint as Error. + let ch = source[pos..] + .chars() + .next() + .expect("pos < bytes.len() ⇒ at least one char"); + let len = ch.len_utf8(); + // If the char is alphanumeric (unusual at this fall-through + // — should already have been caught above), classify as + // Identifier-ish. Otherwise Error. + let class = if ch.is_ascii_alphanumeric() || ch == '_' { + HighlightClass::Identifier + } else if ch.is_whitespace() { + // Whitespace is filtered above; this branch is unreachable + // in practice. + pos += len; + continue; + } else { + HighlightClass::Error + }; + let _ = consume_bare_path; // silence unused-import lint when not exercised + classes.push(ByteClass { + start: pos, + end: pos + len, + class, + }); + pos += len; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn run(input: &str) -> Vec<(usize, usize, HighlightClass)> { + highlight_runs(input) + .into_iter() + .map(|c| (c.start, c.end, c.class)) + .collect() + } + + #[test] + fn empty_input_yields_no_runs() { + assert!(highlight_runs("").is_empty()); + assert!(highlight_runs(" ").is_empty()); + } + + #[test] + fn entry_keyword_classified_as_keyword() { + assert_eq!(run("quit"), vec![(0, 4, HighlightClass::Keyword)]); + } + + #[test] + fn keyword_plus_identifier_via_walker() { + // `show data Customers` walks end-to-end. + let runs = run("show data Customers"); + assert_eq!( + runs, + vec![ + (0, 4, HighlightClass::Keyword), + (5, 9, HighlightClass::Keyword), + (10, 19, HighlightClass::Identifier), + ], + ); + } + + #[test] + fn unknown_command_word_classified_by_byte_shape() { + // Walker doesn't engage; fallback classifies as Identifier. + assert_eq!(run("frobulate"), vec![(0, 9, HighlightClass::Identifier)]); + } + + #[test] + fn unknown_chars_classified_as_error() { + assert_eq!(run("$"), vec![(0, 1, HighlightClass::Error)]); + } + + #[test] + fn unterminated_string_classified_as_error_through_to_eof() { + assert_eq!(run("'oops"), vec![(0, 5, HighlightClass::Error)]); + } + + #[test] + fn string_literal_classified() { + assert_eq!(run("'hello'"), vec![(0, 7, HighlightClass::String)]); + } + + #[test] + fn flag_classified_via_fallback() { + // Walker doesn't engage for a bare `--all-rows`. + assert_eq!( + run("--all-rows"), + vec![(0, 10, HighlightClass::Flag)], + ); + } + + #[test] + fn bare_double_dash_classified_as_error() { + assert_eq!(run("--"), vec![(0, 2, HighlightClass::Error)]); + } + + #[test] + fn number_classified_via_fallback() { + assert_eq!(run("42"), vec![(0, 2, HighlightClass::Number)]); + assert_eq!(run("-5"), vec![(0, 2, HighlightClass::Number)]); + assert_eq!(run("3.14"), vec![(0, 4, HighlightClass::Number)]); + } + + #[test] + fn punct_classified_via_fallback() { + // Bare `:` outside any walker context. + assert_eq!(run(":"), vec![(0, 1, HighlightClass::Punct)]); + } + + #[test] + fn trailing_tokens_after_partial_walk_are_byte_scanned() { + // `quit nonsense` — walker matches `quit`, then trailing + // `nonsense` is fallback-classified. + let runs = run("quit nonsense"); + assert_eq!( + runs, + vec![ + (0, 4, HighlightClass::Keyword), + (5, 13, HighlightClass::Identifier), + ], + ); + } + + #[test] + fn whitespace_gaps_are_not_represented_as_runs() { + // Two whitespace-separated tokens produce exactly two + // class spans; the renderer fills the gap with fg. + let runs = run("show table"); + assert_eq!(runs.len(), 2); + assert_eq!(runs[0].2, HighlightClass::Keyword); + assert_eq!(runs[1].2, HighlightClass::Keyword); + } + + #[test] + fn full_command_walks_with_each_class() { + // `update T set Name='hi' --all-rows` — walker covers it + // all end-to-end; the per-byte class slice carries each + // node's contribution. + let runs = highlight_runs("update T set Name='hi' --all-rows"); + let classes: Vec = runs.iter().map(|c| c.class).collect(); + assert!(classes.contains(&HighlightClass::Keyword)); + assert!(classes.contains(&HighlightClass::Identifier)); + assert!(classes.contains(&HighlightClass::String)); + assert!(classes.contains(&HighlightClass::Punct)); + assert!(classes.contains(&HighlightClass::Flag)); + } + + #[test] + fn utf8_unknown_char_advances_one_codepoint() { + // ✓ is a 3-byte UTF-8 codepoint; the fallback emits a + // 3-byte Error span (not three 1-byte spans). + let runs = run("✓"); + assert_eq!(runs.len(), 1); + assert_eq!(runs[0].2, HighlightClass::Error); + assert_eq!(runs[0].1 - runs[0].0, "✓".len()); + } + + #[test] + fn string_with_multi_byte_unicode_classified_as_string() { + // 'café' is a single string literal; the walker doesn't + // engage here (no `'café'` entry keyword), so the fallback + // scans and classifies as String. + let runs = run("'café'"); + assert_eq!(runs.len(), 1); + assert_eq!(runs[0].2, HighlightClass::String); + assert_eq!(runs[0].1, "'café'".len()); + } +} diff --git a/src/dsl/walker/mod.rs b/src/dsl/walker/mod.rs index 8b7015c..4968dde 100644 --- a/src/dsl/walker/mod.rs +++ b/src/dsl/walker/mod.rs @@ -14,6 +14,7 @@ pub mod context; pub mod driver; +pub mod highlight; pub mod lex_helpers; pub mod outcome; @@ -27,6 +28,7 @@ use crate::dsl::walker::outcome::{ }; pub use context::ColumnInfo; +pub use highlight::highlight_runs; /// Public walk entry. `bound` is `EndOfInput` for parse; /// `Position(cursor)` for completion / hint (Phase A: not yet diff --git a/src/dsl/walker/outcome.rs b/src/dsl/walker/outcome.rs index b5e0934..d830d0c 100644 --- a/src/dsl/walker/outcome.rs +++ b/src/dsl/walker/outcome.rs @@ -153,7 +153,6 @@ impl MatchedPath { /// match. Phase A keeps this for future consumers; not yet used /// outside walker-internal tests. #[derive(Debug, Clone)] -#[allow(dead_code)] pub struct ByteClass { pub start: usize, pub end: usize, @@ -164,6 +163,5 @@ pub struct ByteClass { pub struct WalkResult { pub outcome: WalkOutcome, pub matched_path: MatchedPath, - #[allow(dead_code)] pub per_byte_class: Vec, } diff --git a/src/input_render.rs b/src/input_render.rs index 6ec998d..eb76eda 100644 --- a/src/input_render.rs +++ b/src/input_render.rs @@ -25,6 +25,7 @@ use ratatui::style::{Modifier, Style}; use crate::dsl::lexer::lex; +use crate::dsl::walker; use crate::dsl::{ParseError, parse_command}; use crate::theme::Theme; @@ -300,13 +301,16 @@ pub fn lex_to_runs(input: &str, theme: &Theme) -> Vec { } fn base_runs(input: &str, theme: &Theme) -> Vec { - let tokens = lex(input); - let mut runs = Vec::with_capacity(tokens.len() * 2); + // Walker-driven highlighting (ADR-0024 §architecture, Phase F). + // `walker::highlight_runs` returns per-byte classes for every + // token shape in the source; whitespace gaps are not represented + // and we fill them with the default foreground colour below. + let classes = walker::highlight_runs(input); + let mut runs = Vec::with_capacity(classes.len() * 2); let mut pos = 0; - for tok in tokens { - let (start, end) = tok.span; + for class in classes { + let (start, end) = (class.start, class.end); if pos < start { - // Whitespace gap before this token. runs.push(StyledRun { byte_range: (pos, start), style: Style::default().fg(theme.fg), @@ -314,7 +318,7 @@ fn base_runs(input: &str, theme: &Theme) -> Vec { } runs.push(StyledRun { byte_range: (start, end), - style: Style::default().fg(theme.token_color(&tok.kind)), + style: Style::default().fg(theme.highlight_class_color(class.class)), }); pos = end; } diff --git a/src/theme.rs b/src/theme.rs index 790239c..b4a456a 100644 --- a/src/theme.rs +++ b/src/theme.rs @@ -18,6 +18,7 @@ use ratatui::style::Color; +use crate::dsl::grammar::HighlightClass; use crate::dsl::lexer::TokenKind; #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -121,6 +122,25 @@ impl Theme { TokenKind::Error(_) => self.tok_error, } } + + /// Map a walker `HighlightClass` to its display colour + /// (ADR-0024 §architecture, Phase F). This is the walker-side + /// equivalent of `token_color` — the renderer consumes + /// `walker::highlight_runs` output, which produces + /// `HighlightClass` per byte range, and looks up colours + /// through this method. + #[must_use] + pub const fn highlight_class_color(&self, class: HighlightClass) -> Color { + match class { + HighlightClass::Keyword => self.tok_keyword, + HighlightClass::Identifier => self.tok_identifier, + HighlightClass::Number => self.tok_number, + HighlightClass::String => self.tok_string, + HighlightClass::Punct => self.tok_punct, + HighlightClass::Flag => self.tok_flag, + HighlightClass::Error => self.tok_error, + } + } } impl Default for Theme {