ADR-0024 Phase F (full) step 1: walker-driven highlighting

Replaces the lex()-driven `base_runs` span builder in
`input_render.rs` with `walker::highlight_runs`. The new
walker-side `dsl::walker::highlight` module returns per-byte
`HighlightClass` assignments for every token shape in the source:

- For commands the walker engages on, `WalkResult::per_byte_class`
  is the authoritative source (keyword / identifier / number /
  string / punct / flag).
- Trailing junk past a partial match — and inputs the walker
  doesn't engage on at all (no registered entry word) — fall
  through to a byte-shape scanner over `lex_helpers` so unknown
  command words, stray punctuation, and unterminated strings
  still highlight sensibly.

`Theme::highlight_class_color` is the walker-side analogue of
`token_color(&TokenKind)`; the renderer reads `walker::highlight_runs`
output and looks up colours through it. `token_color` and the
`lex()` pre-pass remain in place for now — the lexer module is
still consumed by usage rendering and completion until the
remaining Phase F steps land.

`HighlightClass`'s and `WalkResult::per_byte_class`'s
`#[allow(dead_code)]` annotations come off — they're now part of
the production highlight path.

Tests:
- 16 new tests under `dsl::walker::highlight` cover end-to-end
  walks, byte-shape fallbacks (unknown commands, bare flags,
  numbers, punctuation), UTF-8 codepoint advance, and trailing-
  token handling after partial walks.
- Existing `input_render` tests pass unchanged.
- 860 total tests passing (727 lib + 133 integration), 1 ignored.

Clippy clean with `nursery` lints + `-D warnings`.
This commit is contained in:
claude@clouddev1
2026-05-15 08:19:52 +00:00
parent b3d3bdfe5b
commit 7bdd3987e1
6 changed files with 354 additions and 12 deletions
+319
View File
@@ -0,0 +1,319 @@
//! Walker-driven highlighting (ADR-0024 §migration Phase F).
//!
//! `highlight_runs(source)` returns the per-byte highlight class
//! assignments for every token shape in `source`. It is the
//! single entry point that consumers (input panel, echo lines)
//! should use to colour DSL input — there is no separate lexer
//! pre-pass.
//!
//! Strategy:
//!
//! - Try the walker first. Whatever it consumed end-to-end (entry
//! word + matching nodes) contributes `WalkResult::per_byte_class`.
//! - For any bytes the walker did not cover — input the walker
//! doesn't engage on at all (no registered entry word), trailing
//! junk after a partial match, or content past a structural
//! failure — fall back to a byte-shape scanner that classifies
//! each consumed token by its shape using the same `lex_helpers`
//! primitives the walker uses internally.
//!
//! The two streams are returned in source-byte order; whitespace
//! gaps are not represented (the renderer fills them with the
//! default foreground colour).
use crate::dsl::grammar::HighlightClass;
use crate::dsl::walker::context::WalkContext;
use crate::dsl::walker::lex_helpers::{
consume_bare_path, consume_flag, consume_ident, consume_number_literal,
consume_string_literal, skip_whitespace,
};
use crate::dsl::walker::outcome::{ByteClass, WalkBound};
/// Produce the per-byte highlight classes for `source`.
///
/// On a successful walk this is exactly the walker's recorded
/// classes. On partial / unmatched input the byte-shape scanner
/// fills the gap so the renderer keeps colouring through trailing
/// tokens and unknown-command inputs.
#[must_use]
pub fn highlight_runs(source: &str) -> Vec<ByteClass> {
let mut ctx = WalkContext::new();
let (result, _cmd) = super::walk(source, WalkBound::EndOfInput, &mut ctx);
let mut classes: Vec<ByteClass> = result
.map(|r| r.per_byte_class)
.unwrap_or_default();
let scan_start = classes.last().map_or(0, |c| c.end);
scan_remainder(source, scan_start, &mut classes);
classes
}
/// Byte-shape scan from `start` to end of source, appending each
/// classified token to `classes`. Whitespace gaps are skipped.
fn scan_remainder(source: &str, start: usize, classes: &mut Vec<ByteClass>) {
let bytes = source.as_bytes();
let mut pos = start;
while pos < bytes.len() {
pos = skip_whitespace(source, pos);
if pos >= bytes.len() {
break;
}
let b = bytes[pos];
// Identifier first — covers keywords-by-shape, since at
// the highlight layer we no longer distinguish keyword from
// identifier without a successful walker match.
if (b.is_ascii_alphabetic() || b == b'_')
&& let Some((s, e)) = consume_ident(source, pos)
{
classes.push(ByteClass {
start: s,
end: e,
class: HighlightClass::Identifier,
});
pos = e;
continue;
}
if b == b'\'' {
// Quoted string. Unterminated → mark the rest as Error
// so the user sees the unclosed run highlighted.
if let Some(((s, e), _)) = consume_string_literal(source, pos) {
classes.push(ByteClass {
start: s,
end: e,
class: HighlightClass::String,
});
pos = e;
} else {
classes.push(ByteClass {
start: pos,
end: bytes.len(),
class: HighlightClass::Error,
});
pos = bytes.len();
}
continue;
}
if b == b'-' && bytes.get(pos + 1) == Some(&b'-') {
// Flag. `--` without a body is BadFlag → Error.
if let Some((s, e)) = consume_flag(source, pos) {
classes.push(ByteClass {
start: s,
end: e,
class: HighlightClass::Flag,
});
pos = e;
} else {
classes.push(ByteClass {
start: pos,
end: pos + 2,
class: HighlightClass::Error,
});
pos += 2;
}
continue;
}
let looks_like_number = b.is_ascii_digit()
|| (b == b'-'
&& bytes
.get(pos + 1)
.copied()
.is_some_and(|c| c.is_ascii_digit()));
if looks_like_number
&& let Some((s, e)) = consume_number_literal(source, pos)
{
classes.push(ByteClass {
start: s,
end: e,
class: HighlightClass::Number,
});
pos = e;
continue;
}
if matches!(b, b':' | b'(' | b')' | b',' | b'=' | b'.') {
classes.push(ByteClass {
start: pos,
end: pos + 1,
class: HighlightClass::Punct,
});
pos += 1;
continue;
}
// Bare-path tail (e.g., trailing `frobulate widgets` past
// a partial command match): only used when we know the
// remainder isn't structured. Without a grammar context
// here we conservatively treat as Error so the user sees
// the unknown-shape byte highlighted.
//
// For multi-byte UTF-8 (emoji, unknown unicode) advance
// one whole codepoint as Error.
let ch = source[pos..]
.chars()
.next()
.expect("pos < bytes.len() ⇒ at least one char");
let len = ch.len_utf8();
// If the char is alphanumeric (unusual at this fall-through
// — should already have been caught above), classify as
// Identifier-ish. Otherwise Error.
let class = if ch.is_ascii_alphanumeric() || ch == '_' {
HighlightClass::Identifier
} else if ch.is_whitespace() {
// Whitespace is filtered above; this branch is unreachable
// in practice.
pos += len;
continue;
} else {
HighlightClass::Error
};
let _ = consume_bare_path; // silence unused-import lint when not exercised
classes.push(ByteClass {
start: pos,
end: pos + len,
class,
});
pos += len;
}
}
#[cfg(test)]
mod tests {
use super::*;
fn run(input: &str) -> Vec<(usize, usize, HighlightClass)> {
highlight_runs(input)
.into_iter()
.map(|c| (c.start, c.end, c.class))
.collect()
}
#[test]
fn empty_input_yields_no_runs() {
assert!(highlight_runs("").is_empty());
assert!(highlight_runs(" ").is_empty());
}
#[test]
fn entry_keyword_classified_as_keyword() {
assert_eq!(run("quit"), vec![(0, 4, HighlightClass::Keyword)]);
}
#[test]
fn keyword_plus_identifier_via_walker() {
// `show data Customers` walks end-to-end.
let runs = run("show data Customers");
assert_eq!(
runs,
vec![
(0, 4, HighlightClass::Keyword),
(5, 9, HighlightClass::Keyword),
(10, 19, HighlightClass::Identifier),
],
);
}
#[test]
fn unknown_command_word_classified_by_byte_shape() {
// Walker doesn't engage; fallback classifies as Identifier.
assert_eq!(run("frobulate"), vec![(0, 9, HighlightClass::Identifier)]);
}
#[test]
fn unknown_chars_classified_as_error() {
assert_eq!(run("$"), vec![(0, 1, HighlightClass::Error)]);
}
#[test]
fn unterminated_string_classified_as_error_through_to_eof() {
assert_eq!(run("'oops"), vec![(0, 5, HighlightClass::Error)]);
}
#[test]
fn string_literal_classified() {
assert_eq!(run("'hello'"), vec![(0, 7, HighlightClass::String)]);
}
#[test]
fn flag_classified_via_fallback() {
// Walker doesn't engage for a bare `--all-rows`.
assert_eq!(
run("--all-rows"),
vec![(0, 10, HighlightClass::Flag)],
);
}
#[test]
fn bare_double_dash_classified_as_error() {
assert_eq!(run("--"), vec![(0, 2, HighlightClass::Error)]);
}
#[test]
fn number_classified_via_fallback() {
assert_eq!(run("42"), vec![(0, 2, HighlightClass::Number)]);
assert_eq!(run("-5"), vec![(0, 2, HighlightClass::Number)]);
assert_eq!(run("3.14"), vec![(0, 4, HighlightClass::Number)]);
}
#[test]
fn punct_classified_via_fallback() {
// Bare `:` outside any walker context.
assert_eq!(run(":"), vec![(0, 1, HighlightClass::Punct)]);
}
#[test]
fn trailing_tokens_after_partial_walk_are_byte_scanned() {
// `quit nonsense` — walker matches `quit`, then trailing
// `nonsense` is fallback-classified.
let runs = run("quit nonsense");
assert_eq!(
runs,
vec![
(0, 4, HighlightClass::Keyword),
(5, 13, HighlightClass::Identifier),
],
);
}
#[test]
fn whitespace_gaps_are_not_represented_as_runs() {
// Two whitespace-separated tokens produce exactly two
// class spans; the renderer fills the gap with fg.
let runs = run("show table");
assert_eq!(runs.len(), 2);
assert_eq!(runs[0].2, HighlightClass::Keyword);
assert_eq!(runs[1].2, HighlightClass::Keyword);
}
#[test]
fn full_command_walks_with_each_class() {
// `update T set Name='hi' --all-rows` — walker covers it
// all end-to-end; the per-byte class slice carries each
// node's contribution.
let runs = highlight_runs("update T set Name='hi' --all-rows");
let classes: Vec<HighlightClass> = runs.iter().map(|c| c.class).collect();
assert!(classes.contains(&HighlightClass::Keyword));
assert!(classes.contains(&HighlightClass::Identifier));
assert!(classes.contains(&HighlightClass::String));
assert!(classes.contains(&HighlightClass::Punct));
assert!(classes.contains(&HighlightClass::Flag));
}
#[test]
fn utf8_unknown_char_advances_one_codepoint() {
// ✓ is a 3-byte UTF-8 codepoint; the fallback emits a
// 3-byte Error span (not three 1-byte spans).
let runs = run("");
assert_eq!(runs.len(), 1);
assert_eq!(runs[0].2, HighlightClass::Error);
assert_eq!(runs[0].1 - runs[0].0, "".len());
}
#[test]
fn string_with_multi_byte_unicode_classified_as_string() {
// 'café' is a single string literal; the walker doesn't
// engage here (no `'café'` entry keyword), so the fallback
// scans and classifies as String.
let runs = run("'café'");
assert_eq!(runs.len(), 1);
assert_eq!(runs[0].2, HighlightClass::String);
assert_eq!(runs[0].1, "'café'".len());
}
}