From deb0948d6cdc2ce00a6a83837777205c5aa5ffcf Mon Sep 17 00:00:00 2001 From: "claude@clouddev1" Date: Fri, 12 Jun 2026 20:36:20 +0000 Subject: [PATCH] feat(seed): year-as-int + conventional choice-set heuristics (#33, #34) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two additive D7 catalogue rules, surfaced while writing the website seed docs. No change to the type fallback, executor, or grammar. #33 — year-like int columns. `published`/`birth_year` were just `int`, so they fell to the unbounded int path and produced nonsense (`9419`). Add an int-gated year rule (after the quantity rule, so `year_count` stays a count): `year`/`*_year`/`published`/`founded` -> a bounded 1950-2025 year (new `YearRecent`), or the dob-style birth window 1945-2007 for `birth`/`born`/`dob` (new `YearBirth`). Plain int; not added to the D9 named-generator vocabulary. #34 — conventional choice sets. A few enum-ish names have a near-canonical small set that reads far better than lorem text. Add a type-gated PickFrom lookup (reusing the existing generator): priority/prio, severity, rating/stars. `status` is deliberately excluded (values too domain-specific) and keeps the D12 advisory; a user IN-CHECK still wins. `priority` leaves ENUM_TOKENS. ADR-0048 Amendment 1; +8 tests (incl. a column-fill integration test that also closes a pre-existing gap on that path). --- docs/adr/0048-seed-fake-data-generation.md | 65 +++++++++++ docs/adr/README.md | 2 +- docs/requirements.md | 5 +- src/seed/generators.rs | 52 +++++++++ src/seed/heuristics.rs | 130 ++++++++++++++++++++- src/seed/mod.rs | 7 ++ tests/it/seed.rs | 117 +++++++++++++++++++ 7 files changed, 374 insertions(+), 4 deletions(-) diff --git a/docs/adr/0048-seed-fake-data-generation.md b/docs/adr/0048-seed-fake-data-generation.md index 9dfd1ed..b3cbe83 100644 --- a/docs/adr/0048-seed-fake-data-generation.md +++ b/docs/adr/0048-seed-fake-data-generation.md @@ -317,6 +317,8 @@ with the implementation): | `url`/`website`/`homepage` · `color`/`colour` | URL / hex colour | text | | `price`/`amount`/`cost`/`salary`/`balance`/`total` | currency-range number | numeric | | `age` · `quantity`/`qty`/`stock`/`count` | 18–80 · small int | numeric | +| `year`/`*_year`/`published`/`founded` (Amendment 1) | bounded year (birth window for `birth`/`born`/`dob`, else 1950–2025) | int | +| `priority`/`prio` · `severity` · `rating`/`stars` (Amendment 1) | built-in `PickFrom` value set | text/int | | `date`/`*_date` | date, recent ~3 yr window | date | | `dob`/`birthday` | date, adult window (18–80 yr ago) | date | | `timestamp`/`datetime` · `created_at`/`updated_at`/`*_at` | datetime, recent window (`updated_at` ≥ `created_at`) | datetime | @@ -675,3 +677,66 @@ the regression floor. derive-`IN`-else-friendly-fail tier. - **`set`-driven NULL / per-column report / recursive parent seed:** deferred — see Out of scope. + +## Amendment 1 — year-as-int + conventional choice sets (2026-06-12) + +Two SD2-style refinements to the D7 catalogue, surfaced while writing +the website `seed` docs. Both are additive name rules; no change to D8 +(type fallback), the executor, or the grammar. + +### Issue #33 — year-like `int` columns + +A column such as `published` or `birth_year` was just an `int`, so it +fell through to the unbounded type-based `int` path (D8) and produced +nonsense like `9419` or `1426` — implausible as years, undercutting the +"realistic data" pedagogy. Added an **`int`-gated** year rule, placed +*after* the quantity rule (so `year_count` stays a count): + +- `year` / `*_year` / `published` / `founded` → **`YearRecent`**, a + bounded window of **1950–2025** (75 years relative to the fixed + `REF_YEAR`, wide enough for published books / founding years / + release years; matches the issue's own `between 1950 and 2020` + workaround). +- the same with a `birth` / `born` / `dob` token (e.g. `birth_year`) → + **`YearBirth`**, mirroring the existing `dob → DateAdult` adult birth + window as years (**1945–2007**). + +Both emit a plain `int`. `published` / `founded` are included +(user-confirmed): an `int` so named is almost always a year (a flag +would be `is_published`). The generators are **not** added to the D9 +named-generator vocabulary — explicit control stays with `set +between and `. + +### Issue #34 — built-in value sets for conventional choice names + +D12 deliberately does not guess values for enum-ish names. For a few, +though, there is a near-canonical small set that reads far better than +lorem text. Added a **type-gated `PickFrom`** lookup (reusing the +existing generator — no new machinery), placed ahead of the enum-ish +fallthrough: + +| Name (tokens) | text | int | +|---|---|---| +| `priority` / `prio` | `low`/`medium`/`high` | `1`/`2`/`3` | +| `severity` | `low`/`medium`/`high`/`critical` | `1`/`2`/`3`/`4` | +| `rating` / `stars` | — | `1`–`5` | + +A user-declared `IN`-CHECK (D17) still wins — it is resolved before the +heuristics. Any name that gains a set is **removed from the enum-ish +advisory trigger** (`priority` left `ENUM_TOKENS`); since the advisory +(D13) only fires on `Generator::Generic`, a `PickFrom` name is excluded +either way, but the removal keeps `is_enum_ish` semantically "names seed +still can't guess". + +**`status` is deliberately excluded** (user-confirmed on the issue): its +real values are too domain-specific (`active/inactive`, +`open/closed/pending`, `draft/published`, …), so it keeps the D12 +"don't guess" stance — generic text + the advisory pointing at `set +status in (…)`. `state` stays its US-state-name generator (D7); +`type`/`kind`/`category`/`stage`/`gender` and `size`/`tier`/`plan` were +considered and left to the advisory. + +**Website follow-up** (tracked on the `website` branch, not here): the +`seed` cast exercises a `tickets` table with `priority`; it should be +re-recorded so the table tightens once `priority` collapses to a short +value — likely subsumed by the pre-publication cast sweep. diff --git a/docs/adr/README.md b/docs/adr/README.md index 3f101b1..13a0fb0 100644 --- a/docs/adr/README.md +++ b/docs/adr/README.md @@ -53,4 +53,4 @@ This directory contains the project's ADRs, recorded per - [ADR-0045 — `create m:n relationship` convenience command (C4)](0045-mn-convenience.md) — **Accepted + implemented 2026-06-10** (closes `requirements.md` **C4**; all forks user-confirmed + a `/runda` DA pass that verified the `do_create_table` reuse against code and corrected the "no PK-less tables" assumption — advanced SQL `create table t (a int)` has none, so a parent-PK guard is retained). Implementation corrected a second ADR premise: "the walker already dispatches multiple nodes per entry word" held only in *advanced* mode — two simple-mode spots (dispatcher `decide`, completion continuation-merge) assumed ≤1 DSL form per entry word and were generalized **behaviour-preservingly** (dispatch reduces to the old single-candidate commit; completion merge gated on `simple_count > 1`). Junction echo wired (`render_create_m2n`, round-trips as SQL). `create m:n relationship from to [as ]` generates a junction table with one FK column per parent PK column, a **compound PK over all the FK columns** (the textbook junction — the pair is unique, no duplicate links), and **two 1:n relationships**, all in **one transaction = one undo step** (built by reusing `do_create_table`, which already takes `foreign_keys` + writes relationship metadata — no batch bracketing). Forks all user-chosen: junction PK = compound-over-FKs (vs surrogate serial / no PK); referential actions = **`CASCADE`** on delete+update (vs NO ACTION / RESTRICT); naming = auto `{T1}_{T2}` + optional `as` (vs auto-only); available in **both modes** (Simple-category DSL, like the sibling relationship commands). FK columns named `{parent_table}_{pk_column}` (disambiguates shared `id`; generalises to compound parents via ADR-0043), typed via `fk_target_type` (ADR-0011). A distinct `Command::CreateM2nRelationship` (not lowered to `CreateTable`) preserves command identity (X5) and lets the teaching echo speak in m:n terms. Cross-cutting wiring enumerated: separate `CREATE_M2N` `CommandNode` (own `help_id`/`usage_ids`), `("m","m:n")` completion composite, `HintMode`s, grammar-driven highlighting, `help`/`help create`, `parse_error_pedagogy` near-miss matrix, teaching echo. OOS: **self-referential m:n** (`from T to T`) refused outright (user-confirmed "full stop" — directional column-naming is more than this beginner convenience warrants); per-relationship action overrides; extra junction payload columns; m:n diagram echo; renaming the auto-generated relationships - [ADR-0046 — Schema sidebar focus/navigation mode and responsive input & hint layout (UI #20/#21/#23)](0046-sidebar-navigation-and-responsive-input-hint.md) — **Accepted + implemented 2026-06-10, phased A→B→C** (8 commits `9f5f76b`…`22bec61`; closes Gitea **#20** hint jumpiness, **#21** left-column improvements, **#23** long input — all forks user-confirmed, including the persistent show/hide toggle which is **deferred**: the Ctrl-O peek covers #21's "keystroke to show and hide"). Two decisions landed differently from the draft (recorded inline): relationship data on **`App`** not `SchemaCache` (DB2); the nav overlay clears **only the sidebar strip + a one-column gutter**, panels staying visible behind (DC2). Treats the three UI issues as one coupled decision because they share the terminal's width/height budget. **Phase A (input & hint):** the hint panel's height becomes a function of **terminal geometry, fixed between resizes** (not of hint content), eliminating the #20 jump at its source — measured catalog shows ≥ ~54-col right-column width never needs > 2 hint lines, so 3 lines is a rare narrow-terminal-only case; height buckets `H<40` compact (input 1 row + horizontal scroll / hint 2) vs `H≥40` comfortable (input 2 rows soft-wrap / hint 2), output `Min(5)` honoured first under degradation; input gains horizontal scroll (`input_scroll_offset`, single logical `String` — **not** I1 multi-line) and 2-row soft-wrap display when tall, preserving ADR-0027's 6-col indicator reserve. **Phase B (sidebar):** the 26-col Tables column is **kept but made optional and richer** (not deleted — pedagogy wins ties) — **width-derived session-only** visibility (visible iff width > 90 or a Ctrl-O peek is active — no stored field; hides at width ≤ 90 so the 90-col screencasts drop it; ADR-0015 format untouched), plus a **relationships panel** rendered narrow with endpoints broken at the arrow, ellipsized — a **separate sibling panel** that **overrides S2**'s nested-list extension model (relationships are cross-table). the full records live on a new **`App.relationships`** field (revised from the ADR's original `SchemaCache.relationship_details` at implementation — `SchemaCache` is walker-facing and needs only the names, kept in `relationships: Vec`; details are UI-only, so `App` mirrors `app.tables` and avoids ~23 fixture edits), delivered by `Database::read_all_relationships` + an `AppEvent::RelationshipsRefreshed`; the two left panels split vertically with the relationships panel floored at 5 rows ("(none)" when empty) and capped at 50 % of the column (DB4). **Phase C (navigation mode):** **`Ctrl-O`** enters a focus cycle (Input → Tables → Relationships → Input; `Esc` exits) orthogonal to the ADR-0003 input mode — **`Ctrl-B` was rejected on review as the default tmux prefix** (unreachable inside tmux); the focused panel **expands to ~40–50 cols as a `Clear` overlay** (right panels stay unchanging underneath) and scrolls via **Up/Down (line) + PageUp/PageDown (page)** (context-rebind, reusing the output-scroll viewport mechanism), with an accent focus border; all non-nav keys inert in nav mode (and nav keys inert while a modal is open). Forks all user-chosen: keep-optional-richer (vs remove/narrow); navigation-mode (vs modeless modifier scroll); `Ctrl-O` (Ctrl-B rejected = tmux prefix); overlay (vs layout re-split); inert-non-nav-keys; geometry-fixed hint height; `H<40/≥40` thresholds; session-only persistence; Up/Down line-scroll; **separate relationships panel overriding S2**; **no hint-area toggle** (S4's stale "keyboard-toggleable" claim struck — never implemented, unwanted). A pre-build `/runda` DA pass drove these corrections: caught the `Ctrl-B`/tmux collision, the `SchemaCache` retype that would have broken completion, the 2-row-input/indicator placement, the missing nav-mode key disposition + modal gate, and three unreferenced requirements (S1 evolved, S2 overridden, S4 corrected); also cross-checked open issue **#22** (overlay/annotation layer — separate ADR, adjacent). OOS: true multi-line input (I1); readline shortcuts (I1b); cross-session sidebar persistence; output as a third nav focus; relationship search/edit from the panel; hint-area toggle; #22's annotation layer. Accepted consequence: the 90-col visibility threshold makes a terminal's output *narrower* when widened across the boundary (sidebar appears); **Amendment 1, 2026-06-12** (issue #25): DC3's focus accent is now a **non-bold accent colour** (`theme.mode_simple`, blue) rather than bold bright-`fg` — bold box-drawing glyphs render as broken/gapped line-art in the asciinema cast player (and are fragile in some terminals), so `panel_border_style` carries no `Modifier::BOLD` on a border (bold stays fine on text spans); pure style change — the text-only Tier-2 snapshots were unaffected, the Tier-1 assertion was updated, and a render-level test now checks the focused border cells carry the accent and no bold - [ADR-0047 — Demonstration overlay layer (keystroke badges + step captions)](0047-demonstration-overlay-layer.md) — **Accepted 2026-06-10; implemented 2026-06-11, phased A→B→C (closes Gitea #22)** (commits `f879d54`→`2d0f4b2`; no `requirements.md` item — tracked by issue + ADR per convention; all forks user-confirmed + a pre-build `/runda` pass that produced 10 tightening findings and a whole-implementation `/runda` pass that returned PASS, no blockers). An in-app **demonstration mode** (`--demo` flag / `RDBMS_PLAYGROUND_DEMO` env, **off by default, zero footprint when off**) that renders two transient overlays so `autocast` screencasts — and live teaching, and a future guided-lesson system — can show otherwise-invisible interactions. **Keystroke badges** (`[TAB]`, `[ENTER]`, `[UP]`, …): **automatic, app-detected** over a fixed set of glyph-less keys (the app already sees every key, so it re-records for free), label via a pure `demo_badge_label(&KeyEvent)`; the badge **auto-expires on a ~1.5 s timer** that extends the runtime's existing time-boxed-`recv` arm condition (`debounce.is_armed() || badge_pending`; expiry `Instant` in the runtime, `App.demo_badge` the render mirror — mirroring the `input` vs `input_indicator` split). **Step captions**: a **stealth, control-code-delimited input buffer** toggled by **`Ctrl+]`** (byte `0x1D` → arrives as `Char('5')+CONTROL`, verified against crossterm 0.29 `parse.rs:110-113`; chosen over `Ctrl+!`, which is **not a single ASCII byte so autocast cannot send it** — the same wall as arrow keys, R4) — typed characters accumulate **invisibly** (prompt untouched, no echo/history), `Backspace` edits, other keys inert, a second `Ctrl+]` **commits** to the caption box (empty commit dismisses); lives in pure-sync `App::update()`, **intercepted before the modal gate** so captions/badges work **over the load picker** (the `#24` projects cast). Both render as **floating flat black-on-yellow rectangles** (solid fill, **no border glyphs** — a one-cell text margin, deliberately unlike the app's bordered panels; user decision post-build, `2d0f4b2`) **at the output panel's inner bottom-right**, drawn **last over modals**, badge **stacked above** the caption, **no layout reflow**; caption **word-wraps to ≤ 3 lines** (3–5 rows), badge fixed 3 rows; clamp/skip guard for tiny terminals; a new **`App.last_output_area: Rect`** (set in `render_output_panel`) gives the top-level draw the anchor. Caption persists **until the next keystroke**; badge suppressed while capturing. Forks all user-chosen: `--demo` activation (vs hidden command / chord); automatic badges (vs scripted); stealth buffer (vs typed-command / preloaded-file); floating bottom-right boxes (vs HUD / banner / subtitle); `Ctrl+]` trigger; wrap-to-3-line captions; ~1.5 s badge / next-keystroke caption timing. Tested test-first across Tier 1 (label fn, capture state machine incl. over-modal + demo-off gate, nearest-deadline helper), Tier 2 (insta snapshots: badge/caption/both-stacked at 90×26 light+dark, short-terminal clamp), Tier 3 (`--demo` plumbing, badge set/suppressed, caption-without-input wiring), CLI (`--demo` parse + env fallback) — with an **honest limit** noted: the `tokio` timer wiring inside `run_loop` is exercised via the pure pieces + Tier-3 plumbing, not a standalone integration test of the timeout (same posture as the existing `IndicatorDebounce`). One intentional, user-acknowledged behaviour: `Ctrl-C` is inert while capturing (every non-`Ctrl+]` key is, by spec). Final tally **2290 passing / 0 failing / 0 skipped** (1 long-standing ignored doctest), clippy clean. OOS: scripted/manual badge push; badges for glyph keys; configurable styling/placement; the guided-lesson system itself (own ADR); cross-session/-switch persistence; localised caption content; arrow-only cast interactions (output-pane scroll); wiring the overlays into the website `casts.mjs` scripts (website-branch follow-up). Implementation phased **A** (`--demo` plumbing) → **B** (badges) → **C** (captions) + a flat-rectangle restyle -- [ADR-0048 — `seed` fake-data generation command](0048-seed-fake-data-generation.md) — **Accepted 2026-06-11; Phase 1 + Phase 2 implemented 2026-06-11** (Phase 1 commits `202e25a`→`fbd219b`; design settled with the user across an extended fork dialogue, hardened by a pre-build `/runda` pass (six blockers folded in), a post-implementation `/runda` pass (eight gaps closed — FK/shortid determinism so **D4 holds with no exceptions**, plus six untested ADR decisions), and a Phase-2 pre-build `/runda` pass (which caught the no-date-literal-token reality → the D2 quoted-dates amendment), and a post-implementation `/runda` pass (which added a friendly error for a bounded override on a UNIQUE column — see D2); **2400 tests pass, clippy clean**). Closes `requirements.md` **SD1** and the core of **SD2**; closes the `seed` half of **A1**. **Phase 1 shipped:** whole-row `seed [count] [--seed ]` with realistic name-aware generation (the `fake` crate + a type-gated heuristic catalogue, table-context name disambiguation, hand-rolled `product` generator, bounded dates), identifier + constraint uniqueness incl. junction distinct-combos, FK sampling from existing parent rows (empty-parent error), `IN`-CHECK derivation + complex-CHECK advisory, a required-column block guard, `--seed` reproducibility (serial/FK/shortid all deterministic), undo as one batch step, replay as a data write, a capped auto-show preview, the enum/CHECK advisory, and an O(N) single-transaction insert path. **Phase 2 shipped (2026-06-11):** the `set` override clause (D2 — fixed value / pick-list / `as ` / `between` range, **quoted** dates per the D2 amendment, type-aware, override drops the column from the advisory) and the `
.` column-fill form (D1 form 2 — an UPDATE over existing rows, refusing PK/autogen targets, empty-table no-op, FK/unique-respecting, one undo step), with the new `KNOWN_GENERATORS` vocabulary (D9), a range `Generator`, full completion/highlight (`HighlightClass::Function`)/validity (`IdentSource::Generators`)/help/pedagogy wiring, and the D13 advisory's Phase-2/3 wording. Further SD2 increments (custom generators, NULL injection, multi-locale, recursive auto-seed) out of scope. Closes `requirements.md` **SD1** and the core of **SD2**; closes the `seed` half of **A1** (the other being `hint`/**H2**). A dedicated `seed` command (own AST variant + `do_seed` executor, **both modes**) generating **realistic, name-aware** fake data. Two forms: **`seed
[count]`** (new rows, default **20**, capped) and **`seed
.`** (fill a column on existing rows, an UPDATE). Generation adds the **`fake` crate** (v5, English) driven by a **type-gated, token-matched name-heuristic catalogue** (~30 patterns, documented false-positive guards), with **table-context** disambiguating the `name`/`title` family (`products.name`→product, `users.name`→person, `vendors.name`→company), a **hand-rolled `product` generator** (`fake` has no commerce module), **bounded dates** (`date`/`timestamp`/`dob`/`*_at` recognised, recent windows — never "all of history"), the **identifier family** (`id`/`code`/`ref`/`number`, non-FK/non-PK) → **unique sequential**, and **enum-ish names** (`role`/`status`/`type`/…) left generic + a **post-seed Hint advisory** pointing at `set … in (…)`. A **`set` override clause** — `= value` / `in (a,b,c)` / `as ` / `between a and b` (numeric **and** date), reusing ADR-0026 operators — answers the heuristic-miss case. **`--seed `** makes runs reproducible (and enables exact-value tests). **FK** columns sampled uniformly from existing parent rows (**empty parent → friendly error**, no recursion v1); **junction/compound-PK** tables seeded with **distinct combinations**, capped + noted (SD1). A **required-column block guard** refuses rather than NULL-violate a `NOT NULL` column it can't fill (e.g. `NOT NULL blob`). Full ambient wiring (completion incl. a new generator-name vocabulary highlighted as `tok_function`, hints, `help seed`, ADR-0042 near-miss matrix, ADR-0027 validity); **no DSL→SQL teaching echo** (seed is a utility command, not a SQL twin). Honours **X5** — `do_seed` reuses insert/update *mechanics as helpers*, not by emitting `Command::Insert`. Implementation phased: (1) core whole-row seed → (2) `set` overrides → (3) column-fill. Deferred (future SD2): recursive auto-seed, NULL injection, multi-locale, user-defined custom generators, full per-column report +- [ADR-0048 — `seed` fake-data generation command](0048-seed-fake-data-generation.md) — **Accepted 2026-06-11; Phase 1 + Phase 2 implemented 2026-06-11** (Phase 1 commits `202e25a`→`fbd219b`; design settled with the user across an extended fork dialogue, hardened by a pre-build `/runda` pass (six blockers folded in), a post-implementation `/runda` pass (eight gaps closed — FK/shortid determinism so **D4 holds with no exceptions**, plus six untested ADR decisions), and a Phase-2 pre-build `/runda` pass (which caught the no-date-literal-token reality → the D2 quoted-dates amendment), and a post-implementation `/runda` pass (which added a friendly error for a bounded override on a UNIQUE column — see D2); **2400 tests pass, clippy clean**). Closes `requirements.md` **SD1** and the core of **SD2**; closes the `seed` half of **A1**. **Phase 1 shipped:** whole-row `seed
[count] [--seed ]` with realistic name-aware generation (the `fake` crate + a type-gated heuristic catalogue, table-context name disambiguation, hand-rolled `product` generator, bounded dates), identifier + constraint uniqueness incl. junction distinct-combos, FK sampling from existing parent rows (empty-parent error), `IN`-CHECK derivation + complex-CHECK advisory, a required-column block guard, `--seed` reproducibility (serial/FK/shortid all deterministic), undo as one batch step, replay as a data write, a capped auto-show preview, the enum/CHECK advisory, and an O(N) single-transaction insert path. **Phase 2 shipped (2026-06-11):** the `set` override clause (D2 — fixed value / pick-list / `as ` / `between` range, **quoted** dates per the D2 amendment, type-aware, override drops the column from the advisory) and the `
.` column-fill form (D1 form 2 — an UPDATE over existing rows, refusing PK/autogen targets, empty-table no-op, FK/unique-respecting, one undo step), with the new `KNOWN_GENERATORS` vocabulary (D9), a range `Generator`, full completion/highlight (`HighlightClass::Function`)/validity (`IdentSource::Generators`)/help/pedagogy wiring, and the D13 advisory's Phase-2/3 wording. Further SD2 increments (custom generators, NULL injection, multi-locale, recursive auto-seed) out of scope. Closes `requirements.md` **SD1** and the core of **SD2**; closes the `seed` half of **A1** (the other being `hint`/**H2**). A dedicated `seed` command (own AST variant + `do_seed` executor, **both modes**) generating **realistic, name-aware** fake data. Two forms: **`seed
[count]`** (new rows, default **20**, capped) and **`seed
.`** (fill a column on existing rows, an UPDATE). Generation adds the **`fake` crate** (v5, English) driven by a **type-gated, token-matched name-heuristic catalogue** (~30 patterns, documented false-positive guards), with **table-context** disambiguating the `name`/`title` family (`products.name`→product, `users.name`→person, `vendors.name`→company), a **hand-rolled `product` generator** (`fake` has no commerce module), **bounded dates** (`date`/`timestamp`/`dob`/`*_at` recognised, recent windows — never "all of history"), the **identifier family** (`id`/`code`/`ref`/`number`, non-FK/non-PK) → **unique sequential**, and **enum-ish names** (`role`/`status`/`type`/…) left generic + a **post-seed Hint advisory** pointing at `set … in (…)`. A **`set` override clause** — `= value` / `in (a,b,c)` / `as ` / `between a and b` (numeric **and** date), reusing ADR-0026 operators — answers the heuristic-miss case. **`--seed `** makes runs reproducible (and enables exact-value tests). **FK** columns sampled uniformly from existing parent rows (**empty parent → friendly error**, no recursion v1); **junction/compound-PK** tables seeded with **distinct combinations**, capped + noted (SD1). A **required-column block guard** refuses rather than NULL-violate a `NOT NULL` column it can't fill (e.g. `NOT NULL blob`). Full ambient wiring (completion incl. a new generator-name vocabulary highlighted as `tok_function`, hints, `help seed`, ADR-0042 near-miss matrix, ADR-0027 validity); **no DSL→SQL teaching echo** (seed is a utility command, not a SQL twin). Honours **X5** — `do_seed` reuses insert/update *mechanics as helpers*, not by emitting `Command::Insert`. Implementation phased: (1) core whole-row seed → (2) `set` overrides → (3) column-fill. Deferred (future SD2): recursive auto-seed, NULL injection, multi-locale, user-defined custom generators, full per-column report. **Amendment 1, 2026-06-12** (issues #33/#34): two additive D7 catalogue rules — **year-as-int** (`year`/`*_year`/`published`/`founded` → a bounded `int` year, 1950–2025, or the `dob`-style birth window 1945–2007 for `birth`/`born`/`dob`; fixes nonsense like `9419`; `int`-gated, after the quantity rule so `year_count` stays a count; two new `YearRecent`/`YearBirth` generators, *not* added to the D9 vocabulary) and **conventional choice sets** (`priority`/`prio`, `severity`, `rating`/`stars` → type-gated built-in `PickFrom` value sets reusing the existing generator; `priority` leaves `ENUM_TOKENS`). `status` is **deliberately excluded** (user-confirmed — values too domain-specific; keeps the D12 "don't guess" + advisory); a user `IN`-CHECK still wins. Website `seed` cast re-record tracked on the `website` branch diff --git a/docs/requirements.md b/docs/requirements.md index 2222f11..7b2984e 100644 --- a/docs/requirements.md +++ b/docs/requirements.md @@ -696,7 +696,10 @@ since ADR-0027.) `Generator`, and full completion / highlight / validity / help / parse-error-pedagogy wiring. Deferred SD2 increments: user-defined custom generators, NULL injection, multi-locale, - recursive parent auto-seed.)* + recursive parent auto-seed. Later catalogue refinements: + **#33** year-as-int (`year`/`*_year`/`published`/`founded`) and + **#34** conventional choice sets (`priority`/`severity`/`rating`, + `status` excluded) — ADR-0048 Amendment 1.)* ## Query analysis diff --git a/src/seed/generators.rs b/src/seed/generators.rs index a5d6a99..7e81f86 100644 --- a/src/seed/generators.rs +++ b/src/seed/generators.rs @@ -31,6 +31,16 @@ const RECENT_WINDOW_DAYS: i64 = 3 * 365; const ADULT_MIN_DAYS: i64 = 18 * 365; const ADULT_MAX_DAYS: i64 = 80 * 365; +/// Year windows for the `int`-typed year heuristics (issue #33), +/// expressed relative to [`REF_YEAR`] so they advance with releases — +/// the year siblings of the `DateRecent` / `DateAdult` windows above. +/// `YearRecent` spans ~75 years (1950–2025 at REF_YEAR=2025), wide +/// enough for `published` / `founded` / `release_year`; `YearBirth` +/// mirrors the adult birth window (1945–2007). +const YEAR_RECENT_SPAN: i32 = 75; +const YEAR_BIRTH_MIN_AGE: i32 = 18; +const YEAR_BIRTH_MAX_AGE: i32 = 80; + /// Produce one value for `generator` against destination type `ty`. #[must_use] pub fn generate_value(generator: &Generator, ty: Type, rng: &mut SeedRng) -> Value { @@ -71,6 +81,13 @@ pub fn generate_value(generator: &Generator, ty: Type, rng: &mut SeedRng) -> Val Generator::CurrencyAmount => currency_amount(ty, rng), Generator::Age => Value::Number(rng.random_range(18..=80).to_string()), Generator::SmallInt => Value::Number(rng.random_range(1..=100).to_string()), + Generator::YearRecent => { + Value::Number(rng.random_range((REF_YEAR - YEAR_RECENT_SPAN)..=REF_YEAR).to_string()) + } + Generator::YearBirth => Value::Number( + rng.random_range((REF_YEAR - YEAR_BIRTH_MAX_AGE)..=(REF_YEAR - YEAR_BIRTH_MIN_AGE)) + .to_string(), + ), Generator::DateRecent => Value::Text(format_date(random_past_date(rng, 0, RECENT_WINDOW_DAYS))), Generator::DateAdult => { Value::Text(format_date(random_past_date(rng, ADULT_MIN_DAYS, ADULT_MAX_DAYS))) @@ -489,6 +506,41 @@ mod tests { assert!(matches!(v, Value::Number(_)), "numeric pick should be a Number: {v:?}"); } + #[test] + fn year_generators_stay_within_their_bounded_windows() { + // Issue #33: both year generators emit a plain `int` inside a + // bounded, plausible window — never the unbounded-int nonsense. + let mut rng = make_rng(Some(7)); + for _ in 0..300 { + let Value::Number(s) = generate_value(&Generator::YearRecent, Type::Int, &mut rng) + else { + panic!("YearRecent must be a Number") + }; + let n: i32 = s.parse().unwrap(); + assert!((1950..=2025).contains(&n), "YearRecent {n} out of [1950,2025]"); + } + for _ in 0..300 { + let Value::Number(s) = generate_value(&Generator::YearBirth, Type::Int, &mut rng) + else { + panic!("YearBirth must be a Number") + }; + let n: i32 = s.parse().unwrap(); + assert!((1945..=2007).contains(&n), "YearBirth {n} out of [1945,2007]"); + } + } + + #[test] + fn year_generators_are_deterministic_for_a_fixed_seed() { + assert_eq!( + gen_once(&Generator::YearRecent, Type::Int, 42), + gen_once(&Generator::YearRecent, Type::Int, 42), + ); + assert_eq!( + gen_once(&Generator::YearBirth, Type::Int, 42), + gen_once(&Generator::YearBirth, Type::Int, 42), + ); + } + #[test] fn int_range_stays_within_inclusive_bounds() { let g = Generator::Range { low: "10".into(), high: "20".into() }; diff --git a/src/seed/heuristics.rs b/src/seed/heuristics.rs index d62f78a..3162dd2 100644 --- a/src/seed/heuristics.rs +++ b/src/seed/heuristics.rs @@ -57,9 +57,14 @@ fn choose_generator_inner(table: &str, col: &ColumnSpec) -> Generator { /// the post-seed advisory; such columns still receive generic text. #[must_use] pub fn is_enum_ish(name: &str) -> bool { + // `priority` is intentionally absent: issue #34 gave it a built-in + // value set (low/medium/high · 1/2/3), so it is no longer "filled + // generically" and must not trigger the D13 advisory. `severity` / + // `rating` / `stars` were never here. `status` stays — it is + // deliberately left to the advisory (no built-in set). const ENUM_TOKENS: &[&str] = &[ "role", "status", "state", "type", "kind", "category", "level", - "tier", "stage", "priority", "gender", + "tier", "stage", "gender", ]; let toks = tokens(name); toks.iter().any(|t| ENUM_TOKENS.contains(&t.as_str())) @@ -150,6 +155,49 @@ fn match_name_generator(table: &str, toks: &[String], ty: Type) -> Option Vec { out } +/// A `PickFrom` generator from string-literal values (issue #34's +/// conventional choice sets). `literal_to_value` interprets each entry +/// by the destination type at generation time (an `int` column turns +/// `"1"` into a number). +fn pick_from(values: &[&str]) -> Generator { + Generator::PickFrom(values.iter().map(|s| (*s).to_string()).collect()) +} + fn has_token(toks: &[String], t: &str) -> bool { toks.iter().any(|x| x == t) } @@ -412,11 +468,81 @@ mod tests { assert!(is_enum_ish("status")); assert!(is_enum_ish("role")); assert!(is_enum_ish("order_state")); - assert!(is_enum_ish("priority")); + // Issue #34: `priority` gained a built-in value set, so it is no + // longer advised (it is no longer "filled generically"). + assert!(!is_enum_ish("priority")); + assert!(!is_enum_ish("severity")); + assert!(!is_enum_ish("rating")); assert!(!is_enum_ish("email")); assert!(!is_enum_ish("first_name")); } + #[test] + fn year_like_int_columns_map_to_bounded_years() { + // Issue #33: `int`-gated year heuristics. `birth`/`born`/`dob` + // years pick the birth window; the rest a recent window. + assert_eq!(choose("authors", "birth_year", Type::Int), Generator::YearBirth); + assert_eq!(choose("authors", "birthYear", Type::Int), Generator::YearBirth); + assert_eq!(choose("u", "year_born", Type::Int), Generator::YearBirth); + assert_eq!(choose("books", "year", Type::Int), Generator::YearRecent); + assert_eq!(choose("films", "release_year", Type::Int), Generator::YearRecent); + assert_eq!(choose("books", "published", Type::Int), Generator::YearRecent); + assert_eq!(choose("companies", "founded", Type::Int), Generator::YearRecent); + // Type-gated: a text `year` is not a bounded-year int. + assert_eq!(choose("books", "year", Type::Text), Generator::Generic); + // `year_count` is a count, not a year — the quantity rule wins. + assert_eq!(choose("t", "year_count", Type::Int), Generator::SmallInt); + } + + #[test] + fn conventional_choice_sets_map_to_pick_from() { + // Issue #34: type-gated built-in value sets. + assert_eq!( + choose("tickets", "priority", Type::Text), + Generator::PickFrom(vec!["low".into(), "medium".into(), "high".into()]), + ); + assert_eq!( + choose("tickets", "prio", Type::Int), + Generator::PickFrom(vec!["1".into(), "2".into(), "3".into()]), + ); + assert_eq!( + choose("bugs", "severity", Type::Text), + Generator::PickFrom(vec!["low".into(), "medium".into(), "high".into(), "critical".into()]), + ); + assert_eq!( + choose("bugs", "severity", Type::Int), + Generator::PickFrom(vec!["1".into(), "2".into(), "3".into(), "4".into()]), + ); + assert_eq!( + choose("reviews", "rating", Type::Int), + Generator::PickFrom(vec!["1".into(), "2".into(), "3".into(), "4".into(), "5".into()]), + ); + assert_eq!( + choose("reviews", "stars", Type::Int), + Generator::PickFrom(vec!["1".into(), "2".into(), "3".into(), "4".into(), "5".into()]), + ); + } + + #[test] + fn status_is_left_to_the_advisory_not_given_a_set() { + // User-confirmed (issue #34): `status` keeps the D12 "don't + // guess" stance — generic text + the advisory, no built-in set. + assert_eq!(choose("orders", "status", Type::Text), Generator::Generic); + assert!(is_enum_ish("status")); + } + + #[test] + fn a_declared_in_check_still_wins_over_a_built_in_set() { + // The CHECK is the user's explicit intent; it precedes the + // issue-#34 default set for the same name. + let mut spec = ColumnSpec::plain("priority", Type::Text); + spec.check_in_values = Some(vec!["p1".into(), "p2".into()]); + assert_eq!( + choose_generator("tickets", &spec), + Generator::PickFrom(vec!["p1".into(), "p2".into()]), + ); + } + #[test] fn enum_ish_columns_fall_through_to_generic() { // No special generator — generic text + the advisory flags them. diff --git a/src/seed/mod.rs b/src/seed/mod.rs index 1a4d424..452097b 100644 --- a/src/seed/mod.rs +++ b/src/seed/mod.rs @@ -149,6 +149,13 @@ pub enum Generator { Age, /// A small positive integer (quantities, counts). SmallInt, + /// A plausible recent year as a plain `int` — `year` / `*_year` / + /// `published` / `founded` columns (issue #33). Bounded window so the + /// type-based `int` fallback can't emit nonsense like `9419`. + YearRecent, + /// A plausible birth year as a plain `int` — `birth_year` and kin + /// (issue #33), the year-typed sibling of [`Self::DateAdult`]. + YearBirth, // — Temporal (bounded windows, D8) — /// A date within the last few years. DateRecent, diff --git a/tests/it/seed.rs b/tests/it/seed.rs index a4a1bc2..eba3e3c 100644 --- a/tests/it/seed.rs +++ b/tests/it/seed.rs @@ -281,6 +281,123 @@ fn seed_populates_a_table_and_persists_rows() { assert!(csv.contains('@'), "seeded emails should appear in the CSV:\n{csv}"); } +/// Parse a seeded table's CSV into per-column value lists (simple +/// comma-split — the values under test carry no commas/quotes). +fn csv_columns(csv: &str) -> (Vec, Vec>) { + let mut lines = csv.lines().filter(|l| !l.trim().is_empty()); + let header: Vec = lines.next().unwrap().split(',').map(str::to_string).collect(); + let rows: Vec> = + lines.map(|l| l.split(',').map(str::to_string).collect()).collect(); + (header, rows) +} + +fn column_values(csv: &str, col: &str) -> Vec { + let (header, rows) = csv_columns(csv); + let idx = header.iter().position(|h| h == col).expect("column present"); + rows.iter().map(|r| r[idx].clone()).collect() +} + +#[test] +fn seed_year_and_choice_set_heuristics() { + // Issues #33 (year-like int columns) + #34 (conventional choice + // sets). A fixed `--seed` makes the values deterministic; we assert + // membership in the bounded windows / value sets rather than exact + // strings (robust to RNG-internals changes, still proves the + // heuristic fired — the type fallback would produce 9419 / lorem). + let (project, db, _dir) = open_project_db(); + let rt = rt(); + rt.block_on(db.create_table( + "Records".to_string(), + vec![ + ColumnSpec::new("id", Type::Serial), + ColumnSpec::new("birth_year", Type::Int), + ColumnSpec::new("published", Type::Int), + ColumnSpec::new("priority", Type::Text), + ColumnSpec::new("severity", Type::Text), + ColumnSpec::new("rating", Type::Int), + ], + vec!["id".to_string()], + None, + )) + .expect("create Records"); + + rt.block_on(db.seed("Records".into(), None, Some(30), Vec::new(), Some(99), Some("seed Records 30".into()))) + .expect("seed succeeds"); + let csv = read_csv(&project, "Records").expect("Records CSV exists"); + + for y in column_values(&csv, "birth_year") { + let n: i32 = y.parse().expect("birth_year is an int"); + assert!((1945..=2007).contains(&n), "birth_year {n} must be a plausible birth year"); + } + for y in column_values(&csv, "published") { + let n: i32 = y.parse().expect("published is an int"); + assert!((1950..=2025).contains(&n), "published {n} must be a plausible recent year"); + } + for p in column_values(&csv, "priority") { + assert!(["low", "medium", "high"].contains(&p.as_str()), "priority `{p}` must be low/medium/high"); + } + for s in column_values(&csv, "severity") { + assert!( + ["low", "medium", "high", "critical"].contains(&s.as_str()), + "severity `{s}` must be low/medium/high/critical", + ); + } + for r in column_values(&csv, "rating") { + let n: i32 = r.parse().expect("rating is an int"); + assert!((1..=5).contains(&n), "rating {n} must be 1–5"); + } +} + +#[test] +fn seed_column_fill_uses_choice_set_heuristic() { + // The `seed
.` column-fill path (an UPDATE over + // existing rows) shares `choose_generator`, so issue #34's value + // sets apply there too. Insert rows with `priority` left NULL, then + // fill just that column and confirm it collapses to the set. + let (project, db, _dir) = open_project_db(); + let rt = rt(); + rt.block_on(db.create_table( + "Tasks".to_string(), + vec![ + ColumnSpec::new("id", Type::Serial), + ColumnSpec::new("title", Type::Text), + ColumnSpec::new("priority", Type::Text), + ], + vec!["id".to_string()], + None, + )) + .expect("create Tasks"); + for t in ["a", "b", "c", "d"] { + rt.block_on(db.insert( + "Tasks".to_string(), + Some(vec!["title".to_string()]), + vec![Value::Text(t.to_string())], + None, + )) + .expect("insert row"); + } + + rt.block_on(db.seed( + "Tasks".into(), + Some("priority".into()), + None, + Vec::new(), + Some(5), + Some("seed Tasks.priority".into()), + )) + .expect("column-fill priority"); + + let csv = read_csv(&project, "Tasks").expect("Tasks CSV"); + let priorities = column_values(&csv, "priority"); + assert_eq!(priorities.len(), 4, "every existing row is filled:\n{csv}"); + for p in priorities { + assert!( + ["low", "medium", "high"].contains(&p.as_str()), + "column-fill priority `{p}` must be low/medium/high", + ); + } +} + #[test] fn seed_count_defaults_to_twenty() { let (project, db, _dir) = open_project_db();