feat(seed): year-as-int + conventional choice-set heuristics (#33, #34)

Two additive D7 catalogue rules, surfaced while writing the website seed
docs. No change to the type fallback, executor, or grammar.

#33 — year-like int columns. `published`/`birth_year` were just `int`, so
they fell to the unbounded int path and produced nonsense (`9419`). Add an
int-gated year rule (after the quantity rule, so `year_count` stays a
count): `year`/`*_year`/`published`/`founded` -> a bounded 1950-2025 year
(new `YearRecent`), or the dob-style birth window 1945-2007 for
`birth`/`born`/`dob` (new `YearBirth`). Plain int; not added to the D9
named-generator vocabulary.

#34 — conventional choice sets. A few enum-ish names have a near-canonical
small set that reads far better than lorem text. Add a type-gated PickFrom
lookup (reusing the existing generator): priority/prio, severity,
rating/stars. `status` is deliberately excluded (values too
domain-specific) and keeps the D12 advisory; a user IN-CHECK still wins.
`priority` leaves ENUM_TOKENS.

ADR-0048 Amendment 1; +8 tests (incl. a column-fill integration test that
also closes a pre-existing gap on that path).
This commit is contained in:
claude@clouddev1
2026-06-12 20:36:20 +00:00
parent fde50ce3bf
commit deb0948d6c
7 changed files with 374 additions and 4 deletions
+52
View File
@@ -31,6 +31,16 @@ const RECENT_WINDOW_DAYS: i64 = 3 * 365;
const ADULT_MIN_DAYS: i64 = 18 * 365;
const ADULT_MAX_DAYS: i64 = 80 * 365;
/// Year windows for the `int`-typed year heuristics (issue #33),
/// expressed relative to [`REF_YEAR`] so they advance with releases —
/// the year siblings of the `DateRecent` / `DateAdult` windows above.
/// `YearRecent` spans ~75 years (19502025 at REF_YEAR=2025), wide
/// enough for `published` / `founded` / `release_year`; `YearBirth`
/// mirrors the adult birth window (19452007).
const YEAR_RECENT_SPAN: i32 = 75;
const YEAR_BIRTH_MIN_AGE: i32 = 18;
const YEAR_BIRTH_MAX_AGE: i32 = 80;
/// Produce one value for `generator` against destination type `ty`.
#[must_use]
pub fn generate_value(generator: &Generator, ty: Type, rng: &mut SeedRng) -> Value {
@@ -71,6 +81,13 @@ pub fn generate_value(generator: &Generator, ty: Type, rng: &mut SeedRng) -> Val
Generator::CurrencyAmount => currency_amount(ty, rng),
Generator::Age => Value::Number(rng.random_range(18..=80).to_string()),
Generator::SmallInt => Value::Number(rng.random_range(1..=100).to_string()),
Generator::YearRecent => {
Value::Number(rng.random_range((REF_YEAR - YEAR_RECENT_SPAN)..=REF_YEAR).to_string())
}
Generator::YearBirth => Value::Number(
rng.random_range((REF_YEAR - YEAR_BIRTH_MAX_AGE)..=(REF_YEAR - YEAR_BIRTH_MIN_AGE))
.to_string(),
),
Generator::DateRecent => Value::Text(format_date(random_past_date(rng, 0, RECENT_WINDOW_DAYS))),
Generator::DateAdult => {
Value::Text(format_date(random_past_date(rng, ADULT_MIN_DAYS, ADULT_MAX_DAYS)))
@@ -489,6 +506,41 @@ mod tests {
assert!(matches!(v, Value::Number(_)), "numeric pick should be a Number: {v:?}");
}
#[test]
fn year_generators_stay_within_their_bounded_windows() {
// Issue #33: both year generators emit a plain `int` inside a
// bounded, plausible window — never the unbounded-int nonsense.
let mut rng = make_rng(Some(7));
for _ in 0..300 {
let Value::Number(s) = generate_value(&Generator::YearRecent, Type::Int, &mut rng)
else {
panic!("YearRecent must be a Number")
};
let n: i32 = s.parse().unwrap();
assert!((1950..=2025).contains(&n), "YearRecent {n} out of [1950,2025]");
}
for _ in 0..300 {
let Value::Number(s) = generate_value(&Generator::YearBirth, Type::Int, &mut rng)
else {
panic!("YearBirth must be a Number")
};
let n: i32 = s.parse().unwrap();
assert!((1945..=2007).contains(&n), "YearBirth {n} out of [1945,2007]");
}
}
#[test]
fn year_generators_are_deterministic_for_a_fixed_seed() {
assert_eq!(
gen_once(&Generator::YearRecent, Type::Int, 42),
gen_once(&Generator::YearRecent, Type::Int, 42),
);
assert_eq!(
gen_once(&Generator::YearBirth, Type::Int, 42),
gen_once(&Generator::YearBirth, Type::Int, 42),
);
}
#[test]
fn int_range_stays_within_inclusive_bounds() {
let g = Generator::Range { low: "10".into(), high: "20".into() };
+128 -2
View File
@@ -57,9 +57,14 @@ fn choose_generator_inner(table: &str, col: &ColumnSpec) -> Generator {
/// the post-seed advisory; such columns still receive generic text.
#[must_use]
pub fn is_enum_ish(name: &str) -> bool {
// `priority` is intentionally absent: issue #34 gave it a built-in
// value set (low/medium/high · 1/2/3), so it is no longer "filled
// generically" and must not trigger the D13 advisory. `severity` /
// `rating` / `stars` were never here. `status` stays — it is
// deliberately left to the advisory (no built-in set).
const ENUM_TOKENS: &[&str] = &[
"role", "status", "state", "type", "kind", "category", "level",
"tier", "stage", "priority", "gender",
"tier", "stage", "gender",
];
let toks = tokens(name);
toks.iter().any(|t| ENUM_TOKENS.contains(&t.as_str()))
@@ -150,6 +155,49 @@ fn match_name_generator(table: &str, toks: &[String], ty: Type) -> Option<Genera
if numeric && has_any(toks, &["quantity", "qty", "stock", "count"]) {
return Some(Generator::SmallInt);
}
// — Year-as-int (issue #33) — bounded plausible years so the `int`
// type fallback (D8) can't emit nonsense like `9419`. `int`-gated
// (years are whole numbers) and placed *after* the quantity rule so
// `year_count` (a count of years) stays a `SmallInt`. `birth`/`born`/
// `dob` + year picks the birth window — the int sibling of the
// `dob → DateAdult` rule above — otherwise a recent window covers
// `year` / `*_year` / `published` / `founded`.
if matches!(ty, Type::Int)
&& (has_token(toks, "year") || has_any(toks, &["published", "founded"]))
{
return Some(if has_any(toks, &["birth", "born", "dob"]) {
Generator::YearBirth
} else {
Generator::YearRecent
});
}
// — Conventional choice sets (issue #34) — a few enum-ish names have
// a near-canonical small value set that reads far better than lorem
// text. Type-gated; reuses `PickFrom`. Names *without* a canonical
// set (`status`, `role`, `type`, …) stay unmatched → generic text +
// the D12/D13 advisory. `status` is deliberately excluded: its real
// values are too domain-specific (user-confirmed, issue #34). A
// user-declared `IN`-CHECK still wins — it is resolved before this.
if has_any(toks, &["priority", "prio"]) {
if text {
return Some(pick_from(&["low", "medium", "high"]));
}
if matches!(ty, Type::Int) {
return Some(pick_from(&["1", "2", "3"]));
}
}
if has_token(toks, "severity") {
if text {
return Some(pick_from(&["low", "medium", "high", "critical"]));
}
if matches!(ty, Type::Int) {
return Some(pick_from(&["1", "2", "3", "4"]));
}
}
if matches!(ty, Type::Int) && has_any(toks, &["rating", "stars"]) {
return Some(pick_from(&["1", "2", "3", "4", "5"]));
}
// — Temporal (bounded, D8) —
if matches!(ty, Type::Date) && has_any(toks, &["dob", "birthday", "birthdate"]) {
@@ -267,6 +315,14 @@ fn tokens(name: &str) -> Vec<String> {
out
}
/// A `PickFrom` generator from string-literal values (issue #34's
/// conventional choice sets). `literal_to_value` interprets each entry
/// by the destination type at generation time (an `int` column turns
/// `"1"` into a number).
fn pick_from(values: &[&str]) -> Generator {
Generator::PickFrom(values.iter().map(|s| (*s).to_string()).collect())
}
fn has_token(toks: &[String], t: &str) -> bool {
toks.iter().any(|x| x == t)
}
@@ -412,11 +468,81 @@ mod tests {
assert!(is_enum_ish("status"));
assert!(is_enum_ish("role"));
assert!(is_enum_ish("order_state"));
assert!(is_enum_ish("priority"));
// Issue #34: `priority` gained a built-in value set, so it is no
// longer advised (it is no longer "filled generically").
assert!(!is_enum_ish("priority"));
assert!(!is_enum_ish("severity"));
assert!(!is_enum_ish("rating"));
assert!(!is_enum_ish("email"));
assert!(!is_enum_ish("first_name"));
}
#[test]
fn year_like_int_columns_map_to_bounded_years() {
// Issue #33: `int`-gated year heuristics. `birth`/`born`/`dob`
// years pick the birth window; the rest a recent window.
assert_eq!(choose("authors", "birth_year", Type::Int), Generator::YearBirth);
assert_eq!(choose("authors", "birthYear", Type::Int), Generator::YearBirth);
assert_eq!(choose("u", "year_born", Type::Int), Generator::YearBirth);
assert_eq!(choose("books", "year", Type::Int), Generator::YearRecent);
assert_eq!(choose("films", "release_year", Type::Int), Generator::YearRecent);
assert_eq!(choose("books", "published", Type::Int), Generator::YearRecent);
assert_eq!(choose("companies", "founded", Type::Int), Generator::YearRecent);
// Type-gated: a text `year` is not a bounded-year int.
assert_eq!(choose("books", "year", Type::Text), Generator::Generic);
// `year_count` is a count, not a year — the quantity rule wins.
assert_eq!(choose("t", "year_count", Type::Int), Generator::SmallInt);
}
#[test]
fn conventional_choice_sets_map_to_pick_from() {
// Issue #34: type-gated built-in value sets.
assert_eq!(
choose("tickets", "priority", Type::Text),
Generator::PickFrom(vec!["low".into(), "medium".into(), "high".into()]),
);
assert_eq!(
choose("tickets", "prio", Type::Int),
Generator::PickFrom(vec!["1".into(), "2".into(), "3".into()]),
);
assert_eq!(
choose("bugs", "severity", Type::Text),
Generator::PickFrom(vec!["low".into(), "medium".into(), "high".into(), "critical".into()]),
);
assert_eq!(
choose("bugs", "severity", Type::Int),
Generator::PickFrom(vec!["1".into(), "2".into(), "3".into(), "4".into()]),
);
assert_eq!(
choose("reviews", "rating", Type::Int),
Generator::PickFrom(vec!["1".into(), "2".into(), "3".into(), "4".into(), "5".into()]),
);
assert_eq!(
choose("reviews", "stars", Type::Int),
Generator::PickFrom(vec!["1".into(), "2".into(), "3".into(), "4".into(), "5".into()]),
);
}
#[test]
fn status_is_left_to_the_advisory_not_given_a_set() {
// User-confirmed (issue #34): `status` keeps the D12 "don't
// guess" stance — generic text + the advisory, no built-in set.
assert_eq!(choose("orders", "status", Type::Text), Generator::Generic);
assert!(is_enum_ish("status"));
}
#[test]
fn a_declared_in_check_still_wins_over_a_built_in_set() {
// The CHECK is the user's explicit intent; it precedes the
// issue-#34 default set for the same name.
let mut spec = ColumnSpec::plain("priority", Type::Text);
spec.check_in_values = Some(vec!["p1".into(), "p2".into()]);
assert_eq!(
choose_generator("tickets", &spec),
Generator::PickFrom(vec!["p1".into(), "p2".into()]),
);
}
#[test]
fn enum_ish_columns_fall_through_to_generic() {
// No special generator — generic text + the advisory flags them.
+7
View File
@@ -149,6 +149,13 @@ pub enum Generator {
Age,
/// A small positive integer (quantities, counts).
SmallInt,
/// A plausible recent year as a plain `int` — `year` / `*_year` /
/// `published` / `founded` columns (issue #33). Bounded window so the
/// type-based `int` fallback can't emit nonsense like `9419`.
YearRecent,
/// A plausible birth year as a plain `int` — `birth_year` and kin
/// (issue #33), the year-typed sibling of [`Self::DateAdult`].
YearBirth,
// — Temporal (bounded windows, D8) —
/// A date within the last few years.
DateRecent,