feat(seed): fake-data generation library + fake dependency (ADR-0048 P1.1)

The pure generation half of `seed` — no command wiring yet:
- src/seed/: ColumnSpec + Generator model and a seeded StdRng; the
  type-gated name-heuristic catalogue (D7) with documented
  false-positive guards; table-context name disambiguation (D11);
  identifier (D10) and enum-ish (D12) detection; per-type + bounded-date
  generators (D8); the hand-rolled product generator (D9); and PickFrom
  for IN-CHECK / enum lists.
- Adds the `fake` crate (v5, default features). Verified: single rand
  0.10.1 (no duplication), determinism via one seeded StdRng driving
  both fake and the hand-rolled generators, security-clean across
  osv/grype/trivy.
- ADR-0048 D3 updated to record the dependency verification.

32 Tier-1 tests (exact-value via fixed --seed); 1673 lib tests pass,
clippy all-targets clean.
This commit is contained in:
claude@clouddev1
2026-06-11 15:35:17 +00:00
parent 0af7f56821
commit 202e25a94f
7 changed files with 1072 additions and 16 deletions
+383
View File
@@ -0,0 +1,383 @@
//! Value production: turn a [`Generator`] + a seeded RNG into a
//! [`Value`] (ADR-0048 D8/D9). Realistic generators come from the
//! `fake` crate (English locale); `product` is hand-rolled (D9, no
//! commerce module exists); dates are generated against a **fixed
//! reference epoch** so a `--seed` run is fully reproducible without
//! depending on the wall clock (D8 bounded windows).
//!
//! The stateful markers ([`Generator::IdentitySequential`],
//! [`Generator::ForeignKeySample`]) are resolved by the executor with
//! database context; if one reaches here un-intercepted it falls back
//! to type-based generation rather than panicking.
use chrono::{Datelike, NaiveDate};
use fake::Fake;
use rand::RngExt;
use crate::dsl::types::Type;
use crate::dsl::value::Value;
use crate::seed::{Generator, SeedRng};
/// Fixed anchor for bounded date/datetime windows. Using a constant
/// (rather than `now()`) keeps `--seed` output reproducible across days
/// and makes tests deterministic. It advances with releases.
const REF_YEAR: i32 = 2025;
const REF_MONTH: u32 = 6;
const REF_DAY: u32 = 1;
/// `~3 years` window for "recent" dates, in days.
const RECENT_WINDOW_DAYS: i64 = 3 * 365;
/// Adult birth window (≈1880 years ago), in days.
const ADULT_MIN_DAYS: i64 = 18 * 365;
const ADULT_MAX_DAYS: i64 = 80 * 365;
/// Produce one value for `generator` against destination type `ty`.
#[must_use]
pub fn generate_value(generator: &Generator, ty: Type, rng: &mut SeedRng) -> Value {
use fake::faker::address::en as addr;
use fake::faker::company::en as company;
use fake::faker::internet::en as net;
use fake::faker::job::en as job;
use fake::faker::lorem::en as lorem;
use fake::faker::name::en as name;
use fake::faker::phone_number::en as phone;
match generator {
Generator::FirstName => Value::Text(name::FirstName().fake_with_rng(rng)),
Generator::LastName => Value::Text(name::LastName().fake_with_rng(rng)),
Generator::FullName => Value::Text(name::Name().fake_with_rng(rng)),
Generator::Email => Value::Text(net::FreeEmail().fake_with_rng(rng)),
Generator::Username => Value::Text(net::Username().fake_with_rng(rng)),
Generator::Password => Value::Text(net::Password(8..16).fake_with_rng(rng)),
Generator::Phone => Value::Text(phone::PhoneNumber().fake_with_rng(rng)),
Generator::City => Value::Text(addr::CityName().fake_with_rng(rng)),
Generator::Country => Value::Text(addr::CountryName().fake_with_rng(rng)),
Generator::StateName => Value::Text(addr::StateName().fake_with_rng(rng)),
Generator::Street => Value::Text(addr::StreetName().fake_with_rng(rng)),
Generator::ZipCode => Value::Text(addr::ZipCode().fake_with_rng(rng)),
Generator::Company => Value::Text(company::CompanyName().fake_with_rng(rng)),
Generator::JobTitle => Value::Text(job::Title().fake_with_rng(rng)),
Generator::ProductName => Value::Text(product_name(rng)),
Generator::Sentence => Value::Text(lorem::Sentence(5..12).fake_with_rng(rng)),
Generator::Paragraph => Value::Text(lorem::Paragraph(2..4).fake_with_rng(rng)),
Generator::Url => {
let word: String = lorem::Word().fake_with_rng(rng);
let suffix: String = net::DomainSuffix().fake_with_rng(rng);
Value::Text(format!("https://{word}.{suffix}"))
}
// Hand-rolled — `fake`'s color module is feature-gated (it pulls
// an extra crate); a hex colour is trivial from the RNG.
Generator::HexColor => Value::Text(format!("#{:06X}", rng.random_range(0..0x0100_0000))),
Generator::CurrencyAmount => currency_amount(ty, rng),
Generator::Age => Value::Number(rng.random_range(18..=80).to_string()),
Generator::SmallInt => Value::Number(rng.random_range(1..=100).to_string()),
Generator::DateRecent => Value::Text(format_date(random_past_date(rng, 0, RECENT_WINDOW_DAYS))),
Generator::DateAdult => {
Value::Text(format_date(random_past_date(rng, ADULT_MIN_DAYS, ADULT_MAX_DAYS)))
}
Generator::DateTimeRecent => Value::Text(random_recent_datetime(rng)),
Generator::Boolean => Value::Bool(rng.random_range(0..2) == 1),
Generator::PickFrom(values) if !values.is_empty() => {
let chosen: &String = pick(rng, values);
literal_to_value(chosen, ty)
}
// Un-intercepted markers + an empty pick list → type-based.
Generator::PickFrom(_)
| Generator::IdentitySequential
| Generator::ForeignKeySample
| Generator::Generic => generic_for_type(ty, rng),
}
}
/// Type-based fallback generation (D8). Never produces NULL for a
/// generatable type; `blob`/`serial`/`shortid` are handled by the
/// executor (autogen / block guard) and yield NULL here only as a
/// last resort.
fn generic_for_type(ty: Type, rng: &mut SeedRng) -> Value {
use fake::faker::lorem::en as lorem;
match ty {
Type::Text => {
let words: Vec<String> = lorem::Words(2..4).fake_with_rng(rng);
Value::Text(words.join(" "))
}
Type::ShortId => Value::Text(crate::dsl::shortid::generate()),
Type::Int => Value::Number(rng.random_range(1..=10_000).to_string()),
Type::Serial => Value::Number(rng.random_range(1..=10_000).to_string()),
Type::Real => {
let n: f64 = rng.random_range(0..100_000) as f64 / 100.0;
Value::Number(format!("{n:.2}"))
}
Type::Decimal => {
let dollars = rng.random_range(0..10_000);
let cents = rng.random_range(0..100);
Value::Number(format!("{dollars}.{cents:02}"))
}
Type::Bool => Value::Bool(rng.random_range(0..2) == 1),
Type::Date => Value::Text(format_date(random_past_date(rng, 0, RECENT_WINDOW_DAYS))),
Type::DateTime => Value::Text(random_recent_datetime(rng)),
Type::Blob => Value::Null,
}
}
/// Wrap a fixed-list literal as the right `Value` shape for `ty` (used
/// by `PickFrom` — enum / `IN`-CHECK values).
fn literal_to_value(s: &str, ty: Type) -> Value {
match ty {
Type::Int | Type::Serial | Type::Real | Type::Decimal => Value::Number(s.to_string()),
Type::Bool => Value::Bool(matches!(s.to_ascii_lowercase().as_str(), "true" | "1")),
_ => Value::Text(s.to_string()),
}
}
/// A money-shaped amount: whole for `int`/`serial`, two-decimal for the
/// fractional numeric types.
fn currency_amount(ty: Type, rng: &mut SeedRng) -> Value {
match ty {
Type::Real | Type::Decimal => {
let dollars = rng.random_range(1..=1_000);
let cents = rng.random_range(0..100);
Value::Number(format!("{dollars}.{cents:02}"))
}
// int / serial / anything else numeric → whole amount.
_ => Value::Number(rng.random_range(1..=1_000).to_string()),
}
}
// — the hand-rolled `product` generator (D9) —
const PRODUCT_ADJECTIVES: &[&str] = &[
"Sleek", "Rustic", "Ergonomic", "Handcrafted", "Refined", "Modern",
"Vintage", "Compact", "Premium", "Lightweight", "Durable", "Elegant",
"Sturdy", "Smooth", "Gorgeous", "Intelligent", "Practical", "Awesome",
"Incredible", "Recycled",
];
const PRODUCT_MATERIALS: &[&str] = &[
"Wooden", "Copper", "Granite", "Cotton", "Steel", "Leather", "Bamboo",
"Plastic", "Ceramic", "Glass", "Concrete", "Rubber", "Bronze", "Marble",
"Linen", "Silk", "Aluminum", "Wool", "Gold", "Carbon",
];
const PRODUCT_NOUNS: &[&str] = &[
"Chair", "Lamp", "Table", "Bottle", "Backpack", "Keyboard", "Mug",
"Shoes", "Jacket", "Watch", "Wallet", "Bench", "Hat", "Gloves",
"Towel", "Ball", "Bike", "Knife", "Pillow", "Blanket",
];
fn product_name(rng: &mut SeedRng) -> String {
format!(
"{} {} {}",
pick(rng, PRODUCT_ADJECTIVES),
pick(rng, PRODUCT_MATERIALS),
pick(rng, PRODUCT_NOUNS),
)
}
// — bounded dates (D8) —
const fn reference_date() -> NaiveDate {
match NaiveDate::from_ymd_opt(REF_YEAR, REF_MONTH, REF_DAY) {
Some(d) => d,
None => panic!("reference date constants must be valid"),
}
}
/// A date between `min_days_ago` and `max_days_ago` before the
/// reference epoch (inclusive).
fn random_past_date(rng: &mut SeedRng, min_days_ago: i64, max_days_ago: i64) -> NaiveDate {
let days_ago = rng.random_range(min_days_ago..=max_days_ago);
let ce = reference_date().num_days_from_ce();
let target = ce - i32::try_from(days_ago).unwrap_or(0);
NaiveDate::from_num_days_from_ce_opt(target).unwrap_or_else(reference_date)
}
fn format_date(date: NaiveDate) -> String {
date.format("%Y-%m-%d").to_string()
}
/// A recent datetime: a recent date plus a random time-of-day, rendered
/// as `YYYY-MM-DDTHH:MM:SS`.
fn random_recent_datetime(rng: &mut SeedRng) -> String {
let date = random_past_date(rng, 0, RECENT_WINDOW_DAYS);
let h = rng.random_range(0..24);
let m = rng.random_range(0..60);
let s = rng.random_range(0..60);
format!("{}T{h:02}:{m:02}:{s:02}", format_date(date))
}
/// Pick a uniformly random element from a non-empty slice.
fn pick<'a, T>(rng: &mut SeedRng, items: &'a [T]) -> &'a T {
&items[rng.random_range(0..items.len())]
}
#[cfg(test)]
mod tests {
use super::*;
use crate::seed::make_rng;
use pretty_assertions::assert_eq;
fn gen_once(generator: &Generator, ty: Type, seed: u64) -> Value {
let mut rng = make_rng(Some(seed));
generate_value(generator, ty, &mut rng)
}
#[test]
fn generation_is_deterministic_for_a_fixed_seed() {
for generator in [
Generator::FullName,
Generator::Email,
Generator::ProductName,
Generator::DateRecent,
Generator::CurrencyAmount,
] {
let a = gen_once(&generator, Type::Text, 7);
let b = gen_once(&generator, Type::Text, 7);
assert_eq!(a, b, "{generator:?} must reproduce for a fixed seed");
}
}
#[test]
fn text_generators_produce_nonempty_text() {
for generator in [
Generator::FirstName,
Generator::LastName,
Generator::FullName,
Generator::Email,
Generator::Username,
Generator::Company,
Generator::City,
Generator::ProductName,
] {
let v = gen_once(&generator, Type::Text, 3);
match v {
Value::Text(s) => assert!(!s.trim().is_empty(), "{generator:?} produced empty text"),
other => panic!("{generator:?} produced non-text {other:?}"),
}
}
}
#[test]
fn email_looks_like_an_email() {
let v = gen_once(&Generator::Email, Type::Text, 11);
let Value::Text(s) = v else { panic!("not text") };
assert!(s.contains('@'), "email should contain @: {s}");
}
#[test]
fn product_name_is_three_capitalised_words() {
let v = gen_once(&Generator::ProductName, Type::Text, 99);
let Value::Text(s) = v else { panic!("not text") };
let words: Vec<&str> = s.split(' ').collect();
assert_eq!(words.len(), 3, "product name should be 3 words: {s}");
for w in words {
assert!(w.chars().next().unwrap().is_ascii_uppercase(), "word `{w}` not capitalised");
}
}
#[test]
fn recent_dates_fall_within_the_bounded_window() {
let mut rng = make_rng(Some(1));
let earliest = reference_date()
.checked_sub_days(chrono::Days::new(RECENT_WINDOW_DAYS as u64))
.unwrap();
let latest = reference_date();
for _ in 0..200 {
let v = generate_value(&Generator::DateRecent, Type::Date, &mut rng);
let Value::Text(s) = v else { panic!("date not text") };
let d = NaiveDate::parse_from_str(&s, "%Y-%m-%d").expect("valid ISO date");
assert!(d >= earliest && d <= latest, "date {d} outside recent window");
}
}
#[test]
fn dob_dates_fall_within_the_adult_window() {
let mut rng = make_rng(Some(2));
let earliest = reference_date()
.checked_sub_days(chrono::Days::new(ADULT_MAX_DAYS as u64))
.unwrap();
let latest = reference_date()
.checked_sub_days(chrono::Days::new(ADULT_MIN_DAYS as u64))
.unwrap();
for _ in 0..200 {
let v = generate_value(&Generator::DateAdult, Type::Date, &mut rng);
let Value::Text(s) = v else { panic!("date not text") };
let d = NaiveDate::parse_from_str(&s, "%Y-%m-%d").expect("valid ISO date");
assert!(d >= earliest && d <= latest, "dob {d} outside adult window");
}
}
#[test]
fn datetime_is_iso_shaped() {
let v = gen_once(&Generator::DateTimeRecent, Type::DateTime, 5);
let Value::Text(s) = v else { panic!("not text") };
assert!(s.contains('T'), "datetime needs a T separator: {s}");
// Parses as a naive datetime.
chrono::NaiveDateTime::parse_from_str(&s, "%Y-%m-%dT%H:%M:%S")
.unwrap_or_else(|e| panic!("invalid datetime {s}: {e}"));
}
#[test]
fn currency_is_whole_for_int_and_fractional_for_decimal() {
let Value::Number(int_amt) = gen_once(&Generator::CurrencyAmount, Type::Int, 4) else {
panic!("not a number")
};
assert!(!int_amt.contains('.'), "int currency should be whole: {int_amt}");
let Value::Number(dec_amt) = gen_once(&Generator::CurrencyAmount, Type::Decimal, 4) else {
panic!("not a number")
};
assert!(dec_amt.contains('.'), "decimal currency should have cents: {dec_amt}");
}
#[test]
fn age_is_in_human_range() {
let mut rng = make_rng(Some(8));
for _ in 0..100 {
let Value::Number(a) = generate_value(&Generator::Age, Type::Int, &mut rng) else {
panic!("age not a number")
};
let n: i64 = a.parse().unwrap();
assert!((18..=80).contains(&n), "age {n} out of range");
}
}
#[test]
fn pick_from_chooses_a_listed_value() {
let generator = Generator::PickFrom(vec!["active".into(), "closed".into()]);
let mut rng = make_rng(Some(6));
for _ in 0..50 {
let Value::Text(s) = generate_value(&generator, Type::Text, &mut rng) else {
panic!("not text")
};
assert!(matches!(s.as_str(), "active" | "closed"), "unexpected pick {s}");
}
}
#[test]
fn pick_from_wraps_numeric_values_as_numbers() {
let generator = Generator::PickFrom(vec!["1".into(), "2".into(), "3".into()]);
let mut rng = make_rng(Some(6));
let v = generate_value(&generator, Type::Int, &mut rng);
assert!(matches!(v, Value::Number(_)), "numeric pick should be a Number: {v:?}");
}
#[test]
fn markers_fall_back_to_type_based_generation() {
// An un-intercepted marker must not panic; it generates by type.
let v = gen_once(&Generator::IdentitySequential, Type::Text, 1);
assert!(matches!(v, Value::Text(_)));
let v = gen_once(&Generator::ForeignKeySample, Type::Int, 1);
assert!(matches!(v, Value::Number(_)));
}
#[test]
fn generic_fallback_matches_each_type() {
let mut rng = make_rng(Some(0));
assert!(matches!(generate_value(&Generator::Generic, Type::Text, &mut rng), Value::Text(_)));
assert!(matches!(generate_value(&Generator::Generic, Type::Int, &mut rng), Value::Number(_)));
assert!(matches!(generate_value(&Generator::Generic, Type::Bool, &mut rng), Value::Bool(_)));
assert!(matches!(generate_value(&Generator::Generic, Type::Blob, &mut rng), Value::Null));
// shortid fallback is a valid base58 id.
let Value::Text(sid) = generate_value(&Generator::Generic, Type::ShortId, &mut rng) else {
panic!("shortid not text")
};
assert!(crate::dsl::shortid::validate(&sid).is_ok(), "invalid shortid {sid}");
}
}