feat(seed): fake-data generation library + fake dependency (ADR-0048 P1.1)
The pure generation half of `seed` — no command wiring yet: - src/seed/: ColumnSpec + Generator model and a seeded StdRng; the type-gated name-heuristic catalogue (D7) with documented false-positive guards; table-context name disambiguation (D11); identifier (D10) and enum-ish (D12) detection; per-type + bounded-date generators (D8); the hand-rolled product generator (D9); and PickFrom for IN-CHECK / enum lists. - Adds the `fake` crate (v5, default features). Verified: single rand 0.10.1 (no duplication), determinism via one seeded StdRng driving both fake and the hand-rolled generators, security-clean across osv/grype/trivy. - ADR-0048 D3 updated to record the dependency verification. 32 Tier-1 tests (exact-value via fixed --seed); 1673 lib tests pass, clippy all-targets clean.
This commit is contained in:
@@ -0,0 +1,383 @@
|
||||
//! Value production: turn a [`Generator`] + a seeded RNG into a
|
||||
//! [`Value`] (ADR-0048 D8/D9). Realistic generators come from the
|
||||
//! `fake` crate (English locale); `product` is hand-rolled (D9, no
|
||||
//! commerce module exists); dates are generated against a **fixed
|
||||
//! reference epoch** so a `--seed` run is fully reproducible without
|
||||
//! depending on the wall clock (D8 bounded windows).
|
||||
//!
|
||||
//! The stateful markers ([`Generator::IdentitySequential`],
|
||||
//! [`Generator::ForeignKeySample`]) are resolved by the executor with
|
||||
//! database context; if one reaches here un-intercepted it falls back
|
||||
//! to type-based generation rather than panicking.
|
||||
|
||||
use chrono::{Datelike, NaiveDate};
|
||||
use fake::Fake;
|
||||
use rand::RngExt;
|
||||
|
||||
use crate::dsl::types::Type;
|
||||
use crate::dsl::value::Value;
|
||||
use crate::seed::{Generator, SeedRng};
|
||||
|
||||
/// Fixed anchor for bounded date/datetime windows. Using a constant
|
||||
/// (rather than `now()`) keeps `--seed` output reproducible across days
|
||||
/// and makes tests deterministic. It advances with releases.
|
||||
const REF_YEAR: i32 = 2025;
|
||||
const REF_MONTH: u32 = 6;
|
||||
const REF_DAY: u32 = 1;
|
||||
|
||||
/// `~3 years` window for "recent" dates, in days.
|
||||
const RECENT_WINDOW_DAYS: i64 = 3 * 365;
|
||||
/// Adult birth window (≈18–80 years ago), in days.
|
||||
const ADULT_MIN_DAYS: i64 = 18 * 365;
|
||||
const ADULT_MAX_DAYS: i64 = 80 * 365;
|
||||
|
||||
/// Produce one value for `generator` against destination type `ty`.
|
||||
#[must_use]
|
||||
pub fn generate_value(generator: &Generator, ty: Type, rng: &mut SeedRng) -> Value {
|
||||
use fake::faker::address::en as addr;
|
||||
use fake::faker::company::en as company;
|
||||
use fake::faker::internet::en as net;
|
||||
use fake::faker::job::en as job;
|
||||
use fake::faker::lorem::en as lorem;
|
||||
use fake::faker::name::en as name;
|
||||
use fake::faker::phone_number::en as phone;
|
||||
|
||||
match generator {
|
||||
Generator::FirstName => Value::Text(name::FirstName().fake_with_rng(rng)),
|
||||
Generator::LastName => Value::Text(name::LastName().fake_with_rng(rng)),
|
||||
Generator::FullName => Value::Text(name::Name().fake_with_rng(rng)),
|
||||
Generator::Email => Value::Text(net::FreeEmail().fake_with_rng(rng)),
|
||||
Generator::Username => Value::Text(net::Username().fake_with_rng(rng)),
|
||||
Generator::Password => Value::Text(net::Password(8..16).fake_with_rng(rng)),
|
||||
Generator::Phone => Value::Text(phone::PhoneNumber().fake_with_rng(rng)),
|
||||
Generator::City => Value::Text(addr::CityName().fake_with_rng(rng)),
|
||||
Generator::Country => Value::Text(addr::CountryName().fake_with_rng(rng)),
|
||||
Generator::StateName => Value::Text(addr::StateName().fake_with_rng(rng)),
|
||||
Generator::Street => Value::Text(addr::StreetName().fake_with_rng(rng)),
|
||||
Generator::ZipCode => Value::Text(addr::ZipCode().fake_with_rng(rng)),
|
||||
Generator::Company => Value::Text(company::CompanyName().fake_with_rng(rng)),
|
||||
Generator::JobTitle => Value::Text(job::Title().fake_with_rng(rng)),
|
||||
Generator::ProductName => Value::Text(product_name(rng)),
|
||||
Generator::Sentence => Value::Text(lorem::Sentence(5..12).fake_with_rng(rng)),
|
||||
Generator::Paragraph => Value::Text(lorem::Paragraph(2..4).fake_with_rng(rng)),
|
||||
Generator::Url => {
|
||||
let word: String = lorem::Word().fake_with_rng(rng);
|
||||
let suffix: String = net::DomainSuffix().fake_with_rng(rng);
|
||||
Value::Text(format!("https://{word}.{suffix}"))
|
||||
}
|
||||
// Hand-rolled — `fake`'s color module is feature-gated (it pulls
|
||||
// an extra crate); a hex colour is trivial from the RNG.
|
||||
Generator::HexColor => Value::Text(format!("#{:06X}", rng.random_range(0..0x0100_0000))),
|
||||
Generator::CurrencyAmount => currency_amount(ty, rng),
|
||||
Generator::Age => Value::Number(rng.random_range(18..=80).to_string()),
|
||||
Generator::SmallInt => Value::Number(rng.random_range(1..=100).to_string()),
|
||||
Generator::DateRecent => Value::Text(format_date(random_past_date(rng, 0, RECENT_WINDOW_DAYS))),
|
||||
Generator::DateAdult => {
|
||||
Value::Text(format_date(random_past_date(rng, ADULT_MIN_DAYS, ADULT_MAX_DAYS)))
|
||||
}
|
||||
Generator::DateTimeRecent => Value::Text(random_recent_datetime(rng)),
|
||||
Generator::Boolean => Value::Bool(rng.random_range(0..2) == 1),
|
||||
Generator::PickFrom(values) if !values.is_empty() => {
|
||||
let chosen: &String = pick(rng, values);
|
||||
literal_to_value(chosen, ty)
|
||||
}
|
||||
// Un-intercepted markers + an empty pick list → type-based.
|
||||
Generator::PickFrom(_)
|
||||
| Generator::IdentitySequential
|
||||
| Generator::ForeignKeySample
|
||||
| Generator::Generic => generic_for_type(ty, rng),
|
||||
}
|
||||
}
|
||||
|
||||
/// Type-based fallback generation (D8). Never produces NULL for a
|
||||
/// generatable type; `blob`/`serial`/`shortid` are handled by the
|
||||
/// executor (autogen / block guard) and yield NULL here only as a
|
||||
/// last resort.
|
||||
fn generic_for_type(ty: Type, rng: &mut SeedRng) -> Value {
|
||||
use fake::faker::lorem::en as lorem;
|
||||
match ty {
|
||||
Type::Text => {
|
||||
let words: Vec<String> = lorem::Words(2..4).fake_with_rng(rng);
|
||||
Value::Text(words.join(" "))
|
||||
}
|
||||
Type::ShortId => Value::Text(crate::dsl::shortid::generate()),
|
||||
Type::Int => Value::Number(rng.random_range(1..=10_000).to_string()),
|
||||
Type::Serial => Value::Number(rng.random_range(1..=10_000).to_string()),
|
||||
Type::Real => {
|
||||
let n: f64 = rng.random_range(0..100_000) as f64 / 100.0;
|
||||
Value::Number(format!("{n:.2}"))
|
||||
}
|
||||
Type::Decimal => {
|
||||
let dollars = rng.random_range(0..10_000);
|
||||
let cents = rng.random_range(0..100);
|
||||
Value::Number(format!("{dollars}.{cents:02}"))
|
||||
}
|
||||
Type::Bool => Value::Bool(rng.random_range(0..2) == 1),
|
||||
Type::Date => Value::Text(format_date(random_past_date(rng, 0, RECENT_WINDOW_DAYS))),
|
||||
Type::DateTime => Value::Text(random_recent_datetime(rng)),
|
||||
Type::Blob => Value::Null,
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrap a fixed-list literal as the right `Value` shape for `ty` (used
|
||||
/// by `PickFrom` — enum / `IN`-CHECK values).
|
||||
fn literal_to_value(s: &str, ty: Type) -> Value {
|
||||
match ty {
|
||||
Type::Int | Type::Serial | Type::Real | Type::Decimal => Value::Number(s.to_string()),
|
||||
Type::Bool => Value::Bool(matches!(s.to_ascii_lowercase().as_str(), "true" | "1")),
|
||||
_ => Value::Text(s.to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
/// A money-shaped amount: whole for `int`/`serial`, two-decimal for the
|
||||
/// fractional numeric types.
|
||||
fn currency_amount(ty: Type, rng: &mut SeedRng) -> Value {
|
||||
match ty {
|
||||
Type::Real | Type::Decimal => {
|
||||
let dollars = rng.random_range(1..=1_000);
|
||||
let cents = rng.random_range(0..100);
|
||||
Value::Number(format!("{dollars}.{cents:02}"))
|
||||
}
|
||||
// int / serial / anything else numeric → whole amount.
|
||||
_ => Value::Number(rng.random_range(1..=1_000).to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
// — the hand-rolled `product` generator (D9) —
|
||||
|
||||
const PRODUCT_ADJECTIVES: &[&str] = &[
|
||||
"Sleek", "Rustic", "Ergonomic", "Handcrafted", "Refined", "Modern",
|
||||
"Vintage", "Compact", "Premium", "Lightweight", "Durable", "Elegant",
|
||||
"Sturdy", "Smooth", "Gorgeous", "Intelligent", "Practical", "Awesome",
|
||||
"Incredible", "Recycled",
|
||||
];
|
||||
const PRODUCT_MATERIALS: &[&str] = &[
|
||||
"Wooden", "Copper", "Granite", "Cotton", "Steel", "Leather", "Bamboo",
|
||||
"Plastic", "Ceramic", "Glass", "Concrete", "Rubber", "Bronze", "Marble",
|
||||
"Linen", "Silk", "Aluminum", "Wool", "Gold", "Carbon",
|
||||
];
|
||||
const PRODUCT_NOUNS: &[&str] = &[
|
||||
"Chair", "Lamp", "Table", "Bottle", "Backpack", "Keyboard", "Mug",
|
||||
"Shoes", "Jacket", "Watch", "Wallet", "Bench", "Hat", "Gloves",
|
||||
"Towel", "Ball", "Bike", "Knife", "Pillow", "Blanket",
|
||||
];
|
||||
|
||||
fn product_name(rng: &mut SeedRng) -> String {
|
||||
format!(
|
||||
"{} {} {}",
|
||||
pick(rng, PRODUCT_ADJECTIVES),
|
||||
pick(rng, PRODUCT_MATERIALS),
|
||||
pick(rng, PRODUCT_NOUNS),
|
||||
)
|
||||
}
|
||||
|
||||
// — bounded dates (D8) —
|
||||
|
||||
const fn reference_date() -> NaiveDate {
|
||||
match NaiveDate::from_ymd_opt(REF_YEAR, REF_MONTH, REF_DAY) {
|
||||
Some(d) => d,
|
||||
None => panic!("reference date constants must be valid"),
|
||||
}
|
||||
}
|
||||
|
||||
/// A date between `min_days_ago` and `max_days_ago` before the
|
||||
/// reference epoch (inclusive).
|
||||
fn random_past_date(rng: &mut SeedRng, min_days_ago: i64, max_days_ago: i64) -> NaiveDate {
|
||||
let days_ago = rng.random_range(min_days_ago..=max_days_ago);
|
||||
let ce = reference_date().num_days_from_ce();
|
||||
let target = ce - i32::try_from(days_ago).unwrap_or(0);
|
||||
NaiveDate::from_num_days_from_ce_opt(target).unwrap_or_else(reference_date)
|
||||
}
|
||||
|
||||
fn format_date(date: NaiveDate) -> String {
|
||||
date.format("%Y-%m-%d").to_string()
|
||||
}
|
||||
|
||||
/// A recent datetime: a recent date plus a random time-of-day, rendered
|
||||
/// as `YYYY-MM-DDTHH:MM:SS`.
|
||||
fn random_recent_datetime(rng: &mut SeedRng) -> String {
|
||||
let date = random_past_date(rng, 0, RECENT_WINDOW_DAYS);
|
||||
let h = rng.random_range(0..24);
|
||||
let m = rng.random_range(0..60);
|
||||
let s = rng.random_range(0..60);
|
||||
format!("{}T{h:02}:{m:02}:{s:02}", format_date(date))
|
||||
}
|
||||
|
||||
/// Pick a uniformly random element from a non-empty slice.
|
||||
fn pick<'a, T>(rng: &mut SeedRng, items: &'a [T]) -> &'a T {
|
||||
&items[rng.random_range(0..items.len())]
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::seed::make_rng;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
fn gen_once(generator: &Generator, ty: Type, seed: u64) -> Value {
|
||||
let mut rng = make_rng(Some(seed));
|
||||
generate_value(generator, ty, &mut rng)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn generation_is_deterministic_for_a_fixed_seed() {
|
||||
for generator in [
|
||||
Generator::FullName,
|
||||
Generator::Email,
|
||||
Generator::ProductName,
|
||||
Generator::DateRecent,
|
||||
Generator::CurrencyAmount,
|
||||
] {
|
||||
let a = gen_once(&generator, Type::Text, 7);
|
||||
let b = gen_once(&generator, Type::Text, 7);
|
||||
assert_eq!(a, b, "{generator:?} must reproduce for a fixed seed");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn text_generators_produce_nonempty_text() {
|
||||
for generator in [
|
||||
Generator::FirstName,
|
||||
Generator::LastName,
|
||||
Generator::FullName,
|
||||
Generator::Email,
|
||||
Generator::Username,
|
||||
Generator::Company,
|
||||
Generator::City,
|
||||
Generator::ProductName,
|
||||
] {
|
||||
let v = gen_once(&generator, Type::Text, 3);
|
||||
match v {
|
||||
Value::Text(s) => assert!(!s.trim().is_empty(), "{generator:?} produced empty text"),
|
||||
other => panic!("{generator:?} produced non-text {other:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn email_looks_like_an_email() {
|
||||
let v = gen_once(&Generator::Email, Type::Text, 11);
|
||||
let Value::Text(s) = v else { panic!("not text") };
|
||||
assert!(s.contains('@'), "email should contain @: {s}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn product_name_is_three_capitalised_words() {
|
||||
let v = gen_once(&Generator::ProductName, Type::Text, 99);
|
||||
let Value::Text(s) = v else { panic!("not text") };
|
||||
let words: Vec<&str> = s.split(' ').collect();
|
||||
assert_eq!(words.len(), 3, "product name should be 3 words: {s}");
|
||||
for w in words {
|
||||
assert!(w.chars().next().unwrap().is_ascii_uppercase(), "word `{w}` not capitalised");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn recent_dates_fall_within_the_bounded_window() {
|
||||
let mut rng = make_rng(Some(1));
|
||||
let earliest = reference_date()
|
||||
.checked_sub_days(chrono::Days::new(RECENT_WINDOW_DAYS as u64))
|
||||
.unwrap();
|
||||
let latest = reference_date();
|
||||
for _ in 0..200 {
|
||||
let v = generate_value(&Generator::DateRecent, Type::Date, &mut rng);
|
||||
let Value::Text(s) = v else { panic!("date not text") };
|
||||
let d = NaiveDate::parse_from_str(&s, "%Y-%m-%d").expect("valid ISO date");
|
||||
assert!(d >= earliest && d <= latest, "date {d} outside recent window");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dob_dates_fall_within_the_adult_window() {
|
||||
let mut rng = make_rng(Some(2));
|
||||
let earliest = reference_date()
|
||||
.checked_sub_days(chrono::Days::new(ADULT_MAX_DAYS as u64))
|
||||
.unwrap();
|
||||
let latest = reference_date()
|
||||
.checked_sub_days(chrono::Days::new(ADULT_MIN_DAYS as u64))
|
||||
.unwrap();
|
||||
for _ in 0..200 {
|
||||
let v = generate_value(&Generator::DateAdult, Type::Date, &mut rng);
|
||||
let Value::Text(s) = v else { panic!("date not text") };
|
||||
let d = NaiveDate::parse_from_str(&s, "%Y-%m-%d").expect("valid ISO date");
|
||||
assert!(d >= earliest && d <= latest, "dob {d} outside adult window");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn datetime_is_iso_shaped() {
|
||||
let v = gen_once(&Generator::DateTimeRecent, Type::DateTime, 5);
|
||||
let Value::Text(s) = v else { panic!("not text") };
|
||||
assert!(s.contains('T'), "datetime needs a T separator: {s}");
|
||||
// Parses as a naive datetime.
|
||||
chrono::NaiveDateTime::parse_from_str(&s, "%Y-%m-%dT%H:%M:%S")
|
||||
.unwrap_or_else(|e| panic!("invalid datetime {s}: {e}"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn currency_is_whole_for_int_and_fractional_for_decimal() {
|
||||
let Value::Number(int_amt) = gen_once(&Generator::CurrencyAmount, Type::Int, 4) else {
|
||||
panic!("not a number")
|
||||
};
|
||||
assert!(!int_amt.contains('.'), "int currency should be whole: {int_amt}");
|
||||
let Value::Number(dec_amt) = gen_once(&Generator::CurrencyAmount, Type::Decimal, 4) else {
|
||||
panic!("not a number")
|
||||
};
|
||||
assert!(dec_amt.contains('.'), "decimal currency should have cents: {dec_amt}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn age_is_in_human_range() {
|
||||
let mut rng = make_rng(Some(8));
|
||||
for _ in 0..100 {
|
||||
let Value::Number(a) = generate_value(&Generator::Age, Type::Int, &mut rng) else {
|
||||
panic!("age not a number")
|
||||
};
|
||||
let n: i64 = a.parse().unwrap();
|
||||
assert!((18..=80).contains(&n), "age {n} out of range");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pick_from_chooses_a_listed_value() {
|
||||
let generator = Generator::PickFrom(vec!["active".into(), "closed".into()]);
|
||||
let mut rng = make_rng(Some(6));
|
||||
for _ in 0..50 {
|
||||
let Value::Text(s) = generate_value(&generator, Type::Text, &mut rng) else {
|
||||
panic!("not text")
|
||||
};
|
||||
assert!(matches!(s.as_str(), "active" | "closed"), "unexpected pick {s}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pick_from_wraps_numeric_values_as_numbers() {
|
||||
let generator = Generator::PickFrom(vec!["1".into(), "2".into(), "3".into()]);
|
||||
let mut rng = make_rng(Some(6));
|
||||
let v = generate_value(&generator, Type::Int, &mut rng);
|
||||
assert!(matches!(v, Value::Number(_)), "numeric pick should be a Number: {v:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn markers_fall_back_to_type_based_generation() {
|
||||
// An un-intercepted marker must not panic; it generates by type.
|
||||
let v = gen_once(&Generator::IdentitySequential, Type::Text, 1);
|
||||
assert!(matches!(v, Value::Text(_)));
|
||||
let v = gen_once(&Generator::ForeignKeySample, Type::Int, 1);
|
||||
assert!(matches!(v, Value::Number(_)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn generic_fallback_matches_each_type() {
|
||||
let mut rng = make_rng(Some(0));
|
||||
assert!(matches!(generate_value(&Generator::Generic, Type::Text, &mut rng), Value::Text(_)));
|
||||
assert!(matches!(generate_value(&Generator::Generic, Type::Int, &mut rng), Value::Number(_)));
|
||||
assert!(matches!(generate_value(&Generator::Generic, Type::Bool, &mut rng), Value::Bool(_)));
|
||||
assert!(matches!(generate_value(&Generator::Generic, Type::Blob, &mut rng), Value::Null));
|
||||
// shortid fallback is a valid base58 id.
|
||||
let Value::Text(sid) = generate_value(&Generator::Generic, Type::ShortId, &mut rng) else {
|
||||
panic!("shortid not text")
|
||||
};
|
||||
assert!(crate::dsl::shortid::validate(&sid).is_ok(), "invalid shortid {sid}");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user