diff --git a/Cargo.lock b/Cargo.lock index 78a8c41..c1fc106 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -419,6 +419,12 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "deunicode" +version = "1.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abd57806937c9cc163efc8ea3910e00a62e2aeb0b8119f1793a978088f8f6b04" + [[package]] name = "diff" version = "0.1.13" @@ -518,6 +524,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "fake" +version = "5.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea6be833b323a56361118a747470a45a1bcd5c52a2ec9b1e40c83dafe687e453" +dependencies = [ + "deunicode", + "either", + "rand 0.10.1", +] + [[package]] name = "fallible-iterator" version = "0.3.0" @@ -1527,6 +1544,7 @@ dependencies = [ "crossterm", "csv", "directories", + "fake", "futures-util", "gethostname", "insta", diff --git a/Cargo.toml b/Cargo.toml index f3b74d9..10c5fd4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,14 @@ chrono = { version = "0.4.44", default-features = false, features = ["clock"] } crossterm = { version = "0.29.0", features = ["event-stream"] } csv = "1.4.0" directories = "6.0.0" +# Realistic fake-data generators for the `seed` command (ADR-0048): +# names, emails, addresses, companies, lorem, etc. Default features +# only — the basic fakers need no flags; date/datetime values are +# generated in-house (rand + the existing `chrono`) for the bounded +# windows ADR-0048 D8 requires, so `fake`'s `chrono` feature is +# deliberately omitted. No commerce/product module exists, so the +# `product` generator is hand-rolled (D9). +fake = "5" futures-util = "0.3.32" gethostname = "1.1.0" rand = "0.10.1" diff --git a/docs/adr/0048-seed-fake-data-generation.md b/docs/adr/0048-seed-fake-data-generation.md index bae83b2..9a35949 100644 --- a/docs/adr/0048-seed-fake-data-generation.md +++ b/docs/adr/0048-seed-fake-data-generation.md @@ -170,23 +170,29 @@ companies, phone numbers, lorem text, dates. Generation is driven by a per-column **generator** chosen by the heuristics (D7) or the override (D2), falling back to **type-based** generation (D8). -**Two open implementation-time verifications** (flagged honestly, to -be resolved when the dependency is locked, not assumed here): +**Implementation-time verifications (resolved 2026-06-11 when the +dependency was added):** -- **`rand` de-duplication.** The project is on `rand 0.10.1`; `fake` - brings its own `rand`. Confirm a single `rand` version resolves (a - duplicate is harmless but should be a conscious outcome, and - `shortid.rs` + the seed RNG must share the version we standardise - on). -- **`fake` module inventory.** Confirm which generators v5 actually - ships (strong prior: it has Name/Internet/Address/Company/Lorem/ - Chrono/Currency/Job/Color but **no commerce/product module** — see - D9), and the minimal feature-flag set needed (derive, chrono-backed - dates). -- **Security (new-dependency posture).** `fake` and its transitive - tree must be scanned (`trivy fs`, `grype`, `osv-scanner`) before - merge, per the global new-dependency rule; findings documented, not - silently accepted. +- **`rand` de-duplication — clean.** `fake` 5.1.0 depends on + `rand = "0.10"`, the **same major** as the project's `rand 0.10.1`, + so `cargo tree -e normal` resolves a **single** `rand 0.10.1` (no + runtime duplication; the `rand 0.8.6` visible to `cargo tree -i + rand` is only `fake`'s own dev-dependency, never compiled for us). + Consequence for D4: one seeded `rand 0.10` `StdRng` can drive + **both** `fake`'s `fake_with_rng` and the hand-rolled generators — + determinism is single-RNG, single-version, and shares `shortid.rs`'s + `rand` version. +- **`fake` module inventory / features — confirmed.** Default features + (`["either"]`) cover the core string fakers used here + (Name/Internet/Address/Company/Lorem/PhoneNumber); `fake`'s `chrono` + feature is **deliberately omitted** (dates generated in-house for + D8's bounded windows). No commerce/product module exists → `product` + is hand-rolled (D9). (The exact faker call sites are pinned when the + generation library is built.) +- **Security (new-dependency posture) — clean.** The `fake` tree (296 + packages total) scanned clean by **all three** mandated scanners: + `osv-scanner` (no issues), `grype` (no vulnerabilities), `trivy fs + --scanners vuln` (0). No findings to document or accept. ### D4 — Determinism: `--seed ` (fork, user-chosen: "optional flag") diff --git a/src/lib.rs b/src/lib.rs index 0b21d7a..aa39b75 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,6 +23,7 @@ pub mod output_render; pub mod persistence; pub mod project; pub mod runtime; +pub mod seed; pub mod theme; pub mod type_change; pub mod ui; diff --git a/src/seed/generators.rs b/src/seed/generators.rs new file mode 100644 index 0000000..c1e4cab --- /dev/null +++ b/src/seed/generators.rs @@ -0,0 +1,383 @@ +//! Value production: turn a [`Generator`] + a seeded RNG into a +//! [`Value`] (ADR-0048 D8/D9). Realistic generators come from the +//! `fake` crate (English locale); `product` is hand-rolled (D9, no +//! commerce module exists); dates are generated against a **fixed +//! reference epoch** so a `--seed` run is fully reproducible without +//! depending on the wall clock (D8 bounded windows). +//! +//! The stateful markers ([`Generator::IdentitySequential`], +//! [`Generator::ForeignKeySample`]) are resolved by the executor with +//! database context; if one reaches here un-intercepted it falls back +//! to type-based generation rather than panicking. + +use chrono::{Datelike, NaiveDate}; +use fake::Fake; +use rand::RngExt; + +use crate::dsl::types::Type; +use crate::dsl::value::Value; +use crate::seed::{Generator, SeedRng}; + +/// Fixed anchor for bounded date/datetime windows. Using a constant +/// (rather than `now()`) keeps `--seed` output reproducible across days +/// and makes tests deterministic. It advances with releases. +const REF_YEAR: i32 = 2025; +const REF_MONTH: u32 = 6; +const REF_DAY: u32 = 1; + +/// `~3 years` window for "recent" dates, in days. +const RECENT_WINDOW_DAYS: i64 = 3 * 365; +/// Adult birth window (≈18–80 years ago), in days. +const ADULT_MIN_DAYS: i64 = 18 * 365; +const ADULT_MAX_DAYS: i64 = 80 * 365; + +/// Produce one value for `generator` against destination type `ty`. +#[must_use] +pub fn generate_value(generator: &Generator, ty: Type, rng: &mut SeedRng) -> Value { + use fake::faker::address::en as addr; + use fake::faker::company::en as company; + use fake::faker::internet::en as net; + use fake::faker::job::en as job; + use fake::faker::lorem::en as lorem; + use fake::faker::name::en as name; + use fake::faker::phone_number::en as phone; + + match generator { + Generator::FirstName => Value::Text(name::FirstName().fake_with_rng(rng)), + Generator::LastName => Value::Text(name::LastName().fake_with_rng(rng)), + Generator::FullName => Value::Text(name::Name().fake_with_rng(rng)), + Generator::Email => Value::Text(net::FreeEmail().fake_with_rng(rng)), + Generator::Username => Value::Text(net::Username().fake_with_rng(rng)), + Generator::Password => Value::Text(net::Password(8..16).fake_with_rng(rng)), + Generator::Phone => Value::Text(phone::PhoneNumber().fake_with_rng(rng)), + Generator::City => Value::Text(addr::CityName().fake_with_rng(rng)), + Generator::Country => Value::Text(addr::CountryName().fake_with_rng(rng)), + Generator::StateName => Value::Text(addr::StateName().fake_with_rng(rng)), + Generator::Street => Value::Text(addr::StreetName().fake_with_rng(rng)), + Generator::ZipCode => Value::Text(addr::ZipCode().fake_with_rng(rng)), + Generator::Company => Value::Text(company::CompanyName().fake_with_rng(rng)), + Generator::JobTitle => Value::Text(job::Title().fake_with_rng(rng)), + Generator::ProductName => Value::Text(product_name(rng)), + Generator::Sentence => Value::Text(lorem::Sentence(5..12).fake_with_rng(rng)), + Generator::Paragraph => Value::Text(lorem::Paragraph(2..4).fake_with_rng(rng)), + Generator::Url => { + let word: String = lorem::Word().fake_with_rng(rng); + let suffix: String = net::DomainSuffix().fake_with_rng(rng); + Value::Text(format!("https://{word}.{suffix}")) + } + // Hand-rolled — `fake`'s color module is feature-gated (it pulls + // an extra crate); a hex colour is trivial from the RNG. + Generator::HexColor => Value::Text(format!("#{:06X}", rng.random_range(0..0x0100_0000))), + Generator::CurrencyAmount => currency_amount(ty, rng), + Generator::Age => Value::Number(rng.random_range(18..=80).to_string()), + Generator::SmallInt => Value::Number(rng.random_range(1..=100).to_string()), + Generator::DateRecent => Value::Text(format_date(random_past_date(rng, 0, RECENT_WINDOW_DAYS))), + Generator::DateAdult => { + Value::Text(format_date(random_past_date(rng, ADULT_MIN_DAYS, ADULT_MAX_DAYS))) + } + Generator::DateTimeRecent => Value::Text(random_recent_datetime(rng)), + Generator::Boolean => Value::Bool(rng.random_range(0..2) == 1), + Generator::PickFrom(values) if !values.is_empty() => { + let chosen: &String = pick(rng, values); + literal_to_value(chosen, ty) + } + // Un-intercepted markers + an empty pick list → type-based. + Generator::PickFrom(_) + | Generator::IdentitySequential + | Generator::ForeignKeySample + | Generator::Generic => generic_for_type(ty, rng), + } +} + +/// Type-based fallback generation (D8). Never produces NULL for a +/// generatable type; `blob`/`serial`/`shortid` are handled by the +/// executor (autogen / block guard) and yield NULL here only as a +/// last resort. +fn generic_for_type(ty: Type, rng: &mut SeedRng) -> Value { + use fake::faker::lorem::en as lorem; + match ty { + Type::Text => { + let words: Vec = lorem::Words(2..4).fake_with_rng(rng); + Value::Text(words.join(" ")) + } + Type::ShortId => Value::Text(crate::dsl::shortid::generate()), + Type::Int => Value::Number(rng.random_range(1..=10_000).to_string()), + Type::Serial => Value::Number(rng.random_range(1..=10_000).to_string()), + Type::Real => { + let n: f64 = rng.random_range(0..100_000) as f64 / 100.0; + Value::Number(format!("{n:.2}")) + } + Type::Decimal => { + let dollars = rng.random_range(0..10_000); + let cents = rng.random_range(0..100); + Value::Number(format!("{dollars}.{cents:02}")) + } + Type::Bool => Value::Bool(rng.random_range(0..2) == 1), + Type::Date => Value::Text(format_date(random_past_date(rng, 0, RECENT_WINDOW_DAYS))), + Type::DateTime => Value::Text(random_recent_datetime(rng)), + Type::Blob => Value::Null, + } +} + +/// Wrap a fixed-list literal as the right `Value` shape for `ty` (used +/// by `PickFrom` — enum / `IN`-CHECK values). +fn literal_to_value(s: &str, ty: Type) -> Value { + match ty { + Type::Int | Type::Serial | Type::Real | Type::Decimal => Value::Number(s.to_string()), + Type::Bool => Value::Bool(matches!(s.to_ascii_lowercase().as_str(), "true" | "1")), + _ => Value::Text(s.to_string()), + } +} + +/// A money-shaped amount: whole for `int`/`serial`, two-decimal for the +/// fractional numeric types. +fn currency_amount(ty: Type, rng: &mut SeedRng) -> Value { + match ty { + Type::Real | Type::Decimal => { + let dollars = rng.random_range(1..=1_000); + let cents = rng.random_range(0..100); + Value::Number(format!("{dollars}.{cents:02}")) + } + // int / serial / anything else numeric → whole amount. + _ => Value::Number(rng.random_range(1..=1_000).to_string()), + } +} + +// — the hand-rolled `product` generator (D9) — + +const PRODUCT_ADJECTIVES: &[&str] = &[ + "Sleek", "Rustic", "Ergonomic", "Handcrafted", "Refined", "Modern", + "Vintage", "Compact", "Premium", "Lightweight", "Durable", "Elegant", + "Sturdy", "Smooth", "Gorgeous", "Intelligent", "Practical", "Awesome", + "Incredible", "Recycled", +]; +const PRODUCT_MATERIALS: &[&str] = &[ + "Wooden", "Copper", "Granite", "Cotton", "Steel", "Leather", "Bamboo", + "Plastic", "Ceramic", "Glass", "Concrete", "Rubber", "Bronze", "Marble", + "Linen", "Silk", "Aluminum", "Wool", "Gold", "Carbon", +]; +const PRODUCT_NOUNS: &[&str] = &[ + "Chair", "Lamp", "Table", "Bottle", "Backpack", "Keyboard", "Mug", + "Shoes", "Jacket", "Watch", "Wallet", "Bench", "Hat", "Gloves", + "Towel", "Ball", "Bike", "Knife", "Pillow", "Blanket", +]; + +fn product_name(rng: &mut SeedRng) -> String { + format!( + "{} {} {}", + pick(rng, PRODUCT_ADJECTIVES), + pick(rng, PRODUCT_MATERIALS), + pick(rng, PRODUCT_NOUNS), + ) +} + +// — bounded dates (D8) — + +const fn reference_date() -> NaiveDate { + match NaiveDate::from_ymd_opt(REF_YEAR, REF_MONTH, REF_DAY) { + Some(d) => d, + None => panic!("reference date constants must be valid"), + } +} + +/// A date between `min_days_ago` and `max_days_ago` before the +/// reference epoch (inclusive). +fn random_past_date(rng: &mut SeedRng, min_days_ago: i64, max_days_ago: i64) -> NaiveDate { + let days_ago = rng.random_range(min_days_ago..=max_days_ago); + let ce = reference_date().num_days_from_ce(); + let target = ce - i32::try_from(days_ago).unwrap_or(0); + NaiveDate::from_num_days_from_ce_opt(target).unwrap_or_else(reference_date) +} + +fn format_date(date: NaiveDate) -> String { + date.format("%Y-%m-%d").to_string() +} + +/// A recent datetime: a recent date plus a random time-of-day, rendered +/// as `YYYY-MM-DDTHH:MM:SS`. +fn random_recent_datetime(rng: &mut SeedRng) -> String { + let date = random_past_date(rng, 0, RECENT_WINDOW_DAYS); + let h = rng.random_range(0..24); + let m = rng.random_range(0..60); + let s = rng.random_range(0..60); + format!("{}T{h:02}:{m:02}:{s:02}", format_date(date)) +} + +/// Pick a uniformly random element from a non-empty slice. +fn pick<'a, T>(rng: &mut SeedRng, items: &'a [T]) -> &'a T { + &items[rng.random_range(0..items.len())] +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::seed::make_rng; + use pretty_assertions::assert_eq; + + fn gen_once(generator: &Generator, ty: Type, seed: u64) -> Value { + let mut rng = make_rng(Some(seed)); + generate_value(generator, ty, &mut rng) + } + + #[test] + fn generation_is_deterministic_for_a_fixed_seed() { + for generator in [ + Generator::FullName, + Generator::Email, + Generator::ProductName, + Generator::DateRecent, + Generator::CurrencyAmount, + ] { + let a = gen_once(&generator, Type::Text, 7); + let b = gen_once(&generator, Type::Text, 7); + assert_eq!(a, b, "{generator:?} must reproduce for a fixed seed"); + } + } + + #[test] + fn text_generators_produce_nonempty_text() { + for generator in [ + Generator::FirstName, + Generator::LastName, + Generator::FullName, + Generator::Email, + Generator::Username, + Generator::Company, + Generator::City, + Generator::ProductName, + ] { + let v = gen_once(&generator, Type::Text, 3); + match v { + Value::Text(s) => assert!(!s.trim().is_empty(), "{generator:?} produced empty text"), + other => panic!("{generator:?} produced non-text {other:?}"), + } + } + } + + #[test] + fn email_looks_like_an_email() { + let v = gen_once(&Generator::Email, Type::Text, 11); + let Value::Text(s) = v else { panic!("not text") }; + assert!(s.contains('@'), "email should contain @: {s}"); + } + + #[test] + fn product_name_is_three_capitalised_words() { + let v = gen_once(&Generator::ProductName, Type::Text, 99); + let Value::Text(s) = v else { panic!("not text") }; + let words: Vec<&str> = s.split(' ').collect(); + assert_eq!(words.len(), 3, "product name should be 3 words: {s}"); + for w in words { + assert!(w.chars().next().unwrap().is_ascii_uppercase(), "word `{w}` not capitalised"); + } + } + + #[test] + fn recent_dates_fall_within_the_bounded_window() { + let mut rng = make_rng(Some(1)); + let earliest = reference_date() + .checked_sub_days(chrono::Days::new(RECENT_WINDOW_DAYS as u64)) + .unwrap(); + let latest = reference_date(); + for _ in 0..200 { + let v = generate_value(&Generator::DateRecent, Type::Date, &mut rng); + let Value::Text(s) = v else { panic!("date not text") }; + let d = NaiveDate::parse_from_str(&s, "%Y-%m-%d").expect("valid ISO date"); + assert!(d >= earliest && d <= latest, "date {d} outside recent window"); + } + } + + #[test] + fn dob_dates_fall_within_the_adult_window() { + let mut rng = make_rng(Some(2)); + let earliest = reference_date() + .checked_sub_days(chrono::Days::new(ADULT_MAX_DAYS as u64)) + .unwrap(); + let latest = reference_date() + .checked_sub_days(chrono::Days::new(ADULT_MIN_DAYS as u64)) + .unwrap(); + for _ in 0..200 { + let v = generate_value(&Generator::DateAdult, Type::Date, &mut rng); + let Value::Text(s) = v else { panic!("date not text") }; + let d = NaiveDate::parse_from_str(&s, "%Y-%m-%d").expect("valid ISO date"); + assert!(d >= earliest && d <= latest, "dob {d} outside adult window"); + } + } + + #[test] + fn datetime_is_iso_shaped() { + let v = gen_once(&Generator::DateTimeRecent, Type::DateTime, 5); + let Value::Text(s) = v else { panic!("not text") }; + assert!(s.contains('T'), "datetime needs a T separator: {s}"); + // Parses as a naive datetime. + chrono::NaiveDateTime::parse_from_str(&s, "%Y-%m-%dT%H:%M:%S") + .unwrap_or_else(|e| panic!("invalid datetime {s}: {e}")); + } + + #[test] + fn currency_is_whole_for_int_and_fractional_for_decimal() { + let Value::Number(int_amt) = gen_once(&Generator::CurrencyAmount, Type::Int, 4) else { + panic!("not a number") + }; + assert!(!int_amt.contains('.'), "int currency should be whole: {int_amt}"); + let Value::Number(dec_amt) = gen_once(&Generator::CurrencyAmount, Type::Decimal, 4) else { + panic!("not a number") + }; + assert!(dec_amt.contains('.'), "decimal currency should have cents: {dec_amt}"); + } + + #[test] + fn age_is_in_human_range() { + let mut rng = make_rng(Some(8)); + for _ in 0..100 { + let Value::Number(a) = generate_value(&Generator::Age, Type::Int, &mut rng) else { + panic!("age not a number") + }; + let n: i64 = a.parse().unwrap(); + assert!((18..=80).contains(&n), "age {n} out of range"); + } + } + + #[test] + fn pick_from_chooses_a_listed_value() { + let generator = Generator::PickFrom(vec!["active".into(), "closed".into()]); + let mut rng = make_rng(Some(6)); + for _ in 0..50 { + let Value::Text(s) = generate_value(&generator, Type::Text, &mut rng) else { + panic!("not text") + }; + assert!(matches!(s.as_str(), "active" | "closed"), "unexpected pick {s}"); + } + } + + #[test] + fn pick_from_wraps_numeric_values_as_numbers() { + let generator = Generator::PickFrom(vec!["1".into(), "2".into(), "3".into()]); + let mut rng = make_rng(Some(6)); + let v = generate_value(&generator, Type::Int, &mut rng); + assert!(matches!(v, Value::Number(_)), "numeric pick should be a Number: {v:?}"); + } + + #[test] + fn markers_fall_back_to_type_based_generation() { + // An un-intercepted marker must not panic; it generates by type. + let v = gen_once(&Generator::IdentitySequential, Type::Text, 1); + assert!(matches!(v, Value::Text(_))); + let v = gen_once(&Generator::ForeignKeySample, Type::Int, 1); + assert!(matches!(v, Value::Number(_))); + } + + #[test] + fn generic_fallback_matches_each_type() { + let mut rng = make_rng(Some(0)); + assert!(matches!(generate_value(&Generator::Generic, Type::Text, &mut rng), Value::Text(_))); + assert!(matches!(generate_value(&Generator::Generic, Type::Int, &mut rng), Value::Number(_))); + assert!(matches!(generate_value(&Generator::Generic, Type::Bool, &mut rng), Value::Bool(_))); + assert!(matches!(generate_value(&Generator::Generic, Type::Blob, &mut rng), Value::Null)); + // shortid fallback is a valid base58 id. + let Value::Text(sid) = generate_value(&Generator::Generic, Type::ShortId, &mut rng) else { + panic!("shortid not text") + }; + assert!(crate::dsl::shortid::validate(&sid).is_ok(), "invalid shortid {sid}"); + } +} diff --git a/src/seed/heuristics.rs b/src/seed/heuristics.rs new file mode 100644 index 0000000..d62f78a --- /dev/null +++ b/src/seed/heuristics.rs @@ -0,0 +1,440 @@ +//! Generator selection: the name-aware, type-gated catalogue (ADR-0048 +//! D7), table-context disambiguation for `name`/`title` (D11), the +//! identifier-family rule (D10), and enum-ish detection (D12). +//! +//! Selection is **token-based**: a column name is split on `_`, `-` and +//! camelCase boundaries, lowercased, and matched against an +//! ordered, most-specific-first list. Each rule is **type-gated** — a +//! name match only fires when the column's type is compatible, so a +//! column called `email` typed `int` falls through to type-based +//! generation rather than producing a string. Documented false-positive +//! guards keep `username`/`filename` away from the bare person-name +//! rule. + +use tracing::trace; + +use crate::dsl::types::Type; +use crate::seed::{ColumnSpec, Generator}; + +/// Choose the generator for a column (ADR-0048 D7/D10/D11/D12). +/// +/// Precedence: foreign keys and `IN`-CHECK columns are resolved first +/// (the executor / a fixed list), then the ordered name catalogue, then +/// the type-based fallback. +#[must_use] +pub fn choose_generator(table: &str, col: &ColumnSpec) -> Generator { + let generator = choose_generator_inner(table, col); + trace!( + table = table, + column = %col.name, + ty = %col.ty, + chosen = ?generator, + "seed: chose generator for column" + ); + generator +} + +fn choose_generator_inner(table: &str, col: &ColumnSpec) -> Generator { + // FK columns are filled by sampling existing parent rows (D14) — + // the executor owns that; generation here would be wrong. + if col.is_foreign_key { + return Generator::ForeignKeySample; + } + // A simple `col IN (…)` CHECK becomes the value source (D17), so the + // common enum-as-CHECK pattern just works. + if let Some(values) = &col.check_in_values + && !values.is_empty() + { + return Generator::PickFrom(values.clone()); + } + + let toks = tokens(&col.name); + match_name_generator(table, &toks, col.ty).unwrap_or(Generator::Generic) +} + +/// Whether a column name looks like an enum / fixed-value set that has +/// no sensible generic generator (D12). Used by the executor to drive +/// the post-seed advisory; such columns still receive generic text. +#[must_use] +pub fn is_enum_ish(name: &str) -> bool { + const ENUM_TOKENS: &[&str] = &[ + "role", "status", "state", "type", "kind", "category", "level", + "tier", "stage", "priority", "gender", + ]; + let toks = tokens(name); + toks.iter().any(|t| ENUM_TOKENS.contains(&t.as_str())) +} + +/// The ordered, most-specific-first name catalogue. Returns `None` when +/// nothing matches (→ type-based fallback) or when a name matches but +/// its type gate fails. +fn match_name_generator(table: &str, toks: &[String], ty: Type) -> Option { + let text = type_is_text(ty); + let numeric = ty.is_numeric(); + + // — Person — + if text && (has_any(toks, &["fname", "firstname"]) || has_seq(toks, "first", "name")) { + return Some(Generator::FirstName); + } + if text + && (has_any(toks, &["lname", "lastname", "surname"]) || has_seq(toks, "last", "name")) + { + return Some(Generator::LastName); + } + if text && (has_any(toks, &["username", "login", "handle"]) || has_seq(toks, "user", "name")) { + return Some(Generator::Username); + } + if text && has_any(toks, &["email", "emails"]) { + return Some(Generator::Email); + } + if text && has_any(toks, &["password", "passwd", "pwd"]) { + return Some(Generator::Password); + } + if text && has_any(toks, &["phone", "mobile", "cell", "tel", "telephone"]) { + return Some(Generator::Phone); + } + + // — bare `name` / `title` → table-context (D11) — + // Guarded against the `*_name` false positives handled above (those + // returned already) plus structural names like `filename`/`table_name`. + if text && has_any(toks, &["name", "title"]) && !is_name_false_positive(toks) { + return Some(name_by_table_context(table)); + } + + // — Address — + if text && has_any(toks, &["city", "town"]) { + return Some(Generator::City); + } + if text && has_token(toks, "country") { + return Some(Generator::Country); + } + // `province` / explicit `state_name`/`state_abbr` → a real state name. + // Bare `state` is left to enum-ish (it usually means status), so we + // require `province` or a `state` token paired with name/abbr. + if text && (has_token(toks, "province") || (has_token(toks, "state") && has_any(toks, &["name", "abbr"]))) { + return Some(Generator::StateName); + } + if text && has_any(toks, &["street", "address", "addr"]) { + return Some(Generator::Street); + } + if text && has_any(toks, &["zip", "zipcode", "postcode", "postal"]) { + return Some(Generator::ZipCode); + } + + // — Organisation / job — + if text && has_any(toks, &["company", "employer", "org", "organization", "organisation"]) { + return Some(Generator::Company); + } + if text && has_any(toks, &["job", "position", "profession", "occupation"]) { + return Some(Generator::JobTitle); + } + + // — Free text — + if text && has_any(toks, &["description", "bio", "notes", "note", "summary", "comment", "comments", "about"]) { + return Some(Generator::Sentence); + } + if text && has_any(toks, &["url", "website", "homepage", "link"]) { + return Some(Generator::Url); + } + if text && has_any(toks, &["color", "colour"]) { + return Some(Generator::HexColor); + } + + // — Numeric — + if numeric && has_any(toks, &["price", "amount", "cost", "salary", "balance", "total", "fee", "revenue"]) { + return Some(Generator::CurrencyAmount); + } + if numeric && has_token(toks, "age") { + return Some(Generator::Age); + } + if numeric && has_any(toks, &["quantity", "qty", "stock", "count"]) { + return Some(Generator::SmallInt); + } + + // — Temporal (bounded, D8) — + if matches!(ty, Type::Date) && has_any(toks, &["dob", "birthday", "birthdate"]) { + return Some(Generator::DateAdult); + } + if matches!(ty, Type::Date) && has_token(toks, "date") { + return Some(Generator::DateRecent); + } + if matches!(ty, Type::DateTime) && has_any(toks, &["timestamp", "datetime", "at"]) { + return Some(Generator::DateTimeRecent); + } + + // — Boolean — + if matches!(ty, Type::Bool) + && (toks.first().map(String::as_str) == Some("is") + || toks.first().map(String::as_str) == Some("has") + || has_any(toks, &["active", "enabled", "verified", "deleted"])) + { + return Some(Generator::Boolean); + } + + // — Identifier family (D10) — late so phone/email/etc. win first. + if matches!(ty, Type::Int | Type::Text) && is_identifier_name(toks) { + return Some(Generator::IdentitySequential); + } + + None +} + +/// Resolve a bare `name`/`title` column by the **table** it lives in +/// (D11): product-ish → a product name, company-ish → a company name, +/// person-ish → a person name, otherwise a generic person name. +fn name_by_table_context(table: &str) -> Generator { + let toks = tokens(table); + const PRODUCTY: &[&str] = &[ + "product", "products", "item", "items", "good", "goods", + "merchandise", "catalog", "catalogue", "inventory", "sku", "skus", + ]; + const COMPANYISH: &[&str] = &[ + "company", "companies", "vendor", "vendors", "supplier", + "suppliers", "manufacturer", "manufacturers", "brand", "brands", + "organization", "organisation", + ]; + const PERSONISH: &[&str] = &[ + "user", "users", "customer", "customers", "person", "people", + "employee", "employees", "member", "members", "contact", + "contacts", "author", "authors", "student", "students", + ]; + if has_any(&toks, PRODUCTY) { + Generator::ProductName + } else if has_any(&toks, COMPANYISH) { + Generator::Company + } else if has_any(&toks, PERSONISH) { + Generator::FullName + } else { + // Unknown table: a person name is the most generally useful + // default for a bare `name` column. + Generator::FullName + } +} + +/// Names ending in `name`/`title` that are NOT person names. The +/// specific `first`/`last`/`user` cases are matched earlier and return +/// before this guard; this catches structural names. +fn is_name_false_positive(toks: &[String]) -> bool { + const NON_PERSON: &[&str] = &[ + "file", "table", "host", "domain", "field", "class", "tag", + "event", "path", "col", "column", "db", "schema", "index", "key", + "page", "node", "type", + ]; + has_any(toks, NON_PERSON) && has_any(toks, &["name", "title"]) +} + +/// Identifier-family names (D10): treated as unique identifiers. FK +/// columns never reach here (handled in [`choose_generator`]). +fn is_identifier_name(toks: &[String]) -> bool { + const ID_TOKENS: &[&str] = &["id", "code", "sku", "ref", "reference", "barcode"]; + if has_any(toks, ID_TOKENS) { + return true; + } + // `*_number` / `*_no` as an identifier, but only when qualified + // (a bare `number`/`no` is too ambiguous, and `phone_number` already + // matched the phone rule earlier). + toks.len() >= 2 && has_any(toks, &["number", "no"]) +} + +// — token utilities — + +/// Split a column/table name into lowercase tokens on `_`, `-`, spaces, +/// and camelCase boundaries. `created_at` → [`created`, `at`]; +/// `firstName` → [`first`, `name`]; `DOB` → [`dob`]. +fn tokens(name: &str) -> Vec { + let mut out = Vec::new(); + let mut cur = String::new(); + let mut prev_was_lower_or_digit = false; + for ch in name.chars() { + if ch == '_' || ch == '-' || ch == ' ' { + if !cur.is_empty() { + out.push(std::mem::take(&mut cur)); + } + prev_was_lower_or_digit = false; + continue; + } + // camelCase boundary: an uppercase letter following a lowercase + // letter or digit starts a new token. + if ch.is_ascii_uppercase() && prev_was_lower_or_digit && !cur.is_empty() { + out.push(std::mem::take(&mut cur)); + } + cur.push(ch.to_ascii_lowercase()); + prev_was_lower_or_digit = ch.is_ascii_lowercase() || ch.is_ascii_digit(); + } + if !cur.is_empty() { + out.push(cur); + } + out +} + +fn has_token(toks: &[String], t: &str) -> bool { + toks.iter().any(|x| x == t) +} + +fn has_any(toks: &[String], candidates: &[&str]) -> bool { + candidates.iter().any(|c| has_token(toks, c)) +} + +/// Whether `a` is immediately followed by `b` in the token list — for +/// matching split compound names like `first name` / `user name`. +fn has_seq(toks: &[String], a: &str, b: &str) -> bool { + toks.windows(2).any(|w| w[0] == a && w[1] == b) +} + +/// Text-typed for heuristic purposes — `text`, `shortid`, plus the +/// text-backed `decimal`/`date`/`datetime` are excluded here because +/// those have their own dedicated gates; only `text`/`shortid` accept +/// free-text generators. +const fn type_is_text(ty: Type) -> bool { + matches!(ty, Type::Text | Type::ShortId) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::seed::ColumnSpec; + use pretty_assertions::assert_eq; + + fn choose(table: &str, name: &str, ty: Type) -> Generator { + choose_generator(table, &ColumnSpec::plain(name, ty)) + } + + #[test] + fn person_name_fields_map_to_name_generators() { + assert_eq!(choose("users", "first_name", Type::Text), Generator::FirstName); + assert_eq!(choose("users", "firstName", Type::Text), Generator::FirstName); + assert_eq!(choose("users", "last_name", Type::Text), Generator::LastName); + assert_eq!(choose("users", "surname", Type::Text), Generator::LastName); + } + + #[test] + fn contact_fields_map_correctly() { + assert_eq!(choose("users", "email", Type::Text), Generator::Email); + assert_eq!(choose("users", "work_email", Type::Text), Generator::Email); + assert_eq!(choose("users", "username", Type::Text), Generator::Username); + assert_eq!(choose("users", "user_name", Type::Text), Generator::Username); + assert_eq!(choose("users", "phone", Type::Text), Generator::Phone); + assert_eq!(choose("accounts", "password", Type::Text), Generator::Password); + } + + #[test] + fn address_fields_map_correctly() { + assert_eq!(choose("a", "city", Type::Text), Generator::City); + assert_eq!(choose("a", "country", Type::Text), Generator::Country); + assert_eq!(choose("a", "street", Type::Text), Generator::Street); + assert_eq!(choose("a", "zip", Type::Text), Generator::ZipCode); + assert_eq!(choose("a", "postcode", Type::Text), Generator::ZipCode); + assert_eq!(choose("a", "province", Type::Text), Generator::StateName); + } + + #[test] + fn bare_name_uses_table_context() { + // D11 — the same column name resolves differently by table. + assert_eq!(choose("products", "name", Type::Text), Generator::ProductName); + assert_eq!(choose("items", "title", Type::Text), Generator::ProductName); + assert_eq!(choose("users", "name", Type::Text), Generator::FullName); + assert_eq!(choose("customers", "name", Type::Text), Generator::FullName); + assert_eq!(choose("vendors", "name", Type::Text), Generator::Company); + // Unknown table → person name default. + assert_eq!(choose("widgets", "name", Type::Text), Generator::FullName); + } + + #[test] + fn name_false_positives_do_not_become_person_names() { + // These must NOT resolve to a person/product name. + assert_ne!(choose("files", "filename", Type::Text), Generator::FullName); + assert_ne!(choose("meta", "table_name", Type::Text), Generator::FullName); + // They fall through to a generic / non-person generator. + assert_eq!(choose("files", "filename", Type::Text), Generator::Generic); + } + + #[test] + fn numeric_name_heuristics_are_type_gated() { + // `price` on a numeric column → currency; on text → falls through. + assert_eq!(choose("p", "price", Type::Int), Generator::CurrencyAmount); + assert_eq!(choose("p", "price", Type::Decimal), Generator::CurrencyAmount); + assert_eq!(choose("p", "price", Type::Text), Generator::Generic); + assert_eq!(choose("u", "age", Type::Int), Generator::Age); + assert_eq!(choose("o", "quantity", Type::Int), Generator::SmallInt); + } + + #[test] + fn email_on_wrong_type_falls_through() { + // The type gate: an `email` int column does NOT get a string — + // it falls through to type-based generation. + assert_eq!(choose("u", "email", Type::Int), Generator::Generic); + } + + #[test] + fn temporal_fields_are_bounded_and_type_gated() { + assert_eq!(choose("u", "dob", Type::Date), Generator::DateAdult); + assert_eq!(choose("o", "order_date", Type::Date), Generator::DateRecent); + assert_eq!(choose("o", "created_at", Type::DateTime), Generator::DateTimeRecent); + assert_eq!(choose("o", "timestamp", Type::DateTime), Generator::DateTimeRecent); + // Wrong type → not a date generator. + assert_eq!(choose("o", "order_date", Type::Int), Generator::Generic); + } + + #[test] + fn boolean_fields_map_to_boolean() { + assert_eq!(choose("u", "is_active", Type::Bool), Generator::Boolean); + assert_eq!(choose("u", "has_paid", Type::Bool), Generator::Boolean); + assert_eq!(choose("u", "enabled", Type::Bool), Generator::Boolean); + } + + #[test] + fn identifier_family_is_unique_sequential() { + assert_eq!(choose("t", "code", Type::Text), Generator::IdentitySequential); + assert_eq!(choose("t", "sku", Type::Text), Generator::IdentitySequential); + assert_eq!(choose("t", "order_number", Type::Int), Generator::IdentitySequential); + assert_eq!(choose("t", "external_id", Type::Int), Generator::IdentitySequential); + } + + #[test] + fn foreign_key_columns_defer_to_executor() { + let mut spec = ColumnSpec::plain("user_id", Type::Int); + spec.is_foreign_key = true; + assert_eq!(choose_generator("orders", &spec), Generator::ForeignKeySample); + } + + #[test] + fn check_in_values_become_pick_from() { + let mut spec = ColumnSpec::plain("status", Type::Text); + spec.check_in_values = Some(vec!["active".into(), "closed".into()]); + assert_eq!( + choose_generator("orders", &spec), + Generator::PickFrom(vec!["active".into(), "closed".into()]) + ); + } + + #[test] + fn enum_ish_names_are_detected_for_the_advisory() { + assert!(is_enum_ish("status")); + assert!(is_enum_ish("role")); + assert!(is_enum_ish("order_state")); + assert!(is_enum_ish("priority")); + assert!(!is_enum_ish("email")); + assert!(!is_enum_ish("first_name")); + } + + #[test] + fn enum_ish_columns_fall_through_to_generic() { + // No special generator — generic text + the advisory flags them. + assert_eq!(choose("orders", "status", Type::Text), Generator::Generic); + assert_eq!(choose("users", "role", Type::Text), Generator::Generic); + } + + #[test] + fn unmatched_columns_use_type_based_fallback() { + assert_eq!(choose("t", "some_freeform_field", Type::Text), Generator::Generic); + } + + #[test] + fn tokenizer_splits_on_all_boundaries() { + assert_eq!(tokens("created_at"), vec!["created", "at"]); + assert_eq!(tokens("firstName"), vec!["first", "name"]); + assert_eq!(tokens("DOB"), vec!["dob"]); + assert_eq!(tokens("user-email"), vec!["user", "email"]); + assert_eq!(tokens("HTTPStatus"), vec!["httpstatus"]); + } +} diff --git a/src/seed/mod.rs b/src/seed/mod.rs new file mode 100644 index 0000000..1f2bd50 --- /dev/null +++ b/src/seed/mod.rs @@ -0,0 +1,200 @@ +//! Pure fake-data generation library for the `seed` command (ADR-0048). +//! +//! This module is the **generation half** of `seed`: given a column's +//! shape (name, type, constraints), it chooses a *generator* and turns +//! a seeded RNG into plausible [`Value`]s. It is deliberately decoupled +//! from `db.rs` — it knows nothing about SQLite, the worker thread, or +//! persistence — so it stays pure and unit-testable, with exact-value +//! assertions made possible by the seedable RNG (ADR-0048 D4). +//! +//! The executor (`db.rs::do_seed`) adapts the real schema into +//! [`ColumnSpec`]s, calls [`choose_generator`] per column, and then +//! [`generate_value`] per row — except for the *stateful* markers +//! ([`Generator::IdentitySequential`], [`Generator::ForeignKeySample`]) +//! which need database context (existing rows, the running sequence) +//! and so are resolved by the executor, not here. +//! +//! Layout: +//! - this file — the public types ([`ColumnSpec`], [`Generator`], +//! [`SeedRng`]) and the RNG constructor. +//! - [`heuristics`] — [`choose_generator`] + the name-aware catalogue +//! (D7), table-context disambiguation (D11), identifier (D10) and +//! enum-ish (D12) detection. +//! - [`generators`] — [`generate_value`]: per-generator value +//! production, the hand-rolled `product` generator (D9) and the +//! bounded date windows (D8). + +mod generators; +mod heuristics; + +pub use generators::generate_value; +pub use heuristics::{choose_generator, is_enum_ish}; + +use rand::rngs::StdRng; +use rand::{RngExt, SeedableRng}; + +use crate::dsl::types::Type; + +/// The RNG that drives all seed generation. +/// +/// A single seeded `StdRng` feeds both `fake`'s `fake_with_rng` and the +/// hand-rolled generators, so a `--seed` value fully determines the +/// output (ADR-0048 D4). `rand 0.10`'s `StdRng` satisfies `fake`'s +/// `RngExt` bound (it re-exports `rand::RngExt`), so the same handle +/// works on both sides. +pub type SeedRng = StdRng; + +/// Build the seed RNG. +/// +/// With `Some(seed)` the stream is reproducible; with `None` it is +/// seeded from entropy (via the thread RNG) so each run differs. +/// Seeding `StdRng` from a single `u64` in both cases keeps +/// construction uniform and avoids `rand`'s churn-prone from-entropy +/// constructors. +#[must_use] +pub fn make_rng(seed: Option) -> SeedRng { + let seed = seed.unwrap_or_else(|| rand::rng().random::()); + StdRng::seed_from_u64(seed) +} + +/// A column described in just enough detail to choose and run a +/// generator. Built by the executor from the real schema; kept +/// independent of `db.rs`'s `ReadColumn` so this library stays pure. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ColumnSpec { + /// The column's name — the primary signal for generator choice. + pub name: String, + /// The user-facing playground type — gates every name heuristic. + pub ty: Type, + /// `NOT NULL` — the executor uses this for the block guard (D1); + /// generation always produces a value, so it is informational here. + pub not_null: bool, + /// Part of the table's primary key. + pub primary_key: bool, + /// Carries a `UNIQUE` constraint (or is a single-column PK). + pub unique: bool, + /// A foreign-key column — generation is the executor's job + /// (sample an existing parent row, D14), so [`choose_generator`] + /// returns [`Generator::ForeignKeySample`]. + pub is_foreign_key: bool, + /// Values parsed from a simple `col IN ('a', 'b', …)` CHECK + /// constraint (D17). When present, generation draws from them so + /// the common enum-as-CHECK pattern "just works". + pub check_in_values: Option>, +} + +impl ColumnSpec { + /// Convenience constructor for a plain, unconstrained column — + /// used heavily in tests. + #[cfg(test)] + #[must_use] + pub fn plain(name: &str, ty: Type) -> Self { + Self { + name: name.to_string(), + ty, + not_null: false, + primary_key: false, + unique: false, + is_foreign_key: false, + check_in_values: None, + } + } +} + +/// The chosen generation strategy for a column. +/// +/// Most variants are *stateless* — [`generate_value`] turns them into a +/// [`Value`] from the RNG alone. Two are *stateful markers* that the +/// executor must intercept (they need database context): +/// [`Self::IdentitySequential`] (the running `MAX+offset` sequence, +/// D10) and [`Self::ForeignKeySample`] (draw from existing parent +/// rows, D14). For safety [`generate_value`] treats an un-intercepted +/// marker as [`Self::Generic`] rather than panicking. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Generator { + // — Person — + FirstName, + LastName, + /// A full person name (table-context default for `name`/`title`). + FullName, + Email, + Username, + Password, + Phone, + // — Address — + City, + Country, + StateName, + Street, + ZipCode, + // — Organisation / commerce — + Company, + JobTitle, + /// Hand-rolled `{adjective} {material} {noun}` (D9) — `fake` has no + /// commerce module. + ProductName, + // — Free text — + Sentence, + Paragraph, + Url, + HexColor, + // — Numeric — + /// A money-shaped amount (whole for `int`, two-decimal otherwise). + CurrencyAmount, + /// A plausible human age (18–80). + Age, + /// A small positive integer (quantities, counts). + SmallInt, + // — Temporal (bounded windows, D8) — + /// A date within the last few years. + DateRecent, + /// A date in an adult birth window (≈18–80 years ago) — for `dob`. + DateAdult, + /// A datetime within the last few years. + DateTimeRecent, + // — Boolean — + Boolean, + // — Stateful markers (executor-resolved) — + /// Unique sequential identifier (D10): the executor supplies + /// `MAX(col)+offset`. Chosen for identifier-named non-FK columns. + IdentitySequential, + /// FK column (D14): the executor samples an existing parent key. + ForeignKeySample, + // — List / fallback — + /// Uniform pick from a fixed list — a simple `IN`-CHECK (D17), an + /// enum, or a future `set in (…)` override. + PickFrom(Vec), + /// Type-based fallback (D8) when no name heuristic matches. + Generic, +} + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn same_seed_yields_identical_rng_streams() { + let mut a = make_rng(Some(42)); + let mut b = make_rng(Some(42)); + let xs: Vec = (0..8).map(|_| a.random::()).collect(); + let ys: Vec = (0..8).map(|_| b.random::()).collect(); + assert_eq!(xs, ys, "a fixed seed must reproduce the stream"); + } + + #[test] + fn different_seeds_yield_different_streams() { + let mut a = make_rng(Some(1)); + let mut b = make_rng(Some(2)); + let xs: Vec = (0..8).map(|_| a.random::()).collect(); + let ys: Vec = (0..8).map(|_| b.random::()).collect(); + assert_ne!(xs, ys); + } + + #[test] + fn unseeded_rng_constructs_without_panicking() { + // Entropy-seeded path: just exercise it. + let mut rng = make_rng(None); + let _ = rng.random::(); + } +}