feat(seed): fake-data generation library + fake dependency (ADR-0048 P1.1)
The pure generation half of `seed` — no command wiring yet: - src/seed/: ColumnSpec + Generator model and a seeded StdRng; the type-gated name-heuristic catalogue (D7) with documented false-positive guards; table-context name disambiguation (D11); identifier (D10) and enum-ish (D12) detection; per-type + bounded-date generators (D8); the hand-rolled product generator (D9); and PickFrom for IN-CHECK / enum lists. - Adds the `fake` crate (v5, default features). Verified: single rand 0.10.1 (no duplication), determinism via one seeded StdRng driving both fake and the hand-rolled generators, security-clean across osv/grype/trivy. - ADR-0048 D3 updated to record the dependency verification. 32 Tier-1 tests (exact-value via fixed --seed); 1673 lib tests pass, clippy all-targets clean.
This commit is contained in:
+200
@@ -0,0 +1,200 @@
|
||||
//! Pure fake-data generation library for the `seed` command (ADR-0048).
|
||||
//!
|
||||
//! This module is the **generation half** of `seed`: given a column's
|
||||
//! shape (name, type, constraints), it chooses a *generator* and turns
|
||||
//! a seeded RNG into plausible [`Value`]s. It is deliberately decoupled
|
||||
//! from `db.rs` — it knows nothing about SQLite, the worker thread, or
|
||||
//! persistence — so it stays pure and unit-testable, with exact-value
|
||||
//! assertions made possible by the seedable RNG (ADR-0048 D4).
|
||||
//!
|
||||
//! The executor (`db.rs::do_seed`) adapts the real schema into
|
||||
//! [`ColumnSpec`]s, calls [`choose_generator`] per column, and then
|
||||
//! [`generate_value`] per row — except for the *stateful* markers
|
||||
//! ([`Generator::IdentitySequential`], [`Generator::ForeignKeySample`])
|
||||
//! which need database context (existing rows, the running sequence)
|
||||
//! and so are resolved by the executor, not here.
|
||||
//!
|
||||
//! Layout:
|
||||
//! - this file — the public types ([`ColumnSpec`], [`Generator`],
|
||||
//! [`SeedRng`]) and the RNG constructor.
|
||||
//! - [`heuristics`] — [`choose_generator`] + the name-aware catalogue
|
||||
//! (D7), table-context disambiguation (D11), identifier (D10) and
|
||||
//! enum-ish (D12) detection.
|
||||
//! - [`generators`] — [`generate_value`]: per-generator value
|
||||
//! production, the hand-rolled `product` generator (D9) and the
|
||||
//! bounded date windows (D8).
|
||||
|
||||
mod generators;
|
||||
mod heuristics;
|
||||
|
||||
pub use generators::generate_value;
|
||||
pub use heuristics::{choose_generator, is_enum_ish};
|
||||
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{RngExt, SeedableRng};
|
||||
|
||||
use crate::dsl::types::Type;
|
||||
|
||||
/// The RNG that drives all seed generation.
|
||||
///
|
||||
/// A single seeded `StdRng` feeds both `fake`'s `fake_with_rng` and the
|
||||
/// hand-rolled generators, so a `--seed` value fully determines the
|
||||
/// output (ADR-0048 D4). `rand 0.10`'s `StdRng` satisfies `fake`'s
|
||||
/// `RngExt` bound (it re-exports `rand::RngExt`), so the same handle
|
||||
/// works on both sides.
|
||||
pub type SeedRng = StdRng;
|
||||
|
||||
/// Build the seed RNG.
|
||||
///
|
||||
/// With `Some(seed)` the stream is reproducible; with `None` it is
|
||||
/// seeded from entropy (via the thread RNG) so each run differs.
|
||||
/// Seeding `StdRng` from a single `u64` in both cases keeps
|
||||
/// construction uniform and avoids `rand`'s churn-prone from-entropy
|
||||
/// constructors.
|
||||
#[must_use]
|
||||
pub fn make_rng(seed: Option<u64>) -> SeedRng {
|
||||
let seed = seed.unwrap_or_else(|| rand::rng().random::<u64>());
|
||||
StdRng::seed_from_u64(seed)
|
||||
}
|
||||
|
||||
/// A column described in just enough detail to choose and run a
|
||||
/// generator. Built by the executor from the real schema; kept
|
||||
/// independent of `db.rs`'s `ReadColumn` so this library stays pure.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct ColumnSpec {
|
||||
/// The column's name — the primary signal for generator choice.
|
||||
pub name: String,
|
||||
/// The user-facing playground type — gates every name heuristic.
|
||||
pub ty: Type,
|
||||
/// `NOT NULL` — the executor uses this for the block guard (D1);
|
||||
/// generation always produces a value, so it is informational here.
|
||||
pub not_null: bool,
|
||||
/// Part of the table's primary key.
|
||||
pub primary_key: bool,
|
||||
/// Carries a `UNIQUE` constraint (or is a single-column PK).
|
||||
pub unique: bool,
|
||||
/// A foreign-key column — generation is the executor's job
|
||||
/// (sample an existing parent row, D14), so [`choose_generator`]
|
||||
/// returns [`Generator::ForeignKeySample`].
|
||||
pub is_foreign_key: bool,
|
||||
/// Values parsed from a simple `col IN ('a', 'b', …)` CHECK
|
||||
/// constraint (D17). When present, generation draws from them so
|
||||
/// the common enum-as-CHECK pattern "just works".
|
||||
pub check_in_values: Option<Vec<String>>,
|
||||
}
|
||||
|
||||
impl ColumnSpec {
|
||||
/// Convenience constructor for a plain, unconstrained column —
|
||||
/// used heavily in tests.
|
||||
#[cfg(test)]
|
||||
#[must_use]
|
||||
pub fn plain(name: &str, ty: Type) -> Self {
|
||||
Self {
|
||||
name: name.to_string(),
|
||||
ty,
|
||||
not_null: false,
|
||||
primary_key: false,
|
||||
unique: false,
|
||||
is_foreign_key: false,
|
||||
check_in_values: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The chosen generation strategy for a column.
|
||||
///
|
||||
/// Most variants are *stateless* — [`generate_value`] turns them into a
|
||||
/// [`Value`] from the RNG alone. Two are *stateful markers* that the
|
||||
/// executor must intercept (they need database context):
|
||||
/// [`Self::IdentitySequential`] (the running `MAX+offset` sequence,
|
||||
/// D10) and [`Self::ForeignKeySample`] (draw from existing parent
|
||||
/// rows, D14). For safety [`generate_value`] treats an un-intercepted
|
||||
/// marker as [`Self::Generic`] rather than panicking.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum Generator {
|
||||
// — Person —
|
||||
FirstName,
|
||||
LastName,
|
||||
/// A full person name (table-context default for `name`/`title`).
|
||||
FullName,
|
||||
Email,
|
||||
Username,
|
||||
Password,
|
||||
Phone,
|
||||
// — Address —
|
||||
City,
|
||||
Country,
|
||||
StateName,
|
||||
Street,
|
||||
ZipCode,
|
||||
// — Organisation / commerce —
|
||||
Company,
|
||||
JobTitle,
|
||||
/// Hand-rolled `{adjective} {material} {noun}` (D9) — `fake` has no
|
||||
/// commerce module.
|
||||
ProductName,
|
||||
// — Free text —
|
||||
Sentence,
|
||||
Paragraph,
|
||||
Url,
|
||||
HexColor,
|
||||
// — Numeric —
|
||||
/// A money-shaped amount (whole for `int`, two-decimal otherwise).
|
||||
CurrencyAmount,
|
||||
/// A plausible human age (18–80).
|
||||
Age,
|
||||
/// A small positive integer (quantities, counts).
|
||||
SmallInt,
|
||||
// — Temporal (bounded windows, D8) —
|
||||
/// A date within the last few years.
|
||||
DateRecent,
|
||||
/// A date in an adult birth window (≈18–80 years ago) — for `dob`.
|
||||
DateAdult,
|
||||
/// A datetime within the last few years.
|
||||
DateTimeRecent,
|
||||
// — Boolean —
|
||||
Boolean,
|
||||
// — Stateful markers (executor-resolved) —
|
||||
/// Unique sequential identifier (D10): the executor supplies
|
||||
/// `MAX(col)+offset`. Chosen for identifier-named non-FK columns.
|
||||
IdentitySequential,
|
||||
/// FK column (D14): the executor samples an existing parent key.
|
||||
ForeignKeySample,
|
||||
// — List / fallback —
|
||||
/// Uniform pick from a fixed list — a simple `IN`-CHECK (D17), an
|
||||
/// enum, or a future `set <col> in (…)` override.
|
||||
PickFrom(Vec<String>),
|
||||
/// Type-based fallback (D8) when no name heuristic matches.
|
||||
Generic,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
#[test]
|
||||
fn same_seed_yields_identical_rng_streams() {
|
||||
let mut a = make_rng(Some(42));
|
||||
let mut b = make_rng(Some(42));
|
||||
let xs: Vec<u64> = (0..8).map(|_| a.random::<u64>()).collect();
|
||||
let ys: Vec<u64> = (0..8).map(|_| b.random::<u64>()).collect();
|
||||
assert_eq!(xs, ys, "a fixed seed must reproduce the stream");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn different_seeds_yield_different_streams() {
|
||||
let mut a = make_rng(Some(1));
|
||||
let mut b = make_rng(Some(2));
|
||||
let xs: Vec<u64> = (0..8).map(|_| a.random::<u64>()).collect();
|
||||
let ys: Vec<u64> = (0..8).map(|_| b.random::<u64>()).collect();
|
||||
assert_ne!(xs, ys);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unseeded_rng_constructs_without_panicking() {
|
||||
// Entropy-seeded path: just exercise it.
|
||||
let mut rng = make_rng(None);
|
||||
let _ = rng.random::<u64>();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user