feat(seed): fake-data generation library + fake dependency (ADR-0048 P1.1)

The pure generation half of `seed` — no command wiring yet:
- src/seed/: ColumnSpec + Generator model and a seeded StdRng; the
  type-gated name-heuristic catalogue (D7) with documented
  false-positive guards; table-context name disambiguation (D11);
  identifier (D10) and enum-ish (D12) detection; per-type + bounded-date
  generators (D8); the hand-rolled product generator (D9); and PickFrom
  for IN-CHECK / enum lists.
- Adds the `fake` crate (v5, default features). Verified: single rand
  0.10.1 (no duplication), determinism via one seeded StdRng driving
  both fake and the hand-rolled generators, security-clean across
  osv/grype/trivy.
- ADR-0048 D3 updated to record the dependency verification.

32 Tier-1 tests (exact-value via fixed --seed); 1673 lib tests pass,
clippy all-targets clean.
This commit is contained in:
claude@clouddev1
2026-06-11 15:35:17 +00:00
parent 0af7f56821
commit 202e25a94f
7 changed files with 1072 additions and 16 deletions
+200
View File
@@ -0,0 +1,200 @@
//! Pure fake-data generation library for the `seed` command (ADR-0048).
//!
//! This module is the **generation half** of `seed`: given a column's
//! shape (name, type, constraints), it chooses a *generator* and turns
//! a seeded RNG into plausible [`Value`]s. It is deliberately decoupled
//! from `db.rs` — it knows nothing about SQLite, the worker thread, or
//! persistence — so it stays pure and unit-testable, with exact-value
//! assertions made possible by the seedable RNG (ADR-0048 D4).
//!
//! The executor (`db.rs::do_seed`) adapts the real schema into
//! [`ColumnSpec`]s, calls [`choose_generator`] per column, and then
//! [`generate_value`] per row — except for the *stateful* markers
//! ([`Generator::IdentitySequential`], [`Generator::ForeignKeySample`])
//! which need database context (existing rows, the running sequence)
//! and so are resolved by the executor, not here.
//!
//! Layout:
//! - this file — the public types ([`ColumnSpec`], [`Generator`],
//! [`SeedRng`]) and the RNG constructor.
//! - [`heuristics`] — [`choose_generator`] + the name-aware catalogue
//! (D7), table-context disambiguation (D11), identifier (D10) and
//! enum-ish (D12) detection.
//! - [`generators`] — [`generate_value`]: per-generator value
//! production, the hand-rolled `product` generator (D9) and the
//! bounded date windows (D8).
mod generators;
mod heuristics;
pub use generators::generate_value;
pub use heuristics::{choose_generator, is_enum_ish};
use rand::rngs::StdRng;
use rand::{RngExt, SeedableRng};
use crate::dsl::types::Type;
/// The RNG that drives all seed generation.
///
/// A single seeded `StdRng` feeds both `fake`'s `fake_with_rng` and the
/// hand-rolled generators, so a `--seed` value fully determines the
/// output (ADR-0048 D4). `rand 0.10`'s `StdRng` satisfies `fake`'s
/// `RngExt` bound (it re-exports `rand::RngExt`), so the same handle
/// works on both sides.
pub type SeedRng = StdRng;
/// Build the seed RNG.
///
/// With `Some(seed)` the stream is reproducible; with `None` it is
/// seeded from entropy (via the thread RNG) so each run differs.
/// Seeding `StdRng` from a single `u64` in both cases keeps
/// construction uniform and avoids `rand`'s churn-prone from-entropy
/// constructors.
#[must_use]
pub fn make_rng(seed: Option<u64>) -> SeedRng {
let seed = seed.unwrap_or_else(|| rand::rng().random::<u64>());
StdRng::seed_from_u64(seed)
}
/// A column described in just enough detail to choose and run a
/// generator. Built by the executor from the real schema; kept
/// independent of `db.rs`'s `ReadColumn` so this library stays pure.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ColumnSpec {
/// The column's name — the primary signal for generator choice.
pub name: String,
/// The user-facing playground type — gates every name heuristic.
pub ty: Type,
/// `NOT NULL` — the executor uses this for the block guard (D1);
/// generation always produces a value, so it is informational here.
pub not_null: bool,
/// Part of the table's primary key.
pub primary_key: bool,
/// Carries a `UNIQUE` constraint (or is a single-column PK).
pub unique: bool,
/// A foreign-key column — generation is the executor's job
/// (sample an existing parent row, D14), so [`choose_generator`]
/// returns [`Generator::ForeignKeySample`].
pub is_foreign_key: bool,
/// Values parsed from a simple `col IN ('a', 'b', …)` CHECK
/// constraint (D17). When present, generation draws from them so
/// the common enum-as-CHECK pattern "just works".
pub check_in_values: Option<Vec<String>>,
}
impl ColumnSpec {
/// Convenience constructor for a plain, unconstrained column —
/// used heavily in tests.
#[cfg(test)]
#[must_use]
pub fn plain(name: &str, ty: Type) -> Self {
Self {
name: name.to_string(),
ty,
not_null: false,
primary_key: false,
unique: false,
is_foreign_key: false,
check_in_values: None,
}
}
}
/// The chosen generation strategy for a column.
///
/// Most variants are *stateless* — [`generate_value`] turns them into a
/// [`Value`] from the RNG alone. Two are *stateful markers* that the
/// executor must intercept (they need database context):
/// [`Self::IdentitySequential`] (the running `MAX+offset` sequence,
/// D10) and [`Self::ForeignKeySample`] (draw from existing parent
/// rows, D14). For safety [`generate_value`] treats an un-intercepted
/// marker as [`Self::Generic`] rather than panicking.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Generator {
// — Person —
FirstName,
LastName,
/// A full person name (table-context default for `name`/`title`).
FullName,
Email,
Username,
Password,
Phone,
// — Address —
City,
Country,
StateName,
Street,
ZipCode,
// — Organisation / commerce —
Company,
JobTitle,
/// Hand-rolled `{adjective} {material} {noun}` (D9) — `fake` has no
/// commerce module.
ProductName,
// — Free text —
Sentence,
Paragraph,
Url,
HexColor,
// — Numeric —
/// A money-shaped amount (whole for `int`, two-decimal otherwise).
CurrencyAmount,
/// A plausible human age (1880).
Age,
/// A small positive integer (quantities, counts).
SmallInt,
// — Temporal (bounded windows, D8) —
/// A date within the last few years.
DateRecent,
/// A date in an adult birth window (≈1880 years ago) — for `dob`.
DateAdult,
/// A datetime within the last few years.
DateTimeRecent,
// — Boolean —
Boolean,
// — Stateful markers (executor-resolved) —
/// Unique sequential identifier (D10): the executor supplies
/// `MAX(col)+offset`. Chosen for identifier-named non-FK columns.
IdentitySequential,
/// FK column (D14): the executor samples an existing parent key.
ForeignKeySample,
// — List / fallback —
/// Uniform pick from a fixed list — a simple `IN`-CHECK (D17), an
/// enum, or a future `set <col> in (…)` override.
PickFrom(Vec<String>),
/// Type-based fallback (D8) when no name heuristic matches.
Generic,
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
#[test]
fn same_seed_yields_identical_rng_streams() {
let mut a = make_rng(Some(42));
let mut b = make_rng(Some(42));
let xs: Vec<u64> = (0..8).map(|_| a.random::<u64>()).collect();
let ys: Vec<u64> = (0..8).map(|_| b.random::<u64>()).collect();
assert_eq!(xs, ys, "a fixed seed must reproduce the stream");
}
#[test]
fn different_seeds_yield_different_streams() {
let mut a = make_rng(Some(1));
let mut b = make_rng(Some(2));
let xs: Vec<u64> = (0..8).map(|_| a.random::<u64>()).collect();
let ys: Vec<u64> = (0..8).map(|_| b.random::<u64>()).collect();
assert_ne!(xs, ys);
}
#[test]
fn unseeded_rng_constructs_without_panicking() {
// Entropy-seeded path: just exercise it.
let mut rng = make_rng(None);
let _ = rng.random::<u64>();
}
}