Files
rdbms-playground/src/seed/mod.rs
T
claude@clouddev1 9c135010ba feat(seed): uniqueness, junction distinct-combos, IN-CHECK (ADR-0048 P1.3b)
do_seed now enforces value uniqueness and derives enum values:
- Uniqueness groups (D10): the user-fillable PK, compound UNIQUE
  constraints, and single-column UNIQUE / identifier columns stay
  distinct across the batch and against existing rows (retry per row).
  Junction distinct-combos fall out of PK-tuple uniqueness and cap at
  the available parent combinations (logged when capped; the
  user-facing note arrives with the advisory in P1.3c).
- Identifier-int columns get a monotonic sequence past MAX(col) (D10),
  so they never collide.
- IN-CHECK derivation (D17): a simple `col IN ('a','b')` CHECK becomes
  the value source via the new, unit-tested seed::parse_in_check_values,
  so the enum-as-CHECK pattern just works.

8 parser unit tests + 4 integration tests (unique column, identifier
sequencing, junction cap, IN-check enum). 2343 pass / 0 fail / 0 skip,
clippy all-targets clean.

Deferred to P1.3c: dedicated SeedResult + capped preview (D18) + the
enum/CHECK advisory incl. the cap note (D12/D13); P1.3d: multi-row path.
2026-06-11 18:50:05 +00:00

203 lines
7.2 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! Pure fake-data generation library for the `seed` command (ADR-0048).
//!
//! This module is the **generation half** of `seed`: given a column's
//! shape (name, type, constraints), it chooses a *generator* and turns
//! a seeded RNG into plausible [`Value`]s. It is deliberately decoupled
//! from `db.rs` — it knows nothing about SQLite, the worker thread, or
//! persistence — so it stays pure and unit-testable, with exact-value
//! assertions made possible by the seedable RNG (ADR-0048 D4).
//!
//! The executor (`db.rs::do_seed`) adapts the real schema into
//! [`ColumnSpec`]s, calls [`choose_generator`] per column, and then
//! [`generate_value`] per row — except for the *stateful* markers
//! ([`Generator::IdentitySequential`], [`Generator::ForeignKeySample`])
//! which need database context (existing rows, the running sequence)
//! and so are resolved by the executor, not here.
//!
//! Layout:
//! - this file — the public types ([`ColumnSpec`], [`Generator`],
//! [`SeedRng`]) and the RNG constructor.
//! - [`heuristics`] — [`choose_generator`] + the name-aware catalogue
//! (D7), table-context disambiguation (D11), identifier (D10) and
//! enum-ish (D12) detection.
//! - [`generators`] — [`generate_value`]: per-generator value
//! production, the hand-rolled `product` generator (D9) and the
//! bounded date windows (D8).
mod check;
mod generators;
mod heuristics;
pub use check::parse_in_check_values;
pub use generators::generate_value;
pub use heuristics::{choose_generator, is_enum_ish};
use rand::rngs::StdRng;
use rand::{RngExt, SeedableRng};
use crate::dsl::types::Type;
/// The RNG that drives all seed generation.
///
/// A single seeded `StdRng` feeds both `fake`'s `fake_with_rng` and the
/// hand-rolled generators, so a `--seed` value fully determines the
/// output (ADR-0048 D4). `rand 0.10`'s `StdRng` satisfies `fake`'s
/// `RngExt` bound (it re-exports `rand::RngExt`), so the same handle
/// works on both sides.
pub type SeedRng = StdRng;
/// Build the seed RNG.
///
/// With `Some(seed)` the stream is reproducible; with `None` it is
/// seeded from entropy (via the thread RNG) so each run differs.
/// Seeding `StdRng` from a single `u64` in both cases keeps
/// construction uniform and avoids `rand`'s churn-prone from-entropy
/// constructors.
#[must_use]
pub fn make_rng(seed: Option<u64>) -> SeedRng {
let seed = seed.unwrap_or_else(|| rand::rng().random::<u64>());
StdRng::seed_from_u64(seed)
}
/// A column described in just enough detail to choose and run a
/// generator. Built by the executor from the real schema; kept
/// independent of `db.rs`'s `ReadColumn` so this library stays pure.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ColumnSpec {
/// The column's name — the primary signal for generator choice.
pub name: String,
/// The user-facing playground type — gates every name heuristic.
pub ty: Type,
/// `NOT NULL` — the executor uses this for the block guard (D1);
/// generation always produces a value, so it is informational here.
pub not_null: bool,
/// Part of the table's primary key.
pub primary_key: bool,
/// Carries a `UNIQUE` constraint (or is a single-column PK).
pub unique: bool,
/// A foreign-key column — generation is the executor's job
/// (sample an existing parent row, D14), so [`choose_generator`]
/// returns [`Generator::ForeignKeySample`].
pub is_foreign_key: bool,
/// Values parsed from a simple `col IN ('a', 'b', …)` CHECK
/// constraint (D17). When present, generation draws from them so
/// the common enum-as-CHECK pattern "just works".
pub check_in_values: Option<Vec<String>>,
}
impl ColumnSpec {
/// Convenience constructor for a plain, unconstrained column —
/// used heavily in tests.
#[cfg(test)]
#[must_use]
pub fn plain(name: &str, ty: Type) -> Self {
Self {
name: name.to_string(),
ty,
not_null: false,
primary_key: false,
unique: false,
is_foreign_key: false,
check_in_values: None,
}
}
}
/// The chosen generation strategy for a column.
///
/// Most variants are *stateless* — [`generate_value`] turns them into a
/// [`Value`] from the RNG alone. Two are *stateful markers* that the
/// executor must intercept (they need database context):
/// [`Self::IdentitySequential`] (the running `MAX+offset` sequence,
/// D10) and [`Self::ForeignKeySample`] (draw from existing parent
/// rows, D14). For safety [`generate_value`] treats an un-intercepted
/// marker as [`Self::Generic`] rather than panicking.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Generator {
// — Person —
FirstName,
LastName,
/// A full person name (table-context default for `name`/`title`).
FullName,
Email,
Username,
Password,
Phone,
// — Address —
City,
Country,
StateName,
Street,
ZipCode,
// — Organisation / commerce —
Company,
JobTitle,
/// Hand-rolled `{adjective} {material} {noun}` (D9) — `fake` has no
/// commerce module.
ProductName,
// — Free text —
Sentence,
Paragraph,
Url,
HexColor,
// — Numeric —
/// A money-shaped amount (whole for `int`, two-decimal otherwise).
CurrencyAmount,
/// A plausible human age (1880).
Age,
/// A small positive integer (quantities, counts).
SmallInt,
// — Temporal (bounded windows, D8) —
/// A date within the last few years.
DateRecent,
/// A date in an adult birth window (≈1880 years ago) — for `dob`.
DateAdult,
/// A datetime within the last few years.
DateTimeRecent,
// — Boolean —
Boolean,
// — Stateful markers (executor-resolved) —
/// Unique sequential identifier (D10): the executor supplies
/// `MAX(col)+offset`. Chosen for identifier-named non-FK columns.
IdentitySequential,
/// FK column (D14): the executor samples an existing parent key.
ForeignKeySample,
// — List / fallback —
/// Uniform pick from a fixed list — a simple `IN`-CHECK (D17), an
/// enum, or a future `set <col> in (…)` override.
PickFrom(Vec<String>),
/// Type-based fallback (D8) when no name heuristic matches.
Generic,
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
#[test]
fn same_seed_yields_identical_rng_streams() {
let mut a = make_rng(Some(42));
let mut b = make_rng(Some(42));
let xs: Vec<u64> = (0..8).map(|_| a.random::<u64>()).collect();
let ys: Vec<u64> = (0..8).map(|_| b.random::<u64>()).collect();
assert_eq!(xs, ys, "a fixed seed must reproduce the stream");
}
#[test]
fn different_seeds_yield_different_streams() {
let mut a = make_rng(Some(1));
let mut b = make_rng(Some(2));
let xs: Vec<u64> = (0..8).map(|_| a.random::<u64>()).collect();
let ys: Vec<u64> = (0..8).map(|_| b.random::<u64>()).collect();
assert_ne!(xs, ys);
}
#[test]
fn unseeded_rng_constructs_without_panicking() {
// Entropy-seeded path: just exercise it.
let mut rng = make_rng(None);
let _ = rng.random::<u64>();
}
}