feat(seed): set override clause + column-fill (ADR-0048 Phase 2)

Build the two SD2 surfaces Phase 1 deferred:

- `set` override clause (D2): comma-separated per-column pins —
  `= 'v'` (fixed), `in ('a','b')` (pick-list), `as <generator>`
  (named), `between x and y` (range; numeric and quoted dates).
  Type-aware via the typed `current_column_value` slot; an override
  drops its column from the generic-fill advisory (D13). Folded from
  the flat matched path (build_seed_overrides) and applied to the
  per-column plan (apply_seed_overrides).
- `<table>.<column>` column-fill (D1 form 2): an UPDATE over existing
  rows. Refuses PK/autogen targets, empty-table no-op, FK-samples the
  parent, collision-free for UNIQUE/identifier targets, one undo step;
  `set` may only adjust the filled column.

Supporting work: KNOWN_GENERATORS vocabulary + generator_for_name
(src/seed/vocabulary.rs, D9); a range Generator + range_bounds_reason;
IdentSource::Generators and HighlightClass::Function; completion of the
generator vocabulary after `as` and the set/.col column slots; the
typing-time validity indicator for an unknown generator; help,
parse-error pedagogy rows, and the D13 advisory's Phase-2/3 wording.

A bounded override (fixed value / too-short pick-list) on a
single-column-UNIQUE target is a friendly error rather than a silent
uniqueness cap (post-implementation /runda finding, user-chosen).

Dates in the range form are quoted (no date-literal token exists);
ADR-0048 D2 amended accordingly. Both modes (D5); reproducible (D4).
This commit is contained in:
claude@clouddev1
2026-06-12 09:44:30 +00:00
parent 78c38e8b33
commit a12facc784
20 changed files with 1913 additions and 65 deletions
+478 -14
View File
@@ -33,7 +33,8 @@ use tracing::{debug, info, warn};
use crate::dsl::action::ReferentialAction;
use crate::dsl::command::{
ChangeColumnMode, Command, CompareOp, Constraint, ConstraintKind, Expr, IndexSelector,
Operand, Predicate, RelationshipSelector, RowFilter, SqlForeignKey,
Operand, Predicate, RelationshipSelector, RowFilter, SeedOverride, SeedOverrideKind,
SqlForeignKey,
};
use crate::dsl::ColumnSpec;
use crate::dsl::shortid;
@@ -723,7 +724,9 @@ enum Request {
/// snapshot wraps the whole seed via `snapshot_then`.
Seed {
table: String,
target_column: Option<String>,
count: Option<u64>,
overrides: Vec<SeedOverride>,
rng_seed: Option<u64>,
source: Option<String>,
reply: oneshot::Sender<Result<SeedResult, DbError>>,
@@ -1517,18 +1520,22 @@ impl Database {
recv.await.map_err(|_| DbError::WorkerGone)?
}
/// Populate a table with generated fake data (ADR-0048, SD1).
/// Populate a table with generated fake data (ADR-0048, SD1/SD2).
pub async fn seed(
&self,
table: String,
target_column: Option<String>,
count: Option<u64>,
overrides: Vec<SeedOverride>,
rng_seed: Option<u64>,
source: Option<String>,
) -> Result<SeedResult, DbError> {
let (reply, recv) = oneshot::channel();
self.send(Request::Seed {
table,
target_column,
count,
overrides,
rng_seed,
source,
reply,
@@ -2694,7 +2701,9 @@ fn handle_request(
}
Request::Seed {
table,
target_column,
count,
overrides,
rng_seed,
source,
reply,
@@ -2706,7 +2715,9 @@ fn handle_request(
persistence,
source.as_deref(),
&table,
target_column.as_deref(),
count,
&overrides,
rng_seed,
));
}
@@ -2938,7 +2949,10 @@ fn do_list_names_for(
}
Ok(out)
}
IdentSource::NewName | IdentSource::Types | IdentSource::Free => Ok(Vec::new()),
IdentSource::NewName
| IdentSource::Types
| IdentSource::Generators
| IdentSource::Free => Ok(Vec::new()),
}
}
@@ -8808,14 +8822,13 @@ fn sample_parent_key_tuples(
Ok(tuples)
}
/// Populate a table with generated fake data (ADR-0048, SD1).
/// Populate a table with generated fake data (ADR-0048, SD1/SD2).
///
/// **Phase 1.** Generates whole rows and inserts them one at a time
/// through [`do_insert`] — reusing all the existing per-value
/// validation, autogen autofill, FK-error enrichment and persistence
/// machinery. The whole seed is a single undo step (the worker wraps
/// the call in one `snapshot_then`) and writes exactly one
/// `history.log` line (only the first row carries the `source`).
/// Generates whole rows and inserts them in one transaction, reusing the
/// per-value validation, autogen autofill, FK-error enrichment and
/// persistence machinery via [`insert_one_row`]. The whole seed is a
/// single undo step (the worker wraps the call in one `snapshot_then`)
/// and writes exactly one `history.log` line.
///
/// Foreign-key columns are filled by sampling existing parent rows
/// (D14); a compound FK reads all its child columns from one sampled
@@ -8823,16 +8836,20 @@ fn sample_parent_key_tuples(
/// `NOT NULL blob` column (which seed cannot generate) is refused by
/// the block guard (D1); a nullable blob is omitted (→ NULL).
///
/// Deferred: identifier/constraint uniqueness incl. junction
/// distinct-combos (D10), the `IN`-CHECK value derivation (D17), the
/// efficient single-transaction multi-row path, the capped auto-show
/// preview (D18), and the enum/CHECK advisory (D12/D13).
/// **Phase 2 (SD2):** when `target_column` is `Some`, this delegates to
/// [`do_seed_column_fill`] (fill one column across existing rows, D1
/// form 2). `overrides` carries the `set <col> …` clause (D2): per-column
/// pins that replace the heuristic generator and drop the column from the
/// generic-fill advisory (D13).
#[allow(clippy::too_many_arguments)]
fn do_seed(
conn: &Connection,
persistence: Option<&Persistence>,
source: Option<&str>,
table: &str,
target_column: Option<&str>,
count: Option<u64>,
overrides: &[SeedOverride],
rng_seed: Option<u64>,
) -> Result<SeedResult, DbError> {
use crate::seed;
@@ -8840,6 +8857,14 @@ fn do_seed(
let canonical_table = require_canonical_table(conn, table)?;
let table = canonical_table.as_str();
// Column-fill (D1 form 2) is a distinct UPDATE path.
if let Some(col) = target_column {
return do_seed_column_fill(
conn, persistence, source, table, col, count, overrides, rng_seed,
);
}
let n = count.unwrap_or(DEFAULT_SEED_COUNT);
debug!(table = %table, count = n, "seed");
if n > MAX_SEED_COUNT {
@@ -8937,6 +8962,17 @@ fn do_seed(
}
}
// Apply the `set <col> …` overrides (D2): each replaces the named
// column's plan with the pinned generator and removes it from the
// generic-fill advisory (the user chose its values deliberately,
// D13). An override that names a non-fillable column is a friendly
// error; a bounded value source (fixed / pick-list) that can't supply
// enough distinct values for a single-column-UNIQUE target is refused
// up front rather than silently capped (DA finding). FK / type binding
// still apply — a value that violates a constraint surfaces through the
// existing FK-error guard.
apply_seed_overrides(&schema, overrides, n, &col_names, &mut plans, &mut advisory_columns)?;
// Uniqueness groups (ADR-0048 D10): value tuples that must stay
// distinct across the batch and against existing rows — the
// user-fillable PK (so junction distinct-combos fall out of this),
@@ -9131,6 +9167,434 @@ fn do_seed(
})
}
/// Apply the `set <col> …` overrides (ADR-0048 D2) to the per-column
/// generation plan. Each override replaces the named column's plan and
/// drops it from the generic-fill advisory (D13 — the user chose those
/// values). An override naming a column that is not in the fillable set
/// (unknown, or an auto-generated `serial`) is a friendly error.
fn apply_seed_overrides(
schema: &ReadSchema,
overrides: &[SeedOverride],
row_count: u64,
col_names: &[String],
plans: &mut [SeedColPlan],
advisory_columns: &mut Vec<String>,
) -> Result<(), DbError> {
for ov in overrides {
let Some(idx) = col_names
.iter()
.position(|c| c.eq_ignore_ascii_case(&ov.column))
else {
return Err(DbError::Unsupported(format!(
"cannot apply `set {col} …`: `{col}` is not a fillable column of this \
table (it is unknown, or an auto-generated column).",
col = ov.column,
)));
};
let ty = schema
.columns
.iter()
.find(|c| c.name.eq_ignore_ascii_case(&ov.column))
.and_then(|c| c.user_type)
.unwrap_or(Type::Text);
seed_override_capacity_guard(schema, &ov.column, &ov.kind, row_count)?;
plans[idx] = seed_override_plan(&ov.kind, ty, &ov.column)?;
advisory_columns.retain(|c| !c.eq_ignore_ascii_case(&ov.column));
}
Ok(())
}
/// Refuse up front when a **bounded** override (a fixed value or a
/// pick-list) cannot supply enough *distinct* values to fill a
/// single-column-UNIQUE target across `row_count` rows — otherwise the
/// uniqueness machinery would silently cap the run to the achievable
/// count (DA finding; the ADR left this interaction open and the user
/// chose a friendly error). Generators and ranges are treated as
/// effectively unbounded sources here; if one does exhaust, the existing
/// distinct-combination cap (D14) still applies.
fn seed_override_capacity_guard(
schema: &ReadSchema,
column: &str,
kind: &SeedOverrideKind,
row_count: u64,
) -> Result<(), DbError> {
let distinct = match kind {
SeedOverrideKind::Fixed(_) => 1,
SeedOverrideKind::PickList(values) => {
let mut set = std::collections::HashSet::new();
for v in values {
set.insert(seed_override_literal(v, column)?);
}
set.len()
}
// Unbounded-enough sources — leave to the cap if they exhaust.
SeedOverrideKind::Generator(_) | SeedOverrideKind::Range { .. } => return Ok(()),
};
if distinct as u64 >= row_count.max(1) {
return Ok(());
}
// Single-column uniqueness only: a compound UNIQUE / compound PK can
// still be satisfied by varying the *other* columns, so a pinned
// value there does not force a cap.
let single_unique = schema
.columns
.iter()
.find(|c| c.name.eq_ignore_ascii_case(column))
.is_some_and(|c| c.unique)
|| (schema.primary_key.len() == 1
&& schema.primary_key[0].eq_ignore_ascii_case(column));
if single_unique {
return Err(DbError::Unsupported(format!(
"cannot fill {row_count} rows: `set {column} …` offers only {distinct} distinct \
value(s), but `{column}` is UNIQUE. Use a generator (e.g. `as email`) or a list \
of at least {row_count} values."
)));
}
Ok(())
}
/// Turn one `set` override into the `SeedColPlan` that produces its
/// values (ADR-0048 D2). `Fixed`/`PickList` become a `PickFrom` over the
/// literal(s); `Generator` resolves the curated name (unknown → friendly
/// error); `Range` validates its bounds against the column type *before*
/// generation (an incompatible bound → friendly error).
fn seed_override_plan(
kind: &SeedOverrideKind,
ty: Type,
column: &str,
) -> Result<SeedColPlan, DbError> {
use crate::seed::Generator;
let generator = match kind {
SeedOverrideKind::Fixed(v) => Generator::PickFrom(vec![seed_override_literal(v, column)?]),
SeedOverrideKind::PickList(vs) => {
let lits = vs
.iter()
.map(|v| seed_override_literal(v, column))
.collect::<Result<Vec<_>, _>>()?;
Generator::PickFrom(lits)
}
SeedOverrideKind::Generator(name) => {
crate::seed::generator_for_name(name).ok_or_else(|| {
DbError::Unsupported(format!(
"unknown generator `{name}` in `set {column} as {name}`. \
Known generators: {}.",
crate::seed::KNOWN_GENERATORS.join(", "),
))
})?
}
SeedOverrideKind::Range { low, high } => {
let lo = seed_override_literal(low, column)?;
let hi = seed_override_literal(high, column)?;
if let Some(reason) = crate::seed::range_bounds_reason(ty, &lo, &hi) {
return Err(DbError::Unsupported(format!(
"cannot apply `set {column} between …`: {reason}."
)));
}
Generator::Range { low: lo, high: hi }
}
};
Ok(SeedColPlan::Generated { generator, ty })
}
/// Extract the literal string an override value contributes to a
/// `PickFrom` / `Range` (re-typed per column by `generate_value`). A
/// `null` override is refused — seed always fills a value (NULL
/// injection is out of scope, ADR-0048 Out-of-scope).
fn seed_override_literal(value: &Value, column: &str) -> Result<String, DbError> {
match value {
Value::Number(s) | Value::Text(s) => Ok(s.clone()),
Value::Bool(b) => Ok(if *b { "true" } else { "false" }.to_string()),
Value::Null => Err(DbError::Unsupported(format!(
"`set {column} = null` is not supported — seed always fills a value."
))),
}
}
/// Column-fill (ADR-0048 D1 form 2): fill one column across the table's
/// **existing** rows (an UPDATE), the natural follow-up to `add column`.
///
/// Refuses PK and auto-generated (`serial`/`shortid`/`blob`) targets;
/// an empty table is a friendly no-op. The `set` clause may only adjust
/// the column being filled (the rest of the per-column heuristics do not
/// apply — there is exactly one column). A UNIQUE / identifier target
/// gets collision-free values (generated distinct from *every* existing
/// value in the column, so no row-by-row UPDATE can transiently collide);
/// an FK target samples an existing parent key (D14). The whole fill is
/// one transaction → one undo step (D15), persisted once (commit-db-last).
#[allow(clippy::too_many_arguments)]
fn do_seed_column_fill(
conn: &Connection,
persistence: Option<&Persistence>,
source: Option<&str>,
table: &str,
column: &str,
count: Option<u64>,
overrides: &[SeedOverride],
rng_seed: Option<u64>,
) -> Result<SeedResult, DbError> {
use crate::seed;
use rand::RngExt;
debug!(table = %table, column = %column, "seed column-fill");
// A row count is meaningless when filling existing rows (D1 form 2).
if count.is_some() {
return Err(DbError::Unsupported(format!(
"`seed {table}.{column}` fills existing rows, so it takes no row count \
(drop the number)."
)));
}
let schema = read_schema(conn, table)?;
let col = schema
.columns
.iter()
.find(|c| c.name.eq_ignore_ascii_case(column))
.ok_or_else(|| {
DbError::Unsupported(format!("cannot fill `{table}.{column}`: no such column."))
})?;
let canonical_col = col.name.clone();
let ty = col.user_type.unwrap_or(Type::Text);
// Refuse identity / auto-generated / un-generatable targets (D1).
if col.primary_key {
return Err(DbError::Unsupported(format!(
"cannot fill `{table}.{canonical_col}`: it is part of the primary key — \
you don't fill an identity column."
)));
}
if matches!(ty, Type::Serial | Type::ShortId) {
return Err(DbError::Unsupported(format!(
"cannot fill `{table}.{canonical_col}`: `{}` columns generate their own \
values automatically.",
ty.keyword(),
)));
}
if matches!(ty, Type::Blob) {
return Err(DbError::Unsupported(format!(
"cannot fill `{table}.{canonical_col}`: seed cannot generate `blob` values."
)));
}
// The `set` clause may only adjust the filled column (user decision).
for ov in overrides {
if !ov.column.eq_ignore_ascii_case(&canonical_col) {
return Err(DbError::Unsupported(format!(
"in `seed {table}.{canonical_col}`, `set` can only adjust \
`{canonical_col}` (the column being filled), not `{}`.",
ov.column,
)));
}
}
// Existing rowids in a deterministic order (D4 reproducibility).
let rowids: Vec<i64> = {
let sql = format!(
"SELECT rowid FROM \"{}\" ORDER BY rowid",
table.replace('"', "\"\"")
);
let mut stmt = conn.prepare(&sql).map_err(DbError::from_rusqlite)?;
stmt.query_map([], |r| r.get::<_, i64>(0))
.map_err(DbError::from_rusqlite)?
.collect::<Result<Vec<_>, _>>()
.map_err(DbError::from_rusqlite)?
};
// Empty table → friendly no-op (D1).
if rowids.is_empty() {
return Ok(SeedResult {
table: table.to_string(),
requested: 0,
produced: 0,
data: DataResult {
table_name: table.to_string(),
columns: Vec::new(),
column_types: Vec::new(),
rows: Vec::new(),
},
advisory_columns: Vec::new(),
});
}
// FK target → sample an existing parent key column (D14).
let fk_sample: Option<Vec<Value>> = {
let fk = schema.foreign_keys.iter().find(|fk| {
fk.child_columns
.iter()
.any(|c| c.eq_ignore_ascii_case(&canonical_col))
});
match fk {
Some(fk) => {
// Single-column position within the FK (column-fill targets
// one column; a compound FK filled one column at a time is
// unusual but we sample that column's parent values).
let pos = fk
.child_columns
.iter()
.position(|c| c.eq_ignore_ascii_case(&canonical_col))
.unwrap_or(0);
let parent_col = fk.parent_columns.get(pos).cloned().unwrap_or_default();
let tuples = sample_parent_key_tuples(conn, &fk.parent_table, &[parent_col])?;
if tuples.is_empty() {
return Err(DbError::Unsupported(format!(
"cannot fill `{table}.{canonical_col}`: parent table `{}` has no \
rows to reference. Seed or insert into `{}` first.",
fk.parent_table, fk.parent_table,
)));
}
Some(tuples.into_iter().map(|mut t| t.remove(0)).collect())
}
None => None,
}
};
// The value source: an override (if present) else the heuristic.
let mut advisory_columns: Vec<String> = Vec::new();
let plan: SeedColPlan = if let Some(ov) = overrides
.iter()
.find(|o| o.column.eq_ignore_ascii_case(&canonical_col))
{
// Same capacity guard as whole-row: a bounded override that can't
// give enough distinct values for a UNIQUE column across the
// existing rows is refused up front, not silently capped.
seed_override_capacity_guard(&schema, &canonical_col, &ov.kind, rowids.len() as u64)?;
seed_override_plan(&ov.kind, ty, &canonical_col)?
} else if fk_sample.is_some() {
SeedColPlan::ForeignKey { fk_idx: 0, pos: 0 }
} else if matches!(ty, Type::ShortId) {
SeedColPlan::ShortId // unreachable (refused above), kept for totality
} else {
let check_in_values = col
.check
.as_deref()
.and_then(|chk| seed::parse_in_check_values(chk, &canonical_col));
let spec = seed::ColumnSpec {
name: canonical_col.clone(),
ty,
not_null: col.notnull,
primary_key: col.primary_key,
unique: col.unique,
is_foreign_key: false,
check_in_values,
};
let generator = seed::choose_generator(table, &spec);
if matches!(generator, crate::seed::Generator::Generic)
&& (seed::is_enum_ish(&canonical_col)
|| (col.check.is_some() && spec.check_in_values.is_none()))
{
advisory_columns.push(canonical_col.clone());
}
SeedColPlan::Generated { generator, ty }
};
// Collision-free generation for UNIQUE / identifier targets: seed the
// used-set with EVERY existing value of the column so a generated
// value never matches a not-yet-updated row (no transient UNIQUE
// violation) nor a value already assigned this batch (ADR-0048 D10).
let enforce_unique = col.unique
|| matches!(
&plan,
SeedColPlan::Generated {
generator: crate::seed::Generator::IdentitySequential,
..
}
);
let mut used: std::collections::HashSet<String> = std::collections::HashSet::new();
if enforce_unique {
for tuple in
sample_parent_key_tuples(conn, table, std::slice::from_ref(&canonical_col))?
{
used.insert(seed_value_list_key(&tuple));
}
}
let seq_base = if matches!(
&plan,
SeedColPlan::Generated {
generator: crate::seed::Generator::IdentitySequential,
..
}
) && matches!(ty, Type::Int)
{
Some(seed_max_int(conn, table, &canonical_col)?)
} else {
None
};
const MAX_ATTEMPTS: u32 = 200;
let mut rng = seed::make_rng(rng_seed);
let tx = conn
.unchecked_transaction()
.map_err(DbError::from_rusqlite)?;
let update_sql = format!(
"UPDATE \"{}\" SET \"{}\" = ?1 WHERE rowid = ?2",
table.replace('"', "\"\""),
canonical_col.replace('"', "\"\""),
);
let mut produced: u64 = 0;
for (offset, rowid) in rowids.iter().enumerate() {
let mut attempt = 0u32;
let value = loop {
let v = match &plan {
SeedColPlan::ForeignKey { .. } => {
let samples = fk_sample.as_ref().expect("fk plan implies samples");
samples[rng.random_range(0..samples.len())].clone()
}
SeedColPlan::ShortId => {
Value::Text(crate::dsl::shortid::generate_with_rng(&mut rng))
}
SeedColPlan::Generated { generator, ty }
if matches!(generator, crate::seed::Generator::IdentitySequential)
&& matches!(ty, Type::Int) =>
{
Value::Number((seq_base.unwrap_or(0) + produced as i64 + 1).to_string())
}
SeedColPlan::Generated { generator, ty } => {
seed::generate_value(generator, *ty, &mut rng)
}
};
if enforce_unique {
let key = seed_value_list_key(std::slice::from_ref(&v));
if used.contains(&key) {
attempt += 1;
if attempt >= MAX_ATTEMPTS {
break v; // give up on distinctness; DB may reject
}
continue;
}
used.insert(key);
}
break v;
};
let bound = impl_value_for(&schema, &canonical_col, &value)?;
let params: Vec<rusqlite::types::Value> =
vec![bound_to_sqlite_value(&bound), rusqlite::types::Value::Integer(*rowid)];
execute_with_fk_enrichment(conn, table, &update_sql, &params)?;
produced += 1;
let _ = offset;
}
let changes = Changes {
schema_dirty: false,
rewritten_tables: vec![table.to_string()],
..Changes::default()
};
finalize_persistence(conn, persistence, source, &changes)?;
tx.commit().map_err(DbError::from_rusqlite)?;
// Preview the first capped rows (D18).
let preview: Vec<i64> = rowids.iter().take(SEED_PREVIEW_CAP).copied().collect();
let data = query_rows_by_rowid(conn, table, &preview)?;
Ok(SeedResult {
table: table.to_string(),
requested: produced,
produced,
data,
advisory_columns,
})
}
/// Build and execute a single-row `INSERT` — column resolution, value
/// binding, `serial`/`shortid` autofill, and the FK-enriched execute —
/// returning `(rows_affected, new rowid)`.