From 9c135010ba329d8e7c65dc61e8f42f85c2cb80b7 Mon Sep 17 00:00:00 2001 From: "claude@clouddev1" Date: Thu, 11 Jun 2026 18:50:05 +0000 Subject: [PATCH] feat(seed): uniqueness, junction distinct-combos, IN-CHECK (ADR-0048 P1.3b) do_seed now enforces value uniqueness and derives enum values: - Uniqueness groups (D10): the user-fillable PK, compound UNIQUE constraints, and single-column UNIQUE / identifier columns stay distinct across the batch and against existing rows (retry per row). Junction distinct-combos fall out of PK-tuple uniqueness and cap at the available parent combinations (logged when capped; the user-facing note arrives with the advisory in P1.3c). - Identifier-int columns get a monotonic sequence past MAX(col) (D10), so they never collide. - IN-CHECK derivation (D17): a simple `col IN ('a','b')` CHECK becomes the value source via the new, unit-tested seed::parse_in_check_values, so the enum-as-CHECK pattern just works. 8 parser unit tests + 4 integration tests (unique column, identifier sequencing, junction cap, IN-check enum). 2343 pass / 0 fail / 0 skip, clippy all-targets clean. Deferred to P1.3c: dedicated SeedResult + capped preview (D18) + the enum/CHECK advisory incl. the cap note (D12/D13); P1.3d: multi-row path. --- src/db.rs | 210 ++++++++++++++++++++++++++++++++++++++++------ src/seed/check.rs | 193 ++++++++++++++++++++++++++++++++++++++++++ src/seed/mod.rs | 2 + tests/it/seed.rs | 175 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 555 insertions(+), 25 deletions(-) create mode 100644 src/seed/check.rs diff --git a/src/db.rs b/src/db.rs index ab7f7c1..038cce9 100644 --- a/src/db.rs +++ b/src/db.rs @@ -8701,6 +8701,34 @@ enum SeedColPlan { ForeignKey { fk_idx: usize, pos: usize }, } +/// Collision key for a positional list of seeded values, used to keep +/// uniqueness groups (PK tuple, UNIQUE columns) distinct (ADR-0048 D10). +/// `\u{1}` separates fields; `\u{0}` marks NULL. +fn seed_value_list_key(values: &[Value]) -> String { + let mut key = String::new(); + for v in values { + match v { + Value::Number(s) | Value::Text(s) => key.push_str(s), + Value::Bool(b) => key.push(if *b { 'T' } else { 'F' }), + Value::Null => key.push('\u{0}'), + } + key.push('\u{1}'); + } + key +} + +/// `COALESCE(MAX(col), 0)` — the base for sequencing identifier-int +/// columns (ADR-0048 D10) so generated ids continue past existing rows. +fn seed_max_int(conn: &Connection, table: &str, column: &str) -> Result { + let sql = format!( + "SELECT COALESCE(MAX(\"{}\"), 0) FROM \"{}\"", + column.replace('"', "\"\""), + table.replace('"', "\"\"") + ); + conn.query_row(&sql, [], |r| r.get::<_, i64>(0)) + .map_err(DbError::from_rusqlite) +} + /// Sample existing parent-key tuples for FK generation (ADR-0048 D14). /// /// Returns one `Value` tuple per distinct parent row in @@ -8835,6 +8863,12 @@ fn do_seed( if let Some(&(fk_idx, pos)) = fk_child_pos.get(c.name.as_str()) { plans.push(SeedColPlan::ForeignKey { fk_idx, pos }); } else { + // A simple `col IN ('a','b')` CHECK becomes the value source + // (D17) so the enum-as-CHECK pattern just works. + let check_in_values = c + .check + .as_deref() + .and_then(|chk| seed::parse_in_check_values(chk, &c.name)); let spec = seed::ColumnSpec { name: c.name.clone(), ty, @@ -8842,44 +8876,170 @@ fn do_seed( primary_key: c.primary_key, unique: c.unique, is_foreign_key: false, - // `IN`-CHECK derivation is a later phase. - check_in_values: None, + check_in_values, }; let generator = seed::choose_generator(table, &spec); plans.push(SeedColPlan::Generated { generator, ty }); } } + // Uniqueness groups (ADR-0048 D10): value tuples that must stay + // distinct across the batch and against existing rows — the + // user-fillable PK (so junction distinct-combos fall out of this), + // each compound UNIQUE constraint, and each single-column UNIQUE or + // identifier-named column. Each group is a list of indices into + // `col_names` / `plans`. + let col_index: std::collections::HashMap<&str, usize> = col_names + .iter() + .enumerate() + .map(|(i, name)| (name.as_str(), i)) + .collect(); + let project_group = |cols: &[String]| -> Vec { + cols.iter() + .filter_map(|c| col_index.get(c.as_str()).copied()) + .collect() + }; + let mut unique_groups: Vec> = Vec::new(); + let pk_group = project_group(&schema.primary_key); + if !pk_group.is_empty() { + unique_groups.push(pk_group); + } + for uc in &schema.unique_constraints { + let g = project_group(uc); + if !g.is_empty() { + unique_groups.push(g); + } + } + for (i, name) in col_names.iter().enumerate() { + let unique_col = schema + .columns + .iter() + .find(|c| &c.name == name) + .is_some_and(|c| c.unique); + let is_identifier = matches!( + &plans[i], + SeedColPlan::Generated { + generator: crate::seed::Generator::IdentitySequential, + .. + } + ); + if unique_col || is_identifier { + unique_groups.push(vec![i]); + } + } + + // Sequence base for identifier-int columns (D10): start past the + // current MAX so generated ids continue cleanly. + let mut seq_base: std::collections::HashMap = std::collections::HashMap::new(); + for (i, plan) in plans.iter().enumerate() { + if let SeedColPlan::Generated { generator, ty } = plan + && matches!(generator, crate::seed::Generator::IdentitySequential) + && matches!(ty, Type::Int) + { + seq_base.insert(i, seed_max_int(conn, table, &col_names[i])?); + } + } + + // Pre-load each group's existing tuples so generation never + // collides with rows already present. + let mut used: Vec> = + vec![std::collections::HashSet::new(); unique_groups.len()]; + for (gi, group) in unique_groups.iter().enumerate() { + let cols: Vec = group.iter().map(|&i| col_names[i].clone()).collect(); + for tuple in sample_parent_key_tuples(conn, table, &cols)? { + used[gi].insert(seed_value_list_key(&tuple)); + } + } + + // Retry cap per row: when the unique space is exhausted (e.g. a + // junction requested more rows than there are parent combinations), + // stop and cap rather than spin (D14). + const MAX_ATTEMPTS: u32 = 200; + let mut rng = seed::make_rng(rng_seed); let mut rows_affected = 0usize; let mut last_data: Option = None; + let mut accepted: u64 = 0; + let mut capped = false; - for i in 0..n { - // One sampled parent row per FK for this row, so a compound FK's - // children stay consistent. - let fk_choice: Vec = fk_samples - .iter() - .map(|tuples| rng.random_range(0..tuples.len())) - .collect(); + while accepted < n { + let mut attempt = 0u32; + let inserted = loop { + // One sampled parent row per FK for this attempt, so a + // compound FK's children stay consistent. + let fk_choice: Vec = fk_samples + .iter() + .map(|tuples| rng.random_range(0..tuples.len())) + .collect(); + let values: Vec = plans + .iter() + .enumerate() + .map(|(i, plan)| match plan { + SeedColPlan::ForeignKey { fk_idx, pos } => { + fk_samples[*fk_idx][fk_choice[*fk_idx]][*pos].clone() + } + SeedColPlan::Generated { generator, ty } + if matches!(generator, crate::seed::Generator::IdentitySequential) + && matches!(ty, Type::Int) => + { + // Monotonic past existing rows → inherently unique. + Value::Number((seq_base[&i] + accepted as i64 + 1).to_string()) + } + SeedColPlan::Generated { generator, ty } => { + seed::generate_value(generator, *ty, &mut rng) + } + }) + .collect(); - let values: Vec = plans - .iter() - .map(|plan| match plan { - SeedColPlan::Generated { generator, ty } => { - seed::generate_value(generator, *ty, &mut rng) + let keys: Vec = unique_groups + .iter() + .map(|group| { + let projected: Vec = + group.iter().map(|&i| values[i].clone()).collect(); + seed_value_list_key(&projected) + }) + .collect(); + if keys.iter().enumerate().any(|(gi, k)| used[gi].contains(k)) { + attempt += 1; + if attempt >= MAX_ATTEMPTS { + capped = true; + break None; } - SeedColPlan::ForeignKey { fk_idx, pos } => { - fk_samples[*fk_idx][fk_choice[*fk_idx]][*pos].clone() - } - }) - .collect(); + continue; + } + for (gi, k) in keys.into_iter().enumerate() { + used[gi].insert(k); + } + // Only the first inserted row carries the `source`, so the + // whole seed writes exactly one `history.log` line. + let row_source = if accepted == 0 { source } else { None }; + break Some(do_insert( + conn, + persistence, + row_source, + table, + Some(&col_names), + &values, + )?); + }; + match inserted { + Some(result) => { + rows_affected += result.rows_affected; + last_data = Some(result.data); + accepted += 1; + } + None => break, + } + } - // Only the first row carries the `source`, so the whole seed - // writes exactly one `history.log` line. - let row_source = if i == 0 { source } else { None }; - let result = do_insert(conn, persistence, row_source, table, Some(&col_names), &values)?; - rows_affected += result.rows_affected; - last_data = Some(result.data); + if capped { + warn!( + table = %table, + requested = n, + produced = accepted, + "seed capped: ran out of distinct unique-value combinations before the \ + requested count (user-facing note arrives with the advisory in P1.3c)" + ); } Ok(InsertResult { diff --git a/src/seed/check.rs b/src/seed/check.rs new file mode 100644 index 0000000..1294eb1 --- /dev/null +++ b/src/seed/check.rs @@ -0,0 +1,193 @@ +//! Parse a simple ` IN ('a', 'b', …)` CHECK into its allowed +//! value list (ADR-0048 D17), so the common enum-as-CHECK pattern seeds +//! from the permitted values instead of generic text. Anything more +//! complex (ranges, expressions, multi-column, non-literal items) +//! returns `None`; the executor then best-effort generates and lets a +//! violation surface through the friendly-error layer. + +/// Extract the string-literal values of a ` IN ( … )` CHECK. +/// +/// Case-insensitive on the `IN` keyword and the column name; tolerates a +/// quoted column (`"status"`). Every list item must be a single-quoted +/// string literal (`''` is an embedded quote). Returns `None` for any +/// other shape. +#[must_use] +pub fn parse_in_check_values(check: &str, column: &str) -> Option> { + let (in_idx, paren_open) = find_in_paren(check)?; + if !lhs_is_column(check[..in_idx].trim(), column) { + return None; + } + let values = extract_quoted_list(&check[paren_open..])?; + if values.is_empty() { None } else { Some(values) } +} + +const fn is_ident_byte(b: u8) -> bool { + b.is_ascii_alphanumeric() || b == b'_' +} + +/// Find the `IN` keyword (as a word, outside string literals) that is +/// followed by `(`. Returns `(byte index of `IN`, byte index of `(`)`. +fn find_in_paren(check: &str) -> Option<(usize, usize)> { + let bytes = check.as_bytes(); + let mut i = 0; + let mut in_quote = false; + while i < bytes.len() { + let b = bytes[i]; + if in_quote { + if b == b'\'' { + in_quote = false; + } + i += 1; + continue; + } + if b == b'\'' { + in_quote = true; + i += 1; + continue; + } + let is_in = (b == b'i' || b == b'I') + && bytes.get(i + 1).is_some_and(|n| *n == b'n' || *n == b'N'); + if is_in { + let before_ok = i == 0 || !is_ident_byte(bytes[i - 1]); + let after = i + 2; + let after_ok = bytes.get(after).is_none_or(|n| !is_ident_byte(*n)); + if before_ok && after_ok { + let mut k = after; + while bytes.get(k).is_some_and(u8::is_ascii_whitespace) { + k += 1; + } + if bytes.get(k) == Some(&b'(') { + return Some((i, k)); + } + } + } + i += 1; + } + None +} + +fn lhs_is_column(lhs: &str, column: &str) -> bool { + let t = lhs.trim(); + let stripped = t + .strip_prefix('"') + .and_then(|s| s.strip_suffix('"')) + .unwrap_or(t); + stripped.eq_ignore_ascii_case(column) +} + +/// Parse `( 'a', 'b', … )` from a string starting at `(` into the +/// unescaped literals. `None` if any item is not a pure quoted literal. +fn extract_quoted_list(s: &str) -> Option> { + let mut chars = s.chars().peekable(); + if chars.next()? != '(' { + return None; + } + let mut values = Vec::new(); + loop { + while chars.peek().is_some_and(|c| c.is_whitespace()) { + chars.next(); + } + match chars.peek()? { + ')' => { + chars.next(); + break; + } + '\'' => { + let v = read_quoted(&mut chars)?; + values.push(v); + while chars.peek().is_some_and(|c| c.is_whitespace()) { + chars.next(); + } + match chars.next()? { + ',' => {} + ')' => break, + _ => return None, + } + } + _ => return None, + } + } + Some(values) +} + +/// Read a single-quoted string literal (cursor at the opening `'`), +/// unescaping `''` to `'`. +fn read_quoted(chars: &mut std::iter::Peekable) -> Option { + if chars.next()? != '\'' { + return None; + } + let mut out = String::new(); + loop { + match chars.next()? { + '\'' => { + if chars.peek() == Some(&'\'') { + chars.next(); + out.push('\''); + } else { + return Some(out); + } + } + c => out.push(c), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn parses_a_simple_in_check() { + assert_eq!( + parse_in_check_values("status IN ('active', 'closed')", "status"), + Some(vec!["active".to_string(), "closed".to_string()]) + ); + } + + #[test] + fn tolerates_a_quoted_column_and_lowercase_in() { + assert_eq!( + parse_in_check_values("\"status\" in ('a','b','c')", "status"), + Some(vec!["a".into(), "b".into(), "c".into()]) + ); + } + + #[test] + fn unescapes_embedded_quotes() { + assert_eq!( + parse_in_check_values("note IN ('it''s', 'ok')", "note"), + Some(vec!["it's".into(), "ok".into()]) + ); + } + + #[test] + fn handles_commas_and_parens_inside_literals() { + assert_eq!( + parse_in_check_values("label IN ('a, b', 'c)d')", "label"), + Some(vec!["a, b".into(), "c)d".into()]) + ); + } + + #[test] + fn rejects_non_literal_lists() { + assert_eq!(parse_in_check_values("n IN (1, 2, 3)", "n"), None); + } + + #[test] + fn rejects_non_in_checks() { + assert_eq!(parse_in_check_values("age >= 0", "age"), None); + assert_eq!(parse_in_check_values("length(name) > 0", "name"), None); + } + + #[test] + fn rejects_when_lhs_is_a_different_column() { + assert_eq!(parse_in_check_values("status IN ('a')", "role"), None); + } + + #[test] + fn does_not_trip_on_in_inside_a_word_or_literal() { + // `min` contains "in" but is not the IN operator. + assert_eq!(parse_in_check_values("min(x) > 0", "x"), None); + } +} diff --git a/src/seed/mod.rs b/src/seed/mod.rs index 1f2bd50..daeca2c 100644 --- a/src/seed/mod.rs +++ b/src/seed/mod.rs @@ -24,9 +24,11 @@ //! production, the hand-rolled `product` generator (D9) and the //! bounded date windows (D8). +mod check; mod generators; mod heuristics; +pub use check::parse_in_check_values; pub use generators::generate_value; pub use heuristics::{choose_generator, is_enum_ish}; diff --git a/tests/it/seed.rs b/tests/it/seed.rs index b082757..4984dac 100644 --- a/tests/it/seed.rs +++ b/tests/it/seed.rs @@ -298,3 +298,178 @@ fn seed_omits_a_nullable_blob_column() { let csv = read_csv(&project, "Files").expect("Files CSV"); assert_eq!(data_row_count(&csv), 3); } + +// — uniqueness, junction distinct-combos, IN-CHECK (D10 / D14 / D17) — + +/// The `n`th comma-separated field of each data row (the generated +/// values here never contain commas). +fn nth_column_values(csv: &str, n: usize) -> Vec { + csv.lines() + .filter(|l| !l.trim().is_empty()) + .skip(1) + .map(|l| l.split(',').nth(n).unwrap_or_default().trim().to_string()) + .collect() +} + +#[test] +fn seed_keeps_unique_columns_distinct() { + let (project, db, _dir) = open_project_db(); + let rt = rt(); + let mut label = ColumnSpec::new("label", Type::Text); + label.unique = true; + rt.block_on(db.create_table( + "Tags".to_string(), + vec![ColumnSpec::new("id", Type::Serial), label], + vec!["id".to_string()], + None, + )) + .expect("create Tags"); + + let res = rt + .block_on(db.seed("Tags".into(), Some(8), Some(3), Some("seed Tags 8".into()))) + .expect("seed"); + assert_eq!(res.rows_affected, 8); + + let csv = read_csv(&project, "Tags").expect("Tags CSV"); + let labels = nth_column_values(&csv, 1); + let distinct: std::collections::HashSet<&String> = labels.iter().collect(); + assert_eq!(distinct.len(), labels.len(), "UNIQUE column has duplicates:\n{csv}"); +} + +#[test] +fn seed_sequences_identifier_int_columns() { + let (project, db, _dir) = open_project_db(); + let rt = rt(); + // `code` is an identifier-named int (D10) but not a constraint — + // uniqueness comes from the identifier rule. + rt.block_on(db.create_table( + "Items".to_string(), + vec![ + ColumnSpec::new("id", Type::Serial), + ColumnSpec::new("code", Type::Int), + ColumnSpec::new("name", Type::Text), + ], + vec!["id".to_string()], + None, + )) + .expect("create Items"); + + let res = rt + .block_on(db.seed("Items".into(), Some(5), Some(1), Some("seed Items 5".into()))) + .expect("seed"); + assert_eq!(res.rows_affected, 5); + + let csv = read_csv(&project, "Items").expect("Items CSV"); + let codes: Vec = nth_column_values(&csv, 1) + .iter() + .map(|s| s.parse().expect("code is an int")) + .collect(); + let distinct: std::collections::HashSet = codes.iter().copied().collect(); + assert_eq!(distinct.len(), 5, "identifier ints must be unique: {codes:?}"); +} + +#[test] +fn seed_junction_produces_distinct_combinations_and_caps() { + let (project, db, _dir) = open_project_db(); + let rt = rt(); + rt.block_on(async { + // Two parents, 2 rows each → 2x2 = 4 possible (a, b) pairs. + for t in ["P1", "P2"] { + db.create_table( + t.to_string(), + vec![ + ColumnSpec::new("id", Type::Serial), + ColumnSpec::new("name", Type::Text), + ], + vec!["id".to_string()], + None, + ) + .await + .expect("create parent"); + db.seed(t.into(), Some(2), Some(1), Some(format!("seed {t} 2"))) + .await + .expect("seed parent"); + } + // Junction with a compound PK over its two FK columns. + db.create_table( + "J".to_string(), + vec![ColumnSpec::new("a", Type::Int), ColumnSpec::new("b", Type::Int)], + vec!["a".to_string(), "b".to_string()], + None, + ) + .await + .expect("create J"); + db.add_relationship( + None, + "P1".into(), + vec!["id".into()], + "J".into(), + vec!["a".into()], + ReferentialAction::NoAction, + ReferentialAction::NoAction, + false, + None, + ) + .await + .expect("fk a"); + db.add_relationship( + None, + "P2".into(), + vec!["id".into()], + "J".into(), + vec!["b".into()], + ReferentialAction::NoAction, + ReferentialAction::NoAction, + false, + None, + ) + .await + .expect("fk b"); + + // Requesting 10 caps at the 4 available distinct combinations. + let res = db + .seed("J".into(), Some(10), Some(7), Some("seed J 10".into())) + .await + .expect("seed J"); + assert_eq!(res.rows_affected, 4, "junction caps at available combos"); + }); + + let csv = read_csv(&project, "J").expect("J CSV"); + let pairs: Vec = csv + .lines() + .filter(|l| !l.trim().is_empty()) + .skip(1) + .map(str::to_string) + .collect(); + let distinct: std::collections::HashSet<&String> = pairs.iter().collect(); + assert_eq!(distinct.len(), pairs.len(), "junction rows must be distinct:\n{csv}"); +} + +#[test] +fn seed_draws_enum_values_from_an_in_check() { + let (project, db, _dir) = open_project_db(); + let rt = rt(); + let mut status = ColumnSpec::new("status", Type::Text); + status.check_sql = Some("status IN ('active', 'closed')".to_string()); + rt.block_on(db.create_table( + "Tickets".to_string(), + vec![ColumnSpec::new("id", Type::Serial), status], + vec!["id".to_string()], + None, + )) + .expect("create Tickets"); + + // Every generated status must satisfy the CHECK, so all rows insert. + let res = rt + .block_on(db.seed("Tickets".into(), Some(12), Some(2), Some("seed Tickets 12".into()))) + .expect("seed"); + assert_eq!(res.rows_affected, 12, "all rows insert — values satisfy the CHECK"); + + let csv = read_csv(&project, "Tickets").expect("Tickets CSV"); + for v in nth_column_values(&csv, 1) { + assert!( + matches!(v.as_str(), "active" | "closed"), + "status `{v}` was not drawn from the IN check:\n{csv}" + ); + } +}