Iteration 3: existence-only load + rebuild from text on missing .db

When the runtime opens a project whose playground.db is missing, it now rebuilds the database from project.yaml + data/<table>.csv per ADR-0015 §7. The rebuild path: 1. Parses project.yaml (serde_yml). Unknown versions / types / actions surface as PersistenceFatal. 2. Recreates each user table with FK constraints inline (PRAGMA foreign_keys=OFF), then populates the column-type, relationship, and project metadata tables. 3. Loads each table's CSV via a hand-rolled reader that preserves the NULL-vs-empty distinction (the csv crate doesn't expose whether a field was quoted; ours does). 4. Runs PRAGMA foreign_key_check before commit; any violation aborts. 5. Restores foreign_keys=ON regardless of success. Row-level failures get DbError::RebuildRowFailed with row number, file, table, and a friendly per-type detail. They land in the runtime as a fatal stderr message ("unable to load row N from `data/T.csv` into table `T`: ...") before the alternate screen is entered. created_at from project.yaml overwrites the configure-time placeholder so timestamps round-trip stably. Tests: 307 passing (267 lib + 9 + 5 new + 9 + 17), 0 failing, 0 skipped. Clippy clean with nursery lints.
2026-05-07 22:11:45 +00:00
parent 5410075398
commit f0fc063756
8 changed files with 1244 additions and 5 deletions
@@ -16,6 +16,12 @@
 //! quote, newline). We handle the empty-string-vs-null
 //! distinction manually by always quoting non-null empty
 //! Text and never quoting Null.
+//
+// `pub(crate)` items below are re-exported from
+// `persistence::mod.rs`; the db worker reaches them via that
+// path. Clippy's `redundant_pub_crate` lint flags this
+// pattern, but it's load-bearing here.
+#![allow(clippy::redundant_pub_crate)]

 use std::io::Write as _;

@@ -172,6 +178,182 @@ fn format_real(f: f64) -> String {
    }
 }

+/// Parsed CSV records: header row + zero or more data rows.
+///
+/// Each cell records whether it was syntactically quoted in
+/// the source — that's the bit we need to distinguish NULL
+/// (empty unquoted) from `""` (empty quoted). The `csv`
+/// crate doesn't expose this, which is why we hand-roll the
+/// reader to pair with the hand-rolled writer above.
+#[derive(Debug, PartialEq, Eq)]
+pub(crate) struct ParsedCsv {
+    pub header: Vec<String>,
+    pub rows: Vec<Vec<RawCell>>,
+}
+
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub(crate) struct RawCell {
+    pub content: String,
+    pub was_quoted: bool,
+}
+
+#[derive(Debug, thiserror::Error)]
+pub(crate) enum CsvError {
+    #[error("CSV is empty")]
+    Empty,
+    #[error("invalid UTF-8 in CSV body")]
+    InvalidUtf8,
+    #[error("unterminated quoted field")]
+    UnterminatedQuote,
+}
+
+/// Tokenize a CSV body. Returns the header (column names from
+/// the first record) and the data rows. Each cell preserves a
+/// `was_quoted` flag so the caller can distinguish an empty
+/// unquoted field (NULL) from an empty quoted field (`""`).
+pub(crate) fn parse_csv(body: &str) -> Result<ParsedCsv, CsvError> {
+    let mut records: Vec<Vec<RawCell>> = Vec::new();
+    let mut current: Vec<RawCell> = Vec::new();
+    let bytes = body.as_bytes();
+    let mut i = 0;
+    let n = bytes.len();
+
+    while i < n {
+        let (cell, advance) = parse_field(&bytes[i..])?;
+        i += advance;
+        current.push(cell);
+        match bytes.get(i) {
+            Some(&b',') => i += 1,
+            Some(&b'\n') => {
+                i += 1;
+                records.push(std::mem::take(&mut current));
+            }
+            Some(&b'\r') => {
+                i += 1;
+                if bytes.get(i) == Some(&b'\n') {
+                    i += 1;
+                }
+                records.push(std::mem::take(&mut current));
+            }
+            None => {
+                records.push(std::mem::take(&mut current));
+            }
+            Some(&other) => {
+                // A non-structural byte after a quoted field —
+                // shouldn't happen with our well-formed writer.
+                // Treat as part of an unquoted continuation by
+                // appending to the last cell. We choose to
+                // tolerate rather than error since the most
+                // common cause is a trailing space, which we
+                // can roll into the cell.
+                let last = current
+                    .last_mut()
+                    .or_else(|| records.last_mut().and_then(|r| r.last_mut()));
+                if let Some(c) = last {
+                    c.content.push(other as char);
+                }
+                i += 1;
+            }
+        }
+    }
+    if !current.is_empty() {
+        records.push(current);
+    }
+
+    if records.is_empty() {
+        return Err(CsvError::Empty);
+    }
+    let header_record = records.remove(0);
+    let header: Vec<String> = header_record.into_iter().map(|c| c.content).collect();
+    Ok(ParsedCsv {
+        header,
+        rows: records,
+    })
+}
+
+fn parse_field(bytes: &[u8]) -> Result<(RawCell, usize), CsvError> {
+    if bytes.first() == Some(&b'"') {
+        let mut content_bytes: Vec<u8> = Vec::new();
+        let mut i = 1;
+        while i < bytes.len() {
+            match bytes[i] {
+                b'"' => {
+                    if bytes.get(i + 1) == Some(&b'"') {
+                        content_bytes.push(b'"');
+                        i += 2;
+                    } else {
+                        let content =
+                            String::from_utf8(content_bytes).map_err(|_| CsvError::InvalidUtf8)?;
+                        return Ok((
+                            RawCell {
+                                content,
+                                was_quoted: true,
+                            },
+                            i + 1,
+                        ));
+                    }
+                }
+                other => {
+                    content_bytes.push(other);
+                    i += 1;
+                }
+            }
+        }
+        Err(CsvError::UnterminatedQuote)
+    } else {
+        let mut i = 0;
+        while i < bytes.len() {
+            match bytes[i] {
+                b',' | b'\n' | b'\r' => break,
+                _ => i += 1,
+            }
+        }
+        let content =
+            String::from_utf8(bytes[..i].to_vec()).map_err(|_| CsvError::InvalidUtf8)?;
+        Ok((
+            RawCell {
+                content,
+                was_quoted: false,
+            },
+            i,
+        ))
+    }
+}
+
+/// Decode one parsed cell into a `CellValue` per the column's
+/// declared type. Returns an error string the caller can
+/// embed in a fatal banner per ADR-0015 §7 ("unable to load
+/// row N from data/T.csv into table T: …").
+pub(crate) fn decode_cell(ty: Type, cell: &RawCell) -> Result<CellValue, String> {
+    if !cell.was_quoted && cell.content.is_empty() {
+        return Ok(CellValue::Null);
+    }
+    match ty {
+        Type::Text | Type::Date | Type::DateTime | Type::Decimal | Type::ShortId => {
+            Ok(CellValue::Text(cell.content.clone()))
+        }
+        Type::Int | Type::Serial => cell
+            .content
+            .parse::<i64>()
+            .map(CellValue::Integer)
+            .map_err(|_| format!("expected an integer, got `{}`", cell.content)),
+        Type::Real => cell
+            .content
+            .parse::<f64>()
+            .map(CellValue::Real)
+            .map_err(|_| format!("expected a real number, got `{}`", cell.content)),
+        Type::Bool => match cell.content.as_str() {
+            "true" => Ok(CellValue::Integer(1)),
+            "false" => Ok(CellValue::Integer(0)),
+            other => Err(format!("expected `true` or `false`, got `{other}`")),
+        },
+        Type::Blob => base64::engine::general_purpose::STANDARD
+            .decode(cell.content.as_bytes())
+            .map(CellValue::Blob)
+            .map_err(|e| format!("invalid base64 blob: {e}")),
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -290,6 +472,95 @@ mod tests {
        assert!(s.contains("2026-05-07,2026-05-07T14:30:12Z"));
    }

+    #[test]
+    fn parse_round_trips_simple_table() {
+        let table = TableSnapshot {
+            name: "Customers".to_string(),
+            columns: vec![col("id", Type::Serial), col("Name", Type::Text)],
+            rows: vec![
+                vec![CellValue::Integer(1), CellValue::Text("Alice".to_string())],
+                vec![CellValue::Integer(2), CellValue::Text("Bob".to_string())],
+            ],
+        };
+        let body = serialize_table(&table).unwrap();
+        let parsed = parse_csv(std::str::from_utf8(&body).unwrap()).unwrap();
+        assert_eq!(parsed.header, vec!["id", "Name"]);
+        assert_eq!(parsed.rows.len(), 2);
+        assert_eq!(parsed.rows[0][0].content, "1");
+        assert_eq!(parsed.rows[0][1].content, "Alice");
+        assert_eq!(parsed.rows[1][1].content, "Bob");
+    }
+
+    #[test]
+    fn parse_distinguishes_null_from_empty_string() {
+        // Header "Name", then two rows: NULL (empty unquoted)
+        // and "" (empty quoted).
+        let body = "Name\n\n\"\"\n";
+        let parsed = parse_csv(body).unwrap();
+        assert_eq!(parsed.rows.len(), 2);
+        assert!(!parsed.rows[0][0].was_quoted);
+        assert_eq!(parsed.rows[0][0].content, "");
+        assert!(parsed.rows[1][0].was_quoted);
+        assert_eq!(parsed.rows[1][0].content, "");
+
+        let null = decode_cell(Type::Text, &parsed.rows[0][0]).unwrap();
+        let empty = decode_cell(Type::Text, &parsed.rows[1][0]).unwrap();
+        assert!(matches!(null, CellValue::Null));
+        assert!(matches!(empty, CellValue::Text(s) if s.is_empty()));
+    }
+
+    #[test]
+    fn parse_handles_rfc4180_escapes() {
+        let body = "Name\n\"hello, world\"\n\"she said \"\"hi\"\"\"\n";
+        let parsed = parse_csv(body).unwrap();
+        assert_eq!(parsed.rows[0][0].content, "hello, world");
+        assert_eq!(parsed.rows[1][0].content, "she said \"hi\"");
+    }
+
+    #[test]
+    fn parse_decodes_per_type() {
+        // Rows match the round-trip produced by serialize_table.
+        let table = TableSnapshot {
+            name: "T".to_string(),
+            columns: vec![
+                col("n", Type::Int),
+                col("r", Type::Real),
+                col("b", Type::Bool),
+                col("blob", Type::Blob),
+            ],
+            rows: vec![vec![
+                CellValue::Integer(42),
+                CellValue::Real(std::f64::consts::PI),
+                CellValue::Integer(1),
+                CellValue::Blob(b"hi".to_vec()),
+            ]],
+        };
+        let body = serialize_table(&table).unwrap();
+        let parsed = parse_csv(std::str::from_utf8(&body).unwrap()).unwrap();
+        let row = &parsed.rows[0];
+        assert!(matches!(decode_cell(Type::Int, &row[0]).unwrap(), CellValue::Integer(42)));
+        match decode_cell(Type::Real, &row[1]).unwrap() {
+            CellValue::Real(f) => assert!((f - std::f64::consts::PI).abs() < 1e-12),
+            other => panic!("got {other:?}"),
+        }
+        assert!(matches!(decode_cell(Type::Bool, &row[2]).unwrap(), CellValue::Integer(1)));
+        assert!(matches!(decode_cell(Type::Blob, &row[3]).unwrap(), CellValue::Blob(b) if b == b"hi"));
+    }
+
+    #[test]
+    fn parse_rejects_unterminated_quotes() {
+        let err = parse_csv("Name\n\"oops").expect_err("must error");
+        assert!(matches!(err, CsvError::UnterminatedQuote));
+    }
+
+    #[test]
+    fn decode_cell_reports_friendly_error_for_bad_int() {
+        let cell = RawCell { content: "abc".to_string(), was_quoted: false };
+        let err = decode_cell(Type::Int, &cell).expect_err("must error");
+        assert!(err.contains("integer"));
+        assert!(err.contains("abc"));
+    }
+
    #[test]
    fn row_width_mismatch_errors() {
        let err = serialize_table(&TableSnapshot {
@@ -22,10 +22,15 @@ use crate::dsl::action::ReferentialAction;
 use crate::dsl::types::Type;
 use crate::project::{DATA_DIR, HISTORY_LOG, PROJECT_YAML};

+// Submodules are private; the few items the db worker needs
+// during rebuild (ADR-0015 §7) are re-exported below.
 mod csv_io;
 mod history;
 mod yaml;

+pub(crate) use csv_io::{decode_cell, parse_csv};
+pub(crate) use yaml::parse_schema;
+
 /// Owns persistence to a single project on disk. Cheap to
 /// move; the db worker holds one instance for its lifetime.
 #[derive(Debug, Clone)]
@@ -1,16 +1,27 @@
-//! Hand-rolled `project.yaml` writer (ADR-0015 §3).
+//! `project.yaml` writer (hand-rolled, ADR-0015 §3) and
+//! reader (`serde_yml`, ADR-0015 §7).
 //!
 //! The schema YAML uses a small, fixed set of structures —
 //! tables, columns, relationships — and the values it carries
 //! are all known-safe (identifiers from the DSL, types from
 //! the fixed `Type` enum, action names from `ReferentialAction`).
 //! Hand-rolling the writer avoids pulling a YAML serializer
-//! dep just for this file. The reader (Iteration 3) will use
-//! a real YAML parser.
+//! dep just for the write path; the read path uses
+//! `serde_yml` because we need to handle whatever the user
+//! (or a future migrator, or a hand-edit) puts in there.
+//
+// `pub(crate)` items in this private submodule are
+// re-exported from `persistence::mod.rs`; that path is what
+// the db worker uses. Clippy's `redundant_pub_crate` lint
+// flags this pattern, but it's load-bearing here.
+#![allow(clippy::redundant_pub_crate)]

 use std::fmt::Write as _;

+use serde::Deserialize;
+
 use crate::dsl::action::ReferentialAction;
+use crate::dsl::types::Type;

 use super::{ColumnSchema, RelationshipSchema, SchemaSnapshot, TableSchema};

@@ -144,6 +155,133 @@ const fn is_safe_yaml_char(c: char) -> bool {
    c.is_ascii_alphanumeric() || matches!(c, '_' | '-' | '.' | ':')
 }

+/// Parse a `project.yaml` body into a `SchemaSnapshot`.
+///
+/// The wire types below mirror the format `serialize_schema`
+/// emits. Anything outside that shape produces a structured
+/// error — callers (the rebuild path) translate those into a
+/// fatal banner per ADR-0015 §8.
+pub(crate) fn parse_schema(body: &str) -> Result<SchemaSnapshot, YamlError> {
+    let raw: RawProject =
+        serde_yml::from_str(body).map_err(|e| YamlError::Syntax(e.to_string()))?;
+    if raw.version != 1 {
+        return Err(YamlError::UnsupportedVersion(raw.version));
+    }
+    let mut tables: Vec<TableSchema> = Vec::with_capacity(raw.tables.len());
+    for t in raw.tables {
+        let mut columns: Vec<ColumnSchema> = Vec::with_capacity(t.columns.len());
+        for c in t.columns {
+            let user_type = c.user_type.parse::<Type>().map_err(|_| {
+                YamlError::UnknownType {
+                    table: t.name.clone(),
+                    column: c.name.clone(),
+                    raw: c.user_type.clone(),
+                }
+            })?;
+            columns.push(ColumnSchema {
+                name: c.name,
+                user_type,
+            });
+        }
+        tables.push(TableSchema {
+            name: t.name,
+            primary_key: t.primary_key,
+            columns,
+        });
+    }
+    let mut relationships: Vec<RelationshipSchema> = Vec::with_capacity(raw.relationships.len());
+    for r in raw.relationships {
+        let on_delete = parse_action(&r.on_delete)
+            .ok_or_else(|| YamlError::UnknownAction(r.on_delete.clone()))?;
+        let on_update = parse_action(&r.on_update)
+            .ok_or_else(|| YamlError::UnknownAction(r.on_update.clone()))?;
+        relationships.push(RelationshipSchema {
+            name: r.name,
+            parent_table: r.parent.table,
+            parent_column: r.parent.column,
+            child_table: r.child.table,
+            child_column: r.child.column,
+            on_delete,
+            on_update,
+        });
+    }
+    Ok(SchemaSnapshot {
+        created_at: raw.project.created_at,
+        tables,
+        relationships,
+    })
+}
+
+#[derive(Debug, thiserror::Error)]
+pub(crate) enum YamlError {
+    #[error("project.yaml syntax error: {0}")]
+    Syntax(String),
+    #[error("unsupported project.yaml version: {0} (expected 1)")]
+    UnsupportedVersion(u32),
+    #[error("unknown user-facing column type `{raw}` for `{table}.{column}`")]
+    UnknownType {
+        table: String,
+        column: String,
+        raw: String,
+    },
+    #[error("unknown referential action `{0}`")]
+    UnknownAction(String),
+}
+
+fn parse_action(s: &str) -> Option<ReferentialAction> {
+    match s {
+        "no_action" => Some(ReferentialAction::NoAction),
+        "restrict" => Some(ReferentialAction::Restrict),
+        "set_null" => Some(ReferentialAction::SetNull),
+        "cascade" => Some(ReferentialAction::Cascade),
+        _ => None,
+    }
+}
+
+#[derive(Deserialize)]
+struct RawProject {
+    version: u32,
+    project: RawProjectMeta,
+    #[serde(default)]
+    tables: Vec<RawTable>,
+    #[serde(default)]
+    relationships: Vec<RawRelationship>,
+}
+
+#[derive(Deserialize)]
+struct RawProjectMeta {
+    created_at: String,
+}
+
+#[derive(Deserialize)]
+struct RawTable {
+    name: String,
+    primary_key: Vec<String>,
+    columns: Vec<RawColumn>,
+}
+
+#[derive(Deserialize)]
+struct RawColumn {
+    name: String,
+    #[serde(rename = "type")]
+    user_type: String,
+}
+
+#[derive(Deserialize)]
+struct RawRelationship {
+    name: String,
+    parent: RawEndpoint,
+    child: RawEndpoint,
+    on_delete: String,
+    on_update: String,
+}
+
+#[derive(Deserialize)]
+struct RawEndpoint {
+    table: String,
+    column: String,
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -235,6 +373,77 @@ mod tests {
        assert_eq!(quote_if_needed("with\"quote"), "\"with\\\"quote\"");
    }

+    #[test]
+    fn write_then_read_round_trips() {
+        let original = snapshot();
+        let body = serialize_schema(&original);
+        let parsed = parse_schema(&body).expect("parse schema");
+        assert_eq!(parsed, original);
+    }
+
+    #[test]
+    fn parses_minimal_yaml_with_no_tables() {
+        let body = "\
+version: 1
+project:
+  created_at: 2026-05-07T14:30:12Z
+tables: []
+relationships: []
+";
+        let parsed = parse_schema(body).expect("parse minimal");
+        assert_eq!(parsed.tables.len(), 0);
+        assert_eq!(parsed.relationships.len(), 0);
+        assert_eq!(parsed.created_at, "2026-05-07T14:30:12Z");
+    }
+
+    #[test]
+    fn rejects_unknown_version() {
+        let body = "version: 9\nproject:\n  created_at: x\ntables: []\nrelationships: []\n";
+        match parse_schema(body) {
+            Err(YamlError::UnsupportedVersion(9)) => {}
+            other => panic!("expected UnsupportedVersion(9), got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn rejects_unknown_column_type() {
+        let body = "\
+version: 1
+project:
+  created_at: x
+tables:
+  - name: T
+    primary_key: [id]
+    columns:
+      - { name: id, type: bogus }
+relationships: []
+";
+        match parse_schema(body) {
+            Err(YamlError::UnknownType { raw, .. }) => assert_eq!(raw, "bogus"),
+            other => panic!("expected UnknownType, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn rejects_unknown_action() {
+        let body = "\
+version: 1
+project:
+  created_at: x
+tables: []
+relationships:
+  - name: R
+    parent: { table: A, column: id }
+    child:  { table: B, column: aid }
+    on_delete: blow_up
+    on_update: no_action
+";
+        match parse_schema(body) {
+            Err(YamlError::UnknownAction(s)) => assert_eq!(s, "blow_up"),
+            other => panic!("expected UnknownAction, got {other:?}"),
+        }
+    }
+
    #[test]
    fn preserves_compound_primary_key_order() {
        let body = serialize_schema(&SchemaSnapshot {