//! Per-type CSV writer (ADR-0015 §4). //! //! Encoding rules per type are exactly as specified in the //! ADR; the cell-level encoder lives in `encode_cell`. The //! `csv` crate handles RFC 4180 quoting around our encoded //! strings. //! //! NULL representation: an empty unquoted field. The `csv` //! crate's writer emits a non-quoted empty field for an empty //! string by default; we map `CellValue::Null` to that, and //! `CellValue::Text(String::new())` to a *quoted* empty //! field (`""`) by emitting a sentinel that round-trips. //! //! For the writer, the trick is: `WriterBuilder::quote_style(QuoteStyle::Necessary)` //! is the default and quotes only when needed (separator, //! quote, newline). We handle the empty-string-vs-null //! distinction manually by always quoting non-null empty //! Text and never quoting Null. // // `pub(crate)` items below are re-exported from // `persistence::mod.rs`; the db worker reaches them via that // path. Clippy's `redundant_pub_crate` lint flags this // pattern, but it's load-bearing here. #![allow(clippy::redundant_pub_crate)] use std::io::Write as _; use base64::Engine as _; use crate::dsl::types::Type; use super::{CellValue, TableSnapshot}; /// Serialize a `TableSnapshot` to a CSV body. Returns the raw /// bytes (UTF-8) ready to be written to disk. pub(super) fn serialize_table(table: &TableSnapshot) -> Result, String> { // We bypass the `csv` crate for cell-level emission so the // NULL-vs-empty distinction stays under our control. The // header and per-line framing are still simple enough to // emit directly. let mut out: Vec = Vec::new(); write_record( &mut out, table.columns.iter().map(|c| Cell::Plain(c.name.clone())), )?; for row in &table.rows { if row.len() != table.columns.len() { return Err(format!( "row width {} does not match column count {} for table `{}`", row.len(), table.columns.len(), table.name, )); } let mut cells: Vec = Vec::with_capacity(row.len()); for (col, value) in table.columns.iter().zip(row.iter()) { cells.push(encode_cell(col.user_type, value)?); } write_record(&mut out, cells.into_iter())?; } Ok(out) } /// One cell to write. `Plain` is unquoted; `Quoted` is /// always RFC 4180 double-quoted (used for the empty-string /// vs NULL distinction). enum Cell { Plain(String), Quoted(String), } /// Emit a record (header or row) to `out`. Adds the trailing /// `\n` (RFC 4180 says CRLF, but `\n` is universally accepted /// and matches what every CSV reader on every platform /// handles cleanly; line endings are deliberately uniform /// across our generated artefacts). fn write_record>(out: &mut Vec, cells: I) -> Result<(), String> { let mut first = true; for cell in cells { if !first { out.push(b','); } first = false; match cell { Cell::Plain(s) => { if needs_quoting(&s) { write_quoted(out, &s); } else { out.write_all(s.as_bytes()).map_err(|e| e.to_string())?; } } Cell::Quoted(s) => write_quoted(out, &s), } } out.push(b'\n'); Ok(()) } fn write_quoted(out: &mut Vec, s: &str) { out.push(b'"'); for &b in s.as_bytes() { if b == b'"' { out.extend_from_slice(b"\"\""); } else { out.push(b); } } out.push(b'"'); } fn needs_quoting(s: &str) -> bool { s.bytes().any(|b| matches!(b, b',' | b'"' | b'\n' | b'\r')) } /// Encode a single cell per type (ADR-0015 §4 table). Returns /// the cell wrapped in `Plain` or `Quoted` as appropriate for /// the NULL/empty distinction. fn encode_cell(ty: Type, value: &CellValue) -> Result { if matches!(value, CellValue::Null) { return Ok(Cell::Plain(String::new())); } match ty { Type::Text => match value { CellValue::Text(s) if s.is_empty() => Ok(Cell::Quoted(String::new())), CellValue::Text(s) => Ok(Cell::Plain(s.clone())), other => Err(format!("expected text, got {other:?}")), }, Type::Int => match value { CellValue::Integer(n) => Ok(Cell::Plain(n.to_string())), other => Err(format!("expected int, got {other:?}")), }, Type::Real => match value { CellValue::Real(f) => Ok(Cell::Plain(format_real(*f))), other => Err(format!("expected real, got {other:?}")), }, Type::Decimal => match value { // Decimals are stored as TEXT to preserve precision. CellValue::Text(s) if s.is_empty() => Ok(Cell::Quoted(String::new())), CellValue::Text(s) => Ok(Cell::Plain(s.clone())), other => Err(format!("expected decimal (text), got {other:?}")), }, Type::Bool => match value { CellValue::Integer(0) => Ok(Cell::Plain("false".to_string())), CellValue::Integer(1) => Ok(Cell::Plain("true".to_string())), other => Err(format!("expected bool (0 or 1), got {other:?}")), }, Type::Date | Type::DateTime => match value { CellValue::Text(s) if s.is_empty() => Ok(Cell::Quoted(String::new())), CellValue::Text(s) => Ok(Cell::Plain(s.clone())), other => Err(format!("expected date/datetime (text), got {other:?}")), }, Type::Blob => match value { CellValue::Blob(bytes) => Ok(Cell::Plain(base64::engine::general_purpose::STANDARD.encode(bytes))), other => Err(format!("expected blob, got {other:?}")), }, Type::Serial => match value { CellValue::Integer(n) => Ok(Cell::Plain(n.to_string())), other => Err(format!("expected serial (int), got {other:?}")), }, Type::ShortId => match value { CellValue::Text(s) if s.is_empty() => Ok(Cell::Quoted(String::new())), CellValue::Text(s) => Ok(Cell::Plain(s.clone())), other => Err(format!("expected shortid (text), got {other:?}")), }, } } fn format_real(f: f64) -> String { if f.is_nan() { "nan".to_string() } else if f.is_infinite() { if f > 0.0 { "inf".to_string() } else { "-inf".to_string() } } else { // Default `{}` formatting on f64 emits a shortest // round-tripping decimal — exactly what the ADR asks // for. format!("{f}") } } /// Parsed CSV records: header row + zero or more data rows. /// /// Each cell records whether it was syntactically quoted in /// the source — that's the bit we need to distinguish NULL /// (empty unquoted) from `""` (empty quoted). The `csv` /// crate doesn't expose this, which is why we hand-roll the /// reader to pair with the hand-rolled writer above. #[derive(Debug, PartialEq, Eq)] pub(crate) struct ParsedCsv { pub header: Vec, pub rows: Vec>, } #[derive(Debug, PartialEq, Eq, Clone)] pub(crate) struct RawCell { pub content: String, pub was_quoted: bool, } #[derive(Debug)] pub(crate) enum CsvError { Empty, InvalidUtf8, UnterminatedQuote, } impl std::fmt::Display for CsvError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let key = match self { Self::Empty => "persistence.csv.empty", Self::InvalidUtf8 => "persistence.csv.invalid_utf8", Self::UnterminatedQuote => "persistence.csv.unterminated_quote", }; f.write_str(&crate::friendly::translate(key, &[])) } } impl std::error::Error for CsvError {} /// Tokenize a CSV body. Returns the header (column names from /// the first record) and the data rows. Each cell preserves a /// `was_quoted` flag so the caller can distinguish an empty /// unquoted field (NULL) from an empty quoted field (`""`). pub(crate) fn parse_csv(body: &str) -> Result { let mut records: Vec> = Vec::new(); let mut current: Vec = Vec::new(); let bytes = body.as_bytes(); let mut i = 0; let n = bytes.len(); while i < n { let (cell, advance) = parse_field(&bytes[i..])?; i += advance; current.push(cell); match bytes.get(i) { Some(&b',') => i += 1, Some(&b'\n') => { i += 1; records.push(std::mem::take(&mut current)); } Some(&b'\r') => { i += 1; if bytes.get(i) == Some(&b'\n') { i += 1; } records.push(std::mem::take(&mut current)); } None => { records.push(std::mem::take(&mut current)); } Some(&other) => { // A non-structural byte after a quoted field — // shouldn't happen with our well-formed writer. // Treat as part of an unquoted continuation by // appending to the last cell. We choose to // tolerate rather than error since the most // common cause is a trailing space, which we // can roll into the cell. let last = current .last_mut() .or_else(|| records.last_mut().and_then(|r| r.last_mut())); if let Some(c) = last { c.content.push(other as char); } i += 1; } } } if !current.is_empty() { records.push(current); } if records.is_empty() { return Err(CsvError::Empty); } let header_record = records.remove(0); let header: Vec = header_record.into_iter().map(|c| c.content).collect(); Ok(ParsedCsv { header, rows: records, }) } fn parse_field(bytes: &[u8]) -> Result<(RawCell, usize), CsvError> { if bytes.first() == Some(&b'"') { let mut content_bytes: Vec = Vec::new(); let mut i = 1; while i < bytes.len() { match bytes[i] { b'"' => { if bytes.get(i + 1) == Some(&b'"') { content_bytes.push(b'"'); i += 2; } else { let content = String::from_utf8(content_bytes).map_err(|_| CsvError::InvalidUtf8)?; return Ok(( RawCell { content, was_quoted: true, }, i + 1, )); } } other => { content_bytes.push(other); i += 1; } } } Err(CsvError::UnterminatedQuote) } else { let mut i = 0; while i < bytes.len() { match bytes[i] { b',' | b'\n' | b'\r' => break, _ => i += 1, } } let content = String::from_utf8(bytes[..i].to_vec()).map_err(|_| CsvError::InvalidUtf8)?; Ok(( RawCell { content, was_quoted: false, }, i, )) } } /// Decode one parsed cell into a `CellValue` per the column's /// declared type. Returns an error string the caller can /// embed in a fatal banner per ADR-0015 §7 ("unable to load /// row N from data/T.csv into table T: …"). pub(crate) fn decode_cell(ty: Type, cell: &RawCell) -> Result { if !cell.was_quoted && cell.content.is_empty() { return Ok(CellValue::Null); } match ty { Type::Text | Type::Date | Type::DateTime | Type::Decimal | Type::ShortId => { Ok(CellValue::Text(cell.content.clone())) } Type::Int | Type::Serial => cell .content .parse::() .map(CellValue::Integer) .map_err(|_| format!("expected an integer, got `{}`", cell.content)), Type::Real => cell .content .parse::() .map(CellValue::Real) .map_err(|_| format!("expected a real number, got `{}`", cell.content)), Type::Bool => match cell.content.as_str() { "true" => Ok(CellValue::Integer(1)), "false" => Ok(CellValue::Integer(0)), other => Err(format!("expected `true` or `false`, got `{other}`")), }, Type::Blob => base64::engine::general_purpose::STANDARD .decode(cell.content.as_bytes()) .map(CellValue::Blob) .map_err(|e| format!("invalid base64 blob: {e}")), } } #[cfg(test)] mod tests { use super::*; use crate::persistence::ColumnSchema; fn col(name: &str, ty: Type) -> ColumnSchema { ColumnSchema { name: name.to_string(), user_type: ty, unique: false, not_null: false, default: None, check: None, } } #[test] fn empty_table_emits_header_only() { let body = serialize_table(&TableSnapshot { name: "Customers".to_string(), columns: vec![col("id", Type::Serial), col("Name", Type::Text)], rows: vec![], }) .unwrap(); assert_eq!(String::from_utf8(body).unwrap(), "id,Name\n"); } #[test] fn null_is_empty_unquoted_field() { let body = serialize_table(&TableSnapshot { name: "T".to_string(), columns: vec![col("Name", Type::Text)], rows: vec![vec![CellValue::Null]], }) .unwrap(); assert_eq!(String::from_utf8(body).unwrap(), "Name\n\n"); } #[test] fn empty_string_is_double_quoted() { let body = serialize_table(&TableSnapshot { name: "T".to_string(), columns: vec![col("Name", Type::Text)], rows: vec![vec![CellValue::Text(String::new())]], }) .unwrap(); assert_eq!(String::from_utf8(body).unwrap(), "Name\n\"\"\n"); } #[test] fn text_with_comma_or_quote_is_rfc4180_quoted() { let body = serialize_table(&TableSnapshot { name: "T".to_string(), columns: vec![col("Name", Type::Text)], rows: vec![ vec![CellValue::Text("hello, world".to_string())], vec![CellValue::Text("she said \"hi\"".to_string())], ], }) .unwrap(); let s = String::from_utf8(body).unwrap(); assert!(s.contains("\"hello, world\"")); assert!(s.contains("\"she said \"\"hi\"\"\"")); } #[test] fn ints_and_reals_round_trip_simply() { let body = serialize_table(&TableSnapshot { name: "T".to_string(), columns: vec![col("n", Type::Int), col("r", Type::Real)], rows: vec![ vec![CellValue::Integer(42), CellValue::Real(std::f64::consts::PI)], vec![CellValue::Integer(-7), CellValue::Real(0.0)], ], }) .unwrap(); let s = String::from_utf8(body).unwrap(); let lines: Vec<&str> = s.trim_end().lines().collect(); assert_eq!(lines[0], "n,r"); assert!(lines[1].starts_with("42,")); assert_eq!(lines[2], "-7,0"); } #[test] fn bools_use_words_not_digits() { let body = serialize_table(&TableSnapshot { name: "T".to_string(), columns: vec![col("b", Type::Bool)], rows: vec![ vec![CellValue::Integer(1)], vec![CellValue::Integer(0)], ], }) .unwrap(); let s = String::from_utf8(body).unwrap(); assert_eq!(s, "b\ntrue\nfalse\n"); } #[test] fn blobs_use_base64() { let body = serialize_table(&TableSnapshot { name: "T".to_string(), columns: vec![col("blob", Type::Blob)], rows: vec![vec![CellValue::Blob(b"hello".to_vec())]], }) .unwrap(); let s = String::from_utf8(body).unwrap(); assert!(s.contains("aGVsbG8=")); } #[test] fn dates_and_datetimes_pass_through() { let body = serialize_table(&TableSnapshot { name: "T".to_string(), columns: vec![col("d", Type::Date), col("ts", Type::DateTime)], rows: vec![vec![ CellValue::Text("2026-05-07".to_string()), CellValue::Text("2026-05-07T14:30:12Z".to_string()), ]], }) .unwrap(); let s = String::from_utf8(body).unwrap(); assert!(s.contains("2026-05-07,2026-05-07T14:30:12Z")); } #[test] fn parse_round_trips_simple_table() { let table = TableSnapshot { name: "Customers".to_string(), columns: vec![col("id", Type::Serial), col("Name", Type::Text)], rows: vec![ vec![CellValue::Integer(1), CellValue::Text("Alice".to_string())], vec![CellValue::Integer(2), CellValue::Text("Bob".to_string())], ], }; let body = serialize_table(&table).unwrap(); let parsed = parse_csv(std::str::from_utf8(&body).unwrap()).unwrap(); assert_eq!(parsed.header, vec!["id", "Name"]); assert_eq!(parsed.rows.len(), 2); assert_eq!(parsed.rows[0][0].content, "1"); assert_eq!(parsed.rows[0][1].content, "Alice"); assert_eq!(parsed.rows[1][1].content, "Bob"); } #[test] fn parse_distinguishes_null_from_empty_string() { // Header "Name", then two rows: NULL (empty unquoted) // and "" (empty quoted). let body = "Name\n\n\"\"\n"; let parsed = parse_csv(body).unwrap(); assert_eq!(parsed.rows.len(), 2); assert!(!parsed.rows[0][0].was_quoted); assert_eq!(parsed.rows[0][0].content, ""); assert!(parsed.rows[1][0].was_quoted); assert_eq!(parsed.rows[1][0].content, ""); let null = decode_cell(Type::Text, &parsed.rows[0][0]).unwrap(); let empty = decode_cell(Type::Text, &parsed.rows[1][0]).unwrap(); assert!(matches!(null, CellValue::Null)); assert!(matches!(empty, CellValue::Text(s) if s.is_empty())); } #[test] fn parse_handles_rfc4180_escapes() { let body = "Name\n\"hello, world\"\n\"she said \"\"hi\"\"\"\n"; let parsed = parse_csv(body).unwrap(); assert_eq!(parsed.rows[0][0].content, "hello, world"); assert_eq!(parsed.rows[1][0].content, "she said \"hi\""); } #[test] fn parse_decodes_per_type() { // Rows match the round-trip produced by serialize_table. let table = TableSnapshot { name: "T".to_string(), columns: vec![ col("n", Type::Int), col("r", Type::Real), col("b", Type::Bool), col("blob", Type::Blob), ], rows: vec![vec![ CellValue::Integer(42), CellValue::Real(std::f64::consts::PI), CellValue::Integer(1), CellValue::Blob(b"hi".to_vec()), ]], }; let body = serialize_table(&table).unwrap(); let parsed = parse_csv(std::str::from_utf8(&body).unwrap()).unwrap(); let row = &parsed.rows[0]; assert!(matches!(decode_cell(Type::Int, &row[0]).unwrap(), CellValue::Integer(42))); match decode_cell(Type::Real, &row[1]).unwrap() { CellValue::Real(f) => assert!((f - std::f64::consts::PI).abs() < 1e-12), other => panic!("got {other:?}"), } assert!(matches!(decode_cell(Type::Bool, &row[2]).unwrap(), CellValue::Integer(1))); assert!(matches!(decode_cell(Type::Blob, &row[3]).unwrap(), CellValue::Blob(b) if b == b"hi")); } #[test] fn parse_rejects_unterminated_quotes() { let err = parse_csv("Name\n\"oops").expect_err("must error"); assert!(matches!(err, CsvError::UnterminatedQuote)); } #[test] fn decode_cell_reports_friendly_error_for_bad_int() { let cell = RawCell { content: "abc".to_string(), was_quoted: false }; let err = decode_cell(Type::Int, &cell).expect_err("must error"); assert!(err.contains("integer")); assert!(err.contains("abc")); } #[test] fn row_width_mismatch_errors() { let err = serialize_table(&TableSnapshot { name: "T".to_string(), columns: vec![col("a", Type::Int), col("b", Type::Int)], rows: vec![vec![CellValue::Integer(1)]], }) .unwrap_err(); assert!(err.contains("row width")); } }