Files
rdbms-playground/src/persistence/csv_io.rs
T
claude@clouddev1 6ca297579e round-5 follow-up r2: migrate all thiserror Display attributes to catalog
Completes the i18n sweep started in the previous commit. All
remaining hand-rolled user-facing English strings inside
thiserror #[error(...)] attributes have been moved into the
catalog. Drops the thiserror dependency entirely.

Twelve error types migrated:

- dsl::action::UnknownAction         → parse.custom.unknown_action
- dsl::parser::ParseError            → parse.error_wrapper + parse.empty
- dsl::value::ValueError             → value.{type_mismatch,format}
- persistence::csv_io::CsvError      → persistence.csv.*
- persistence::mod::PersistenceError → persistence.{io,encode}
- persistence::yaml::YamlError       → persistence.yaml.*
- persistence::migrations::MigrateError → persistence.migrate.*
- project::lock::LockError           → project.lock.*
- project::naming::NamingError       → project.naming.*
- project::naming::UserNameError     → project.user_name.*
- project::mod::ProjectError         → project.{path_not_found,...}
- project::mod::SafeDeleteError      → project.safe_delete.*
- archive::ArchiveError              → archive.*
- cli::ArgsError                     → cli.*
- db::DbError                        → db.error.*

Pattern per type: drop thiserror::Error derive, write manual
Display calling crate::t!(), keep #[from] semantics via
explicit From impls, override Error::source() where applicable
so #[source]-style chaining is preserved.

Why this matters (user rationale): "fine to have fallbacks for
errors that are purely technical, but lift the output to a
place where it can be localized later and where an adjustment
with friendly text is easily possible if any of them become
part of the happy path." All surface strings now live in
en-US.yaml and can be reworded or localized without touching
Rust source.

Tests: 769 passing, 0 failed, 1 ignored. Clippy clean with
-D warnings. Cargo.toml: drop thiserror = "2.0.18".
2026-05-13 21:24:51 +00:00

585 lines
20 KiB
Rust

//! Per-type CSV writer (ADR-0015 §4).
//!
//! Encoding rules per type are exactly as specified in the
//! ADR; the cell-level encoder lives in `encode_cell`. The
//! `csv` crate handles RFC 4180 quoting around our encoded
//! strings.
//!
//! NULL representation: an empty unquoted field. The `csv`
//! crate's writer emits a non-quoted empty field for an empty
//! string by default; we map `CellValue::Null` to that, and
//! `CellValue::Text(String::new())` to a *quoted* empty
//! field (`""`) by emitting a sentinel that round-trips.
//!
//! For the writer, the trick is: `WriterBuilder::quote_style(QuoteStyle::Necessary)`
//! is the default and quotes only when needed (separator,
//! quote, newline). We handle the empty-string-vs-null
//! distinction manually by always quoting non-null empty
//! Text and never quoting Null.
//
// `pub(crate)` items below are re-exported from
// `persistence::mod.rs`; the db worker reaches them via that
// path. Clippy's `redundant_pub_crate` lint flags this
// pattern, but it's load-bearing here.
#![allow(clippy::redundant_pub_crate)]
use std::io::Write as _;
use base64::Engine as _;
use crate::dsl::types::Type;
use super::{CellValue, TableSnapshot};
/// Serialize a `TableSnapshot` to a CSV body. Returns the raw
/// bytes (UTF-8) ready to be written to disk.
pub(super) fn serialize_table(table: &TableSnapshot) -> Result<Vec<u8>, String> {
// We bypass the `csv` crate for cell-level emission so the
// NULL-vs-empty distinction stays under our control. The
// header and per-line framing are still simple enough to
// emit directly.
let mut out: Vec<u8> = Vec::new();
write_record(
&mut out,
table.columns.iter().map(|c| Cell::Plain(c.name.clone())),
)?;
for row in &table.rows {
if row.len() != table.columns.len() {
return Err(format!(
"row width {} does not match column count {} for table `{}`",
row.len(),
table.columns.len(),
table.name,
));
}
let mut cells: Vec<Cell> = Vec::with_capacity(row.len());
for (col, value) in table.columns.iter().zip(row.iter()) {
cells.push(encode_cell(col.user_type, value)?);
}
write_record(&mut out, cells.into_iter())?;
}
Ok(out)
}
/// One cell to write. `Plain` is unquoted; `Quoted` is
/// always RFC 4180 double-quoted (used for the empty-string
/// vs NULL distinction).
enum Cell {
Plain(String),
Quoted(String),
}
/// Emit a record (header or row) to `out`. Adds the trailing
/// `\n` (RFC 4180 says CRLF, but `\n` is universally accepted
/// and matches what every CSV reader on every platform
/// handles cleanly; line endings are deliberately uniform
/// across our generated artefacts).
fn write_record<I: Iterator<Item = Cell>>(out: &mut Vec<u8>, cells: I) -> Result<(), String> {
let mut first = true;
for cell in cells {
if !first {
out.push(b',');
}
first = false;
match cell {
Cell::Plain(s) => {
if needs_quoting(&s) {
write_quoted(out, &s);
} else {
out.write_all(s.as_bytes()).map_err(|e| e.to_string())?;
}
}
Cell::Quoted(s) => write_quoted(out, &s),
}
}
out.push(b'\n');
Ok(())
}
fn write_quoted(out: &mut Vec<u8>, s: &str) {
out.push(b'"');
for &b in s.as_bytes() {
if b == b'"' {
out.extend_from_slice(b"\"\"");
} else {
out.push(b);
}
}
out.push(b'"');
}
fn needs_quoting(s: &str) -> bool {
s.bytes().any(|b| matches!(b, b',' | b'"' | b'\n' | b'\r'))
}
/// Encode a single cell per type (ADR-0015 §4 table). Returns
/// the cell wrapped in `Plain` or `Quoted` as appropriate for
/// the NULL/empty distinction.
fn encode_cell(ty: Type, value: &CellValue) -> Result<Cell, String> {
if matches!(value, CellValue::Null) {
return Ok(Cell::Plain(String::new()));
}
match ty {
Type::Text => match value {
CellValue::Text(s) if s.is_empty() => Ok(Cell::Quoted(String::new())),
CellValue::Text(s) => Ok(Cell::Plain(s.clone())),
other => Err(format!("expected text, got {other:?}")),
},
Type::Int => match value {
CellValue::Integer(n) => Ok(Cell::Plain(n.to_string())),
other => Err(format!("expected int, got {other:?}")),
},
Type::Real => match value {
CellValue::Real(f) => Ok(Cell::Plain(format_real(*f))),
other => Err(format!("expected real, got {other:?}")),
},
Type::Decimal => match value {
// Decimals are stored as TEXT to preserve precision.
CellValue::Text(s) if s.is_empty() => Ok(Cell::Quoted(String::new())),
CellValue::Text(s) => Ok(Cell::Plain(s.clone())),
other => Err(format!("expected decimal (text), got {other:?}")),
},
Type::Bool => match value {
CellValue::Integer(0) => Ok(Cell::Plain("false".to_string())),
CellValue::Integer(1) => Ok(Cell::Plain("true".to_string())),
other => Err(format!("expected bool (0 or 1), got {other:?}")),
},
Type::Date | Type::DateTime => match value {
CellValue::Text(s) if s.is_empty() => Ok(Cell::Quoted(String::new())),
CellValue::Text(s) => Ok(Cell::Plain(s.clone())),
other => Err(format!("expected date/datetime (text), got {other:?}")),
},
Type::Blob => match value {
CellValue::Blob(bytes) => Ok(Cell::Plain(base64::engine::general_purpose::STANDARD.encode(bytes))),
other => Err(format!("expected blob, got {other:?}")),
},
Type::Serial => match value {
CellValue::Integer(n) => Ok(Cell::Plain(n.to_string())),
other => Err(format!("expected serial (int), got {other:?}")),
},
Type::ShortId => match value {
CellValue::Text(s) if s.is_empty() => Ok(Cell::Quoted(String::new())),
CellValue::Text(s) => Ok(Cell::Plain(s.clone())),
other => Err(format!("expected shortid (text), got {other:?}")),
},
}
}
fn format_real(f: f64) -> String {
if f.is_nan() {
"nan".to_string()
} else if f.is_infinite() {
if f > 0.0 { "inf".to_string() } else { "-inf".to_string() }
} else {
// Default `{}` formatting on f64 emits a shortest
// round-tripping decimal — exactly what the ADR asks
// for.
format!("{f}")
}
}
/// Parsed CSV records: header row + zero or more data rows.
///
/// Each cell records whether it was syntactically quoted in
/// the source — that's the bit we need to distinguish NULL
/// (empty unquoted) from `""` (empty quoted). The `csv`
/// crate doesn't expose this, which is why we hand-roll the
/// reader to pair with the hand-rolled writer above.
#[derive(Debug, PartialEq, Eq)]
pub(crate) struct ParsedCsv {
pub header: Vec<String>,
pub rows: Vec<Vec<RawCell>>,
}
#[derive(Debug, PartialEq, Eq, Clone)]
pub(crate) struct RawCell {
pub content: String,
pub was_quoted: bool,
}
#[derive(Debug)]
pub(crate) enum CsvError {
Empty,
InvalidUtf8,
UnterminatedQuote,
}
impl std::fmt::Display for CsvError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let key = match self {
Self::Empty => "persistence.csv.empty",
Self::InvalidUtf8 => "persistence.csv.invalid_utf8",
Self::UnterminatedQuote => "persistence.csv.unterminated_quote",
};
f.write_str(&crate::friendly::translate(key, &[]))
}
}
impl std::error::Error for CsvError {}
/// Tokenize a CSV body. Returns the header (column names from
/// the first record) and the data rows. Each cell preserves a
/// `was_quoted` flag so the caller can distinguish an empty
/// unquoted field (NULL) from an empty quoted field (`""`).
pub(crate) fn parse_csv(body: &str) -> Result<ParsedCsv, CsvError> {
let mut records: Vec<Vec<RawCell>> = Vec::new();
let mut current: Vec<RawCell> = Vec::new();
let bytes = body.as_bytes();
let mut i = 0;
let n = bytes.len();
while i < n {
let (cell, advance) = parse_field(&bytes[i..])?;
i += advance;
current.push(cell);
match bytes.get(i) {
Some(&b',') => i += 1,
Some(&b'\n') => {
i += 1;
records.push(std::mem::take(&mut current));
}
Some(&b'\r') => {
i += 1;
if bytes.get(i) == Some(&b'\n') {
i += 1;
}
records.push(std::mem::take(&mut current));
}
None => {
records.push(std::mem::take(&mut current));
}
Some(&other) => {
// A non-structural byte after a quoted field —
// shouldn't happen with our well-formed writer.
// Treat as part of an unquoted continuation by
// appending to the last cell. We choose to
// tolerate rather than error since the most
// common cause is a trailing space, which we
// can roll into the cell.
let last = current
.last_mut()
.or_else(|| records.last_mut().and_then(|r| r.last_mut()));
if let Some(c) = last {
c.content.push(other as char);
}
i += 1;
}
}
}
if !current.is_empty() {
records.push(current);
}
if records.is_empty() {
return Err(CsvError::Empty);
}
let header_record = records.remove(0);
let header: Vec<String> = header_record.into_iter().map(|c| c.content).collect();
Ok(ParsedCsv {
header,
rows: records,
})
}
fn parse_field(bytes: &[u8]) -> Result<(RawCell, usize), CsvError> {
if bytes.first() == Some(&b'"') {
let mut content_bytes: Vec<u8> = Vec::new();
let mut i = 1;
while i < bytes.len() {
match bytes[i] {
b'"' => {
if bytes.get(i + 1) == Some(&b'"') {
content_bytes.push(b'"');
i += 2;
} else {
let content =
String::from_utf8(content_bytes).map_err(|_| CsvError::InvalidUtf8)?;
return Ok((
RawCell {
content,
was_quoted: true,
},
i + 1,
));
}
}
other => {
content_bytes.push(other);
i += 1;
}
}
}
Err(CsvError::UnterminatedQuote)
} else {
let mut i = 0;
while i < bytes.len() {
match bytes[i] {
b',' | b'\n' | b'\r' => break,
_ => i += 1,
}
}
let content =
String::from_utf8(bytes[..i].to_vec()).map_err(|_| CsvError::InvalidUtf8)?;
Ok((
RawCell {
content,
was_quoted: false,
},
i,
))
}
}
/// Decode one parsed cell into a `CellValue` per the column's
/// declared type. Returns an error string the caller can
/// embed in a fatal banner per ADR-0015 §7 ("unable to load
/// row N from data/T.csv into table T: …").
pub(crate) fn decode_cell(ty: Type, cell: &RawCell) -> Result<CellValue, String> {
if !cell.was_quoted && cell.content.is_empty() {
return Ok(CellValue::Null);
}
match ty {
Type::Text | Type::Date | Type::DateTime | Type::Decimal | Type::ShortId => {
Ok(CellValue::Text(cell.content.clone()))
}
Type::Int | Type::Serial => cell
.content
.parse::<i64>()
.map(CellValue::Integer)
.map_err(|_| format!("expected an integer, got `{}`", cell.content)),
Type::Real => cell
.content
.parse::<f64>()
.map(CellValue::Real)
.map_err(|_| format!("expected a real number, got `{}`", cell.content)),
Type::Bool => match cell.content.as_str() {
"true" => Ok(CellValue::Integer(1)),
"false" => Ok(CellValue::Integer(0)),
other => Err(format!("expected `true` or `false`, got `{other}`")),
},
Type::Blob => base64::engine::general_purpose::STANDARD
.decode(cell.content.as_bytes())
.map(CellValue::Blob)
.map_err(|e| format!("invalid base64 blob: {e}")),
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::persistence::ColumnSchema;
fn col(name: &str, ty: Type) -> ColumnSchema {
ColumnSchema { name: name.to_string(), user_type: ty, unique: false }
}
#[test]
fn empty_table_emits_header_only() {
let body = serialize_table(&TableSnapshot {
name: "Customers".to_string(),
columns: vec![col("id", Type::Serial), col("Name", Type::Text)],
rows: vec![],
})
.unwrap();
assert_eq!(String::from_utf8(body).unwrap(), "id,Name\n");
}
#[test]
fn null_is_empty_unquoted_field() {
let body = serialize_table(&TableSnapshot {
name: "T".to_string(),
columns: vec![col("Name", Type::Text)],
rows: vec![vec![CellValue::Null]],
})
.unwrap();
assert_eq!(String::from_utf8(body).unwrap(), "Name\n\n");
}
#[test]
fn empty_string_is_double_quoted() {
let body = serialize_table(&TableSnapshot {
name: "T".to_string(),
columns: vec![col("Name", Type::Text)],
rows: vec![vec![CellValue::Text(String::new())]],
})
.unwrap();
assert_eq!(String::from_utf8(body).unwrap(), "Name\n\"\"\n");
}
#[test]
fn text_with_comma_or_quote_is_rfc4180_quoted() {
let body = serialize_table(&TableSnapshot {
name: "T".to_string(),
columns: vec![col("Name", Type::Text)],
rows: vec![
vec![CellValue::Text("hello, world".to_string())],
vec![CellValue::Text("she said \"hi\"".to_string())],
],
})
.unwrap();
let s = String::from_utf8(body).unwrap();
assert!(s.contains("\"hello, world\""));
assert!(s.contains("\"she said \"\"hi\"\"\""));
}
#[test]
fn ints_and_reals_round_trip_simply() {
let body = serialize_table(&TableSnapshot {
name: "T".to_string(),
columns: vec![col("n", Type::Int), col("r", Type::Real)],
rows: vec![
vec![CellValue::Integer(42), CellValue::Real(std::f64::consts::PI)],
vec![CellValue::Integer(-7), CellValue::Real(0.0)],
],
})
.unwrap();
let s = String::from_utf8(body).unwrap();
let lines: Vec<&str> = s.trim_end().lines().collect();
assert_eq!(lines[0], "n,r");
assert!(lines[1].starts_with("42,"));
assert_eq!(lines[2], "-7,0");
}
#[test]
fn bools_use_words_not_digits() {
let body = serialize_table(&TableSnapshot {
name: "T".to_string(),
columns: vec![col("b", Type::Bool)],
rows: vec![
vec![CellValue::Integer(1)],
vec![CellValue::Integer(0)],
],
})
.unwrap();
let s = String::from_utf8(body).unwrap();
assert_eq!(s, "b\ntrue\nfalse\n");
}
#[test]
fn blobs_use_base64() {
let body = serialize_table(&TableSnapshot {
name: "T".to_string(),
columns: vec![col("blob", Type::Blob)],
rows: vec![vec![CellValue::Blob(b"hello".to_vec())]],
})
.unwrap();
let s = String::from_utf8(body).unwrap();
assert!(s.contains("aGVsbG8="));
}
#[test]
fn dates_and_datetimes_pass_through() {
let body = serialize_table(&TableSnapshot {
name: "T".to_string(),
columns: vec![col("d", Type::Date), col("ts", Type::DateTime)],
rows: vec![vec![
CellValue::Text("2026-05-07".to_string()),
CellValue::Text("2026-05-07T14:30:12Z".to_string()),
]],
})
.unwrap();
let s = String::from_utf8(body).unwrap();
assert!(s.contains("2026-05-07,2026-05-07T14:30:12Z"));
}
#[test]
fn parse_round_trips_simple_table() {
let table = TableSnapshot {
name: "Customers".to_string(),
columns: vec![col("id", Type::Serial), col("Name", Type::Text)],
rows: vec![
vec![CellValue::Integer(1), CellValue::Text("Alice".to_string())],
vec![CellValue::Integer(2), CellValue::Text("Bob".to_string())],
],
};
let body = serialize_table(&table).unwrap();
let parsed = parse_csv(std::str::from_utf8(&body).unwrap()).unwrap();
assert_eq!(parsed.header, vec!["id", "Name"]);
assert_eq!(parsed.rows.len(), 2);
assert_eq!(parsed.rows[0][0].content, "1");
assert_eq!(parsed.rows[0][1].content, "Alice");
assert_eq!(parsed.rows[1][1].content, "Bob");
}
#[test]
fn parse_distinguishes_null_from_empty_string() {
// Header "Name", then two rows: NULL (empty unquoted)
// and "" (empty quoted).
let body = "Name\n\n\"\"\n";
let parsed = parse_csv(body).unwrap();
assert_eq!(parsed.rows.len(), 2);
assert!(!parsed.rows[0][0].was_quoted);
assert_eq!(parsed.rows[0][0].content, "");
assert!(parsed.rows[1][0].was_quoted);
assert_eq!(parsed.rows[1][0].content, "");
let null = decode_cell(Type::Text, &parsed.rows[0][0]).unwrap();
let empty = decode_cell(Type::Text, &parsed.rows[1][0]).unwrap();
assert!(matches!(null, CellValue::Null));
assert!(matches!(empty, CellValue::Text(s) if s.is_empty()));
}
#[test]
fn parse_handles_rfc4180_escapes() {
let body = "Name\n\"hello, world\"\n\"she said \"\"hi\"\"\"\n";
let parsed = parse_csv(body).unwrap();
assert_eq!(parsed.rows[0][0].content, "hello, world");
assert_eq!(parsed.rows[1][0].content, "she said \"hi\"");
}
#[test]
fn parse_decodes_per_type() {
// Rows match the round-trip produced by serialize_table.
let table = TableSnapshot {
name: "T".to_string(),
columns: vec![
col("n", Type::Int),
col("r", Type::Real),
col("b", Type::Bool),
col("blob", Type::Blob),
],
rows: vec![vec![
CellValue::Integer(42),
CellValue::Real(std::f64::consts::PI),
CellValue::Integer(1),
CellValue::Blob(b"hi".to_vec()),
]],
};
let body = serialize_table(&table).unwrap();
let parsed = parse_csv(std::str::from_utf8(&body).unwrap()).unwrap();
let row = &parsed.rows[0];
assert!(matches!(decode_cell(Type::Int, &row[0]).unwrap(), CellValue::Integer(42)));
match decode_cell(Type::Real, &row[1]).unwrap() {
CellValue::Real(f) => assert!((f - std::f64::consts::PI).abs() < 1e-12),
other => panic!("got {other:?}"),
}
assert!(matches!(decode_cell(Type::Bool, &row[2]).unwrap(), CellValue::Integer(1)));
assert!(matches!(decode_cell(Type::Blob, &row[3]).unwrap(), CellValue::Blob(b) if b == b"hi"));
}
#[test]
fn parse_rejects_unterminated_quotes() {
let err = parse_csv("Name\n\"oops").expect_err("must error");
assert!(matches!(err, CsvError::UnterminatedQuote));
}
#[test]
fn decode_cell_reports_friendly_error_for_bad_int() {
let cell = RawCell { content: "abc".to_string(), was_quoted: false };
let err = decode_cell(Type::Int, &cell).expect_err("must error");
assert!(err.contains("integer"));
assert!(err.contains("abc"));
}
#[test]
fn row_width_mismatch_errors() {
let err = serialize_table(&TableSnapshot {
name: "T".to_string(),
columns: vec![col("a", Type::Int), col("b", Type::Int)],
rows: vec![vec![CellValue::Integer(1)]],
})
.unwrap_err();
assert!(err.contains("row width"));
}
}