rdbms-playground/src/persistence/csv_io.rs

//! Per-type CSV writer (ADR-0015 §4).
//!
//! Encoding rules per type are exactly as specified in the
//! ADR; the cell-level encoder lives in `encode_cell`. The
//! `csv` crate handles RFC 4180 quoting around our encoded
//! strings.
//!
//! NULL representation: an empty unquoted field. The `csv`
//! crate's writer emits a non-quoted empty field for an empty
//! string by default; we map `CellValue::Null` to that, and
//! `CellValue::Text(String::new())` to a *quoted* empty
//! field (`""`) by emitting a sentinel that round-trips.
//!
//! For the writer, the trick is: `WriterBuilder::quote_style(QuoteStyle::Necessary)`
//! is the default and quotes only when needed (separator,
//! quote, newline). We handle the empty-string-vs-null
//! distinction manually by always quoting non-null empty
//! Text and never quoting Null.
//
// `pub(crate)` items below are re-exported from
// `persistence::mod.rs`; the db worker reaches them via that
// path. Clippy's `redundant_pub_crate` lint flags this
// pattern, but it's load-bearing here.
#![allow(clippy::redundant_pub_crate)]

use std::io::Write as _;

use base64::Engine as _;

use crate::dsl::types::Type;

use super::{CellValue, TableSnapshot};

/// Serialize a `TableSnapshot` to a CSV body. Returns the raw
/// bytes (UTF-8) ready to be written to disk.
pub(super) fn serialize_table(table: &TableSnapshot) -> Result<Vec<u8>, String> {
    // We bypass the `csv` crate for cell-level emission so the
    // NULL-vs-empty distinction stays under our control. The
    // header and per-line framing are still simple enough to
    // emit directly.
    let mut out: Vec<u8> = Vec::new();
    write_record(
        &mut out,
        table.columns.iter().map(|c| Cell::Plain(c.name.clone())),
    )?;
    for row in &table.rows {
        if row.len() != table.columns.len() {
            return Err(format!(
                "row width {} does not match column count {} for table `{}`",
                row.len(),
                table.columns.len(),
                table.name,
            ));
        }
        let mut cells: Vec<Cell> = Vec::with_capacity(row.len());
        for (col, value) in table.columns.iter().zip(row.iter()) {
            cells.push(encode_cell(col.user_type, value)?);
        }
        write_record(&mut out, cells.into_iter())?;
    }
    Ok(out)
}

/// One cell to write. `Plain` is unquoted; `Quoted` is
/// always RFC 4180 double-quoted (used for the empty-string
/// vs NULL distinction).
enum Cell {
    Plain(String),
    Quoted(String),
}

/// Emit a record (header or row) to `out`. Adds the trailing
/// `\n` (RFC 4180 says CRLF, but `\n` is universally accepted
/// and matches what every CSV reader on every platform
/// handles cleanly; line endings are deliberately uniform
/// across our generated artefacts).
fn write_record<I: Iterator<Item = Cell>>(out: &mut Vec<u8>, cells: I) -> Result<(), String> {
    let mut first = true;
    for cell in cells {
        if !first {
            out.push(b',');
        }
        first = false;
        match cell {
            Cell::Plain(s) => {
                if needs_quoting(&s) {
                    write_quoted(out, &s);
                } else {
                    out.write_all(s.as_bytes()).map_err(|e| e.to_string())?;
                }
            }
            Cell::Quoted(s) => write_quoted(out, &s),
        }
    }
    out.push(b'\n');
    Ok(())
}

fn write_quoted(out: &mut Vec<u8>, s: &str) {
    out.push(b'"');
    for &b in s.as_bytes() {
        if b == b'"' {
            out.extend_from_slice(b"\"\"");
        } else {
            out.push(b);
        }
    }
    out.push(b'"');
}

fn needs_quoting(s: &str) -> bool {
    s.bytes().any(|b| matches!(b, b',' | b'"' | b'\n' | b'\r'))
}

/// Encode a single cell per type (ADR-0015 §4 table). Returns
/// the cell wrapped in `Plain` or `Quoted` as appropriate for
/// the NULL/empty distinction.
fn encode_cell(ty: Type, value: &CellValue) -> Result<Cell, String> {
    if matches!(value, CellValue::Null) {
        return Ok(Cell::Plain(String::new()));
    }
    match ty {
        Type::Text => match value {
            CellValue::Text(s) if s.is_empty() => Ok(Cell::Quoted(String::new())),
            CellValue::Text(s) => Ok(Cell::Plain(s.clone())),
            other => Err(format!("expected text, got {other:?}")),
        },
        Type::Int => match value {
            CellValue::Integer(n) => Ok(Cell::Plain(n.to_string())),
            other => Err(format!("expected int, got {other:?}")),
        },
        Type::Real => match value {
            CellValue::Real(f) => Ok(Cell::Plain(format_real(*f))),
            other => Err(format!("expected real, got {other:?}")),
        },
        Type::Decimal => match value {
            // Decimals are stored as TEXT to preserve precision.
            CellValue::Text(s) if s.is_empty() => Ok(Cell::Quoted(String::new())),
            CellValue::Text(s) => Ok(Cell::Plain(s.clone())),
            other => Err(format!("expected decimal (text), got {other:?}")),
        },
        Type::Bool => match value {
            CellValue::Integer(0) => Ok(Cell::Plain("false".to_string())),
            CellValue::Integer(1) => Ok(Cell::Plain("true".to_string())),
            other => Err(format!("expected bool (0 or 1), got {other:?}")),
        },
        Type::Date | Type::DateTime => match value {
            CellValue::Text(s) if s.is_empty() => Ok(Cell::Quoted(String::new())),
            CellValue::Text(s) => Ok(Cell::Plain(s.clone())),
            other => Err(format!("expected date/datetime (text), got {other:?}")),
        },
        Type::Blob => match value {
            CellValue::Blob(bytes) => Ok(Cell::Plain(base64::engine::general_purpose::STANDARD.encode(bytes))),
            other => Err(format!("expected blob, got {other:?}")),
        },
        Type::Serial => match value {
            CellValue::Integer(n) => Ok(Cell::Plain(n.to_string())),
            other => Err(format!("expected serial (int), got {other:?}")),
        },
        Type::ShortId => match value {
            CellValue::Text(s) if s.is_empty() => Ok(Cell::Quoted(String::new())),
            CellValue::Text(s) => Ok(Cell::Plain(s.clone())),
            other => Err(format!("expected shortid (text), got {other:?}")),
        },
    }
}

fn format_real(f: f64) -> String {
    if f.is_nan() {
        "nan".to_string()
    } else if f.is_infinite() {
        if f > 0.0 { "inf".to_string() } else { "-inf".to_string() }
    } else {
        // Default `{}` formatting on f64 emits a shortest
        // round-tripping decimal — exactly what the ADR asks
        // for.
        format!("{f}")
    }
}

/// Parsed CSV records: header row + zero or more data rows.
///
/// Each cell records whether it was syntactically quoted in
/// the source — that's the bit we need to distinguish NULL
/// (empty unquoted) from `""` (empty quoted). The `csv`
/// crate doesn't expose this, which is why we hand-roll the
/// reader to pair with the hand-rolled writer above.
#[derive(Debug, PartialEq, Eq)]
pub(crate) struct ParsedCsv {
    pub header: Vec<String>,
    pub rows: Vec<Vec<RawCell>>,
}

#[derive(Debug, PartialEq, Eq, Clone)]
pub(crate) struct RawCell {
    pub content: String,
    pub was_quoted: bool,
}

#[derive(Debug)]
pub(crate) enum CsvError {
    Empty,
    InvalidUtf8,
    UnterminatedQuote,
}

impl std::fmt::Display for CsvError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let key = match self {
            Self::Empty => "persistence.csv.empty",
            Self::InvalidUtf8 => "persistence.csv.invalid_utf8",
            Self::UnterminatedQuote => "persistence.csv.unterminated_quote",
        };
        f.write_str(&crate::friendly::translate(key, &[]))
    }
}

impl std::error::Error for CsvError {}

/// Tokenize a CSV body. Returns the header (column names from
/// the first record) and the data rows. Each cell preserves a
/// `was_quoted` flag so the caller can distinguish an empty
/// unquoted field (NULL) from an empty quoted field (`""`).
pub(crate) fn parse_csv(body: &str) -> Result<ParsedCsv, CsvError> {
    let mut records: Vec<Vec<RawCell>> = Vec::new();
    let mut current: Vec<RawCell> = Vec::new();
    let bytes = body.as_bytes();
    let mut i = 0;
    let n = bytes.len();

    while i < n {
        let (cell, advance) = parse_field(&bytes[i..])?;
        i += advance;
        current.push(cell);
        match bytes.get(i) {
            Some(&b',') => i += 1,
            Some(&b'\n') => {
                i += 1;
                records.push(std::mem::take(&mut current));
            }
            Some(&b'\r') => {
                i += 1;
                if bytes.get(i) == Some(&b'\n') {
                    i += 1;
                }
                records.push(std::mem::take(&mut current));
            }
            None => {
                records.push(std::mem::take(&mut current));
            }
            Some(&other) => {
                // A non-structural byte after a quoted field —
                // shouldn't happen with our well-formed writer.
                // Treat as part of an unquoted continuation by
                // appending to the last cell. We choose to
                // tolerate rather than error since the most
                // common cause is a trailing space, which we
                // can roll into the cell.
                let last = current
                    .last_mut()
                    .or_else(|| records.last_mut().and_then(|r| r.last_mut()));
                if let Some(c) = last {
                    c.content.push(other as char);
                }
                i += 1;
            }
        }
    }
    if !current.is_empty() {
        records.push(current);
    }

    if records.is_empty() {
        return Err(CsvError::Empty);
    }
    let header_record = records.remove(0);
    let header: Vec<String> = header_record.into_iter().map(|c| c.content).collect();
    Ok(ParsedCsv {
        header,
        rows: records,
    })
}

fn parse_field(bytes: &[u8]) -> Result<(RawCell, usize), CsvError> {
    if bytes.first() == Some(&b'"') {
        let mut content_bytes: Vec<u8> = Vec::new();
        let mut i = 1;
        while i < bytes.len() {
            match bytes[i] {
                b'"' => {
                    if bytes.get(i + 1) == Some(&b'"') {
                        content_bytes.push(b'"');
                        i += 2;
                    } else {
                        let content =
                            String::from_utf8(content_bytes).map_err(|_| CsvError::InvalidUtf8)?;
                        return Ok((
                            RawCell {
                                content,
                                was_quoted: true,
                            },
                            i + 1,
                        ));
                    }
                }
                other => {
                    content_bytes.push(other);
                    i += 1;
                }
            }
        }
        Err(CsvError::UnterminatedQuote)
    } else {
        let mut i = 0;
        while i < bytes.len() {
            match bytes[i] {
                b',' | b'\n' | b'\r' => break,
                _ => i += 1,
            }
        }
        let content =
            String::from_utf8(bytes[..i].to_vec()).map_err(|_| CsvError::InvalidUtf8)?;
        Ok((
            RawCell {
                content,
                was_quoted: false,
            },
            i,
        ))
    }
}

/// Decode one parsed cell into a `CellValue` per the column's
/// declared type. Returns an error string the caller can
/// embed in a fatal banner per ADR-0015 §7 ("unable to load
/// row N from data/T.csv into table T: …").
pub(crate) fn decode_cell(ty: Type, cell: &RawCell) -> Result<CellValue, String> {
    if !cell.was_quoted && cell.content.is_empty() {
        return Ok(CellValue::Null);
    }
    match ty {
        Type::Text | Type::Date | Type::DateTime | Type::Decimal | Type::ShortId => {
            Ok(CellValue::Text(cell.content.clone()))
        }
        Type::Int | Type::Serial => cell
            .content
            .parse::<i64>()
            .map(CellValue::Integer)
            .map_err(|_| format!("expected an integer, got `{}`", cell.content)),
        Type::Real => cell
            .content
            .parse::<f64>()
            .map(CellValue::Real)
            .map_err(|_| format!("expected a real number, got `{}`", cell.content)),
        Type::Bool => match cell.content.as_str() {
            "true" => Ok(CellValue::Integer(1)),
            "false" => Ok(CellValue::Integer(0)),
            other => Err(format!("expected `true` or `false`, got `{other}`")),
        },
        Type::Blob => base64::engine::general_purpose::STANDARD
            .decode(cell.content.as_bytes())
            .map(CellValue::Blob)
            .map_err(|e| format!("invalid base64 blob: {e}")),
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::persistence::ColumnSchema;

    fn col(name: &str, ty: Type) -> ColumnSchema {
        ColumnSchema { name: name.to_string(), user_type: ty, unique: false }
    }

    #[test]
    fn empty_table_emits_header_only() {
        let body = serialize_table(&TableSnapshot {
            name: "Customers".to_string(),
            columns: vec![col("id", Type::Serial), col("Name", Type::Text)],
            rows: vec![],
        })
        .unwrap();
        assert_eq!(String::from_utf8(body).unwrap(), "id,Name\n");
    }

    #[test]
    fn null_is_empty_unquoted_field() {
        let body = serialize_table(&TableSnapshot {
            name: "T".to_string(),
            columns: vec![col("Name", Type::Text)],
            rows: vec![vec![CellValue::Null]],
        })
        .unwrap();
        assert_eq!(String::from_utf8(body).unwrap(), "Name\n\n");
    }

    #[test]
    fn empty_string_is_double_quoted() {
        let body = serialize_table(&TableSnapshot {
            name: "T".to_string(),
            columns: vec![col("Name", Type::Text)],
            rows: vec![vec![CellValue::Text(String::new())]],
        })
        .unwrap();
        assert_eq!(String::from_utf8(body).unwrap(), "Name\n\"\"\n");
    }

    #[test]
    fn text_with_comma_or_quote_is_rfc4180_quoted() {
        let body = serialize_table(&TableSnapshot {
            name: "T".to_string(),
            columns: vec![col("Name", Type::Text)],
            rows: vec![
                vec![CellValue::Text("hello, world".to_string())],
                vec![CellValue::Text("she said \"hi\"".to_string())],
            ],
        })
        .unwrap();
        let s = String::from_utf8(body).unwrap();
        assert!(s.contains("\"hello, world\""));
        assert!(s.contains("\"she said \"\"hi\"\"\""));
    }

    #[test]
    fn ints_and_reals_round_trip_simply() {
        let body = serialize_table(&TableSnapshot {
            name: "T".to_string(),
            columns: vec![col("n", Type::Int), col("r", Type::Real)],
            rows: vec![
                vec![CellValue::Integer(42), CellValue::Real(std::f64::consts::PI)],
                vec![CellValue::Integer(-7), CellValue::Real(0.0)],
            ],
        })
        .unwrap();
        let s = String::from_utf8(body).unwrap();
        let lines: Vec<&str> = s.trim_end().lines().collect();
        assert_eq!(lines[0], "n,r");
        assert!(lines[1].starts_with("42,"));
        assert_eq!(lines[2], "-7,0");
    }

    #[test]
    fn bools_use_words_not_digits() {
        let body = serialize_table(&TableSnapshot {
            name: "T".to_string(),
            columns: vec![col("b", Type::Bool)],
            rows: vec![
                vec![CellValue::Integer(1)],
                vec![CellValue::Integer(0)],
            ],
        })
        .unwrap();
        let s = String::from_utf8(body).unwrap();
        assert_eq!(s, "b\ntrue\nfalse\n");
    }

    #[test]
    fn blobs_use_base64() {
        let body = serialize_table(&TableSnapshot {
            name: "T".to_string(),
            columns: vec![col("blob", Type::Blob)],
            rows: vec![vec![CellValue::Blob(b"hello".to_vec())]],
        })
        .unwrap();
        let s = String::from_utf8(body).unwrap();
        assert!(s.contains("aGVsbG8="));
    }

    #[test]
    fn dates_and_datetimes_pass_through() {
        let body = serialize_table(&TableSnapshot {
            name: "T".to_string(),
            columns: vec![col("d", Type::Date), col("ts", Type::DateTime)],
            rows: vec![vec![
                CellValue::Text("2026-05-07".to_string()),
                CellValue::Text("2026-05-07T14:30:12Z".to_string()),
            ]],
        })
        .unwrap();
        let s = String::from_utf8(body).unwrap();
        assert!(s.contains("2026-05-07,2026-05-07T14:30:12Z"));
    }

    #[test]
    fn parse_round_trips_simple_table() {
        let table = TableSnapshot {
            name: "Customers".to_string(),
            columns: vec![col("id", Type::Serial), col("Name", Type::Text)],
            rows: vec![
                vec![CellValue::Integer(1), CellValue::Text("Alice".to_string())],
                vec![CellValue::Integer(2), CellValue::Text("Bob".to_string())],
            ],
        };
        let body = serialize_table(&table).unwrap();
        let parsed = parse_csv(std::str::from_utf8(&body).unwrap()).unwrap();
        assert_eq!(parsed.header, vec!["id", "Name"]);
        assert_eq!(parsed.rows.len(), 2);
        assert_eq!(parsed.rows[0][0].content, "1");
        assert_eq!(parsed.rows[0][1].content, "Alice");
        assert_eq!(parsed.rows[1][1].content, "Bob");
    }

    #[test]
    fn parse_distinguishes_null_from_empty_string() {
        // Header "Name", then two rows: NULL (empty unquoted)
        // and "" (empty quoted).
        let body = "Name\n\n\"\"\n";
        let parsed = parse_csv(body).unwrap();
        assert_eq!(parsed.rows.len(), 2);
        assert!(!parsed.rows[0][0].was_quoted);
        assert_eq!(parsed.rows[0][0].content, "");
        assert!(parsed.rows[1][0].was_quoted);
        assert_eq!(parsed.rows[1][0].content, "");

        let null = decode_cell(Type::Text, &parsed.rows[0][0]).unwrap();
        let empty = decode_cell(Type::Text, &parsed.rows[1][0]).unwrap();
        assert!(matches!(null, CellValue::Null));
        assert!(matches!(empty, CellValue::Text(s) if s.is_empty()));
    }

    #[test]
    fn parse_handles_rfc4180_escapes() {
        let body = "Name\n\"hello, world\"\n\"she said \"\"hi\"\"\"\n";
        let parsed = parse_csv(body).unwrap();
        assert_eq!(parsed.rows[0][0].content, "hello, world");
        assert_eq!(parsed.rows[1][0].content, "she said \"hi\"");
    }

    #[test]
    fn parse_decodes_per_type() {
        // Rows match the round-trip produced by serialize_table.
        let table = TableSnapshot {
            name: "T".to_string(),
            columns: vec![
                col("n", Type::Int),
                col("r", Type::Real),
                col("b", Type::Bool),
                col("blob", Type::Blob),
            ],
            rows: vec![vec![
                CellValue::Integer(42),
                CellValue::Real(std::f64::consts::PI),
                CellValue::Integer(1),
                CellValue::Blob(b"hi".to_vec()),
            ]],
        };
        let body = serialize_table(&table).unwrap();
        let parsed = parse_csv(std::str::from_utf8(&body).unwrap()).unwrap();
        let row = &parsed.rows[0];
        assert!(matches!(decode_cell(Type::Int, &row[0]).unwrap(), CellValue::Integer(42)));
        match decode_cell(Type::Real, &row[1]).unwrap() {
            CellValue::Real(f) => assert!((f - std::f64::consts::PI).abs() < 1e-12),
            other => panic!("got {other:?}"),
        }
        assert!(matches!(decode_cell(Type::Bool, &row[2]).unwrap(), CellValue::Integer(1)));
        assert!(matches!(decode_cell(Type::Blob, &row[3]).unwrap(), CellValue::Blob(b) if b == b"hi"));
    }

    #[test]
    fn parse_rejects_unterminated_quotes() {
        let err = parse_csv("Name\n\"oops").expect_err("must error");
        assert!(matches!(err, CsvError::UnterminatedQuote));
    }

    #[test]
    fn decode_cell_reports_friendly_error_for_bad_int() {
        let cell = RawCell { content: "abc".to_string(), was_quoted: false };
        let err = decode_cell(Type::Int, &cell).expect_err("must error");
        assert!(err.contains("integer"));
        assert!(err.contains("abc"));
    }

    #[test]
    fn row_width_mismatch_errors() {
        let err = serialize_table(&TableSnapshot {
            name: "T".to_string(),
            columns: vec![col("a", Type::Int), col("b", Type::Int)],
            rows: vec![vec![CellValue::Integer(1)]],
        })
        .unwrap_err();
        assert!(err.contains("row width"));
    }
}