diff --git a/docs/adr/0006-undo-snapshots-and-replay-log.md b/docs/adr/0006-undo-snapshots-and-replay-log.md index ef00724..fcc3465 100644 --- a/docs/adr/0006-undo-snapshots-and-replay-log.md +++ b/docs/adr/0006-undo-snapshots-and-replay-log.md @@ -229,7 +229,22 @@ during implementation, user-confirmed where they extended the design: - **Snapshot-failure policy** (user-confirmed): staging / finalise / discard failures are **non-fatal** (logged) — the real persistence is the durable state and a backup hiccup must not fail the user's - work. Only *restore* failures surface (as `UndoFailed`). + work. Only *restore* failures surface (as `UndoFailed`). A `/runda` + review found that this policy left a **data-loss edge**: a committed + mutation whose snapshot could not be staged added no undo entry and + did not clear the redo stack (clearing was a side effect of + `finalize`), so a later `redo` could silently discard the new work. + Fixed: any committed user mutation (and any batch with ≥1 committed + mutation) now **clears the redo stack even when its snapshot could + not be staged**, via an explicit `SnapshotStore::clear_redo` + (`src/db.rs` `snapshot_then` / `end_batch`). For the realistic + failure (disk full), `clear_redo` — which deletes redo payloads and + rewrites a tiny index — succeeds even when a full backup couldn't. + **Residual edge** (accepted): if the *entire* `.snapshots/` + directory is unwritable (so `clear_redo` itself fails), a stale redo + can survive; but that state means the whole undo subsystem is + broken, which the user would already observe. Regression-tested in + `tests/undo_snapshots.rs::redo_is_cleared_when_new_work_commits_without_a_snapshot`. - **Batch** uses `BeginBatch`/`EndBatch` worker requests; `replay` wraps its loop so a multi-command replay is one undo step, finalised only if a mutation committed. diff --git a/src/db.rs b/src/db.rs index e7b3052..2f4ca02 100644 --- a/src/db.rs +++ b/src/db.rs @@ -1542,11 +1542,17 @@ fn snapshot_then( if committed { batch.dirty = true; } - } else if let (Some(store), Some(st)) = (snap, staged) { - let outcome = if committed { - store.finalize(st).map(|_| ()) - } else { - store.discard(st) + } else if let Some(store) = snap { + let outcome = match staged { + Some(st) if committed => store.finalize(st).map(|_| ()), + Some(st) => store.discard(st), + // No snapshot was staged. If this is a committed user + // mutation (source present → staging FAILED, not just an + // internal op), the redo stack is now stale and must be + // cleared, or a later `redo` would silently discard this + // work. `finalize` would have done this; it didn't run. + None if committed && source.is_some() => store.clear_redo(), + None => Ok(()), }; if let Err(e) = outcome { warn!(error = %e, "undo snapshot bookkeeping failed (command already applied)"); @@ -1579,11 +1585,15 @@ fn end_batch(snap: Option<&SnapshotStore>, batch: &mut BatchState) { warn!("EndBatch with no active batch; ignoring"); return; } - if let (Some(store), Some(st)) = (snap, batch.staged.take()) { - let outcome = if batch.dirty { - store.finalize(st).map(|_| ()) - } else { - store.discard(st) + if let Some(store) = snap { + let outcome = match batch.staged.take() { + Some(st) if batch.dirty => store.finalize(st).map(|_| ()), + Some(st) => store.discard(st), + // Boundary snapshot failed to stage but mutations + // committed: clear the now-stale redo stack (same + // data-loss guard as the per-command path). + None if batch.dirty => store.clear_redo(), + None => Ok(()), }; if let Err(e) = outcome { warn!(error = %e, "batch undo snapshot bookkeeping failed"); diff --git a/src/undo.rs b/src/undo.rs index 2202e4e..58a5fa8 100644 --- a/src/undo.rs +++ b/src/undo.rs @@ -294,6 +294,27 @@ impl SnapshotStore { Ok(Some(entry.meta())) } + /// Clear the redo stack (deleting its payloads) without touching + /// the undo ring. Called when new work commits but its snapshot + /// could not be staged: `finalize` (which normally clears redo) + /// never runs, so the redo entries are left stale — a later + /// `redo` would restore an outdated state and silently discard + /// the new work. Clearing redo here closes that data-loss hole + /// (ADR-0006 Amendment 1; the snapshot-failure policy is + /// non-fatal, so this keeps it *safe*). + pub fn clear_redo(&self) -> Result<()> { + let mut index = self.load_index()?; + if index.redo.is_empty() { + return Ok(()); + } + for r in std::mem::take(&mut index.redo) { + remove_dir_all_if_exists(&self.payload_dir(r.id))?; + } + self.save_index(&index)?; + tracing::debug!("redo stack cleared (new work committed without a snapshot)"); + Ok(()) + } + /// Remove crash leftovers on project open: the `.staging/` dir and /// any payload dir not referenced by the index. pub fn cleanup(&self) -> Result<()> { @@ -721,6 +742,35 @@ mod tests { assert_eq!(payload_dirs, 2, "ring capped at 2 payloads"); } + #[test] + fn clear_redo_drops_redo_without_touching_undo() { + let fx = Fixture::new(); + let store = fx.store(); + // Two entries so one survives in the undo ring after undo. + stage_finalize(&store, &fx.conn, "cmd 1"); + stage_finalize(&store, &fx.conn, "cmd 2"); + let mut conn = fx.conn; + store.undo(&mut conn).unwrap(); // pops cmd 2, redo gets it + assert!(store.peek_redo().unwrap().is_some(), "redo available"); + assert_eq!(store.peek_undo().unwrap().unwrap().command, "cmd 1"); + + store.clear_redo().unwrap(); + assert!(store.peek_redo().unwrap().is_none(), "redo cleared"); + // The remaining undo entry is untouched. + assert_eq!( + store.peek_undo().unwrap().unwrap().command, + "cmd 1", + "undo ring intact" + ); + // The redo payload is gone; only the surviving undo payload remains. + let payload_dirs = fs::read_dir(&store.root) + .unwrap() + .filter_map(std::result::Result::ok) + .filter(|e| e.file_name().to_str().is_some_and(|n| n.parse::().is_ok())) + .count(); + assert_eq!(payload_dirs, 1, "only the surviving undo payload remains"); + } + #[test] fn empty_stacks_return_none() { let fx = Fixture::new(); diff --git a/tests/undo_snapshots.rs b/tests/undo_snapshots.rs index 3abc8e2..b1389e3 100644 --- a/tests/undo_snapshots.rs +++ b/tests/undo_snapshots.rs @@ -389,6 +389,43 @@ fn undo_restores_db_and_csv_consistently() { }); } +#[test] +fn redo_is_cleared_when_new_work_commits_without_a_snapshot() { + // Regression for a /runda finding: with the non-fatal + // snapshot-failure policy, a committed mutation whose snapshot + // can't be staged left the redo stack stale — a later `redo` + // would silently discard the new work. Any committed user work + // must clear redo, even when no snapshot was recorded. + let data = tempdir(); + let (_p, db, path) = open_project(&data, true); + rt().block_on(async { + make_t(&db).await; + dsl_insert(&db, 1, 10).await; + sql_delete(&db, "delete from T where id = 1").await; // → 0 rows + db.undo().await.unwrap(); // redo now holds the delete; → 1 row + assert!(db.peek_redo().await.unwrap().is_some(), "redo populated"); + }); + + // Force the next staging to fail while the rest of the ring stays + // writable: a plain file where the `.staging` dir is expected makes + // `stage` error, but `clear_redo` (index + payload deletes in the + // ring root) still succeeds. + let staging = path.join(".snapshots").join(".staging"); + std::fs::write(&staging, b"block").unwrap(); + + rt().block_on(async { + dsl_insert(&db, 2, 20).await; // commits; snapshot staging fails + assert_eq!(count_t(&db).await, 2, "new work applied"); + assert!( + db.peek_redo().await.unwrap().is_none(), + "stale redo must be cleared when new work commits without a snapshot" + ); + // Redo is now a no-op — it cannot resurrect the discarded state. + assert!(db.redo().await.unwrap().is_none()); + assert_eq!(count_t(&db).await, 2, "new work preserved"); + }); +} + #[test] fn undo_ring_persists_across_reopen() { let data = tempdir();