fix: ADR-0006 — clear redo when new work commits without a snapshot

/runda found silent data loss: with the non-fatal snapshot-failure
policy, a committed mutation whose snapshot couldn't be staged left
the redo stack stale (redo-clear was only a side effect of finalize),
so a later redo silently discarded the new work. Same gap in batches.

- SnapshotStore::clear_redo() drops the redo stack + payloads
- snapshot_then / end_batch call it when committed user work has no
  staged snapshot; for disk-full it succeeds where a full backup
  couldn't (tiny index write + payload deletes)
- unit test + integration regression (forced staging failure)
- ADR-0006 implementation note records the fix + residual edge

1698 passed / 0 failed / 1 ignored; clippy clean.
This commit is contained in:
claude@clouddev1
2026-05-24 21:10:44 +00:00
parent 5442cfc0b9
commit df6aa69155
4 changed files with 123 additions and 11 deletions
+16 -1
View File
@@ -229,7 +229,22 @@ during implementation, user-confirmed where they extended the design:
- **Snapshot-failure policy** (user-confirmed): staging / finalise /
discard failures are **non-fatal** (logged) — the real persistence
is the durable state and a backup hiccup must not fail the user's
work. Only *restore* failures surface (as `UndoFailed`).
work. Only *restore* failures surface (as `UndoFailed`). A `/runda`
review found that this policy left a **data-loss edge**: a committed
mutation whose snapshot could not be staged added no undo entry and
did not clear the redo stack (clearing was a side effect of
`finalize`), so a later `redo` could silently discard the new work.
Fixed: any committed user mutation (and any batch with ≥1 committed
mutation) now **clears the redo stack even when its snapshot could
not be staged**, via an explicit `SnapshotStore::clear_redo`
(`src/db.rs` `snapshot_then` / `end_batch`). For the realistic
failure (disk full), `clear_redo` — which deletes redo payloads and
rewrites a tiny index — succeeds even when a full backup couldn't.
**Residual edge** (accepted): if the *entire* `.snapshots/`
directory is unwritable (so `clear_redo` itself fails), a stale redo
can survive; but that state means the whole undo subsystem is
broken, which the user would already observe. Regression-tested in
`tests/undo_snapshots.rs::redo_is_cleared_when_new_work_commits_without_a_snapshot`.
- **Batch** uses `BeginBatch`/`EndBatch` worker requests; `replay`
wraps its loop so a multi-command replay is one undo step,
finalised only if a mutation committed.
+20 -10
View File
@@ -1542,11 +1542,17 @@ fn snapshot_then<T>(
if committed {
batch.dirty = true;
}
} else if let (Some(store), Some(st)) = (snap, staged) {
let outcome = if committed {
store.finalize(st).map(|_| ())
} else {
store.discard(st)
} else if let Some(store) = snap {
let outcome = match staged {
Some(st) if committed => store.finalize(st).map(|_| ()),
Some(st) => store.discard(st),
// No snapshot was staged. If this is a committed user
// mutation (source present → staging FAILED, not just an
// internal op), the redo stack is now stale and must be
// cleared, or a later `redo` would silently discard this
// work. `finalize` would have done this; it didn't run.
None if committed && source.is_some() => store.clear_redo(),
None => Ok(()),
};
if let Err(e) = outcome {
warn!(error = %e, "undo snapshot bookkeeping failed (command already applied)");
@@ -1579,11 +1585,15 @@ fn end_batch(snap: Option<&SnapshotStore>, batch: &mut BatchState) {
warn!("EndBatch with no active batch; ignoring");
return;
}
if let (Some(store), Some(st)) = (snap, batch.staged.take()) {
let outcome = if batch.dirty {
store.finalize(st).map(|_| ())
} else {
store.discard(st)
if let Some(store) = snap {
let outcome = match batch.staged.take() {
Some(st) if batch.dirty => store.finalize(st).map(|_| ()),
Some(st) => store.discard(st),
// Boundary snapshot failed to stage but mutations
// committed: clear the now-stale redo stack (same
// data-loss guard as the per-command path).
None if batch.dirty => store.clear_redo(),
None => Ok(()),
};
if let Err(e) = outcome {
warn!(error = %e, "batch undo snapshot bookkeeping failed");
+50
View File
@@ -294,6 +294,27 @@ impl SnapshotStore {
Ok(Some(entry.meta()))
}
/// Clear the redo stack (deleting its payloads) without touching
/// the undo ring. Called when new work commits but its snapshot
/// could not be staged: `finalize` (which normally clears redo)
/// never runs, so the redo entries are left stale — a later
/// `redo` would restore an outdated state and silently discard
/// the new work. Clearing redo here closes that data-loss hole
/// (ADR-0006 Amendment 1; the snapshot-failure policy is
/// non-fatal, so this keeps it *safe*).
pub fn clear_redo(&self) -> Result<()> {
let mut index = self.load_index()?;
if index.redo.is_empty() {
return Ok(());
}
for r in std::mem::take(&mut index.redo) {
remove_dir_all_if_exists(&self.payload_dir(r.id))?;
}
self.save_index(&index)?;
tracing::debug!("redo stack cleared (new work committed without a snapshot)");
Ok(())
}
/// Remove crash leftovers on project open: the `.staging/` dir and
/// any payload dir not referenced by the index.
pub fn cleanup(&self) -> Result<()> {
@@ -721,6 +742,35 @@ mod tests {
assert_eq!(payload_dirs, 2, "ring capped at 2 payloads");
}
#[test]
fn clear_redo_drops_redo_without_touching_undo() {
let fx = Fixture::new();
let store = fx.store();
// Two entries so one survives in the undo ring after undo.
stage_finalize(&store, &fx.conn, "cmd 1");
stage_finalize(&store, &fx.conn, "cmd 2");
let mut conn = fx.conn;
store.undo(&mut conn).unwrap(); // pops cmd 2, redo gets it
assert!(store.peek_redo().unwrap().is_some(), "redo available");
assert_eq!(store.peek_undo().unwrap().unwrap().command, "cmd 1");
store.clear_redo().unwrap();
assert!(store.peek_redo().unwrap().is_none(), "redo cleared");
// The remaining undo entry is untouched.
assert_eq!(
store.peek_undo().unwrap().unwrap().command,
"cmd 1",
"undo ring intact"
);
// The redo payload is gone; only the surviving undo payload remains.
let payload_dirs = fs::read_dir(&store.root)
.unwrap()
.filter_map(std::result::Result::ok)
.filter(|e| e.file_name().to_str().is_some_and(|n| n.parse::<u64>().is_ok()))
.count();
assert_eq!(payload_dirs, 1, "only the surviving undo payload remains");
}
#[test]
fn empty_stacks_return_none() {
let fx = Fixture::new();
+37
View File
@@ -389,6 +389,43 @@ fn undo_restores_db_and_csv_consistently() {
});
}
#[test]
fn redo_is_cleared_when_new_work_commits_without_a_snapshot() {
// Regression for a /runda finding: with the non-fatal
// snapshot-failure policy, a committed mutation whose snapshot
// can't be staged left the redo stack stale — a later `redo`
// would silently discard the new work. Any committed user work
// must clear redo, even when no snapshot was recorded.
let data = tempdir();
let (_p, db, path) = open_project(&data, true);
rt().block_on(async {
make_t(&db).await;
dsl_insert(&db, 1, 10).await;
sql_delete(&db, "delete from T where id = 1").await; // → 0 rows
db.undo().await.unwrap(); // redo now holds the delete; → 1 row
assert!(db.peek_redo().await.unwrap().is_some(), "redo populated");
});
// Force the next staging to fail while the rest of the ring stays
// writable: a plain file where the `.staging` dir is expected makes
// `stage` error, but `clear_redo` (index + payload deletes in the
// ring root) still succeeds.
let staging = path.join(".snapshots").join(".staging");
std::fs::write(&staging, b"block").unwrap();
rt().block_on(async {
dsl_insert(&db, 2, 20).await; // commits; snapshot staging fails
assert_eq!(count_t(&db).await, 2, "new work applied");
assert!(
db.peek_redo().await.unwrap().is_none(),
"stale redo must be cleared when new work commits without a snapshot"
);
// Redo is now a no-op — it cannot resurrect the discarded state.
assert!(db.redo().await.unwrap().is_none());
assert_eq!(count_t(&db).await, 2, "new work preserved");
});
}
#[test]
fn undo_ring_persists_across_reopen() {
let data = tempdir();