fix: ADR-0006 — clear redo when new work commits without a snapshot
/runda found silent data loss: with the non-fatal snapshot-failure policy, a committed mutation whose snapshot couldn't be staged left the redo stack stale (redo-clear was only a side effect of finalize), so a later redo silently discarded the new work. Same gap in batches. - SnapshotStore::clear_redo() drops the redo stack + payloads - snapshot_then / end_batch call it when committed user work has no staged snapshot; for disk-full it succeeds where a full backup couldn't (tiny index write + payload deletes) - unit test + integration regression (forced staging failure) - ADR-0006 implementation note records the fix + residual edge 1698 passed / 0 failed / 1 ignored; clippy clean.
This commit is contained in:
@@ -229,7 +229,22 @@ during implementation, user-confirmed where they extended the design:
|
||||
- **Snapshot-failure policy** (user-confirmed): staging / finalise /
|
||||
discard failures are **non-fatal** (logged) — the real persistence
|
||||
is the durable state and a backup hiccup must not fail the user's
|
||||
work. Only *restore* failures surface (as `UndoFailed`).
|
||||
work. Only *restore* failures surface (as `UndoFailed`). A `/runda`
|
||||
review found that this policy left a **data-loss edge**: a committed
|
||||
mutation whose snapshot could not be staged added no undo entry and
|
||||
did not clear the redo stack (clearing was a side effect of
|
||||
`finalize`), so a later `redo` could silently discard the new work.
|
||||
Fixed: any committed user mutation (and any batch with ≥1 committed
|
||||
mutation) now **clears the redo stack even when its snapshot could
|
||||
not be staged**, via an explicit `SnapshotStore::clear_redo`
|
||||
(`src/db.rs` `snapshot_then` / `end_batch`). For the realistic
|
||||
failure (disk full), `clear_redo` — which deletes redo payloads and
|
||||
rewrites a tiny index — succeeds even when a full backup couldn't.
|
||||
**Residual edge** (accepted): if the *entire* `.snapshots/`
|
||||
directory is unwritable (so `clear_redo` itself fails), a stale redo
|
||||
can survive; but that state means the whole undo subsystem is
|
||||
broken, which the user would already observe. Regression-tested in
|
||||
`tests/undo_snapshots.rs::redo_is_cleared_when_new_work_commits_without_a_snapshot`.
|
||||
- **Batch** uses `BeginBatch`/`EndBatch` worker requests; `replay`
|
||||
wraps its loop so a multi-command replay is one undo step,
|
||||
finalised only if a mutation committed.
|
||||
|
||||
@@ -1542,11 +1542,17 @@ fn snapshot_then<T>(
|
||||
if committed {
|
||||
batch.dirty = true;
|
||||
}
|
||||
} else if let (Some(store), Some(st)) = (snap, staged) {
|
||||
let outcome = if committed {
|
||||
store.finalize(st).map(|_| ())
|
||||
} else {
|
||||
store.discard(st)
|
||||
} else if let Some(store) = snap {
|
||||
let outcome = match staged {
|
||||
Some(st) if committed => store.finalize(st).map(|_| ()),
|
||||
Some(st) => store.discard(st),
|
||||
// No snapshot was staged. If this is a committed user
|
||||
// mutation (source present → staging FAILED, not just an
|
||||
// internal op), the redo stack is now stale and must be
|
||||
// cleared, or a later `redo` would silently discard this
|
||||
// work. `finalize` would have done this; it didn't run.
|
||||
None if committed && source.is_some() => store.clear_redo(),
|
||||
None => Ok(()),
|
||||
};
|
||||
if let Err(e) = outcome {
|
||||
warn!(error = %e, "undo snapshot bookkeeping failed (command already applied)");
|
||||
@@ -1579,11 +1585,15 @@ fn end_batch(snap: Option<&SnapshotStore>, batch: &mut BatchState) {
|
||||
warn!("EndBatch with no active batch; ignoring");
|
||||
return;
|
||||
}
|
||||
if let (Some(store), Some(st)) = (snap, batch.staged.take()) {
|
||||
let outcome = if batch.dirty {
|
||||
store.finalize(st).map(|_| ())
|
||||
} else {
|
||||
store.discard(st)
|
||||
if let Some(store) = snap {
|
||||
let outcome = match batch.staged.take() {
|
||||
Some(st) if batch.dirty => store.finalize(st).map(|_| ()),
|
||||
Some(st) => store.discard(st),
|
||||
// Boundary snapshot failed to stage but mutations
|
||||
// committed: clear the now-stale redo stack (same
|
||||
// data-loss guard as the per-command path).
|
||||
None if batch.dirty => store.clear_redo(),
|
||||
None => Ok(()),
|
||||
};
|
||||
if let Err(e) = outcome {
|
||||
warn!(error = %e, "batch undo snapshot bookkeeping failed");
|
||||
|
||||
+50
@@ -294,6 +294,27 @@ impl SnapshotStore {
|
||||
Ok(Some(entry.meta()))
|
||||
}
|
||||
|
||||
/// Clear the redo stack (deleting its payloads) without touching
|
||||
/// the undo ring. Called when new work commits but its snapshot
|
||||
/// could not be staged: `finalize` (which normally clears redo)
|
||||
/// never runs, so the redo entries are left stale — a later
|
||||
/// `redo` would restore an outdated state and silently discard
|
||||
/// the new work. Clearing redo here closes that data-loss hole
|
||||
/// (ADR-0006 Amendment 1; the snapshot-failure policy is
|
||||
/// non-fatal, so this keeps it *safe*).
|
||||
pub fn clear_redo(&self) -> Result<()> {
|
||||
let mut index = self.load_index()?;
|
||||
if index.redo.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
for r in std::mem::take(&mut index.redo) {
|
||||
remove_dir_all_if_exists(&self.payload_dir(r.id))?;
|
||||
}
|
||||
self.save_index(&index)?;
|
||||
tracing::debug!("redo stack cleared (new work committed without a snapshot)");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remove crash leftovers on project open: the `.staging/` dir and
|
||||
/// any payload dir not referenced by the index.
|
||||
pub fn cleanup(&self) -> Result<()> {
|
||||
@@ -721,6 +742,35 @@ mod tests {
|
||||
assert_eq!(payload_dirs, 2, "ring capped at 2 payloads");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn clear_redo_drops_redo_without_touching_undo() {
|
||||
let fx = Fixture::new();
|
||||
let store = fx.store();
|
||||
// Two entries so one survives in the undo ring after undo.
|
||||
stage_finalize(&store, &fx.conn, "cmd 1");
|
||||
stage_finalize(&store, &fx.conn, "cmd 2");
|
||||
let mut conn = fx.conn;
|
||||
store.undo(&mut conn).unwrap(); // pops cmd 2, redo gets it
|
||||
assert!(store.peek_redo().unwrap().is_some(), "redo available");
|
||||
assert_eq!(store.peek_undo().unwrap().unwrap().command, "cmd 1");
|
||||
|
||||
store.clear_redo().unwrap();
|
||||
assert!(store.peek_redo().unwrap().is_none(), "redo cleared");
|
||||
// The remaining undo entry is untouched.
|
||||
assert_eq!(
|
||||
store.peek_undo().unwrap().unwrap().command,
|
||||
"cmd 1",
|
||||
"undo ring intact"
|
||||
);
|
||||
// The redo payload is gone; only the surviving undo payload remains.
|
||||
let payload_dirs = fs::read_dir(&store.root)
|
||||
.unwrap()
|
||||
.filter_map(std::result::Result::ok)
|
||||
.filter(|e| e.file_name().to_str().is_some_and(|n| n.parse::<u64>().is_ok()))
|
||||
.count();
|
||||
assert_eq!(payload_dirs, 1, "only the surviving undo payload remains");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_stacks_return_none() {
|
||||
let fx = Fixture::new();
|
||||
|
||||
@@ -389,6 +389,43 @@ fn undo_restores_db_and_csv_consistently() {
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn redo_is_cleared_when_new_work_commits_without_a_snapshot() {
|
||||
// Regression for a /runda finding: with the non-fatal
|
||||
// snapshot-failure policy, a committed mutation whose snapshot
|
||||
// can't be staged left the redo stack stale — a later `redo`
|
||||
// would silently discard the new work. Any committed user work
|
||||
// must clear redo, even when no snapshot was recorded.
|
||||
let data = tempdir();
|
||||
let (_p, db, path) = open_project(&data, true);
|
||||
rt().block_on(async {
|
||||
make_t(&db).await;
|
||||
dsl_insert(&db, 1, 10).await;
|
||||
sql_delete(&db, "delete from T where id = 1").await; // → 0 rows
|
||||
db.undo().await.unwrap(); // redo now holds the delete; → 1 row
|
||||
assert!(db.peek_redo().await.unwrap().is_some(), "redo populated");
|
||||
});
|
||||
|
||||
// Force the next staging to fail while the rest of the ring stays
|
||||
// writable: a plain file where the `.staging` dir is expected makes
|
||||
// `stage` error, but `clear_redo` (index + payload deletes in the
|
||||
// ring root) still succeeds.
|
||||
let staging = path.join(".snapshots").join(".staging");
|
||||
std::fs::write(&staging, b"block").unwrap();
|
||||
|
||||
rt().block_on(async {
|
||||
dsl_insert(&db, 2, 20).await; // commits; snapshot staging fails
|
||||
assert_eq!(count_t(&db).await, 2, "new work applied");
|
||||
assert!(
|
||||
db.peek_redo().await.unwrap().is_none(),
|
||||
"stale redo must be cleared when new work commits without a snapshot"
|
||||
);
|
||||
// Redo is now a no-op — it cannot resurrect the discarded state.
|
||||
assert!(db.redo().await.unwrap().is_none());
|
||||
assert_eq!(count_t(&db).await, 2, "new work preserved");
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn undo_ring_persists_across_reopen() {
|
||||
let data = tempdir();
|
||||
|
||||
Reference in New Issue
Block a user