fix: ADR-0006 — clear redo when new work commits without a snapshot

/runda found silent data loss: with the non-fatal snapshot-failure policy, a committed mutation whose snapshot couldn't be staged left the redo stack stale (redo-clear was only a side effect of finalize), so a later redo silently discarded the new work. Same gap in batches. - SnapshotStore::clear_redo() drops the redo stack + payloads - snapshot_then / end_batch call it when committed user work has no staged snapshot; for disk-full it succeeds where a full backup couldn't (tiny index write + payload deletes) - unit test + integration regression (forced staging failure) - ADR-0006 implementation note records the fix + residual edge 1698 passed / 0 failed / 1 ignored; clippy clean.
2026-05-24 21:10:44 +00:00
parent 5442cfc0b9
commit df6aa69155
4 changed files with 123 additions and 11 deletions
@@ -229,7 +229,22 @@ during implementation, user-confirmed where they extended the design:
 - **Snapshot-failure policy** (user-confirmed): staging / finalise /
  discard failures are **non-fatal** (logged) — the real persistence
  is the durable state and a backup hiccup must not fail the user's
-  work. Only *restore* failures surface (as `UndoFailed`).
+  work. Only *restore* failures surface (as `UndoFailed`). A `/runda`
+  review found that this policy left a **data-loss edge**: a committed
+  mutation whose snapshot could not be staged added no undo entry and
+  did not clear the redo stack (clearing was a side effect of
+  `finalize`), so a later `redo` could silently discard the new work.
+  Fixed: any committed user mutation (and any batch with ≥1 committed
+  mutation) now **clears the redo stack even when its snapshot could
+  not be staged**, via an explicit `SnapshotStore::clear_redo`
+  (`src/db.rs` `snapshot_then` / `end_batch`). For the realistic
+  failure (disk full), `clear_redo` — which deletes redo payloads and
+  rewrites a tiny index — succeeds even when a full backup couldn't.
+  **Residual edge** (accepted): if the *entire* `.snapshots/`
+  directory is unwritable (so `clear_redo` itself fails), a stale redo
+  can survive; but that state means the whole undo subsystem is
+  broken, which the user would already observe. Regression-tested in
+  `tests/undo_snapshots.rs::redo_is_cleared_when_new_work_commits_without_a_snapshot`.
 - **Batch** uses `BeginBatch`/`EndBatch` worker requests; `replay`
  wraps its loop so a multi-command replay is one undo step,
  finalised only if a mutation committed.
@@ -1542,11 +1542,17 @@ fn snapshot_then<T>(
        if committed {
            batch.dirty = true;
        }
-    } else if let (Some(store), Some(st)) = (snap, staged) {
-        let outcome = if committed {
-            store.finalize(st).map(|_| ())
-        } else {
-            store.discard(st)
+    } else if let Some(store) = snap {
+        let outcome = match staged {
+            Some(st) if committed => store.finalize(st).map(|_| ()),
+            Some(st) => store.discard(st),
+            // No snapshot was staged. If this is a committed user
+            // mutation (source present → staging FAILED, not just an
+            // internal op), the redo stack is now stale and must be
+            // cleared, or a later `redo` would silently discard this
+            // work. `finalize` would have done this; it didn't run.
+            None if committed && source.is_some() => store.clear_redo(),
+            None => Ok(()),
        };
        if let Err(e) = outcome {
            warn!(error = %e, "undo snapshot bookkeeping failed (command already applied)");
@@ -1579,11 +1585,15 @@ fn end_batch(snap: Option<&SnapshotStore>, batch: &mut BatchState) {
        warn!("EndBatch with no active batch; ignoring");
        return;
    }
-    if let (Some(store), Some(st)) = (snap, batch.staged.take()) {
-        let outcome = if batch.dirty {
-            store.finalize(st).map(|_| ())
-        } else {
-            store.discard(st)
+    if let Some(store) = snap {
+        let outcome = match batch.staged.take() {
+            Some(st) if batch.dirty => store.finalize(st).map(|_| ()),
+            Some(st) => store.discard(st),
+            // Boundary snapshot failed to stage but mutations
+            // committed: clear the now-stale redo stack (same
+            // data-loss guard as the per-command path).
+            None if batch.dirty => store.clear_redo(),
+            None => Ok(()),
        };
        if let Err(e) = outcome {
            warn!(error = %e, "batch undo snapshot bookkeeping failed");
@@ -294,6 +294,27 @@ impl SnapshotStore {
        Ok(Some(entry.meta()))
    }

+    /// Clear the redo stack (deleting its payloads) without touching
+    /// the undo ring. Called when new work commits but its snapshot
+    /// could not be staged: `finalize` (which normally clears redo)
+    /// never runs, so the redo entries are left stale — a later
+    /// `redo` would restore an outdated state and silently discard
+    /// the new work. Clearing redo here closes that data-loss hole
+    /// (ADR-0006 Amendment 1; the snapshot-failure policy is
+    /// non-fatal, so this keeps it *safe*).
+    pub fn clear_redo(&self) -> Result<()> {
+        let mut index = self.load_index()?;
+        if index.redo.is_empty() {
+            return Ok(());
+        }
+        for r in std::mem::take(&mut index.redo) {
+            remove_dir_all_if_exists(&self.payload_dir(r.id))?;
+        }
+        self.save_index(&index)?;
+        tracing::debug!("redo stack cleared (new work committed without a snapshot)");
+        Ok(())
+    }
+
    /// Remove crash leftovers on project open: the `.staging/` dir and
    /// any payload dir not referenced by the index.
    pub fn cleanup(&self) -> Result<()> {
@@ -721,6 +742,35 @@ mod tests {
        assert_eq!(payload_dirs, 2, "ring capped at 2 payloads");
    }

+    #[test]
+    fn clear_redo_drops_redo_without_touching_undo() {
+        let fx = Fixture::new();
+        let store = fx.store();
+        // Two entries so one survives in the undo ring after undo.
+        stage_finalize(&store, &fx.conn, "cmd 1");
+        stage_finalize(&store, &fx.conn, "cmd 2");
+        let mut conn = fx.conn;
+        store.undo(&mut conn).unwrap(); // pops cmd 2, redo gets it
+        assert!(store.peek_redo().unwrap().is_some(), "redo available");
+        assert_eq!(store.peek_undo().unwrap().unwrap().command, "cmd 1");
+
+        store.clear_redo().unwrap();
+        assert!(store.peek_redo().unwrap().is_none(), "redo cleared");
+        // The remaining undo entry is untouched.
+        assert_eq!(
+            store.peek_undo().unwrap().unwrap().command,
+            "cmd 1",
+            "undo ring intact"
+        );
+        // The redo payload is gone; only the surviving undo payload remains.
+        let payload_dirs = fs::read_dir(&store.root)
+            .unwrap()
+            .filter_map(std::result::Result::ok)
+            .filter(|e| e.file_name().to_str().is_some_and(|n| n.parse::<u64>().is_ok()))
+            .count();
+        assert_eq!(payload_dirs, 1, "only the surviving undo payload remains");
+    }
+
    #[test]
    fn empty_stacks_return_none() {
        let fx = Fixture::new();
@@ -389,6 +389,43 @@ fn undo_restores_db_and_csv_consistently() {
    });
 }

+#[test]
+fn redo_is_cleared_when_new_work_commits_without_a_snapshot() {
+    // Regression for a /runda finding: with the non-fatal
+    // snapshot-failure policy, a committed mutation whose snapshot
+    // can't be staged left the redo stack stale — a later `redo`
+    // would silently discard the new work. Any committed user work
+    // must clear redo, even when no snapshot was recorded.
+    let data = tempdir();
+    let (_p, db, path) = open_project(&data, true);
+    rt().block_on(async {
+        make_t(&db).await;
+        dsl_insert(&db, 1, 10).await;
+        sql_delete(&db, "delete from T where id = 1").await; // → 0 rows
+        db.undo().await.unwrap(); // redo now holds the delete; → 1 row
+        assert!(db.peek_redo().await.unwrap().is_some(), "redo populated");
+    });
+
+    // Force the next staging to fail while the rest of the ring stays
+    // writable: a plain file where the `.staging` dir is expected makes
+    // `stage` error, but `clear_redo` (index + payload deletes in the
+    // ring root) still succeeds.
+    let staging = path.join(".snapshots").join(".staging");
+    std::fs::write(&staging, b"block").unwrap();
+
+    rt().block_on(async {
+        dsl_insert(&db, 2, 20).await; // commits; snapshot staging fails
+        assert_eq!(count_t(&db).await, 2, "new work applied");
+        assert!(
+            db.peek_redo().await.unwrap().is_none(),
+            "stale redo must be cleared when new work commits without a snapshot"
+        );
+        // Redo is now a no-op — it cannot resurrect the discarded state.
+        assert!(db.redo().await.unwrap().is_none());
+        assert_eq!(count_t(&db).await, 2, "new work preserved");
+    });
+}
+
 #[test]
 fn undo_ring_persists_across_reopen() {
    let data = tempdir();