fix: use incremental diff per iteration instead of cumulative base diff

After each iteration's _commit_iteration, record the new HEAD SHA and use it as the diff anchor for the next iteration. Previously capture_diff always diffed against the initial base commit, causing every iteration to return the same full cumulative diff — reviewers couldn't see what changed between iterations, leading to repeated feedback and stuck FAIL loops. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-15 10:07:11 +09:00
parent bf64d19123
commit 28efd5bb8f
3 changed files with 42 additions and 41 deletions
--- a/cross_eval/pipeline.py
+++ b/cross_eval/pipeline.py
@@ -62,18 +62,20 @@ def _commit_iteration(
    label: str,
    iteration: int,
    verdict: str | None,
-) -> None:
+) -> str:
    """Intermediate commit after each agentic iteration.
    This resets the diff baseline so the next iteration only captures new changes.
    Returns the new HEAD SHA to use as the base_commit for the next iteration.
    """
-    from cross_eval.worktree import commit_worktree
+    from cross_eval.worktree import commit_worktree, get_current_head
    committed = commit_worktree(
        worktree_path,
        f"cross-eval: {label} v{iteration} ({verdict or 'no-verdict'})",
    )
    if committed:
        logger.debug("  Intermediate commit: v%d (%s)", iteration, verdict)
    return get_current_head(worktree_path)
 def _has_agentic_steps(config: PipelineConfig, steps: list[StepConfig]) -> bool:
@@ -388,7 +390,7 @@ def _run_simple_pipeline(
            # Intermediate commit so next iteration's diff only shows new changes
            if worktree_path is not None:
-                _commit_iteration(worktree_path, config.preset_name, i, verdict)
+                agentic_base_commit = _commit_iteration(worktree_path, config.preset_name, i, verdict)
            iter_result = IterationResult(
                iteration=i,
@@ -588,7 +590,7 @@ def _run_phased_pipeline(
                # Intermediate commit so next iteration's diff only shows new changes
                if worktree_path is not None:
-                    _commit_iteration(
+                    agentic_base_commit = _commit_iteration(
                        worktree_path, f"{config.preset_name}/{phase.name}",
                        global_iter, verdict,
                    )
--- a/cross_eval/worktree.py
+++ b/cross_eval/worktree.py
@@ -101,19 +101,18 @@ def create_worktree(base_cwd: Path, work_dir: Path, branch_name: str) -> tuple[P
 def capture_diff(worktree_path: Path, base_commit: str | None = None) -> str:
-    """Capture all changes made in the worktree as a unified diff.
+    """Capture all changes made in the worktree since ``base_commit``.
-    Includes both tracked modifications, new untracked files, and changes
+    Handles two scenarios:
-    that the agent may have committed on its own.
+    1. Agent left changes uncommitted → ``git add -A && git diff base HEAD``
    2. Agent committed its own changes → HEAD advanced, diff base..HEAD captures them
    Args:
-        base_commit: The commit SHA from when the worktree was created.
+        base_commit: The diff anchor — typically the worktree HEAD *before* this
-                     If provided, diffs against this fixed base instead of HEAD.
+                     iteration started (set by ``get_current_head`` after each
-                     This is critical because agents (e.g. Claude in interactive
+                     ``_commit_iteration``). Falls back to ``HEAD`` if not given.
                     mode) may create their own commits, advancing HEAD and
                     making ``git diff --cached HEAD`` return empty.
    """
-    # Stage any uncommitted changes so they're included in the diff
+    # Stage any uncommitted changes
    subprocess.run(
        ["git", "add", "-A"],
        cwd=worktree_path,
@@ -121,35 +120,33 @@ def capture_diff(worktree_path: Path, base_commit: str | None = None) -> str:
        check=True,
    )
-    if base_commit:
+    # Commit staged changes so everything is reachable via HEAD
-        # Diff everything (committed + staged) against the original base.
+    # (this is a no-op if nothing is staged)
-        # This captures changes regardless of whether the agent committed them.
+    subprocess.run(
-        result = subprocess.run(
+        ["git", "commit", "-m", "cross-eval: capture-diff snapshot", "--allow-empty-message"],
            ["git", "diff", base_commit, "--cached"],
        cwd=worktree_path,
        capture_output=True,
        text=True,
    )
        diff = result.stdout.strip()
        if diff:
            return diff
-        # Also check committed changes (agent may have committed and left
+    ref = base_commit or "HEAD~1"
        # nothing staged)
    result = subprocess.run(
-            ["git", "diff", base_commit, "HEAD"],
+        ["git", "diff", ref, "HEAD"],
        cwd=worktree_path,
        capture_output=True,
        text=True,
    )
    return result.stdout.strip()
-    # Fallback: no base_commit, use original behavior
+
 def get_current_head(worktree_path: Path) -> str:
    """Return the current HEAD SHA of the worktree."""
    result = subprocess.run(
-        ["git", "diff", "--cached", "HEAD"],
+        ["git", "rev-parse", "HEAD"],
        cwd=worktree_path,
        capture_output=True,
        text=True,
        check=True,
    )
    return result.stdout.strip()
--- a/tests/test_runtime_misc.py
+++ b/tests/test_runtime_misc.py
@@ -413,11 +413,13 @@ class TestInvokeAgenticRuntime(unittest.TestCase):
 class TestPipelineHelpers(unittest.TestCase):
    @patch("cross_eval.worktree.get_current_head", return_value="a" * 40)
    @patch("cross_eval.worktree.commit_worktree", return_value=True)
-    def test_commit_iteration_logs_only_when_committed(self, mock_commit: MagicMock) -> None:
+    def test_commit_iteration_logs_only_when_committed(self, mock_commit: MagicMock, mock_head: MagicMock) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
-            _commit_iteration(Path(tmpdir), "review-fix", 2, "PASS")
+            new_head = _commit_iteration(Path(tmpdir), "review-fix", 2, "PASS")
        mock_commit.assert_called_once()
        self.assertEqual(new_head, "a" * 40)
    def test_snapshot_repo_state_includes_untracked_digest(self) -> None:
        with tempfile.TemporaryDirectory() as tmpdir: