fix: capture_diff uses base commit to handle agent self-commits

Claude in agentic mode (interactive, no -p flag) commits its own changes, advancing HEAD. This made `git diff --cached HEAD` return empty, triggering false EMPTY_DIFF errors every time. Now capture_diff diffs against the base commit SHA recorded at worktree creation, so changes are captured regardless of whether the agent committed them. Also adds UX_IMPROVEMENT_PLAN.md for guided message improvements. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-14 23:59:53 +09:00
parent af05fc1ddb
commit 60c7b07939
6 changed files with 281 additions and 28 deletions
--- a/cross_eval/agent.py
+++ b/cross_eval/agent.py
@@ -414,6 +414,7 @@ def invoke_agent_agentic(
    env: Optional[dict[str, str]] = None,
    timeout: int | None = None,
    quiet: bool = False,
+    base_commit: str | None = None,
 ) -> AgentResult:
    """Invoke an agent in agentic mode using the worktree as the source of truth."""
    from cross_eval.worktree import capture_diff
@@ -506,8 +507,8 @@ def invoke_agent_agentic(
            suggested_action=suggested_action,
        )

-    # Capture git diff as the output (changes since last commit on the branch)
-    diff_output = capture_diff(worktree_path)
+    # Capture git diff as the output (changes since the base commit)
+    diff_output = capture_diff(worktree_path, base_commit=base_commit)

    if not diff_output:
        stdout_excerpt = (result.stdout or "").strip()
--- a/cross_eval/pipeline.py
+++ b/cross_eval/pipeline.py
@@ -84,24 +84,25 @@ def _has_agentic_steps(config: PipelineConfig, steps: list[StepConfig]) -> bool:
    )


-def _setup_worktree(cwd: Path, run_dir: Path, preset_name: str) -> tuple[Path, str]:
+def _setup_worktree(cwd: Path, run_dir: Path, preset_name: str) -> tuple[Path, str, str]:
    """Create a shared worktree for the entire pipeline run.

    1. Generate branch name (cross-eval/<preset>_<timestamp>)
    2. Create branch from HEAD
    3. Create worktree on that branch

-    Returns (worktree_path, branch_name).
+    Returns (worktree_path, branch_name, base_commit).
    """
    from cross_eval.worktree import create_worktree, make_branch_name, make_worktree_dir
    branch_name = make_branch_name(preset_name)
    worktree_dir = make_worktree_dir(cwd, branch_name)
-    worktree_path = create_worktree(
+    worktree_path, base_commit = create_worktree(
        base_cwd=cwd, work_dir=worktree_dir, branch_name=branch_name,
    )
    (run_dir / "worktree_path.txt").write_text(f"{worktree_path}\n", encoding="utf-8")
    (run_dir / "worktree_branch.txt").write_text(f"{branch_name}\n", encoding="utf-8")
-    return worktree_path, branch_name
+    (run_dir / "worktree_base.txt").write_text(f"{base_commit}\n", encoding="utf-8")
+    return worktree_path, branch_name, base_commit


 def _copy_inputs_to_worktree(
@@ -321,10 +322,11 @@ def _run_simple_pipeline(
    # Setup shared worktree for agentic mode
    worktree_path: Path | None = None
    agentic_branch_name: str | None = None
+    agentic_base_commit: str | None = None
    base_repo_state: dict[str, str] | None = None
    base_repo_status: str | None = None
    if not dry_run and _has_agentic_steps(config, config.pipeline):
-        worktree_path, agentic_branch_name = _setup_worktree(
+        worktree_path, agentic_branch_name, agentic_base_commit = _setup_worktree(
            cwd, run_dir, config.preset_name,
        )
        _copy_inputs_to_worktree(config, worktree_path)
@@ -360,6 +362,7 @@ def _run_simple_pipeline(
                runtime_env=runtime_env,
                base_repo_state=base_repo_state,
                base_repo_status=base_repo_status,
+                base_commit=agentic_base_commit,
            )

            # Intermediate commit so next iteration's diff only shows new changes
@@ -498,10 +501,11 @@ def _run_phased_pipeline(
    all_phase_steps = [s for p in config.phases for s in p.steps]
    worktree_path: Path | None = None
    agentic_branch_name: str | None = None
+    agentic_base_commit: str | None = None
    base_repo_state: dict[str, str] | None = None
    base_repo_status: str | None = None
    if not dry_run and _has_agentic_steps(config, all_phase_steps):
-        worktree_path, agentic_branch_name = _setup_worktree(
+        worktree_path, agentic_branch_name, agentic_base_commit = _setup_worktree(
            cwd, run_dir, config.preset_name,
        )
        _copy_inputs_to_worktree(config, worktree_path)
@@ -558,6 +562,7 @@ def _run_phased_pipeline(
                    runtime_env=runtime_env,
                    base_repo_state=base_repo_state,
                    base_repo_status=base_repo_status,
+                    base_commit=agentic_base_commit,
                )

                # Intermediate commit so next iteration's diff only shows new changes
@@ -903,6 +908,7 @@ def _run_steps(
    runtime_env: dict[str, str] | None = None,
    base_repo_state: dict[str, str] | None = None,
    base_repo_status: str | None = None,
+    base_commit: str | None = None,
 ) -> tuple[dict[str, str], dict[str, AgentResult], str | None]:
    """Execute all steps in one iteration, parallelizing where possible."""
    step_outputs: dict[str, str] = {}
@@ -923,6 +929,7 @@ def _run_steps(
                runtime_env=runtime_env,
                base_repo_state=base_repo_state,
                base_repo_status=base_repo_status,
+                base_commit=base_commit,
            )
        else:
            _execute_parallel_batch(
@@ -934,6 +941,7 @@ def _run_steps(
                runtime_env=runtime_env,
                base_repo_state=base_repo_state,
                base_repo_status=base_repo_status,
+                base_commit=base_commit,
            )

    # Extract verdict from all verdict steps (ALL must PASS; ESCALATE wins over all)
@@ -961,6 +969,7 @@ def _invoke_agentic(
    env: dict[str, str] | None = None,
    timeout: int | None = None,
    quiet: bool = False,
+    base_commit: str | None = None,
 ) -> AgentResult:
    """Run an agent in agentic mode using an existing worktree."""
    return invoke_agent_agentic(
@@ -968,6 +977,7 @@ def _invoke_agentic(
        worktree_path=worktree_path,
        env=env,
        timeout=timeout, quiet=quiet,
+        base_commit=base_commit,
    )


@@ -992,6 +1002,7 @@ def _execute_step(
    runtime_env: dict[str, str] | None = None,
    base_repo_state: dict[str, str] | None = None,
    base_repo_status: str | None = None,
+    base_commit: str | None = None,
 ) -> None:
    """Execute a single step, updating step_outputs and step_results in place."""
    if not quiet:
@@ -1035,6 +1046,7 @@ def _execute_step(
                worktree_path=worktree_path,
                env=runtime_env,
                timeout=timeout, quiet=quiet,
+                base_commit=base_commit,
            )
        else:
            # When worktree exists, run non-agentic agents (reviewers) in
@@ -1125,6 +1137,7 @@ def _execute_parallel_batch(
    runtime_env: dict[str, str] | None = None,
    base_repo_state: dict[str, str] | None = None,
    base_repo_status: str | None = None,
+    base_commit: str | None = None,
 ) -> None:
    """Execute multiple steps in parallel using threads."""
    agent_names = ", ".join(s.agent for s in batch)
@@ -1139,6 +1152,7 @@ def _execute_parallel_batch(
                run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
                base_repo_state=base_repo_state,
                base_repo_status=base_repo_status,
+                base_commit=base_commit,
            )
        return

@@ -1161,6 +1175,7 @@ def _execute_parallel_batch(
                phase_name=phase_name, worktree_path=worktree_path,
                base_repo_state=base_repo_state,
                base_repo_status=base_repo_status,
+                base_commit=base_commit,
            )
        return

@@ -1204,6 +1219,7 @@ def _execute_parallel_batch(
                worktree_path=worktree_path,
                env=runtime_env,
                timeout=timeout, quiet=True,
+                base_commit=base_commit,
            )
        else:
            effective_cwd = worktree_path if worktree_path else cwd
--- a/cross_eval/worktree.py
+++ b/cross_eval/worktree.py
@@ -37,18 +37,31 @@ def make_worktree_dir(base_cwd: Path, branch_name: str) -> Path:
    )


-def create_worktree(base_cwd: Path, work_dir: Path, branch_name: str) -> Path:
+def create_worktree(base_cwd: Path, work_dir: Path, branch_name: str) -> tuple[Path, str]:
    """Create a git worktree on a new branch from HEAD.

    1. Create branch from HEAD
    2. Create worktree checked out to that branch

    The branch lives in the original repo, so it survives worktree removal.
+    Returns (worktree_path, base_commit_sha).
    """
    work_dir = work_dir.resolve()
    if work_dir.exists():
        shutil.rmtree(work_dir)

+    # Record the base commit SHA before creating the branch.
+    # This is the anchor for all diffs — even if the agent makes its own commits,
+    # we always diff against this base to capture the full set of changes.
+    result = subprocess.run(
+        ["git", "rev-parse", "HEAD"],
+        cwd=base_cwd,
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+    base_commit = result.stdout.strip()
+
    # Create the branch at HEAD
    try:
        subprocess.run(
@@ -83,15 +96,24 @@ def create_worktree(base_cwd: Path, work_dir: Path, branch_name: str) -> Path:
            f"Failed to create worktree at {work_dir}: {e.stderr.strip()}"
        ) from e

-    logger.debug("Created worktree on branch '%s': %s", branch_name, work_dir)
-    return work_dir
+    logger.debug("Created worktree on branch '%s': %s (base: %s)", branch_name, work_dir, base_commit[:8])
+    return work_dir, base_commit


-def capture_diff(worktree_path: Path) -> str:
+def capture_diff(worktree_path: Path, base_commit: str | None = None) -> str:
    """Capture all changes made in the worktree as a unified diff.

-    Includes both tracked modifications and new untracked files.
+    Includes both tracked modifications, new untracked files, and changes
+    that the agent may have committed on its own.
+
+    Args:
+        base_commit: The commit SHA from when the worktree was created.
+                     If provided, diffs against this fixed base instead of HEAD.
+                     This is critical because agents (e.g. Claude in interactive
+                     mode) may create their own commits, advancing HEAD and
+                     making ``git diff --cached HEAD`` return empty.
    """
+    # Stage any uncommitted changes so they're included in the diff
    subprocess.run(
        ["git", "add", "-A"],
        cwd=worktree_path,
@@ -99,6 +121,30 @@ def capture_diff(worktree_path: Path) -> str:
        check=True,
    )

+    if base_commit:
+        # Diff everything (committed + staged) against the original base.
+        # This captures changes regardless of whether the agent committed them.
+        result = subprocess.run(
+            ["git", "diff", base_commit, "--cached"],
+            cwd=worktree_path,
+            capture_output=True,
+            text=True,
+        )
+        diff = result.stdout.strip()
+        if diff:
+            return diff
+
+        # Also check committed changes (agent may have committed and left
+        # nothing staged)
+        result = subprocess.run(
+            ["git", "diff", base_commit, "HEAD"],
+            cwd=worktree_path,
+            capture_output=True,
+            text=True,
+        )
+        return result.stdout.strip()
+
+    # Fallback: no base_commit, use original behavior
    result = subprocess.run(
        ["git", "diff", "--cached", "HEAD"],
        cwd=worktree_path,