feat: isolate agentic worktrees and surface execution evidence

2026-03-13 22:50:46 +09:00
parent 3fb19e90c0
commit b19d174c98
7 changed files with 758 additions and 14 deletions
--- a/cross_eval/agent.py
+++ b/cross_eval/agent.py
@@ -32,20 +32,33 @@ _NO_CHANGE_ACK_MARKERS = (
 _CHANGE_CLAIM_MARKERS = (
    "summary of all changes made",
    "here's a summary of all changes made",
+    "here is a summary of all changes",
    "implemented",
    "i implemented",
+    "i've implemented",
    "added",
    "i added",
+    "i've added",
    "updated",
    "i updated",
+    "i've updated",
    "modified",
    "i modified",
+    "i've modified",
    "created",
    "i created",
+    "i've created",
    "fixed",
    "i fixed",
+    "i've fixed",
    "completed the changes",
    "finished the changes",
+    "made the following changes",
+    "applied the fix",
+    "changes have been applied",
+    "wrote the code",
+    "refactored",
+    "i refactored",
 )


@@ -134,6 +147,29 @@ def _classify_agent_failure(detail: str) -> tuple[str, str]:
    )


+_WRITE_FAILURE_MARKERS = (
+    "permission denied",
+    "read-only file system",
+    "read only file system",
+    "operation not permitted",
+    "cannot write",
+    "failed to write",
+    "could not write",
+    "unable to write",
+    "sandbox",
+    "eacces",
+    "erofs",
+)
+
+
+def _has_write_failure_indicators(stderr: str) -> bool:
+    """Detect stderr patterns indicating the agent could not write files."""
+    if not stderr.strip():
+        return False
+    normalized = stderr.lower()
+    return any(marker in normalized for marker in _WRITE_FAILURE_MARKERS)
+
+
 def _claims_file_changes(output: str) -> bool:
    """Heuristic for agent text that claims code changes were made."""
    normalized = output.lower()
@@ -406,7 +442,8 @@ def invoke_agent_agentic(
        # (avoids OS arg length limits for large prompts)
        cmd.append(
            f"Read the task file at {task_file} and execute all instructions in it. "
-            f"Work in the current directory."
+            f"Work only inside the current directory and do not modify files "
+            f"outside it."
        )

    cmd_preview = " ".join(cmd[:6])
@@ -467,7 +504,14 @@ def invoke_agent_agentic(
    if not diff_output:
        stdout_excerpt = (result.stdout or "").strip()
        stderr_excerpt = (result.stderr or "").strip()
-        if _claims_file_changes(stdout_excerpt):
+
+        # Detect two failure modes:
+        # 1. Agent claims changes in stdout but produced no diff
+        # 2. Agent stderr contains permission or write-failure indicators
+        claims_changes = _claims_file_changes(stdout_excerpt)
+        has_write_failure = _has_write_failure_indicators(stderr_excerpt)
+
+        if claims_changes or has_write_failure:
            if spinner:
                spinner.stop(f"[{step_name}] FAILED (empty diff)")
            raw_error = stdout_excerpt or "(stdout empty)"
@@ -475,16 +519,27 @@ def invoke_agent_agentic(
                raw_error = f"{raw_error}\n\n[stderr]\n{stderr_excerpt}"
            if len(raw_error) > 2000:
                raw_error = raw_error[:2000] + "..."
+
+            if has_write_failure:
+                failure_type = "WRITE_FAILURE"
+                suggested_action = (
+                    "Agent encountered file write errors (permission denied, read-only, "
+                    "or sandbox restriction). Check agent permissions and worktree state."
+                )
+            else:
+                failure_type = "EMPTY_DIFF"
+                suggested_action = (
+                    "Agent reported code changes but produced no git diff. "
+                    "Treat this run as failed and require a real worktree diff before continuing."
+                )
+
            raise AgentInvocationError(
                agent_name=agent.name,
                step_name=step_name,
                cmd_preview=cmd_preview,
                raw_error=raw_error,
-                failure_type="EMPTY_DIFF",
-                suggested_action=(
-                    "Agent reported code changes but produced no git diff. "
-                    "Treat this run as failed and require a real worktree diff before continuing."
-                ),
+                failure_type=failure_type,
+                suggested_action=suggested_action,
            )

        diff_output = "(no changes)"
--- a/cross_eval/pipeline.py
+++ b/cross_eval/pipeline.py
@@ -6,6 +6,7 @@ import os
 import re
 import subprocess
 import time
+from hashlib import sha256
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime
 from pathlib import Path
@@ -92,15 +93,110 @@ def _setup_worktree(cwd: Path, run_dir: Path, preset_name: str) -> tuple[Path, s

    Returns (worktree_path, branch_name).
    """
-    from cross_eval.worktree import create_worktree, make_branch_name
+    from cross_eval.worktree import create_worktree, make_branch_name, make_worktree_dir
    branch_name = make_branch_name(preset_name)
-    worktree_dir = run_dir / "work"
+    worktree_dir = make_worktree_dir(cwd, branch_name)
    worktree_path = create_worktree(
        base_cwd=cwd, work_dir=worktree_dir, branch_name=branch_name,
    )
+    (run_dir / "worktree_path.txt").write_text(f"{worktree_path}\n", encoding="utf-8")
+    (run_dir / "worktree_branch.txt").write_text(f"{branch_name}\n", encoding="utf-8")
    return worktree_path, branch_name


+def _snapshot_repo_state(cwd: Path) -> str:
+    """Capture the base repository working-tree state.
+
+    This is used to detect agentic runs that accidentally modify the original
+    checkout instead of the isolated worktree.
+    """
+    status = subprocess.run(
+        ["git", "status", "--short", "--untracked-files=all"],
+        cwd=cwd,
+        capture_output=True,
+        text=True,
+    )
+    if status.returncode != 0:
+        return ""
+
+    diff = subprocess.run(
+        ["git", "diff", "--no-ext-diff", "--binary", "HEAD"],
+        cwd=cwd,
+        capture_output=True,
+        text=True,
+    )
+    cached_diff = subprocess.run(
+        ["git", "diff", "--no-ext-diff", "--binary", "--cached"],
+        cwd=cwd,
+        capture_output=True,
+        text=True,
+    )
+    untracked = subprocess.run(
+        ["git", "ls-files", "--others", "--exclude-standard", "-z"],
+        cwd=cwd,
+        capture_output=True,
+    )
+
+    parts = [
+        status.stdout,
+        diff.stdout,
+        cached_diff.stdout,
+    ]
+
+    if untracked.returncode == 0 and untracked.stdout:
+        for rel_path in untracked.stdout.decode("utf-8", errors="replace").split("\0"):
+            if not rel_path:
+                continue
+            file_path = cwd / rel_path
+            if file_path.is_file():
+                digest = sha256(file_path.read_bytes()).hexdigest()
+                parts.append(f"UNTRACKED {rel_path} {digest}")
+            else:
+                parts.append(f"UNTRACKED {rel_path} (non-file)")
+
+    return "\n".join(parts)
+
+
+def _snapshot_repo_status(cwd: Path) -> str:
+    """Capture a human-readable status summary for error reporting."""
+    result = subprocess.run(
+        ["git", "status", "--short", "--untracked-files=all"],
+        cwd=cwd,
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        return ""
+    return result.stdout.strip()
+
+
+def _assert_base_repo_isolation(
+    cwd: Path,
+    baseline_state: str,
+    *,
+    step_name: str,
+    agent_name: str,
+    worktree_path: Path,
+    baseline_status: str,
+) -> None:
+    """Fail fast if an agentic run leaked changes into the base repo."""
+    current_state = _snapshot_repo_state(cwd)
+    if current_state == baseline_state:
+        return
+
+    current_status = _snapshot_repo_status(cwd)
+    before = baseline_status or "(clean)"
+    after = current_status or "(clean)"
+    raise WorktreeError(
+        "Agent modified the base repository instead of the isolated worktree.\n\n"
+        f"Step: {step_name}\n"
+        f"Agent: {agent_name}\n"
+        f"Worktree: {worktree_path}\n\n"
+        f"Baseline status:\n{before}\n\n"
+        f"Current status:\n{after}"
+    )
+
+
 def _finalize_worktree(
    cwd: Path,
    worktree_path: Path,
@@ -172,10 +268,14 @@ def _run_simple_pipeline(
    # Setup shared worktree for agentic mode
    worktree_path: Path | None = None
    agentic_branch_name: str | None = None
+    base_repo_state: str | None = None
+    base_repo_status: str | None = None
    if not dry_run and _has_agentic_steps(config, config.pipeline):
        worktree_path, agentic_branch_name = _setup_worktree(
            cwd, run_dir, config.preset_name,
        )
+        base_repo_state = _snapshot_repo_state(cwd)
+        base_repo_status = _snapshot_repo_status(cwd)

    feedback = "(no feedback — first iteration)"
    iterations: list[IterationResult] = []
@@ -203,6 +303,8 @@ def _run_simple_pipeline(
                run_dir=run_dir, output_iter=i,
                worktree_path=worktree_path,
                runtime_env=runtime_env,
+                base_repo_state=base_repo_state,
+                base_repo_status=base_repo_status,
            )

            # Intermediate commit so next iteration's diff only shows new changes
@@ -332,10 +434,14 @@ def _run_phased_pipeline(
    all_phase_steps = [s for p in config.phases for s in p.steps]
    worktree_path: Path | None = None
    agentic_branch_name: str | None = None
+    base_repo_state: str | None = None
+    base_repo_status: str | None = None
    if not dry_run and _has_agentic_steps(config, all_phase_steps):
        worktree_path, agentic_branch_name = _setup_worktree(
            cwd, run_dir, config.preset_name,
        )
+        base_repo_state = _snapshot_repo_state(cwd)
+        base_repo_status = _snapshot_repo_status(cwd)

    iterations: list[IterationResult] = []
    feedback = "(no feedback — first iteration)"
@@ -384,6 +490,8 @@ def _run_phased_pipeline(
                    run_dir=run_dir, output_iter=global_iter, phase_name=phase.name,
                    worktree_path=worktree_path,
                    runtime_env=runtime_env,
+                    base_repo_state=base_repo_state,
+                    base_repo_status=base_repo_status,
                )

                # Intermediate commit so next iteration's diff only shows new changes
@@ -626,6 +734,8 @@ def _run_steps(
    phase_name: str | None = None,
    worktree_path: Path | None = None,
    runtime_env: dict[str, str] | None = None,
+    base_repo_state: str | None = None,
+    base_repo_status: str | None = None,
 ) -> tuple[dict[str, str], dict[str, AgentResult], str | None]:
    """Execute all steps in one iteration, parallelizing where possible."""
    step_outputs: dict[str, str] = {}
@@ -644,6 +754,8 @@ def _run_steps(
                run_dir=run_dir, output_iter=output_iter,
                phase_name=phase_name, worktree_path=worktree_path,
                runtime_env=runtime_env,
+                base_repo_state=base_repo_state,
+                base_repo_status=base_repo_status,
            )
        else:
            _execute_parallel_batch(
@@ -653,6 +765,8 @@ def _run_steps(
                run_dir=run_dir, output_iter=output_iter,
                phase_name=phase_name, worktree_path=worktree_path,
                runtime_env=runtime_env,
+                base_repo_state=base_repo_state,
+                base_repo_status=base_repo_status,
            )

    # Extract verdict from all verdict steps (ALL must PASS; ESCALATE wins over all)
@@ -709,6 +823,8 @@ def _execute_step(
    quiet: bool = False,
    worktree_path: Path | None = None,
    runtime_env: dict[str, str] | None = None,
+    base_repo_state: str | None = None,
+    base_repo_status: str | None = None,
 ) -> None:
    """Execute a single step, updating step_outputs and step_results in place."""
    if not quiet:
@@ -717,9 +833,10 @@ def _execute_step(
    # 1. Resolve template
    template = resolve_template(step.prompt_template)

-    # 2. Build context
+    # 2. Build context (include prior step results for evidence)
    context = _build_context(
        input_contents, step_outputs, feedback, iteration, max_iterations,
+        step_results=step_results,
    )

    # 3. Apply context overrides
@@ -794,6 +911,16 @@ def _execute_step(
        raise

    # 7. Store output
+    if worktree_path is not None and base_repo_state is not None:
+        _assert_base_repo_isolation(
+            cwd,
+            base_repo_state,
+            step_name=step.name,
+            agent_name=step.agent,
+            worktree_path=worktree_path,
+            baseline_status=base_repo_status or "",
+        )
+
    step_outputs[step.output_key] = result.output
    step_results[step.output_key] = result

@@ -826,6 +953,8 @@ def _execute_parallel_batch(
    phase_name: str | None = None,
    worktree_path: Path | None = None,
    runtime_env: dict[str, str] | None = None,
+    base_repo_state: str | None = None,
+    base_repo_status: str | None = None,
 ) -> None:
    """Execute multiple steps in parallel using threads."""
    agent_names = ", ".join(s.agent for s in batch)
@@ -838,6 +967,8 @@ def _execute_parallel_batch(
                iteration, max_iterations, cwd, timeout, dry_run,
                step_outputs, step_results,
                run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
+                base_repo_state=base_repo_state,
+                base_repo_status=base_repo_status,
            )
        return

@@ -858,12 +989,15 @@ def _execute_parallel_batch(
                step_outputs, step_results,
                run_dir=run_dir, output_iter=output_iter,
                phase_name=phase_name, worktree_path=worktree_path,
+                base_repo_state=base_repo_state,
+                base_repo_status=base_repo_status,
            )
        return

    # Snapshot context before parallel execution (all steps see same state)
    context_snapshot = dict(input_contents)
    context_snapshot.update(step_outputs)
+    results_snapshot = dict(step_results)

    # Collect results from parallel threads
    local_outputs: dict[str, str] = {}
@@ -883,6 +1017,7 @@ def _execute_parallel_batch(
        template = resolve_template(step.prompt_template)
        context = _build_context(
            context_snapshot, {}, feedback, iteration, max_iterations,
+            step_results=results_snapshot,
        )
        if step.context_override:
            context = _apply_context_override(context, step.context_override)
@@ -919,6 +1054,16 @@ def _execute_parallel_batch(
    batch_elapsed = round(time.monotonic() - batch_start, 1)

    # Persist successful outputs even if a sibling step failed.
+    if worktree_path is not None and base_repo_state is not None:
+        _assert_base_repo_isolation(
+            cwd,
+            base_repo_state,
+            step_name=phase_name or "parallel-batch",
+            agent_name=agent_names,
+            worktree_path=worktree_path,
+            baseline_status=base_repo_status or "",
+        )
+
    for step in batch:
        key = step.output_key
        if key not in local_outputs:
@@ -986,6 +1131,7 @@ def _build_context(
    feedback: str,
    iteration: int,
    max_iterations: int,
+    step_results: dict[str, AgentResult] | None = None,
 ) -> dict[str, str]:
    """Build the template context dict."""
    context: dict[str, str] = {}
@@ -994,9 +1140,42 @@ def _build_context(
    context["feedback"] = feedback
    context["iteration"] = str(iteration)
    context["max_iterations"] = str(max_iterations)
+    # Surface execution evidence from prior steps so reviewers can inspect it
+    if step_results:
+        context["execution_evidence"] = _format_execution_evidence(step_results)
    return context


+def _format_execution_evidence(
+    step_results: dict[str, AgentResult],
+) -> str:
+    """Format execution evidence from prior steps for reviewer consumption.
+
+    Produces a compact summary of command, exit code, duration, and a truncated
+    transcript excerpt for each completed step so that reviewers and seniors
+    can verify claims against real execution data.
+    """
+    if not step_results:
+        return "(no prior execution evidence)"
+    parts: list[str] = []
+    for key, result in step_results.items():
+        section = [
+            f"### Step: {result.step_name} ({result.agent_name})",
+            f"- Command: `{result.command_preview}`" if result.command_preview else "",
+            f"- Exit code: {result.exit_code}",
+            f"- Duration: {result.duration_seconds}s",
+        ]
+        section = [line for line in section if line]
+        if result.transcript:
+            # Include a truncated transcript excerpt for debugging
+            excerpt = result.transcript[:2000]
+            if len(result.transcript) > 2000:
+                excerpt += "\n... (truncated)"
+            section.append(f"\n<details>\n<summary>Transcript excerpt</summary>\n\n{excerpt}\n</details>")
+        parts.append("\n".join(section))
+    return "\n\n---\n\n".join(parts)
+
+
 def _build_runtime_inputs(
    config: PipelineConfig,
    input_contents: dict[str, str],
--- a/cross_eval/prompts.py
+++ b/cross_eval/prompts.py
@@ -59,9 +59,14 @@ You are tasked with reviewing code against a plan and checklist.
 ## Previous Review Feedback
 {feedback}

+## Execution Evidence
+{execution_evidence}
+
 ## Review Instructions
 Explore the project directory to understand the full codebase context, \
-then evaluate the code against ONLY the plan and checklist above.
+then evaluate the code against ONLY the plan and checklist above. \
+Use the execution evidence above to verify agent claims against actual \
+command outputs and exit codes.

 For each issue found, classify it with BOTH severity AND category:

@@ -164,9 +169,13 @@ REVIEW_TEMPLATE_KO = """\
 ## 이전 리뷰 피드백
 {feedback}

+## 실행 증거
+{execution_evidence}
+
 ## 검토 지침
 프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스 맥락을 파악한 뒤, \
-위 기획서와 체크리스트 기준으로만 코드를 평가하세요.
+위 기획서와 체크리스트 기준으로만 코드를 평가하세요. \
+위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력과 종료 코드로 검증하세요.

 발견된 각 이슈에 심각도와 카테고리를 모두 부여하세요:

@@ -525,8 +534,13 @@ You are adjudicating multiple review results and turning them into an actionable
 ## Previous Issue Tracker
 {previous_senior_tracker}

+## Execution Evidence
+{execution_evidence}
+
 ## Instructions
-Explore the project directory to confirm the current codebase state. Then:
+Explore the project directory to confirm the current codebase state. \
+Use the execution evidence above to verify claims against actual command \
+outputs and exit codes. Then:
 1. Deduplicate overlapping issues across reviewers.
 2. Resolve disagreements explicitly.
 3. Keep only issues supported by the plan, checklist, code, or reviewer evidence.
@@ -592,8 +606,13 @@ AGGREGATE_REVIEW_TEMPLATE_KO = """\
 ## 이전 이슈 트래커
 {previous_senior_tracker}

+## 실행 증거
+{execution_evidence}
+
 ## 지침
-프로젝트 디렉토리를 탐색하여 현재 코드베이스 상태를 확인한 뒤 다음을 수행하세요.
+프로젝트 디렉토리를 탐색하여 현재 코드베이스 상태를 확인한 뒤, \
+위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력과 종료 코드로 검증하세요. \
+그런 다음 아래를 수행하세요.
 1. 리뷰어들 사이에 중복되는 이슈를 합치세요.
 2. 의견 충돌은 명시적으로 정리하세요.
 3. 기획서, 체크리스트, 코드, 리뷰 근거로 뒷받침되는 이슈만 남기세요.
--- a/cross_eval/report.py
+++ b/cross_eval/report.py
@@ -386,6 +386,11 @@ def _append_iteration_steps(

        lines.append(f"### {_t(config, 'step')}: {step.name} ({agent_name}){duration}\n")

+        # Show command preview and exit code for execution evidence
+        if agent_result and agent_result.command_preview:
+            lines.append(f"**Command**: `{agent_result.command_preview}`")
+            lines.append(f"**Exit code**: {agent_result.exit_code}\n")
+
        if step.verdict and iter_result.verdict:
            lines.append(f"**{_t(config, 'verdict')}: {iter_result.verdict}**\n")

@@ -400,6 +405,16 @@ def _append_iteration_steps(
            lines.append(output)
            lines.append("")

+        # Include transcript excerpt for execution evidence visibility
+        if agent_result and agent_result.transcript:
+            transcript_preview = agent_result.transcript[:1500]
+            if len(agent_result.transcript) > 1500:
+                transcript_preview += "\n... (truncated)"
+            lines.append("<details>")
+            lines.append("<summary>Execution transcript</summary>\n")
+            lines.append(transcript_preview)
+            lines.append("\n</details>\n")
+
        if not skip_extraction and step.role == "review":
            oos = _extract_out_of_scope(output)
            if oos:
--- a/cross_eval/worktree.py
+++ b/cross_eval/worktree.py
@@ -4,6 +4,7 @@ from __future__ import annotations
 import logging
 import shutil
 import subprocess
+import tempfile
 from datetime import datetime
 from pathlib import Path

@@ -20,6 +21,22 @@ def make_branch_name(preset_name: str) -> str:
    return f"cross-eval/{preset_name}_{ts}"


+def make_worktree_dir(base_cwd: Path, branch_name: str) -> Path:
+    """Choose a worktree directory outside the base repo.
+
+    Keeping agentic worktrees outside the source checkout avoids tools that
+    incorrectly walk up to the outer repo and write into the base worktree.
+    """
+    repo_name = base_cwd.resolve().name or "repo"
+    branch_slug = branch_name.replace("/", "__")
+    return (
+        Path(tempfile.gettempdir())
+        / "cross-eval-worktrees"
+        / repo_name
+        / branch_slug
+    )
+
+
 def create_worktree(base_cwd: Path, work_dir: Path, branch_name: str) -> Path:
    """Create a git worktree on a new branch from HEAD.