fix: Claude reviewer empty output, worktree isolation false positives, and input file access
- Add -p flag to _CLAUDE_REVIEW_ARGS so reviewer uses print mode (stdin→stdout) instead of interactive mode which conflicts with plan permission mode - Copy input files (plan, checklist) into worktree .cross-eval-inputs/ so agents in plan mode can access them without escaping the sandbox - Simplify _snapshot_repo_state to use only git diff HEAD + untracked hashes, eliminating false positives from staging state changes (git diff --cached) and git status index drift during long-running pipelines Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -104,45 +104,73 @@ def _setup_worktree(cwd: Path, run_dir: Path, preset_name: str) -> tuple[Path, s
|
||||
return worktree_path, branch_name
|
||||
|
||||
|
||||
def _snapshot_repo_state(cwd: Path) -> str:
|
||||
def _copy_inputs_to_worktree(
|
||||
config: PipelineConfig,
|
||||
worktree_path: Path,
|
||||
) -> None:
|
||||
"""Copy input files (plan, checklist, etc.) into the worktree.
|
||||
|
||||
This ensures agents running in plan/read-only mode within the worktree
|
||||
can access these files, even though the originals live in the base repo.
|
||||
Updates config.inputs in-place so subsequent reference refreshes use
|
||||
worktree-local paths.
|
||||
"""
|
||||
import shutil
|
||||
inputs_dir = worktree_path / ".cross-eval-inputs"
|
||||
inputs_dir.mkdir(exist_ok=True)
|
||||
# Exclude from git so these don't pollute agentic diffs
|
||||
(inputs_dir / ".gitignore").write_text("*\n", encoding="utf-8")
|
||||
for key, val in list(config.inputs.items()):
|
||||
if key.endswith("_ref") or not isinstance(val, Path):
|
||||
continue
|
||||
if not val.exists():
|
||||
continue
|
||||
dest = inputs_dir / val.name
|
||||
shutil.copy2(val, dest)
|
||||
config.inputs[key] = dest
|
||||
|
||||
|
||||
def _snapshot_repo_state(cwd: Path) -> dict[str, str]:
|
||||
"""Capture the base repository working-tree state.
|
||||
|
||||
This is used to detect agentic runs that accidentally modify the original
|
||||
checkout instead of the isolated worktree.
|
||||
|
||||
We intentionally use only two components:
|
||||
- ``diff``: ``git diff HEAD`` — all tracked changes vs HEAD, combining
|
||||
staged and unstaged so that staging-state changes don't cause false
|
||||
positives.
|
||||
- ``untracked``: SHA-256 hashes of untracked files — detects new or
|
||||
modified untracked files appearing in the base repo.
|
||||
|
||||
``git status --short`` and ``git diff --cached`` are NOT used because
|
||||
external tools (IDEs, git hooks) can change staging state during a
|
||||
long-running pipeline, causing spurious failures.
|
||||
"""
|
||||
status = subprocess.run(
|
||||
["git", "status", "--short", "--untracked-files=all"],
|
||||
# Refresh index stat cache to prevent false positives from mtime drift
|
||||
subprocess.run(
|
||||
["git", "update-index", "--refresh", "-q"],
|
||||
cwd=cwd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
if status.returncode != 0:
|
||||
return ""
|
||||
|
||||
# git diff HEAD: all changes (staged + unstaged) vs HEAD
|
||||
diff = subprocess.run(
|
||||
["git", "diff", "--no-ext-diff", "--binary", "HEAD"],
|
||||
cwd=cwd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
cached_diff = subprocess.run(
|
||||
["git", "diff", "--no-ext-diff", "--binary", "--cached"],
|
||||
cwd=cwd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
if diff.returncode != 0:
|
||||
return {}
|
||||
|
||||
untracked = subprocess.run(
|
||||
["git", "ls-files", "--others", "--exclude-standard", "-z"],
|
||||
cwd=cwd,
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
parts = [
|
||||
status.stdout,
|
||||
diff.stdout,
|
||||
cached_diff.stdout,
|
||||
]
|
||||
|
||||
untracked_parts: list[str] = []
|
||||
if untracked.returncode == 0 and untracked.stdout:
|
||||
for rel_path in untracked.stdout.decode("utf-8", errors="replace").split("\0"):
|
||||
if not rel_path:
|
||||
@@ -150,11 +178,14 @@ def _snapshot_repo_state(cwd: Path) -> str:
|
||||
file_path = cwd / rel_path
|
||||
if file_path.is_file():
|
||||
digest = sha256(file_path.read_bytes()).hexdigest()
|
||||
parts.append(f"UNTRACKED {rel_path} {digest}")
|
||||
untracked_parts.append(f"UNTRACKED {rel_path} {digest}")
|
||||
else:
|
||||
parts.append(f"UNTRACKED {rel_path} (non-file)")
|
||||
untracked_parts.append(f"UNTRACKED {rel_path} (non-file)")
|
||||
|
||||
return "\n".join(parts)
|
||||
return {
|
||||
"diff": diff.stdout,
|
||||
"untracked": "\n".join(untracked_parts),
|
||||
}
|
||||
|
||||
|
||||
def _snapshot_repo_status(cwd: Path) -> str:
|
||||
@@ -172,7 +203,7 @@ def _snapshot_repo_status(cwd: Path) -> str:
|
||||
|
||||
def _assert_base_repo_isolation(
|
||||
cwd: Path,
|
||||
baseline_state: str,
|
||||
baseline_state: dict[str, str],
|
||||
*,
|
||||
step_name: str,
|
||||
agent_name: str,
|
||||
@@ -184,6 +215,18 @@ def _assert_base_repo_isolation(
|
||||
if current_state == baseline_state:
|
||||
return
|
||||
|
||||
# Identify which component(s) actually changed
|
||||
changed: list[str] = []
|
||||
for key in ("diff", "untracked"):
|
||||
if baseline_state.get(key, "") != current_state.get(key, ""):
|
||||
changed.append(key)
|
||||
|
||||
if not changed:
|
||||
# State dicts differ only in keys we no longer track — benign.
|
||||
return
|
||||
|
||||
# untracked-only change: new files appeared — real leak
|
||||
# diff-only change: tracked file content changed — real leak
|
||||
current_status = _snapshot_repo_status(cwd)
|
||||
before = baseline_status or "(clean)"
|
||||
after = current_status or "(clean)"
|
||||
@@ -191,7 +234,8 @@ def _assert_base_repo_isolation(
|
||||
"Agent modified the base repository instead of the isolated worktree.\n\n"
|
||||
f"Step: {step_name}\n"
|
||||
f"Agent: {agent_name}\n"
|
||||
f"Worktree: {worktree_path}\n\n"
|
||||
f"Worktree: {worktree_path}\n"
|
||||
f"Changed components: {', '.join(changed)}\n\n"
|
||||
f"Baseline status:\n{before}\n\n"
|
||||
f"Current status:\n{after}"
|
||||
)
|
||||
@@ -268,12 +312,14 @@ def _run_simple_pipeline(
|
||||
# Setup shared worktree for agentic mode
|
||||
worktree_path: Path | None = None
|
||||
agentic_branch_name: str | None = None
|
||||
base_repo_state: str | None = None
|
||||
base_repo_state: dict[str, str] | None = None
|
||||
base_repo_status: str | None = None
|
||||
if not dry_run and _has_agentic_steps(config, config.pipeline):
|
||||
worktree_path, agentic_branch_name = _setup_worktree(
|
||||
cwd, run_dir, config.preset_name,
|
||||
)
|
||||
_copy_inputs_to_worktree(config, worktree_path)
|
||||
_refresh_input_references(config, input_contents)
|
||||
base_repo_state = _snapshot_repo_state(cwd)
|
||||
base_repo_status = _snapshot_repo_status(cwd)
|
||||
|
||||
@@ -443,12 +489,14 @@ def _run_phased_pipeline(
|
||||
all_phase_steps = [s for p in config.phases for s in p.steps]
|
||||
worktree_path: Path | None = None
|
||||
agentic_branch_name: str | None = None
|
||||
base_repo_state: str | None = None
|
||||
base_repo_state: dict[str, str] | None = None
|
||||
base_repo_status: str | None = None
|
||||
if not dry_run and _has_agentic_steps(config, all_phase_steps):
|
||||
worktree_path, agentic_branch_name = _setup_worktree(
|
||||
cwd, run_dir, config.preset_name,
|
||||
)
|
||||
_copy_inputs_to_worktree(config, worktree_path)
|
||||
_refresh_input_references(config, input_contents)
|
||||
base_repo_state = _snapshot_repo_state(cwd)
|
||||
base_repo_status = _snapshot_repo_status(cwd)
|
||||
|
||||
@@ -844,7 +892,7 @@ def _run_steps(
|
||||
phase_name: str | None = None,
|
||||
worktree_path: Path | None = None,
|
||||
runtime_env: dict[str, str] | None = None,
|
||||
base_repo_state: str | None = None,
|
||||
base_repo_state: dict[str, str] | None = None,
|
||||
base_repo_status: str | None = None,
|
||||
) -> tuple[dict[str, str], dict[str, AgentResult], str | None]:
|
||||
"""Execute all steps in one iteration, parallelizing where possible."""
|
||||
@@ -933,7 +981,7 @@ def _execute_step(
|
||||
quiet: bool = False,
|
||||
worktree_path: Path | None = None,
|
||||
runtime_env: dict[str, str] | None = None,
|
||||
base_repo_state: str | None = None,
|
||||
base_repo_state: dict[str, str] | None = None,
|
||||
base_repo_status: str | None = None,
|
||||
) -> None:
|
||||
"""Execute a single step, updating step_outputs and step_results in place."""
|
||||
@@ -1066,7 +1114,7 @@ def _execute_parallel_batch(
|
||||
phase_name: str | None = None,
|
||||
worktree_path: Path | None = None,
|
||||
runtime_env: dict[str, str] | None = None,
|
||||
base_repo_state: str | None = None,
|
||||
base_repo_state: dict[str, str] | None = None,
|
||||
base_repo_status: str | None = None,
|
||||
) -> None:
|
||||
"""Execute multiple steps in parallel using threads."""
|
||||
|
||||
Reference in New Issue
Block a user