fix: Claude reviewer empty output, worktree isolation false positives, and input file access

- Add -p flag to _CLAUDE_REVIEW_ARGS so reviewer uses print mode (stdin→stdout)
  instead of interactive mode which conflicts with plan permission mode
- Copy input files (plan, checklist) into worktree .cross-eval-inputs/ so
  agents in plan mode can access them without escaping the sandbox
- Simplify _snapshot_repo_state to use only git diff HEAD + untracked hashes,
  eliminating false positives from staging state changes (git diff --cached)
  and git status index drift during long-running pipelines

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
이충영 에이닷서비스개발
2026-03-14 16:19:57 +09:00
parent 7b95233edf
commit cc8d583914
6 changed files with 158 additions and 41 deletions

View File

@@ -434,6 +434,10 @@ def main(argv: list[str] | None = None) -> int:
"--reviewer-model", default=None, metavar="MODEL", "--reviewer-model", default=None, metavar="MODEL",
help="Reviewer 에이전트 모델만 변경", help="Reviewer 에이전트 모델만 변경",
) )
agent_group.add_argument(
"--senior-model", default=None, metavar="MODEL",
help="Senior 에이전트 모델만 변경",
)
# -- 파이프라인 -- # -- 파이프라인 --
pipe_group = run_parser.add_argument_group("파이프라인") pipe_group = run_parser.add_argument_group("파이프라인")
@@ -474,7 +478,7 @@ def main(argv: list[str] | None = None) -> int:
) )
etc_group.add_argument( etc_group.add_argument(
"--output-dir", type=Path, default=None, "--output-dir", type=Path, default=None,
help="결과 저장 디렉토리 (기본: output/)", help="결과 저장 디렉토리 (기본: .cross-eval/output/)",
) )
etc_group.add_argument( etc_group.add_argument(
"--dry-run", action="store_true", "--dry-run", action="store_true",
@@ -953,13 +957,16 @@ def cmd_run(args: argparse.Namespace) -> int:
if args.model is not None: if args.model is not None:
for agent_name in config.agents: for agent_name in config.agents:
_apply_model_override(config, agent_name, args.model) _apply_model_override(config, agent_name, args.model)
# --coder-model / --reviewer-model: apply by role # --coder-model / --reviewer-model / --senior-model: apply by role
if args.coder_model is not None: if args.coder_model is not None:
for coder_name in config.coders: for coder_name in config.coders:
_apply_model_override(config, coder_name, args.coder_model) _apply_model_override(config, coder_name, args.coder_model)
if args.reviewer_model is not None: if args.reviewer_model is not None:
for reviewer_name in config.reviewers: for reviewer_name in config.reviewers:
_apply_model_override(config, reviewer_name, args.reviewer_model) _apply_model_override(config, reviewer_name, args.reviewer_model)
if args.senior_model is not None:
for senior_name in config.seniors:
_apply_model_override(config, senior_name, args.senior_model)
# --plan / --checklist shortcuts # --plan / --checklist shortcuts
for key, val in [("plan", args.plan), ("checklist", args.checklist)]: for key, val in [("plan", args.plan), ("checklist", args.checklist)]:

View File

@@ -62,12 +62,7 @@ _CLAUDE_CODER_ARGS = list(_CLAUDE_BASE_ARGS) + [
"bypassPermissions", "bypassPermissions",
] ]
_CLAUDE_REVIEW_ARGS = [ _CLAUDE_REVIEW_ARGS = list(_CLAUDE_BASE_ARGS) + [
"--setting-sources",
"user",
"--disable-slash-commands",
"--model",
"opus",
"--permission-mode", "--permission-mode",
"plan", "plan",
] ]
@@ -425,6 +420,8 @@ def load_config(path: Path) -> PipelineConfig:
def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig: def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
"""Parse raw YAML dict into PipelineConfig.""" """Parse raw YAML dict into PipelineConfig."""
project_root = config_path.parent.parent if config_path.parent.name == ".cross-eval" else config_path.parent
# --- agents --- # --- agents ---
agents: dict[str, AgentConfig] = {} agents: dict[str, AgentConfig] = {}
for name, agent_data in raw.get("agents", {}).items(): for name, agent_data in raw.get("agents", {}).items():
@@ -494,8 +491,12 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
if isinstance(pipeline_raw, str) and pipeline_raw.startswith("preset:"): if isinstance(pipeline_raw, str) and pipeline_raw.startswith("preset:"):
preset_name = pipeline_raw.split(":", 1)[1] preset_name = pipeline_raw.split(":", 1)[1]
output_dir = Path(raw.get("output_dir", ".cross-eval/output"))
if not output_dir.is_absolute():
output_dir = project_root / output_dir
config = PipelineConfig( config = PipelineConfig(
output_dir=Path(raw.get("output_dir", ".cross-eval/output")), output_dir=output_dir,
max_iterations=int(raw.get("max_iterations", 3)), max_iterations=int(raw.get("max_iterations", 3)),
min_iterations=int(raw.get("min_iterations", 1)), min_iterations=int(raw.get("min_iterations", 1)),
verbose=bool(raw.get("verbose", False)), verbose=bool(raw.get("verbose", False)),

View File

@@ -104,45 +104,73 @@ def _setup_worktree(cwd: Path, run_dir: Path, preset_name: str) -> tuple[Path, s
return worktree_path, branch_name return worktree_path, branch_name
def _snapshot_repo_state(cwd: Path) -> str: def _copy_inputs_to_worktree(
config: PipelineConfig,
worktree_path: Path,
) -> None:
"""Copy input files (plan, checklist, etc.) into the worktree.
This ensures agents running in plan/read-only mode within the worktree
can access these files, even though the originals live in the base repo.
Updates config.inputs in-place so subsequent reference refreshes use
worktree-local paths.
"""
import shutil
inputs_dir = worktree_path / ".cross-eval-inputs"
inputs_dir.mkdir(exist_ok=True)
# Exclude from git so these don't pollute agentic diffs
(inputs_dir / ".gitignore").write_text("*\n", encoding="utf-8")
for key, val in list(config.inputs.items()):
if key.endswith("_ref") or not isinstance(val, Path):
continue
if not val.exists():
continue
dest = inputs_dir / val.name
shutil.copy2(val, dest)
config.inputs[key] = dest
def _snapshot_repo_state(cwd: Path) -> dict[str, str]:
"""Capture the base repository working-tree state. """Capture the base repository working-tree state.
This is used to detect agentic runs that accidentally modify the original This is used to detect agentic runs that accidentally modify the original
checkout instead of the isolated worktree. checkout instead of the isolated worktree.
We intentionally use only two components:
- ``diff``: ``git diff HEAD`` — all tracked changes vs HEAD, combining
staged and unstaged so that staging-state changes don't cause false
positives.
- ``untracked``: SHA-256 hashes of untracked files — detects new or
modified untracked files appearing in the base repo.
``git status --short`` and ``git diff --cached`` are NOT used because
external tools (IDEs, git hooks) can change staging state during a
long-running pipeline, causing spurious failures.
""" """
status = subprocess.run( # Refresh index stat cache to prevent false positives from mtime drift
["git", "status", "--short", "--untracked-files=all"], subprocess.run(
["git", "update-index", "--refresh", "-q"],
cwd=cwd, cwd=cwd,
capture_output=True, capture_output=True,
text=True,
) )
if status.returncode != 0:
return ""
# git diff HEAD: all changes (staged + unstaged) vs HEAD
diff = subprocess.run( diff = subprocess.run(
["git", "diff", "--no-ext-diff", "--binary", "HEAD"], ["git", "diff", "--no-ext-diff", "--binary", "HEAD"],
cwd=cwd, cwd=cwd,
capture_output=True, capture_output=True,
text=True, text=True,
) )
cached_diff = subprocess.run( if diff.returncode != 0:
["git", "diff", "--no-ext-diff", "--binary", "--cached"], return {}
cwd=cwd,
capture_output=True,
text=True,
)
untracked = subprocess.run( untracked = subprocess.run(
["git", "ls-files", "--others", "--exclude-standard", "-z"], ["git", "ls-files", "--others", "--exclude-standard", "-z"],
cwd=cwd, cwd=cwd,
capture_output=True, capture_output=True,
) )
parts = [ untracked_parts: list[str] = []
status.stdout,
diff.stdout,
cached_diff.stdout,
]
if untracked.returncode == 0 and untracked.stdout: if untracked.returncode == 0 and untracked.stdout:
for rel_path in untracked.stdout.decode("utf-8", errors="replace").split("\0"): for rel_path in untracked.stdout.decode("utf-8", errors="replace").split("\0"):
if not rel_path: if not rel_path:
@@ -150,11 +178,14 @@ def _snapshot_repo_state(cwd: Path) -> str:
file_path = cwd / rel_path file_path = cwd / rel_path
if file_path.is_file(): if file_path.is_file():
digest = sha256(file_path.read_bytes()).hexdigest() digest = sha256(file_path.read_bytes()).hexdigest()
parts.append(f"UNTRACKED {rel_path} {digest}") untracked_parts.append(f"UNTRACKED {rel_path} {digest}")
else: else:
parts.append(f"UNTRACKED {rel_path} (non-file)") untracked_parts.append(f"UNTRACKED {rel_path} (non-file)")
return "\n".join(parts) return {
"diff": diff.stdout,
"untracked": "\n".join(untracked_parts),
}
def _snapshot_repo_status(cwd: Path) -> str: def _snapshot_repo_status(cwd: Path) -> str:
@@ -172,7 +203,7 @@ def _snapshot_repo_status(cwd: Path) -> str:
def _assert_base_repo_isolation( def _assert_base_repo_isolation(
cwd: Path, cwd: Path,
baseline_state: str, baseline_state: dict[str, str],
*, *,
step_name: str, step_name: str,
agent_name: str, agent_name: str,
@@ -184,6 +215,18 @@ def _assert_base_repo_isolation(
if current_state == baseline_state: if current_state == baseline_state:
return return
# Identify which component(s) actually changed
changed: list[str] = []
for key in ("diff", "untracked"):
if baseline_state.get(key, "") != current_state.get(key, ""):
changed.append(key)
if not changed:
# State dicts differ only in keys we no longer track — benign.
return
# untracked-only change: new files appeared — real leak
# diff-only change: tracked file content changed — real leak
current_status = _snapshot_repo_status(cwd) current_status = _snapshot_repo_status(cwd)
before = baseline_status or "(clean)" before = baseline_status or "(clean)"
after = current_status or "(clean)" after = current_status or "(clean)"
@@ -191,7 +234,8 @@ def _assert_base_repo_isolation(
"Agent modified the base repository instead of the isolated worktree.\n\n" "Agent modified the base repository instead of the isolated worktree.\n\n"
f"Step: {step_name}\n" f"Step: {step_name}\n"
f"Agent: {agent_name}\n" f"Agent: {agent_name}\n"
f"Worktree: {worktree_path}\n\n" f"Worktree: {worktree_path}\n"
f"Changed components: {', '.join(changed)}\n\n"
f"Baseline status:\n{before}\n\n" f"Baseline status:\n{before}\n\n"
f"Current status:\n{after}" f"Current status:\n{after}"
) )
@@ -268,12 +312,14 @@ def _run_simple_pipeline(
# Setup shared worktree for agentic mode # Setup shared worktree for agentic mode
worktree_path: Path | None = None worktree_path: Path | None = None
agentic_branch_name: str | None = None agentic_branch_name: str | None = None
base_repo_state: str | None = None base_repo_state: dict[str, str] | None = None
base_repo_status: str | None = None base_repo_status: str | None = None
if not dry_run and _has_agentic_steps(config, config.pipeline): if not dry_run and _has_agentic_steps(config, config.pipeline):
worktree_path, agentic_branch_name = _setup_worktree( worktree_path, agentic_branch_name = _setup_worktree(
cwd, run_dir, config.preset_name, cwd, run_dir, config.preset_name,
) )
_copy_inputs_to_worktree(config, worktree_path)
_refresh_input_references(config, input_contents)
base_repo_state = _snapshot_repo_state(cwd) base_repo_state = _snapshot_repo_state(cwd)
base_repo_status = _snapshot_repo_status(cwd) base_repo_status = _snapshot_repo_status(cwd)
@@ -443,12 +489,14 @@ def _run_phased_pipeline(
all_phase_steps = [s for p in config.phases for s in p.steps] all_phase_steps = [s for p in config.phases for s in p.steps]
worktree_path: Path | None = None worktree_path: Path | None = None
agentic_branch_name: str | None = None agentic_branch_name: str | None = None
base_repo_state: str | None = None base_repo_state: dict[str, str] | None = None
base_repo_status: str | None = None base_repo_status: str | None = None
if not dry_run and _has_agentic_steps(config, all_phase_steps): if not dry_run and _has_agentic_steps(config, all_phase_steps):
worktree_path, agentic_branch_name = _setup_worktree( worktree_path, agentic_branch_name = _setup_worktree(
cwd, run_dir, config.preset_name, cwd, run_dir, config.preset_name,
) )
_copy_inputs_to_worktree(config, worktree_path)
_refresh_input_references(config, input_contents)
base_repo_state = _snapshot_repo_state(cwd) base_repo_state = _snapshot_repo_state(cwd)
base_repo_status = _snapshot_repo_status(cwd) base_repo_status = _snapshot_repo_status(cwd)
@@ -844,7 +892,7 @@ def _run_steps(
phase_name: str | None = None, phase_name: str | None = None,
worktree_path: Path | None = None, worktree_path: Path | None = None,
runtime_env: dict[str, str] | None = None, runtime_env: dict[str, str] | None = None,
base_repo_state: str | None = None, base_repo_state: dict[str, str] | None = None,
base_repo_status: str | None = None, base_repo_status: str | None = None,
) -> tuple[dict[str, str], dict[str, AgentResult], str | None]: ) -> tuple[dict[str, str], dict[str, AgentResult], str | None]:
"""Execute all steps in one iteration, parallelizing where possible.""" """Execute all steps in one iteration, parallelizing where possible."""
@@ -933,7 +981,7 @@ def _execute_step(
quiet: bool = False, quiet: bool = False,
worktree_path: Path | None = None, worktree_path: Path | None = None,
runtime_env: dict[str, str] | None = None, runtime_env: dict[str, str] | None = None,
base_repo_state: str | None = None, base_repo_state: dict[str, str] | None = None,
base_repo_status: str | None = None, base_repo_status: str | None = None,
) -> None: ) -> None:
"""Execute a single step, updating step_outputs and step_results in place.""" """Execute a single step, updating step_outputs and step_results in place."""
@@ -1066,7 +1114,7 @@ def _execute_parallel_batch(
phase_name: str | None = None, phase_name: str | None = None,
worktree_path: Path | None = None, worktree_path: Path | None = None,
runtime_env: dict[str, str] | None = None, runtime_env: dict[str, str] | None = None,
base_repo_state: str | None = None, base_repo_state: dict[str, str] | None = None,
base_repo_status: str | None = None, base_repo_status: str | None = None,
) -> None: ) -> None:
"""Execute multiple steps in parallel using threads.""" """Execute multiple steps in parallel using threads."""

View File

@@ -206,21 +206,28 @@ class TestMakeWorktreeDir(unittest.TestCase):
class TestBaseRepoIsolation(unittest.TestCase): class TestBaseRepoIsolation(unittest.TestCase):
"""Base repo mutations should fail fast during agentic execution.""" """Base repo mutations should fail fast during agentic execution."""
def test_raises_when_base_repo_status_changes(self) -> None: def test_raises_when_base_repo_state_changes(self) -> None:
with tempfile.TemporaryDirectory() as td: with tempfile.TemporaryDirectory() as td:
base = Path(td) / "repo" base = Path(td) / "repo"
worktree = Path(td) / "worktree" worktree = Path(td) / "worktree"
base.mkdir() base.mkdir()
worktree.mkdir() worktree.mkdir()
# Baseline has a diff that won't match a non-git directory
# (which returns {}), triggering the isolation error.
baseline_state = {
"diff": "diff --git a/file.py ...\n",
"untracked": "",
}
with self.assertRaises(RuntimeError) as ctx: with self.assertRaises(RuntimeError) as ctx:
_assert_base_repo_isolation( _assert_base_repo_isolation(
base, base,
"M cross_eval/agent.py", baseline_state,
step_name="coding", step_name="coding",
agent_name="claude-coder", agent_name="claude-coder",
worktree_path=worktree, worktree_path=worktree,
baseline_status="M cross_eval/agent.py", baseline_status="M file.py",
) )
self.assertIn("base repository", str(ctx.exception)) self.assertIn("base repository", str(ctx.exception))

View File

@@ -1030,6 +1030,60 @@ class FixPresetBehaviorTest(unittest.TestCase):
self.assertTrue(captured["agentic"]) self.assertTrue(captured["agentic"])
self.assertEqual(captured["phase_max"], 3) self.assertEqual(captured["phase_max"], 3)
def test_run_senior_model_override_applies_only_to_seniors(self) -> None:
captured: dict[str, list[str]] = {}
def _fake_run_pipeline(config, **kwargs):
captured["coder_args"] = list(config.agents[config.coders[0]].args)
captured["reviewer_args"] = list(config.agents[config.reviewers[0]].args)
captured["senior_args"] = list(config.agents[config.seniors[0]].args)
return PipelineResult(
iterations=[],
final_verdict="PASS",
run_dir=Path(".cross-eval/output"),
)
with patch("cross_eval.pipeline.run_pipeline", side_effect=_fake_run_pipeline):
exit_code = main([
"run",
"--preset", "review-fix",
"--coder", "claude",
"--reviewer", "claude",
"--senior", "claude",
"--senior-model", "sonnet",
"--dry-run",
])
self.assertEqual(exit_code, 0)
self.assertIn("opus", captured["coder_args"])
self.assertIn("opus", captured["reviewer_args"])
self.assertIn("sonnet", captured["senior_args"])
class OutputDirectoryResolutionTest(unittest.TestCase):
def test_load_config_resolves_output_dir_from_project_root(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
root = Path(tmpdir)
ce_dir = root / ".cross-eval"
ce_dir.mkdir()
(ce_dir / "plan.md").write_text("# plan\n", encoding="utf-8")
config_path = ce_dir / "config.yaml"
config_path.write_text(
(
"inputs:\n"
" plan: plan.md\n"
"coders: [claude-coder]\n"
"reviewers: [claude-reviewer]\n"
"pipeline: preset:simple\n"
"output_dir: .cross-eval/output\n"
),
encoding="utf-8",
)
config = load_config(config_path)
self.assertEqual(config.output_dir.resolve(), (root / ".cross-eval" / "output").resolve())
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@@ -390,7 +390,7 @@ class TestPipelineHelpers(unittest.TestCase):
snapshot = _snapshot_repo_state(repo) snapshot = _snapshot_repo_state(repo)
self.assertIn("UNTRACKED scratch.txt", snapshot) self.assertIn("UNTRACKED scratch.txt", snapshot["untracked"])
def test_finalize_worktree_deletes_empty_branch(self) -> None: def test_finalize_worktree_deletes_empty_branch(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir: