From cc8d583914c9e8780905af6ebc1e1f7805ab248f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=9D=B4=EC=B6=A9=EC=98=81=20=EC=97=90=EC=9D=B4=EB=8B=B7?= =?UTF-8?q?=EC=84=9C=EB=B9=84=EC=8A=A4=EA=B0=9C=EB=B0=9C?= Date: Sat, 14 Mar 2026 16:19:57 +0900 Subject: [PATCH] fix: Claude reviewer empty output, worktree isolation false positives, and input file access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add -p flag to _CLAUDE_REVIEW_ARGS so reviewer uses print mode (stdin→stdout) instead of interactive mode which conflicts with plan permission mode - Copy input files (plan, checklist) into worktree .cross-eval-inputs/ so agents in plan mode can access them without escaping the sandbox - Simplify _snapshot_repo_state to use only git diff HEAD + untracked hashes, eliminating false positives from staging state changes (git diff --cached) and git status index drift during long-running pipelines Co-Authored-By: Claude Opus 4.6 --- cross_eval/cli.py | 11 +++- cross_eval/config.py | 15 +++--- cross_eval/pipeline.py | 104 +++++++++++++++++++++++++++---------- tests/test_agentic.py | 13 +++-- tests/test_config.py | 54 +++++++++++++++++++ tests/test_runtime_misc.py | 2 +- 6 files changed, 158 insertions(+), 41 deletions(-) diff --git a/cross_eval/cli.py b/cross_eval/cli.py index dc6350a..29e3fa7 100644 --- a/cross_eval/cli.py +++ b/cross_eval/cli.py @@ -434,6 +434,10 @@ def main(argv: list[str] | None = None) -> int: "--reviewer-model", default=None, metavar="MODEL", help="Reviewer 에이전트 모델만 변경", ) + agent_group.add_argument( + "--senior-model", default=None, metavar="MODEL", + help="Senior 에이전트 모델만 변경", + ) # -- 파이프라인 -- pipe_group = run_parser.add_argument_group("파이프라인") @@ -474,7 +478,7 @@ def main(argv: list[str] | None = None) -> int: ) etc_group.add_argument( "--output-dir", type=Path, default=None, - help="결과 저장 디렉토리 (기본: output/)", + help="결과 저장 디렉토리 (기본: .cross-eval/output/)", ) etc_group.add_argument( "--dry-run", action="store_true", @@ -953,13 +957,16 @@ def cmd_run(args: argparse.Namespace) -> int: if args.model is not None: for agent_name in config.agents: _apply_model_override(config, agent_name, args.model) - # --coder-model / --reviewer-model: apply by role + # --coder-model / --reviewer-model / --senior-model: apply by role if args.coder_model is not None: for coder_name in config.coders: _apply_model_override(config, coder_name, args.coder_model) if args.reviewer_model is not None: for reviewer_name in config.reviewers: _apply_model_override(config, reviewer_name, args.reviewer_model) + if args.senior_model is not None: + for senior_name in config.seniors: + _apply_model_override(config, senior_name, args.senior_model) # --plan / --checklist shortcuts for key, val in [("plan", args.plan), ("checklist", args.checklist)]: diff --git a/cross_eval/config.py b/cross_eval/config.py index fe3fb66..6bd5015 100644 --- a/cross_eval/config.py +++ b/cross_eval/config.py @@ -62,12 +62,7 @@ _CLAUDE_CODER_ARGS = list(_CLAUDE_BASE_ARGS) + [ "bypassPermissions", ] -_CLAUDE_REVIEW_ARGS = [ - "--setting-sources", - "user", - "--disable-slash-commands", - "--model", - "opus", +_CLAUDE_REVIEW_ARGS = list(_CLAUDE_BASE_ARGS) + [ "--permission-mode", "plan", ] @@ -425,6 +420,8 @@ def load_config(path: Path) -> PipelineConfig: def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig: """Parse raw YAML dict into PipelineConfig.""" + project_root = config_path.parent.parent if config_path.parent.name == ".cross-eval" else config_path.parent + # --- agents --- agents: dict[str, AgentConfig] = {} for name, agent_data in raw.get("agents", {}).items(): @@ -494,8 +491,12 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig: if isinstance(pipeline_raw, str) and pipeline_raw.startswith("preset:"): preset_name = pipeline_raw.split(":", 1)[1] + output_dir = Path(raw.get("output_dir", ".cross-eval/output")) + if not output_dir.is_absolute(): + output_dir = project_root / output_dir + config = PipelineConfig( - output_dir=Path(raw.get("output_dir", ".cross-eval/output")), + output_dir=output_dir, max_iterations=int(raw.get("max_iterations", 3)), min_iterations=int(raw.get("min_iterations", 1)), verbose=bool(raw.get("verbose", False)), diff --git a/cross_eval/pipeline.py b/cross_eval/pipeline.py index 8c3971b..b6e5441 100644 --- a/cross_eval/pipeline.py +++ b/cross_eval/pipeline.py @@ -104,45 +104,73 @@ def _setup_worktree(cwd: Path, run_dir: Path, preset_name: str) -> tuple[Path, s return worktree_path, branch_name -def _snapshot_repo_state(cwd: Path) -> str: +def _copy_inputs_to_worktree( + config: PipelineConfig, + worktree_path: Path, +) -> None: + """Copy input files (plan, checklist, etc.) into the worktree. + + This ensures agents running in plan/read-only mode within the worktree + can access these files, even though the originals live in the base repo. + Updates config.inputs in-place so subsequent reference refreshes use + worktree-local paths. + """ + import shutil + inputs_dir = worktree_path / ".cross-eval-inputs" + inputs_dir.mkdir(exist_ok=True) + # Exclude from git so these don't pollute agentic diffs + (inputs_dir / ".gitignore").write_text("*\n", encoding="utf-8") + for key, val in list(config.inputs.items()): + if key.endswith("_ref") or not isinstance(val, Path): + continue + if not val.exists(): + continue + dest = inputs_dir / val.name + shutil.copy2(val, dest) + config.inputs[key] = dest + + +def _snapshot_repo_state(cwd: Path) -> dict[str, str]: """Capture the base repository working-tree state. This is used to detect agentic runs that accidentally modify the original checkout instead of the isolated worktree. + + We intentionally use only two components: + - ``diff``: ``git diff HEAD`` — all tracked changes vs HEAD, combining + staged and unstaged so that staging-state changes don't cause false + positives. + - ``untracked``: SHA-256 hashes of untracked files — detects new or + modified untracked files appearing in the base repo. + + ``git status --short`` and ``git diff --cached`` are NOT used because + external tools (IDEs, git hooks) can change staging state during a + long-running pipeline, causing spurious failures. """ - status = subprocess.run( - ["git", "status", "--short", "--untracked-files=all"], + # Refresh index stat cache to prevent false positives from mtime drift + subprocess.run( + ["git", "update-index", "--refresh", "-q"], cwd=cwd, capture_output=True, - text=True, ) - if status.returncode != 0: - return "" + # git diff HEAD: all changes (staged + unstaged) vs HEAD diff = subprocess.run( ["git", "diff", "--no-ext-diff", "--binary", "HEAD"], cwd=cwd, capture_output=True, text=True, ) - cached_diff = subprocess.run( - ["git", "diff", "--no-ext-diff", "--binary", "--cached"], - cwd=cwd, - capture_output=True, - text=True, - ) + if diff.returncode != 0: + return {} + untracked = subprocess.run( ["git", "ls-files", "--others", "--exclude-standard", "-z"], cwd=cwd, capture_output=True, ) - parts = [ - status.stdout, - diff.stdout, - cached_diff.stdout, - ] - + untracked_parts: list[str] = [] if untracked.returncode == 0 and untracked.stdout: for rel_path in untracked.stdout.decode("utf-8", errors="replace").split("\0"): if not rel_path: @@ -150,11 +178,14 @@ def _snapshot_repo_state(cwd: Path) -> str: file_path = cwd / rel_path if file_path.is_file(): digest = sha256(file_path.read_bytes()).hexdigest() - parts.append(f"UNTRACKED {rel_path} {digest}") + untracked_parts.append(f"UNTRACKED {rel_path} {digest}") else: - parts.append(f"UNTRACKED {rel_path} (non-file)") + untracked_parts.append(f"UNTRACKED {rel_path} (non-file)") - return "\n".join(parts) + return { + "diff": diff.stdout, + "untracked": "\n".join(untracked_parts), + } def _snapshot_repo_status(cwd: Path) -> str: @@ -172,7 +203,7 @@ def _snapshot_repo_status(cwd: Path) -> str: def _assert_base_repo_isolation( cwd: Path, - baseline_state: str, + baseline_state: dict[str, str], *, step_name: str, agent_name: str, @@ -184,6 +215,18 @@ def _assert_base_repo_isolation( if current_state == baseline_state: return + # Identify which component(s) actually changed + changed: list[str] = [] + for key in ("diff", "untracked"): + if baseline_state.get(key, "") != current_state.get(key, ""): + changed.append(key) + + if not changed: + # State dicts differ only in keys we no longer track — benign. + return + + # untracked-only change: new files appeared — real leak + # diff-only change: tracked file content changed — real leak current_status = _snapshot_repo_status(cwd) before = baseline_status or "(clean)" after = current_status or "(clean)" @@ -191,7 +234,8 @@ def _assert_base_repo_isolation( "Agent modified the base repository instead of the isolated worktree.\n\n" f"Step: {step_name}\n" f"Agent: {agent_name}\n" - f"Worktree: {worktree_path}\n\n" + f"Worktree: {worktree_path}\n" + f"Changed components: {', '.join(changed)}\n\n" f"Baseline status:\n{before}\n\n" f"Current status:\n{after}" ) @@ -268,12 +312,14 @@ def _run_simple_pipeline( # Setup shared worktree for agentic mode worktree_path: Path | None = None agentic_branch_name: str | None = None - base_repo_state: str | None = None + base_repo_state: dict[str, str] | None = None base_repo_status: str | None = None if not dry_run and _has_agentic_steps(config, config.pipeline): worktree_path, agentic_branch_name = _setup_worktree( cwd, run_dir, config.preset_name, ) + _copy_inputs_to_worktree(config, worktree_path) + _refresh_input_references(config, input_contents) base_repo_state = _snapshot_repo_state(cwd) base_repo_status = _snapshot_repo_status(cwd) @@ -443,12 +489,14 @@ def _run_phased_pipeline( all_phase_steps = [s for p in config.phases for s in p.steps] worktree_path: Path | None = None agentic_branch_name: str | None = None - base_repo_state: str | None = None + base_repo_state: dict[str, str] | None = None base_repo_status: str | None = None if not dry_run and _has_agentic_steps(config, all_phase_steps): worktree_path, agentic_branch_name = _setup_worktree( cwd, run_dir, config.preset_name, ) + _copy_inputs_to_worktree(config, worktree_path) + _refresh_input_references(config, input_contents) base_repo_state = _snapshot_repo_state(cwd) base_repo_status = _snapshot_repo_status(cwd) @@ -844,7 +892,7 @@ def _run_steps( phase_name: str | None = None, worktree_path: Path | None = None, runtime_env: dict[str, str] | None = None, - base_repo_state: str | None = None, + base_repo_state: dict[str, str] | None = None, base_repo_status: str | None = None, ) -> tuple[dict[str, str], dict[str, AgentResult], str | None]: """Execute all steps in one iteration, parallelizing where possible.""" @@ -933,7 +981,7 @@ def _execute_step( quiet: bool = False, worktree_path: Path | None = None, runtime_env: dict[str, str] | None = None, - base_repo_state: str | None = None, + base_repo_state: dict[str, str] | None = None, base_repo_status: str | None = None, ) -> None: """Execute a single step, updating step_outputs and step_results in place.""" @@ -1066,7 +1114,7 @@ def _execute_parallel_batch( phase_name: str | None = None, worktree_path: Path | None = None, runtime_env: dict[str, str] | None = None, - base_repo_state: str | None = None, + base_repo_state: dict[str, str] | None = None, base_repo_status: str | None = None, ) -> None: """Execute multiple steps in parallel using threads.""" diff --git a/tests/test_agentic.py b/tests/test_agentic.py index 75ad121..0534d55 100644 --- a/tests/test_agentic.py +++ b/tests/test_agentic.py @@ -206,21 +206,28 @@ class TestMakeWorktreeDir(unittest.TestCase): class TestBaseRepoIsolation(unittest.TestCase): """Base repo mutations should fail fast during agentic execution.""" - def test_raises_when_base_repo_status_changes(self) -> None: + def test_raises_when_base_repo_state_changes(self) -> None: with tempfile.TemporaryDirectory() as td: base = Path(td) / "repo" worktree = Path(td) / "worktree" base.mkdir() worktree.mkdir() + # Baseline has a diff that won't match a non-git directory + # (which returns {}), triggering the isolation error. + baseline_state = { + "diff": "diff --git a/file.py ...\n", + "untracked": "", + } + with self.assertRaises(RuntimeError) as ctx: _assert_base_repo_isolation( base, - "M cross_eval/agent.py", + baseline_state, step_name="coding", agent_name="claude-coder", worktree_path=worktree, - baseline_status="M cross_eval/agent.py", + baseline_status="M file.py", ) self.assertIn("base repository", str(ctx.exception)) diff --git a/tests/test_config.py b/tests/test_config.py index 35ddb4a..2e5828a 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1030,6 +1030,60 @@ class FixPresetBehaviorTest(unittest.TestCase): self.assertTrue(captured["agentic"]) self.assertEqual(captured["phase_max"], 3) + def test_run_senior_model_override_applies_only_to_seniors(self) -> None: + captured: dict[str, list[str]] = {} + + def _fake_run_pipeline(config, **kwargs): + captured["coder_args"] = list(config.agents[config.coders[0]].args) + captured["reviewer_args"] = list(config.agents[config.reviewers[0]].args) + captured["senior_args"] = list(config.agents[config.seniors[0]].args) + return PipelineResult( + iterations=[], + final_verdict="PASS", + run_dir=Path(".cross-eval/output"), + ) + + with patch("cross_eval.pipeline.run_pipeline", side_effect=_fake_run_pipeline): + exit_code = main([ + "run", + "--preset", "review-fix", + "--coder", "claude", + "--reviewer", "claude", + "--senior", "claude", + "--senior-model", "sonnet", + "--dry-run", + ]) + + self.assertEqual(exit_code, 0) + self.assertIn("opus", captured["coder_args"]) + self.assertIn("opus", captured["reviewer_args"]) + self.assertIn("sonnet", captured["senior_args"]) + + +class OutputDirectoryResolutionTest(unittest.TestCase): + def test_load_config_resolves_output_dir_from_project_root(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + ce_dir = root / ".cross-eval" + ce_dir.mkdir() + (ce_dir / "plan.md").write_text("# plan\n", encoding="utf-8") + config_path = ce_dir / "config.yaml" + config_path.write_text( + ( + "inputs:\n" + " plan: plan.md\n" + "coders: [claude-coder]\n" + "reviewers: [claude-reviewer]\n" + "pipeline: preset:simple\n" + "output_dir: .cross-eval/output\n" + ), + encoding="utf-8", + ) + + config = load_config(config_path) + + self.assertEqual(config.output_dir.resolve(), (root / ".cross-eval" / "output").resolve()) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_runtime_misc.py b/tests/test_runtime_misc.py index 110d331..9aea896 100644 --- a/tests/test_runtime_misc.py +++ b/tests/test_runtime_misc.py @@ -390,7 +390,7 @@ class TestPipelineHelpers(unittest.TestCase): snapshot = _snapshot_repo_state(repo) - self.assertIn("UNTRACKED scratch.txt", snapshot) + self.assertIn("UNTRACKED scratch.txt", snapshot["untracked"]) def test_finalize_worktree_deletes_empty_branch(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: