From cc8d583914c9e8780905af6ebc1e1f7805ab248f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EC=9D=B4=EC=B6=A9=EC=98=81=20=EC=97=90=EC=9D=B4=EB=8B=B7?=
 =?UTF-8?q?=EC=84=9C=EB=B9=84=EC=8A=A4=EA=B0=9C=EB=B0=9C?= <lee.cy@sk.com>
Date: Sat, 14 Mar 2026 16:19:57 +0900
Subject: [PATCH] fix: Claude reviewer empty output, worktree isolation false
 positives, and input file access
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add -p flag to _CLAUDE_REVIEW_ARGS so reviewer uses print mode (stdin→stdout)
  instead of interactive mode which conflicts with plan permission mode
- Copy input files (plan, checklist) into worktree .cross-eval-inputs/ so
  agents in plan mode can access them without escaping the sandbox
- Simplify _snapshot_repo_state to use only git diff HEAD + untracked hashes,
  eliminating false positives from staging state changes (git diff --cached)
  and git status index drift during long-running pipelines

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 cross_eval/cli.py          |  11 +++-
 cross_eval/config.py       |  15 +++---
 cross_eval/pipeline.py     | 104 +++++++++++++++++++++++++++----------
 tests/test_agentic.py      |  13 +++--
 tests/test_config.py       |  54 +++++++++++++++++++
 tests/test_runtime_misc.py |   2 +-
 6 files changed, 158 insertions(+), 41 deletions(-)

diff --git a/cross_eval/cli.py b/cross_eval/cli.py
index dc6350a..29e3fa7 100644
--- a/cross_eval/cli.py
+++ b/cross_eval/cli.py
@@ -434,6 +434,10 @@ def main(argv: list[str] | None = None) -> int:
         "--reviewer-model", default=None, metavar="MODEL",
         help="Reviewer 에이전트 모델만 변경",
     )
+    agent_group.add_argument(
+        "--senior-model", default=None, metavar="MODEL",
+        help="Senior 에이전트 모델만 변경",
+    )
 
     # -- 파이프라인 --
     pipe_group = run_parser.add_argument_group("파이프라인")
@@ -474,7 +478,7 @@ def main(argv: list[str] | None = None) -> int:
     )
     etc_group.add_argument(
         "--output-dir", type=Path, default=None,
-        help="결과 저장 디렉토리 (기본: output/)",
+        help="결과 저장 디렉토리 (기본: .cross-eval/output/)",
     )
     etc_group.add_argument(
         "--dry-run", action="store_true",
@@ -953,13 +957,16 @@ def cmd_run(args: argparse.Namespace) -> int:
     if args.model is not None:
         for agent_name in config.agents:
             _apply_model_override(config, agent_name, args.model)
-    # --coder-model / --reviewer-model: apply by role
+    # --coder-model / --reviewer-model / --senior-model: apply by role
     if args.coder_model is not None:
         for coder_name in config.coders:
             _apply_model_override(config, coder_name, args.coder_model)
     if args.reviewer_model is not None:
         for reviewer_name in config.reviewers:
             _apply_model_override(config, reviewer_name, args.reviewer_model)
+    if args.senior_model is not None:
+        for senior_name in config.seniors:
+            _apply_model_override(config, senior_name, args.senior_model)
 
     # --plan / --checklist shortcuts
     for key, val in [("plan", args.plan), ("checklist", args.checklist)]:
diff --git a/cross_eval/config.py b/cross_eval/config.py
index fe3fb66..6bd5015 100644
--- a/cross_eval/config.py
+++ b/cross_eval/config.py
@@ -62,12 +62,7 @@ _CLAUDE_CODER_ARGS = list(_CLAUDE_BASE_ARGS) + [
     "bypassPermissions",
 ]
 
-_CLAUDE_REVIEW_ARGS = [
-    "--setting-sources",
-    "user",
-    "--disable-slash-commands",
-    "--model",
-    "opus",
+_CLAUDE_REVIEW_ARGS = list(_CLAUDE_BASE_ARGS) + [
     "--permission-mode",
     "plan",
 ]
@@ -425,6 +420,8 @@ def load_config(path: Path) -> PipelineConfig:
 
 def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
     """Parse raw YAML dict into PipelineConfig."""
+    project_root = config_path.parent.parent if config_path.parent.name == ".cross-eval" else config_path.parent
+
     # --- agents ---
     agents: dict[str, AgentConfig] = {}
     for name, agent_data in raw.get("agents", {}).items():
@@ -494,8 +491,12 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
     if isinstance(pipeline_raw, str) and pipeline_raw.startswith("preset:"):
         preset_name = pipeline_raw.split(":", 1)[1]
 
+    output_dir = Path(raw.get("output_dir", ".cross-eval/output"))
+    if not output_dir.is_absolute():
+        output_dir = project_root / output_dir
+
     config = PipelineConfig(
-        output_dir=Path(raw.get("output_dir", ".cross-eval/output")),
+        output_dir=output_dir,
         max_iterations=int(raw.get("max_iterations", 3)),
         min_iterations=int(raw.get("min_iterations", 1)),
         verbose=bool(raw.get("verbose", False)),
diff --git a/cross_eval/pipeline.py b/cross_eval/pipeline.py
index 8c3971b..b6e5441 100644
--- a/cross_eval/pipeline.py
+++ b/cross_eval/pipeline.py
@@ -104,45 +104,73 @@ def _setup_worktree(cwd: Path, run_dir: Path, preset_name: str) -> tuple[Path, s
     return worktree_path, branch_name
 
 
-def _snapshot_repo_state(cwd: Path) -> str:
+def _copy_inputs_to_worktree(
+    config: PipelineConfig,
+    worktree_path: Path,
+) -> None:
+    """Copy input files (plan, checklist, etc.) into the worktree.
+
+    This ensures agents running in plan/read-only mode within the worktree
+    can access these files, even though the originals live in the base repo.
+    Updates config.inputs in-place so subsequent reference refreshes use
+    worktree-local paths.
+    """
+    import shutil
+    inputs_dir = worktree_path / ".cross-eval-inputs"
+    inputs_dir.mkdir(exist_ok=True)
+    # Exclude from git so these don't pollute agentic diffs
+    (inputs_dir / ".gitignore").write_text("*\n", encoding="utf-8")
+    for key, val in list(config.inputs.items()):
+        if key.endswith("_ref") or not isinstance(val, Path):
+            continue
+        if not val.exists():
+            continue
+        dest = inputs_dir / val.name
+        shutil.copy2(val, dest)
+        config.inputs[key] = dest
+
+
+def _snapshot_repo_state(cwd: Path) -> dict[str, str]:
     """Capture the base repository working-tree state.
 
     This is used to detect agentic runs that accidentally modify the original
     checkout instead of the isolated worktree.
+
+    We intentionally use only two components:
+      - ``diff``: ``git diff HEAD`` — all tracked changes vs HEAD, combining
+        staged and unstaged so that staging-state changes don't cause false
+        positives.
+      - ``untracked``: SHA-256 hashes of untracked files — detects new or
+        modified untracked files appearing in the base repo.
+
+    ``git status --short`` and ``git diff --cached`` are NOT used because
+    external tools (IDEs, git hooks) can change staging state during a
+    long-running pipeline, causing spurious failures.
     """
-    status = subprocess.run(
-        ["git", "status", "--short", "--untracked-files=all"],
+    # Refresh index stat cache to prevent false positives from mtime drift
+    subprocess.run(
+        ["git", "update-index", "--refresh", "-q"],
         cwd=cwd,
         capture_output=True,
-        text=True,
     )
-    if status.returncode != 0:
-        return ""
 
+    # git diff HEAD: all changes (staged + unstaged) vs HEAD
     diff = subprocess.run(
         ["git", "diff", "--no-ext-diff", "--binary", "HEAD"],
         cwd=cwd,
         capture_output=True,
         text=True,
     )
-    cached_diff = subprocess.run(
-        ["git", "diff", "--no-ext-diff", "--binary", "--cached"],
-        cwd=cwd,
-        capture_output=True,
-        text=True,
-    )
+    if diff.returncode != 0:
+        return {}
+
     untracked = subprocess.run(
         ["git", "ls-files", "--others", "--exclude-standard", "-z"],
         cwd=cwd,
         capture_output=True,
     )
 
-    parts = [
-        status.stdout,
-        diff.stdout,
-        cached_diff.stdout,
-    ]
-
+    untracked_parts: list[str] = []
     if untracked.returncode == 0 and untracked.stdout:
         for rel_path in untracked.stdout.decode("utf-8", errors="replace").split("\0"):
             if not rel_path:
@@ -150,11 +178,14 @@ def _snapshot_repo_state(cwd: Path) -> str:
             file_path = cwd / rel_path
             if file_path.is_file():
                 digest = sha256(file_path.read_bytes()).hexdigest()
-                parts.append(f"UNTRACKED {rel_path} {digest}")
+                untracked_parts.append(f"UNTRACKED {rel_path} {digest}")
             else:
-                parts.append(f"UNTRACKED {rel_path} (non-file)")
+                untracked_parts.append(f"UNTRACKED {rel_path} (non-file)")
 
-    return "\n".join(parts)
+    return {
+        "diff": diff.stdout,
+        "untracked": "\n".join(untracked_parts),
+    }
 
 
 def _snapshot_repo_status(cwd: Path) -> str:
@@ -172,7 +203,7 @@ def _snapshot_repo_status(cwd: Path) -> str:
 
 def _assert_base_repo_isolation(
     cwd: Path,
-    baseline_state: str,
+    baseline_state: dict[str, str],
     *,
     step_name: str,
     agent_name: str,
@@ -184,6 +215,18 @@ def _assert_base_repo_isolation(
     if current_state == baseline_state:
         return
 
+    # Identify which component(s) actually changed
+    changed: list[str] = []
+    for key in ("diff", "untracked"):
+        if baseline_state.get(key, "") != current_state.get(key, ""):
+            changed.append(key)
+
+    if not changed:
+        # State dicts differ only in keys we no longer track — benign.
+        return
+
+    # untracked-only change: new files appeared — real leak
+    # diff-only change: tracked file content changed — real leak
     current_status = _snapshot_repo_status(cwd)
     before = baseline_status or "(clean)"
     after = current_status or "(clean)"
@@ -191,7 +234,8 @@ def _assert_base_repo_isolation(
         "Agent modified the base repository instead of the isolated worktree.\n\n"
         f"Step: {step_name}\n"
         f"Agent: {agent_name}\n"
-        f"Worktree: {worktree_path}\n\n"
+        f"Worktree: {worktree_path}\n"
+        f"Changed components: {', '.join(changed)}\n\n"
         f"Baseline status:\n{before}\n\n"
         f"Current status:\n{after}"
     )
@@ -268,12 +312,14 @@ def _run_simple_pipeline(
     # Setup shared worktree for agentic mode
     worktree_path: Path | None = None
     agentic_branch_name: str | None = None
-    base_repo_state: str | None = None
+    base_repo_state: dict[str, str] | None = None
     base_repo_status: str | None = None
     if not dry_run and _has_agentic_steps(config, config.pipeline):
         worktree_path, agentic_branch_name = _setup_worktree(
             cwd, run_dir, config.preset_name,
         )
+        _copy_inputs_to_worktree(config, worktree_path)
+        _refresh_input_references(config, input_contents)
         base_repo_state = _snapshot_repo_state(cwd)
         base_repo_status = _snapshot_repo_status(cwd)
 
@@ -443,12 +489,14 @@ def _run_phased_pipeline(
     all_phase_steps = [s for p in config.phases for s in p.steps]
     worktree_path: Path | None = None
     agentic_branch_name: str | None = None
-    base_repo_state: str | None = None
+    base_repo_state: dict[str, str] | None = None
     base_repo_status: str | None = None
     if not dry_run and _has_agentic_steps(config, all_phase_steps):
         worktree_path, agentic_branch_name = _setup_worktree(
             cwd, run_dir, config.preset_name,
         )
+        _copy_inputs_to_worktree(config, worktree_path)
+        _refresh_input_references(config, input_contents)
         base_repo_state = _snapshot_repo_state(cwd)
         base_repo_status = _snapshot_repo_status(cwd)
 
@@ -844,7 +892,7 @@ def _run_steps(
     phase_name: str | None = None,
     worktree_path: Path | None = None,
     runtime_env: dict[str, str] | None = None,
-    base_repo_state: str | None = None,
+    base_repo_state: dict[str, str] | None = None,
     base_repo_status: str | None = None,
 ) -> tuple[dict[str, str], dict[str, AgentResult], str | None]:
     """Execute all steps in one iteration, parallelizing where possible."""
@@ -933,7 +981,7 @@ def _execute_step(
     quiet: bool = False,
     worktree_path: Path | None = None,
     runtime_env: dict[str, str] | None = None,
-    base_repo_state: str | None = None,
+    base_repo_state: dict[str, str] | None = None,
     base_repo_status: str | None = None,
 ) -> None:
     """Execute a single step, updating step_outputs and step_results in place."""
@@ -1066,7 +1114,7 @@ def _execute_parallel_batch(
     phase_name: str | None = None,
     worktree_path: Path | None = None,
     runtime_env: dict[str, str] | None = None,
-    base_repo_state: str | None = None,
+    base_repo_state: dict[str, str] | None = None,
     base_repo_status: str | None = None,
 ) -> None:
     """Execute multiple steps in parallel using threads."""
diff --git a/tests/test_agentic.py b/tests/test_agentic.py
index 75ad121..0534d55 100644
--- a/tests/test_agentic.py
+++ b/tests/test_agentic.py
@@ -206,21 +206,28 @@ class TestMakeWorktreeDir(unittest.TestCase):
 class TestBaseRepoIsolation(unittest.TestCase):
     """Base repo mutations should fail fast during agentic execution."""
 
-    def test_raises_when_base_repo_status_changes(self) -> None:
+    def test_raises_when_base_repo_state_changes(self) -> None:
         with tempfile.TemporaryDirectory() as td:
             base = Path(td) / "repo"
             worktree = Path(td) / "worktree"
             base.mkdir()
             worktree.mkdir()
 
+            # Baseline has a diff that won't match a non-git directory
+            # (which returns {}), triggering the isolation error.
+            baseline_state = {
+                "diff": "diff --git a/file.py ...\n",
+                "untracked": "",
+            }
+
             with self.assertRaises(RuntimeError) as ctx:
                 _assert_base_repo_isolation(
                     base,
-                    "M cross_eval/agent.py",
+                    baseline_state,
                     step_name="coding",
                     agent_name="claude-coder",
                     worktree_path=worktree,
-                    baseline_status="M cross_eval/agent.py",
+                    baseline_status="M file.py",
                 )
 
         self.assertIn("base repository", str(ctx.exception))
diff --git a/tests/test_config.py b/tests/test_config.py
index 35ddb4a..2e5828a 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1030,6 +1030,60 @@ class FixPresetBehaviorTest(unittest.TestCase):
         self.assertTrue(captured["agentic"])
         self.assertEqual(captured["phase_max"], 3)
 
+    def test_run_senior_model_override_applies_only_to_seniors(self) -> None:
+        captured: dict[str, list[str]] = {}
+
+        def _fake_run_pipeline(config, **kwargs):
+            captured["coder_args"] = list(config.agents[config.coders[0]].args)
+            captured["reviewer_args"] = list(config.agents[config.reviewers[0]].args)
+            captured["senior_args"] = list(config.agents[config.seniors[0]].args)
+            return PipelineResult(
+                iterations=[],
+                final_verdict="PASS",
+                run_dir=Path(".cross-eval/output"),
+            )
+
+        with patch("cross_eval.pipeline.run_pipeline", side_effect=_fake_run_pipeline):
+            exit_code = main([
+                "run",
+                "--preset", "review-fix",
+                "--coder", "claude",
+                "--reviewer", "claude",
+                "--senior", "claude",
+                "--senior-model", "sonnet",
+                "--dry-run",
+            ])
+
+        self.assertEqual(exit_code, 0)
+        self.assertIn("opus", captured["coder_args"])
+        self.assertIn("opus", captured["reviewer_args"])
+        self.assertIn("sonnet", captured["senior_args"])
+
+
+class OutputDirectoryResolutionTest(unittest.TestCase):
+    def test_load_config_resolves_output_dir_from_project_root(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            root = Path(tmpdir)
+            ce_dir = root / ".cross-eval"
+            ce_dir.mkdir()
+            (ce_dir / "plan.md").write_text("# plan\n", encoding="utf-8")
+            config_path = ce_dir / "config.yaml"
+            config_path.write_text(
+                (
+                    "inputs:\n"
+                    "  plan: plan.md\n"
+                    "coders: [claude-coder]\n"
+                    "reviewers: [claude-reviewer]\n"
+                    "pipeline: preset:simple\n"
+                    "output_dir: .cross-eval/output\n"
+                ),
+                encoding="utf-8",
+            )
+
+            config = load_config(config_path)
+
+        self.assertEqual(config.output_dir.resolve(), (root / ".cross-eval" / "output").resolve())
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_runtime_misc.py b/tests/test_runtime_misc.py
index 110d331..9aea896 100644
--- a/tests/test_runtime_misc.py
+++ b/tests/test_runtime_misc.py
@@ -390,7 +390,7 @@ class TestPipelineHelpers(unittest.TestCase):
 
             snapshot = _snapshot_repo_state(repo)
 
-            self.assertIn("UNTRACKED scratch.txt", snapshot)
+            self.assertIn("UNTRACKED scratch.txt", snapshot["untracked"])
 
     def test_finalize_worktree_deletes_empty_branch(self) -> None:
         with tempfile.TemporaryDirectory() as tmpdir: