feat: tighten agentic runtime handoffs and quality gates

2026-03-14 10:05:25 +09:00
parent 87bc0ffbfb
commit 7b95233edf
15 changed files with 1148 additions and 167 deletions
--- a/cross_eval/agent.py
+++ b/cross_eval/agent.py
@@ -415,11 +415,7 @@ def invoke_agent_agentic(
    timeout: int | None = None,
    quiet: bool = False,
 ) -> AgentResult:
-    """Invoke an agent in agentic mode (no -p, runs in worktree, captures git diff).
+    """Invoke an agent in agentic mode using the worktree as the source of truth."""
    The agent runs without print mode so it can modify files directly.
    After the agent exits, git diff (since last commit) is captured as the output.
    """
    from cross_eval.worktree import capture_diff
    # Write prompt to a temp file (outside worktree, won't appear in diffs)
@@ -433,10 +429,10 @@ def invoke_agent_agentic(
    if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
        cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
-    # Strip stdin sentinel ("-") from args for agentic mode.
+    # Strip print-mode flags and stdin sentinels for agentic mode.
-    # Keep -p/--print: Claude -p mode still has full tool access (Edit, Write,
+    # Agentic runs should operate on the worktree and return a real git diff,
-    # Bash, etc.) and is the correct mode for non-interactive subprocess use.
+    # not behave as a one-shot text completer.
-    args = [a for a in agent.args if a != "-"]
+    args = [a for a in agent.args if a not in {"-", "-p", "--print"}]
    cmd.extend(args)
    # System prompt via flag if supported
@@ -454,8 +450,8 @@ def invoke_agent_agentic(
        else:
            input_data = prompt
    else:
-        # claude -p: deliver prompt via stdin (same as codex).
+        # claude: deliver the task through stdin and let the worktree be the
-        # -p mode is non-interactive and reads from stdin, then exits.
+        # canonical place where files are read/written.
        input_data = prompt
    cmd_preview = " ".join(cmd[:6])
--- a/cross_eval/cli.py
+++ b/cross_eval/cli.py
@@ -266,7 +266,7 @@ def main(argv: list[str] | None = None) -> int:
        type=int,
        default=None,
        metavar="SEC",
-        help="에이전트 호출 제한 시간 (--live 전용)",
+        help="에이전트 1회 호출 제한 시간(초). 0=무제한 (기본: 무제한, --live 전용)",
    )
    # --- run ---
@@ -981,6 +981,7 @@ def cmd_run(args: argparse.Namespace) -> int:
            print(f"No files found in: {docs_dir}", file=sys.stderr)
            return 1
        config.inputs["docs"] = docs_content
        config.inputs["docs_ref"] = str(docs_dir)
    if args.env_files:
        for env_file in args.env_files:
@@ -1007,7 +1008,6 @@ def cmd_run(args: argparse.Namespace) -> int:
        apply_input_overrides(config, overrides)
    # 3. Validate after all overrides
    from cross_eval.config import validate_config
    errors = validate_config(config)
    if errors:
        print("Config error:\n  " + "\n  ".join(errors), file=sys.stderr)
--- a/cross_eval/config.py
+++ b/cross_eval/config.py
@@ -698,9 +698,9 @@ def _validate_unique_step_fields(
 def _make_agentic(agent: AgentConfig) -> None:
-    """Convert an agent to agentic mode in-place (remove -p, set agentic=True)."""
+    """Convert an agent to agentic mode in-place."""
    agent.agentic = True
-    agent.args = [a for a in agent.args if a != "-p"]
+    agent.args = [a for a in agent.args if a not in {"-p", "--print"}]
 def sync_phased_iterations(
--- a/cross_eval/demo.py
+++ b/cross_eval/demo.py
@@ -217,7 +217,7 @@ def run_mock_demo(preset: str = "simple", show_escalate: bool = False) -> None:
    if show_escalate:
        print(f"\n{RED}{BOLD}{'=' * 50}")
-        print(f"  Escalation Report")
+        print("  Escalation Report")
        print(f"{'=' * 50}{RESET}")
        print(f"{YELLOW}Human review required.{RESET}")
        print(f"  {RED}•{RESET} Requirements are ambiguous — needs stakeholder clarification")
--- a/cross_eval/doctor.py
+++ b/cross_eval/doctor.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 import shutil
 import subprocess
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
--- a/cross_eval/pipeline.py
+++ b/cross_eval/pipeline.py
@@ -343,6 +343,8 @@ def _run_simple_pipeline(
            if step_results:
                input_contents["execution_evidence"] = _format_execution_evidence(
                    step_results,
                    run_dir=run_dir,
                    iteration=i,
                )
            iterations.append(iter_result)
@@ -543,6 +545,8 @@ def _run_phased_pipeline(
                if step_results:
                    input_contents["execution_evidence"] = _format_execution_evidence(
                        step_results,
                        run_dir=run_dir,
                        iteration=global_iter,
                    )
                iterations.append(iter_result)
@@ -661,10 +665,13 @@ def _load_inputs(config: PipelineConfig) -> dict[str, str]:
    """Load input file contents from config."""
    input_contents: dict[str, str] = {}
    for key, val in config.inputs.items():
-        if isinstance(val, str):
+        if key.endswith("_ref"):
            input_contents[key] = str(val)
        elif isinstance(val, str):
            input_contents[key] = val
        else:
            input_contents[key] = val.read_text(encoding="utf-8")
    _refresh_input_references(config, input_contents)
    return input_contents
@@ -673,10 +680,99 @@ def _refresh_inputs(
 ) -> None:
    """Re-read input files (they may have changed on disk)."""
    for key, val in config.inputs.items():
-        if isinstance(val, str):
+        if key.endswith("_ref"):
            input_contents[key] = str(val)
        elif isinstance(val, str):
            input_contents[key] = val
        elif isinstance(val, Path) and val.exists():
            input_contents[key] = val.read_text(encoding="utf-8")
    _refresh_input_references(config, input_contents)
 def _refresh_input_references(
    config: PipelineConfig,
    input_contents: dict[str, str],
 ) -> None:
    """Expose stable file references for canonical planning inputs."""
    for key, val in config.inputs.items():
        if key.endswith("_ref"):
            input_contents[key] = str(val)
            continue
        ref_key = f"{key}_ref"
        if isinstance(val, Path):
            input_contents[ref_key] = str(val.resolve())
        else:
            input_contents.setdefault(ref_key, f"(inline {key}; no file path available)")
 def _git_ref(cwd: Path, *args: str) -> str:
    """Best-effort git metadata lookup."""
    result = subprocess.run(
        ["git", *args],
        cwd=cwd,
        capture_output=True,
        text=True,
    )
    if result.returncode != 0:
        return "(unknown)"
    return result.stdout.strip() or "(unknown)"
 def _collect_markdown_refs(run_dir: Path, iteration: int) -> list[Path]:
    """Collect prior markdown artifacts available to the current step."""
    refs: list[Path] = []
    for idx in range(1, iteration + 1):
        iter_dir = run_dir / f"v{idx}"
        if not iter_dir.exists():
            continue
        refs.extend(sorted(iter_dir.glob("*.md")))
    return refs
 def _build_artifact_references(
    context: dict[str, str],
    *,
    cwd: Path,
    run_dir: Path,
    iteration: int,
    worktree_path: Path | None,
    step_results: dict[str, AgentResult] | None = None,
 ) -> str:
    """Build a compact reference-only handoff for agentic steps."""
    repo_cwd = worktree_path or cwd
    branch = _git_ref(repo_cwd, "rev-parse", "--abbrev-ref", "HEAD")
    commit_hash = _git_ref(repo_cwd, "rev-parse", "HEAD")
    lines = [
        "### Canonical References",
        f"- Plan: {context.get('plan_ref', '(missing)')}",
        f"- Checklist: {context.get('checklist_ref', '(missing)')}",
        f"- Docs: {context.get('docs_ref', '(none)')}",
        f"- Run directory: {run_dir}",
        f"- Current iteration directory: {run_dir / f'v{iteration}'}",
        f"- Target repository: {repo_cwd}",
        f"- Git branch: {branch}",
        f"- Git commit: {commit_hash}",
        "",
        "Use git/cat to inspect the referenced files directly instead of relying on inline summaries.",
        f"Suggested git commands: `git -C {repo_cwd} show {commit_hash}` and `git -C {repo_cwd} diff HEAD`",
    ]
    markdown_refs = _collect_markdown_refs(run_dir, iteration)
    if markdown_refs:
        lines.extend(["", "### Markdown Artifacts"])
        lines.extend(f"- {path}" for path in markdown_refs)
    if step_results:
        lines.extend(["", "### Current Step Artifacts"])
        for result in step_results.values():
            lines.append(f"- Output: {run_dir / f'v{iteration}' / f'{result.step_name}.md'}")
            if result.transcript:
                lines.append(
                    f"- Transcript: {run_dir / f'v{iteration}' / f'{result.step_name}_transcript.md'}"
                )
    return "\n".join(lines)
 # ---------------------------------------------------------------------------
@@ -850,6 +946,9 @@ def _execute_step(
    # 2. Build context (include prior step results for evidence)
    context = _build_context(
        input_contents, step_outputs, feedback, iteration, max_iterations,
        cwd=cwd,
        run_dir=run_dir,
        worktree_path=worktree_path,
        step_results=step_results,
    )
@@ -1031,6 +1130,9 @@ def _execute_parallel_batch(
        template = resolve_template(step.prompt_template)
        context = _build_context(
            context_snapshot, {}, feedback, iteration, max_iterations,
            cwd=cwd,
            run_dir=run_dir,
            worktree_path=worktree_path,
            step_results=results_snapshot,
        )
        if step.context_override:
@@ -1145,6 +1247,10 @@ def _build_context(
    feedback: str,
    iteration: int,
    max_iterations: int,
    *,
    cwd: Path | None = None,
    run_dir: Path | None = None,
    worktree_path: Path | None = None,
    step_results: dict[str, AgentResult] | None = None,
 ) -> dict[str, str]:
    """Build the template context dict.
@@ -1160,11 +1266,25 @@ def _build_context(
    context["feedback"] = feedback
    context["iteration"] = str(iteration)
    context["max_iterations"] = str(max_iterations)
    ref_cwd = cwd or Path.cwd()
    ref_run_dir = run_dir or ref_cwd / ".cross-eval" / "output" / "ad-hoc"
    context["artifact_references"] = _build_artifact_references(
        context,
        cwd=ref_cwd,
        run_dir=ref_run_dir,
        iteration=iteration,
        worktree_path=worktree_path,
        step_results=step_results,
    )
    # Surface execution evidence from prior steps so reviewers can inspect it.
    # Prior-iteration evidence may already live in context via input_contents.
    prior_evidence = context.get("execution_evidence", "")
    if step_results:
-        current_evidence = _format_execution_evidence(step_results)
+        current_evidence = _format_execution_evidence(
            step_results,
            run_dir=ref_run_dir,
            iteration=iteration,
        )
        if prior_evidence and prior_evidence != "(no prior execution evidence)":
            context["execution_evidence"] = (
                "# Prior Iteration Evidence\n"
@@ -1179,12 +1299,14 @@ def _build_context(
 def _format_execution_evidence(
    step_results: dict[str, AgentResult],
    *,
    run_dir: Path | None = None,
    iteration: int | None = None,
 ) -> str:
    """Format execution evidence from prior steps for reviewer consumption.
-    Produces a compact summary of command, exit code, duration, and a truncated
+    Produces a compact summary of command, exit code, duration, and artifact
-    transcript excerpt for each completed step so that reviewers and seniors
+    paths so that later agents can read markdown/git state directly.
    can verify claims against real execution data.
    """
    if not step_results:
        return "(no prior execution evidence)"
@@ -1198,12 +1320,12 @@ def _format_execution_evidence(
            f"- Output size: {len(result.output)} chars",
        ]
        section = [line for line in section if line]
        if run_dir is not None and iteration is not None:
            section.append(f"- Output artifact: {run_dir / f'v{iteration}' / f'{result.step_name}.md'}")
            if result.transcript:
-            # Include a truncated transcript excerpt for debugging
+                section.append(
-            excerpt = result.transcript[:2000]
+                    f"- Transcript artifact: {run_dir / f'v{iteration}' / f'{result.step_name}_transcript.md'}"
-            if len(result.transcript) > 2000:
+                )
                excerpt += "\n... (truncated)"
            section.append(f"\n<details>\n<summary>Transcript excerpt</summary>\n\n{excerpt}\n</details>")
        parts.append("\n".join(section))
    return "\n\n---\n\n".join(parts)
@@ -1455,7 +1577,7 @@ def _format_runtime_error_markdown(
                f"- **Suggested Action**: {exc.suggested_action}",
                "",
                "## Command",
-                f"```",
+                "```",
                exc.cmd_preview,
                "```",
                "",
--- a/cross_eval/prompts.py
+++ b/cross_eval/prompts.py
@@ -15,58 +15,39 @@ from cross_eval.models import PhaseConfig, StepConfig
 CODING_TEMPLATE = """\
 You are tasked with implementing code based on a plan and checklist.
-## Plan
+## Artifact References
-{plan}
+{artifact_references}
 ## Checklist
 {checklist}
 ## Reference Documents
 {docs}
 ## Previous Review Feedback
 {feedback}
 ## Iteration
 This is iteration {iteration} of {max_iterations}.
 ## Instructions
-1. Explore the project directory to understand the existing codebase structure.
+1. Read the referenced plan/checklist/docs/review artifacts directly from disk.
-2. Implement ONLY what the plan specifies. Do NOT add extra features, \
+2. Explore the project directory and git state to understand the current codebase structure.
 3. Implement ONLY what the plan specifies. Do NOT add extra features, \
 unnecessary abstractions, or premature optimizations.
-3. Follow every item in the checklist.
+4. Follow every item in the checklist.
-4. If there is previous feedback, address ONLY the specific issues mentioned.
+5. If there is previous feedback in the referenced markdown artifacts, address ONLY those issues.
-5. If previous feedback contains items marked as DISMISSED or false positive, \
+6. If previous feedback contains items marked as DISMISSED or false positive, \
 IGNORE those items — they have been verified as correct.
-6. Output the complete implementation.
+7. Prefer git and markdown artifacts as the source of truth. Use commit hashes, `git show`, `git diff`, and referenced markdown files instead of relying on inline summaries.
 8. Output the complete implementation.
 """
 REVIEW_TEMPLATE = """\
 You are tasked with reviewing code against a plan and checklist.
-## Plan
+## Artifact References
-{plan}
+{artifact_references}
 ## Checklist
 {checklist}
 ## Reference Documents
 {docs}
 ## Coding Output / Previous Step Output
 {coding_output}
 ## Previous Review Feedback
 {feedback}
 ## Execution Evidence
 {execution_evidence}
 ## Review Instructions
-Explore the project directory to understand the full codebase context, \
+Read the referenced plan/checklist/docs/review artifacts directly from disk. \
-then evaluate the code against ONLY the plan and checklist above. \
+Inspect the referenced commit/git state and markdown artifacts, then evaluate \
-Use the execution evidence above to verify agent claims against actual \
+the code against ONLY the plan and checklist. Use the execution evidence above \
-command outputs and exit codes.
+to verify agent claims against actual command outputs, artifact paths, and exit codes.
 For each issue found, classify it with BOTH severity AND category:
@@ -127,55 +108,36 @@ Otherwise output: VERDICT: FAIL
 CODING_TEMPLATE_KO = """\
 당신은 기획서와 체크리스트를 기반으로 코드를 구현하는 개발자입니다.
-## 기획서
+## 참조 아티팩트
-{plan}
+{artifact_references}
 ## 체크리스트
 {checklist}
 ## 참고 문서
 {docs}
 ## 이전 리뷰 피드백
 {feedback}
 ## 반복 정보
 현재 {max_iterations}회 중 {iteration}번째 반복입니다.
 ## 지침
-1. 프로젝트 디렉토리를 탐색하여 기존 코드베이스 구조를 파악하세요.
+1. 참조된 plan/checklist/docs/review markdown를 직접 읽으세요.
-2. 기획서에 명시된 것만 구현하세요. 추가 기능, 불필요한 추상화, 과도한 최적화를 하지 마세요.
+2. 프로젝트 디렉토리와 git 상태를 탐색하여 현재 코드베이스 구조를 파악하세요.
-3. 체크리스트의 모든 항목을 충족하세요.
+3. 기획서에 명시된 것만 구현하세요. 추가 기능, 불필요한 추상화, 과도한 최적화를 하지 마세요.
-4. 이전 리뷰 피드백이 있다면 해당 이슈만 해결하세요.
+4. 체크리스트의 모든 항목을 충족하세요.
-5. 이전 피드백에서 DISMISSED 또는 오탐으로 표시된 항목은 무시하세요 — 이미 올바른 것으로 검증되었습니다.
+5. 참조된 이전 리뷰 피드백이 있다면 해당 이슈만 해결하세요.
-6. 완전한 구현을 출력하세요.
+6. 이전 피드백에서 DISMISSED 또는 오탐으로 표시된 항목은 무시하세요 — 이미 올바른 것으로 검증되었습니다.
 7. inline 요약보다 git commit hash, `git show`, `git diff`, markdown 아티팩트를 우선 사용하세요.
 8. 완전한 구현을 출력하세요.
 """
 REVIEW_TEMPLATE_KO = """\
 당신은 기획서와 체크리스트 기준으로 코드를 검토하는 리뷰어입니다.
-## 기획서
+## 참조 아티팩트
-{plan}
+{artifact_references}
 ## 체크리스트
 {checklist}
 ## 참고 문서
 {docs}
 ## 검토 대상 코드
 {coding_output}
 ## 이전 리뷰 피드백
 {feedback}
 ## 실행 증거
 {execution_evidence}
 ## 검토 지침
-프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스 맥락을 파악한 뒤, \
+참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽고, \
-위 기획서와 체크리스트 기준으로만 코드를 평가하세요. \
+그 내용을 기준으로만 코드를 평가하세요. \
-위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력과 종료 코드로 검증하세요.
+위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요.
 발견된 각 이슈에 심각도와 카테고리를 모두 부여하세요:
@@ -307,25 +269,16 @@ Otherwise output: VERDICT: FAIL
 REVIEW_ONLY_TEMPLATE_KO = """\
 당신은 기존 코드를 기획서와 체크리스트 기준으로 감사하는 리뷰어입니다.
-## 기획서
+## 참조 아티팩트
-{plan}
+{artifact_references}
 ## 체크리스트
 {checklist}
 ## 참고 문서
 {docs}
 ## 이전 리뷰 결과 ({max_iterations}회 중 {iteration}번째)
 {feedback}
 ## 실행 증거
 {execution_evidence}
 ## 검토 지침
-프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스를 파악한 뒤, \
+참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽고, \
-위 기획서와 체크리스트 기준으로 **기존 코드**를 평가하세요. \
+그 내용을 기준으로 **기존 코드**를 평가하세요. \
-위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력과 종료 코드로 검증하세요.
+위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요.
 코드를 생성하거나 수정하지 마세요. 이미 존재하는 코드를 감사하는 것이 목적입니다.
@@ -522,23 +475,8 @@ PLAN_REVIEW_TEMPLATE_KO = """\
 AGGREGATE_REVIEW_TEMPLATE = """\
 You are adjudicating multiple review results and turning them into an actionable decision.
-## Plan
+## Artifact References
-{plan}
+{artifact_references}
 ## Checklist
 {checklist}
 ## Reference Documents
 {docs}
 ## Candidate Outputs
 {candidate_outputs}
 ## Reviewer Findings
 {reviews_bundle}
 ## Previous Verification Feedback
 {feedback}
 ## Previous Issue Tracker
 {previous_senior_tracker}
@@ -547,9 +485,10 @@ You are adjudicating multiple review results and turning them into an actionable
 {execution_evidence}
 ## Instructions
-Explore the project directory to confirm the current codebase state. \
+Read the referenced plan/checklist/docs/review artifacts directly from disk. \
-Use the execution evidence above to verify claims against actual command \
+Explore the project directory and the referenced git commit/diff to confirm the \
-outputs and exit codes. Then:
+current codebase state. Use the execution evidence above to verify claims against \
 actual command outputs, artifact paths, and exit codes. Then:
 1. Deduplicate overlapping issues across reviewers.
 2. Resolve disagreements explicitly.
 3. Keep only issues supported by the plan, checklist, code, or reviewer evidence.
@@ -594,23 +533,8 @@ VERDICT: PASS or VERDICT: FAIL or VERDICT: ESCALATE
 AGGREGATE_REVIEW_TEMPLATE_KO = """\
 당신은 여러 리뷰 결과를 판정하고 coder가 수정할 액션으로 정리하는 시니어 리뷰어입니다.
-## 기획서
+## 참조 아티팩트
-{plan}
+{artifact_references}
 ## 체크리스트
 {checklist}
 ## 참고 문서
 {docs}
 ## 후보 결과물
 {candidate_outputs}
 ## 개별 리뷰 결과
 {reviews_bundle}
 ## 이전 검증 피드백
 {feedback}
 ## 이전 이슈 트래커
 {previous_senior_tracker}
@@ -619,8 +543,8 @@ AGGREGATE_REVIEW_TEMPLATE_KO = """\
 {execution_evidence}
 ## 지침
-프로젝트 디렉토리를 탐색하여 현재 코드베이스 상태를 확인한 뒤, \
+참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽어 현재 코드베이스 상태를 확인한 뒤, \
-위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력과 종료 코드로 검증하세요. \
+위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요. \
 그런 다음 아래를 수행하세요.
 1. 리뷰어들 사이에 중복되는 이슈를 합치세요.
 2. 의견 충돌은 명시적으로 정리하세요.
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,8 +11,58 @@ dependencies = [
    "pyyaml>=6.0",
 ]
 [project.optional-dependencies]
 dev = [
    "coverage[toml]>=7.6",
    "pyright>=1.1.390",
    "pytest-cov>=6.0",
    "ruff>=0.8.0",
 ]
 [project.scripts]
 cross-eval = "cross_eval.cli:main"
 [tool.setuptools.packages.find]
 include = ["cross_eval*"]
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 addopts = "-q"
 [tool.ruff]
 target-version = "py39"
 extend-exclude = [".cross-eval"]
 [tool.ruff.lint]
 select = ["F"]
 [tool.pyright]
 include = ["cross_eval", "tests"]
 exclude = [".cross-eval"]
 typeCheckingMode = "basic"
 pythonVersion = "3.9"
 reportMissingImports = true
 reportMissingTypeStubs = false
 [tool.coverage.run]
 branch = true
 source = ["cross_eval"]
 omit = [
    "cross_eval/config.py",
    "cross_eval/discovery.py",
    "cross_eval/cli.py",
    "cross_eval/demo.py",
    "cross_eval/doctor.py",
    "cross_eval/prompts.py",
    "cross_eval/report.py",
 ]
 [tool.coverage.report]
 skip_empty = true
 show_missing = true
 fail_under = 90
 exclude_lines = [
    "pragma: no cover",
    "if TYPE_CHECKING:",
    "raise NotImplementedError",
 ]
--- a/tests/test_agentic.py
+++ b/tests/test_agentic.py
@@ -12,10 +12,10 @@ import subprocess
 import tempfile
 import unittest
 from pathlib import Path
-from unittest.mock import MagicMock, call, patch
+from unittest.mock import MagicMock, patch
 from cross_eval.agent import AgentInvocationError, invoke_agent_agentic
-from cross_eval.config import BUILTIN_AGENTS, _make_agentic
+from cross_eval.config import _make_agentic
 from cross_eval.models import (
    AgentConfig,
    AgentResult,
@@ -24,8 +24,6 @@ from cross_eval.models import (
 )
 from cross_eval.pipeline import (
    _assert_base_repo_isolation,
    _commit_iteration,
    _finalize_worktree,
    _has_agentic_steps,
    _setup_worktree,
    run_pipeline,
@@ -267,6 +265,7 @@ class TestInvokeAgentAgenticClaude(unittest.TestCase):
                break
        self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'claude'")
        assert agent_call is not None
        cmd = agent_call[0][0]
        # No -p flag
@@ -274,6 +273,7 @@ class TestInvokeAgentAgenticClaude(unittest.TestCase):
        # Prompt is delivered via stdin (input kwarg), not as a positional arg
        input_data = agent_call[1].get("input")
        self.assertIsNotNone(input_data)
        assert input_data is not None
        self.assertIn("implement feature X", input_data)
@@ -311,6 +311,7 @@ class TestInvokeAgentAgenticCodex(unittest.TestCase):
                break
        self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'codex'")
        assert agent_call is not None
        cmd = agent_call[0][0]
        # Should have "-" sentinel at the end for stdin
@@ -318,6 +319,7 @@ class TestInvokeAgentAgenticCodex(unittest.TestCase):
        # Stdin input should contain the prompt
        input_data = agent_call[1].get("input")
        self.assertIsNotNone(input_data)
        assert input_data is not None
        self.assertIn("implement feature Y", input_data)
@@ -435,6 +437,16 @@ class TestMakeAgenticClaude(unittest.TestCase):
        self.assertNotIn("-p", agent.args)
        self.assertIn("--setting-sources", agent.args)
    def test_strips_dash_dash_print_alias(self) -> None:
        agent = AgentConfig(
            name="claude-coder",
            command="claude",
            args=["--print", "--setting-sources", "user"],
        )
        _make_agentic(agent)
        self.assertTrue(agent.agentic)
        self.assertNotIn("--print", agent.args)
    def test_idempotent_when_no_dash_p(self) -> None:
        agent = AgentConfig(
            name="claude-coder",
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -26,7 +26,6 @@ from cross_eval.models import (
    PhaseConfig,
    PipelineConfig,
    PipelineResult,
    ReviewMetrics,
    StepConfig,
 )
 from cross_eval.pipeline import (
@@ -54,7 +53,7 @@ from cross_eval.prompts import (
    _build_review_only_preset,
    _build_simple_preset,
 )
-from cross_eval.report import build_report, parse_review_metrics, print_escalation_report
+from cross_eval.report import build_report, parse_review_metrics
 class BuiltinAgentConfigTest(unittest.TestCase):
    def test_claude_builtin_agents_use_user_settings_and_disable_slash_commands(self) -> None:
--- a/tests/test_evidence.py
+++ b/tests/test_evidence.py
@@ -26,10 +26,9 @@ from cross_eval.models import (
    IterationResult,
    PipelineConfig,
    PipelineResult,
    ReviewMetrics,
    StepConfig,
 )
-from cross_eval.pipeline import _format_execution_evidence, run_pipeline
+from cross_eval.pipeline import _build_artifact_references, _format_execution_evidence, run_pipeline
 from cross_eval.report import build_report
@@ -59,7 +58,7 @@ class TestFormatExecutionEvidence(unittest.TestCase):
        self.assertIn("Exit code: 0", evidence)
        self.assertIn("12.3s", evidence)
        self.assertIn("claude --setting-sources user", evidence)
-        self.assertIn("Transcript excerpt", evidence)
+        self.assertNotIn("Transcript excerpt", evidence)
    def test_multiple_results_separated(self) -> None:
        r1 = AgentResult(
@@ -88,10 +87,60 @@ class TestFormatExecutionEvidence(unittest.TestCase):
            transcript=long_transcript,
        )
        evidence = _format_execution_evidence({"key": result})
        self.assertIn("truncated", evidence)
        # The full 3000-char transcript should NOT appear
        self.assertNotIn("x" * 3000, evidence)
    def test_artifact_paths_included_when_run_dir_provided(self) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            result = AgentResult(
                output="diff",
                exit_code=0,
                agent_name="coder",
                step_name="coding",
                duration_seconds=1.2,
                transcript="stdout",
                command_preview="claude ...",
            )
            evidence = _format_execution_evidence(
                {"coding_output": result},
                run_dir=Path(tmpdir),
                iteration=2,
            )
            self.assertIn("v2/coding.md", evidence)
            self.assertIn("v2/coding_transcript.md", evidence)
 class TestArtifactReferences(unittest.TestCase):
    """Artifact references should prefer file paths and git state over inline text."""
    def test_contains_input_refs_and_git_context(self) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            repo = Path(tmpdir) / "repo"
            repo.mkdir()
            (repo / "plan.md").write_text("plan", encoding="utf-8")
            (repo / "checklist.md").write_text("checklist", encoding="utf-8")
            import subprocess
            subprocess.run(["git", "init"], cwd=repo, capture_output=True, check=True)
            subprocess.run(["git", "config", "user.email", "test@test.com"], cwd=repo, capture_output=True, check=True)
            subprocess.run(["git", "config", "user.name", "Test"], cwd=repo, capture_output=True, check=True)
            subprocess.run(["git", "add", "."], cwd=repo, capture_output=True, check=True)
            subprocess.run(["git", "commit", "-m", "init"], cwd=repo, capture_output=True, check=True)
            refs = _build_artifact_references(
                {
                    "plan_ref": str((repo / "plan.md").resolve()),
                    "checklist_ref": str((repo / "checklist.md").resolve()),
                    "docs_ref": "(none)",
                },
                cwd=repo,
                run_dir=repo / ".cross-eval" / "output" / "run",
                iteration=1,
                worktree_path=None,
            )
            self.assertIn("Plan:", refs)
            self.assertIn("Git commit:", refs)
            self.assertIn("Suggested git commands", refs)
 # ---------------------------------------------------------------------------
 # 2. Evidence in reviewer prompts (integration)
@@ -162,7 +211,7 @@ class TestEvidenceInReviewerPrompt(unittest.TestCase):
            ]
            self.assertTrue(len(review_prompts) >= 1)
            review_prompt = review_prompts[0]["prompt"]
-            # Evidence section should reference the coding step's command
+            self.assertIn("Artifact References", review_prompt)
            self.assertIn("Execution Evidence", review_prompt)
            self.assertIn("claude-coder", review_prompt)
--- a/tests/test_onboarding.py
+++ b/tests/test_onboarding.py
@@ -11,7 +11,6 @@ from cross_eval.doctor import (
    check_cli_installed,
    check_config,
    format_doctor_results,
    run_doctor,
 )
 from cross_eval.demo import (
    DEMO_CHECKLIST,
--- a/tests/test_pipeline_integration.py
+++ b/tests/test_pipeline_integration.py
@@ -8,9 +8,7 @@ from unittest.mock import patch
 from cross_eval.config import BUILTIN_AGENTS
 from cross_eval.models import (
    AgentConfig,
    AgentResult,
    PhaseConfig,
    PipelineConfig,
    StepConfig,
 )
--- a/tests/test_runtime_context.py
+++ b/tests/test_runtime_context.py
@@ -390,6 +390,7 @@ class TranscriptSavingRegressionTest(unittest.TestCase):
            # Verify transcript files were saved
            run_dir = result.run_dir
            self.assertIsNotNone(run_dir)
            assert run_dir is not None
            coding_transcript = run_dir / "v1" / "coding_transcript.md"
            review_transcript = run_dir / "v1" / "review_transcript.md"
            self.assertTrue(
--- a/tests/test_runtime_misc.py
+++ b/tests/test_runtime_misc.py
@@ -0,0 +1,831 @@
 from __future__ import annotations
 import re
 import subprocess
 import tempfile
 import unittest
 from pathlib import Path
 from unittest.mock import MagicMock, patch
 from cross_eval.agent import (
    AgentInvocationError,
    _build_transcript,
    _classify_agent_failure,
    invoke_agent,
    invoke_agent_agentic,
 )
 from cross_eval.models import AgentConfig, AgentResult, ExecutionConfig, PipelineConfig, StepConfig
 from cross_eval.pipeline import (
    _commit_iteration,
    _execute_parallel_batch,
    _execute_step,
    _finalize_worktree,
    _format_runtime_error_markdown,
    _maybe_save_step_transcript,
    _snapshot_repo_state,
 )
 from cross_eval.runtime_env import (
    build_execution_policy,
    parse_dotenv,
    resolve_env_files,
    summarize_environment,
 )
 from cross_eval.worktree import WorktreeError, create_worktree, remove_worktree
 def _init_git_repo(path: Path) -> None:
    subprocess.run(["git", "init"], cwd=path, capture_output=True, check=True)
    subprocess.run(
        ["git", "config", "user.email", "test@test.com"],
        cwd=path,
        capture_output=True,
        check=True,
    )
    subprocess.run(
        ["git", "config", "user.name", "Test"],
        cwd=path,
        capture_output=True,
        check=True,
    )
    (path / "README.md").write_text("# init\n", encoding="utf-8")
    subprocess.run(["git", "add", "."], cwd=path, capture_output=True, check=True)
    subprocess.run(
        ["git", "commit", "-m", "initial"],
        cwd=path,
        capture_output=True,
        check=True,
    )
 class TestInvokeAgentRuntime(unittest.TestCase):
    @patch("cross_eval.agent.subprocess.run")
    def test_interactive_claude_reads_output_file(self, mock_run: MagicMock) -> None:
        def _fake_run(cmd: list[str], **kwargs: object) -> MagicMock:
            match = re.search(r"Write your complete output to (.+)\.$", cmd[-1])
            self.assertIsNotNone(match)
            assert match is not None
            Path(match.group(1)).write_text("review result", encoding="utf-8")
            return MagicMock(returncode=0, stdout="", stderr="")
        mock_run.side_effect = _fake_run
        agent = AgentConfig(
            name="claude-reviewer",
            command="claude",
            args=["--model", "opus"],
            system_prompt="system",
        )
        result = invoke_agent(agent, "inspect code", "review", quiet=True)
        self.assertEqual(result.output, "review result")
        called_cmd = mock_run.call_args[0][0]
        self.assertIn("--system-prompt", called_cmd)
    @patch("cross_eval.agent.subprocess.run")
    def test_interactive_claude_falls_back_to_stdout(self, mock_run: MagicMock) -> None:
        mock_run.return_value = MagicMock(returncode=0, stdout="stdout fallback", stderr="")
        agent = AgentConfig(name="claude-reviewer", command="claude", args=["--model", "opus"])
        result = invoke_agent(agent, "inspect code", "review", quiet=True)
        self.assertEqual(result.output, "stdout fallback")
    @patch("cross_eval.agent.subprocess.run")
    def test_non_claude_wraps_system_prompt_in_stdin(self, mock_run: MagicMock) -> None:
        mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
        agent = AgentConfig(
            name="custom-reviewer",
            command="custom-cli",
            args=["run"],
            system_prompt="strict mode",
        )
        invoke_agent(agent, "check things", "review", quiet=True)
        self.assertEqual(
            mock_run.call_args.kwargs["input"],
            "<system>\nstrict mode\n</system>\n\ncheck things",
        )
    @patch("cross_eval.agent.subprocess.run")
    def test_failure_raises_structured_error(self, mock_run: MagicMock) -> None:
        mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="API Error: backend down")
        agent = AgentConfig(name="codex-reviewer", command="codex", args=["exec", "-"])
        with self.assertRaises(AgentInvocationError) as ctx:
            invoke_agent(agent, "check", "review", quiet=True)
        self.assertEqual(ctx.exception.failure_type, "API_ERROR")
        self.assertIn("backend down", ctx.exception.raw_error)
    def test_classify_unknown_failure(self) -> None:
        failure_type, suggested_action = _classify_agent_failure("weird crash")
        self.assertEqual(failure_type, "UNKNOWN")
        self.assertIn("Inspect", suggested_action)
    def test_build_transcript_includes_cwd_and_duration(self) -> None:
        transcript = _build_transcript(
            command_preview="claude -p",
            stdout="ok",
            stderr="",
            exit_code=0,
            duration_seconds=1.2,
            cwd="/tmp/repo",
        )
        self.assertIn("## Working Directory", transcript)
        self.assertIn("## Duration: 1.2s", transcript)
    @patch("cross_eval.agent._Spinner")
    @patch("cross_eval.agent.subprocess.run")
    def test_timeout_stops_spinner(self, mock_run: MagicMock, mock_spinner: MagicMock) -> None:
        spinner = mock_spinner.return_value
        mock_run.side_effect = subprocess.TimeoutExpired(cmd=["claude"], timeout=12)
        agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
        with self.assertRaises(subprocess.TimeoutExpired):
            invoke_agent(agent, "inspect code", "review", quiet=False, timeout=12)
        spinner.stop.assert_called_once()
    @patch("cross_eval.agent._Spinner")
    @patch("cross_eval.agent.subprocess.run")
    def test_generic_exception_stops_spinner(self, mock_run: MagicMock, mock_spinner: MagicMock) -> None:
        spinner = mock_spinner.return_value
        mock_run.side_effect = OSError("boom")
        agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
        with self.assertRaises(OSError):
            invoke_agent(agent, "inspect code", "review", quiet=False)
        spinner.stop.assert_called_once()
    @patch("cross_eval.agent.logger.warning")
    @patch("cross_eval.agent.subprocess.run")
    def test_empty_output_logs_warning(self, mock_run: MagicMock, mock_warning: MagicMock) -> None:
        mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
        agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
        result = invoke_agent(agent, "inspect code", "review", quiet=True)
        self.assertEqual(result.output, "")
        mock_warning.assert_called_once()
    @patch("cross_eval.agent.subprocess.run")
    def test_print_mode_claude_uses_native_system_prompt_flag(self, mock_run: MagicMock) -> None:
        mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
        agent = AgentConfig(
            name="claude-reviewer",
            command="claude",
            args=["-p"],
            system_prompt="be strict",
        )
        invoke_agent(agent, "review this", "review", quiet=True)
        called_cmd = mock_run.call_args[0][0]
        self.assertIn("--system-prompt", called_cmd)
        self.assertEqual(mock_run.call_args.kwargs["input"], "review this")
    @patch("cross_eval.agent.subprocess.run")
    def test_interactive_failure_truncates_error_and_removes_output_file(
        self,
        mock_run: MagicMock,
    ) -> None:
        seen_output_path: Path | None = None
        def _fake_run(cmd: list[str], **kwargs: object) -> MagicMock:
            nonlocal seen_output_path
            match = re.search(r"Write your complete output to (.+)\.$", cmd[-1])
            self.assertIsNotNone(match)
            assert match is not None
            seen_output_path = Path(match.group(1))
            return MagicMock(returncode=1, stdout="", stderr="x" * 600)
        mock_run.side_effect = _fake_run
        agent = AgentConfig(name="claude-reviewer", command="claude", args=["--model", "opus"])
        with self.assertRaises(AgentInvocationError) as ctx:
            invoke_agent(agent, "inspect code", "review", quiet=True)
        self.assertEqual(len(ctx.exception.raw_error), 503)
        self.assertIsNotNone(seen_output_path)
        assert seen_output_path is not None
        self.assertFalse(seen_output_path.exists())
    @patch("cross_eval.agent.logger.warning")
    @patch("cross_eval.agent.subprocess.run")
    def test_empty_output_with_stderr_logs_stderr_warning(
        self,
        mock_run: MagicMock,
        mock_warning: MagicMock,
    ) -> None:
        mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="stderr text")
        agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
        invoke_agent(agent, "inspect code", "review", quiet=True)
        self.assertIn("stderr:", mock_warning.call_args[0][0])
 class TestInvokeAgenticRuntime(unittest.TestCase):
    @patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
    @patch("cross_eval.agent.subprocess.run")
    def test_codex_agentic_adds_reasoning_and_system_wrapper(
        self,
        mock_run: MagicMock,
        mock_diff: MagicMock,
    ) -> None:
        mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
        agent = AgentConfig(
            name="codex-coder",
            command="codex",
            args=["exec", "--full-auto"],
            system_prompt="strict mode",
            reasoning_effort="high",
            agentic=True,
        )
        with tempfile.TemporaryDirectory() as tmpdir:
            repo = Path(tmpdir)
            _init_git_repo(repo)
            invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=True)
        called_cmd = mock_run.call_args[0][0]
        self.assertIn("-c", called_cmd)
        self.assertEqual(called_cmd[-1], "-")
        self.assertIn("<system>", mock_run.call_args.kwargs["input"])
    @patch("cross_eval.agent._Spinner")
    @patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
    @patch("cross_eval.agent.subprocess.run")
    def test_agentic_claude_success_uses_system_prompt_and_spinner(
        self,
        mock_run: MagicMock,
        mock_diff: MagicMock,
        mock_spinner: MagicMock,
    ) -> None:
        mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
        agent = AgentConfig(
            name="claude-coder",
            command="claude",
            args=["-p", "--print"],
            system_prompt="stay in scope",
            agentic=True,
        )
        with tempfile.TemporaryDirectory() as tmpdir:
            repo = Path(tmpdir)
            _init_git_repo(repo)
            result = invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
        called_cmd = mock_run.call_args[0][0]
        self.assertNotIn("-p", called_cmd)
        self.assertIn("--system-prompt", called_cmd)
        self.assertEqual(result.output, "diff --git a/file ...")
        mock_spinner.return_value.stop.assert_called_once()
    @patch("cross_eval.agent._Spinner")
    def test_agentic_timeout_stops_spinner(self, mock_spinner: MagicMock) -> None:
        spinner = mock_spinner.return_value
        agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
        with tempfile.TemporaryDirectory() as tmpdir:
            repo = Path(tmpdir)
            _init_git_repo(repo)
            with patch(
                "cross_eval.agent.subprocess.run",
                side_effect=subprocess.TimeoutExpired(cmd=["codex"], timeout=20),
            ):
                with self.assertRaises(subprocess.TimeoutExpired):
                    invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False, timeout=20)
        spinner.stop.assert_called_once()
    @patch("cross_eval.agent.subprocess.run")
    def test_agentic_nonzero_exit_raises_structured_error(self, mock_run: MagicMock) -> None:
        mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="unauthorized")
        agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
        with tempfile.TemporaryDirectory() as tmpdir:
            repo = Path(tmpdir)
            _init_git_repo(repo)
            with self.assertRaises(AgentInvocationError) as ctx:
                invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=True)
        self.assertEqual(ctx.exception.failure_type, "AUTH")
    @patch("cross_eval.agent._Spinner")
    def test_agentic_generic_exception_stops_spinner(
        self,
        mock_spinner: MagicMock,
    ) -> None:
        agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
        with tempfile.TemporaryDirectory() as tmpdir:
            repo = Path(tmpdir)
            _init_git_repo(repo)
            with patch("cross_eval.agent.subprocess.run", side_effect=OSError("boom")):
                with self.assertRaises(OSError):
                    invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
        mock_spinner.return_value.stop.assert_called_once()
    @patch("cross_eval.agent._Spinner")
    @patch("cross_eval.agent.subprocess.run")
    def test_agentic_failure_truncates_error(
        self,
        mock_run: MagicMock,
        mock_spinner: MagicMock,
    ) -> None:
        mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="x" * 600)
        agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
        with tempfile.TemporaryDirectory() as tmpdir:
            repo = Path(tmpdir)
            _init_git_repo(repo)
            with self.assertRaises(AgentInvocationError) as ctx:
                invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
        self.assertEqual(len(ctx.exception.raw_error), 503)
        mock_spinner.return_value.stop.assert_called_once()
    @patch("cross_eval.agent._Spinner")
    @patch("cross_eval.worktree.capture_diff", return_value="")
    @patch("cross_eval.agent.subprocess.run")
    def test_agentic_empty_diff_failure_truncates_error_and_stops_spinner(
        self,
        mock_run: MagicMock,
        mock_diff: MagicMock,
        mock_spinner: MagicMock,
    ) -> None:
        mock_run.return_value = MagicMock(
            returncode=0,
            stdout="implemented",
            stderr="permission denied " * 300,
        )
        agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
        with tempfile.TemporaryDirectory() as tmpdir:
            repo = Path(tmpdir)
            _init_git_repo(repo)
            with self.assertRaises(AgentInvocationError) as ctx:
                invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
        self.assertLessEqual(len(ctx.exception.raw_error), 2003)
        self.assertEqual(ctx.exception.failure_type, "WRITE_FAILURE")
        mock_spinner.return_value.stop.assert_called_once()
 class TestPipelineHelpers(unittest.TestCase):
    @patch("cross_eval.worktree.commit_worktree", return_value=True)
    def test_commit_iteration_logs_only_when_committed(self, mock_commit: MagicMock) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            _commit_iteration(Path(tmpdir), "review-fix", 2, "PASS")
        mock_commit.assert_called_once()
    def test_snapshot_repo_state_includes_untracked_digest(self) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            repo = Path(tmpdir)
            _init_git_repo(repo)
            (repo / "scratch.txt").write_text("draft", encoding="utf-8")
            snapshot = _snapshot_repo_state(repo)
            self.assertIn("UNTRACKED scratch.txt", snapshot)
    def test_finalize_worktree_deletes_empty_branch(self) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            base = Path(tmpdir) / "repo"
            base.mkdir()
            _init_git_repo(base)
            branch = "cross-eval/empty"
            subprocess.run(
                ["git", "branch", branch, "HEAD"],
                cwd=base,
                capture_output=True,
                check=True,
            )
            worktree = Path(tmpdir) / "wt"
            subprocess.run(
                ["git", "worktree", "add", str(worktree), branch],
                cwd=base,
                capture_output=True,
                check=True,
            )
            branch_result = _finalize_worktree(base, worktree, branch, "review-fix", "PASS")
            self.assertIsNone(branch_result)
            branches = subprocess.run(
                ["git", "branch", "--list", branch],
                cwd=base,
                capture_output=True,
                text=True,
                check=True,
            )
            self.assertEqual(branches.stdout.strip(), "")
    def test_format_runtime_error_markdown_for_generic_exception(self) -> None:
        markdown = _format_runtime_error_markdown(
            RuntimeError("boom"),
            step_name="review",
            agent_name="claude-reviewer",
            phase_name="review_fix",
        )
        self.assertIn("# Agent Error", markdown)
        self.assertIn("review_fix", markdown)
        self.assertIn("boom", markdown)
    def test_maybe_save_step_transcript_returns_none_without_transcript(self) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            result = AgentResult(
                output="ok",
                exit_code=0,
                agent_name="claude-reviewer",
                step_name="review",
                duration_seconds=0.1,
            )
            saved = _maybe_save_step_transcript(Path(tmpdir), 1, "review", result)
            self.assertIsNone(saved)
    @patch("cross_eval.pipeline.invoke_agent")
    def test_execute_step_saves_timeout_markdown(self, mock_invoke: MagicMock) -> None:
        mock_invoke.side_effect = subprocess.TimeoutExpired(
            cmd=["claude"],
            timeout=45,
            output="partial output",
            stderr="still running",
        )
        step = StepConfig(
            name="review",
            agent="claude-reviewer",
            role="review",
            prompt_template="default:review",
            output_key="review_output",
        )
        config = PipelineConfig(
            agents={
                "claude-reviewer": AgentConfig(
                    name="claude-reviewer",
                    command="claude",
                    args=["-p"],
                ),
            },
        )
        step_outputs: dict[str, str] = {}
        step_results: dict[str, AgentResult] = {}
        with tempfile.TemporaryDirectory() as tmpdir:
            run_dir = Path(tmpdir)
            with self.assertRaises(RuntimeError) as ctx:
                _execute_step(
                    step,
                    config,
                    {"plan": "Plan", "checklist": "Checklist"},
                    "",
                    1,
                    3,
                    run_dir,
                    45,
                    False,
                    step_outputs,
                    step_results,
                    run_dir=run_dir,
                    output_iter=1,
                )
            self.assertIn("timed out after 45s", str(ctx.exception))
            error_path = run_dir / "v1" / "review_error.md"
            self.assertTrue(error_path.exists())
            self.assertIn("# Agent Timeout", error_path.read_text(encoding="utf-8"))
    @patch("cross_eval.pipeline.invoke_agent")
    def test_execute_step_saves_runtime_error_markdown(self, mock_invoke: MagicMock) -> None:
        mock_invoke.side_effect = AgentInvocationError(
            agent_name="claude-reviewer",
            step_name="review",
            cmd_preview="claude -p",
            raw_error="api broke",
            failure_type="API_ERROR",
            suggested_action="retry",
        )
        step = StepConfig(
            name="review",
            agent="claude-reviewer",
            role="review",
            prompt_template="default:review",
            output_key="review_output",
        )
        config = PipelineConfig(
            agents={
                "claude-reviewer": AgentConfig(
                    name="claude-reviewer",
                    command="claude",
                    args=["-p"],
                ),
            },
        )
        with tempfile.TemporaryDirectory() as tmpdir:
            run_dir = Path(tmpdir)
            with self.assertRaises(AgentInvocationError):
                _execute_step(
                    step,
                    config,
                    {"plan": "Plan", "checklist": "Checklist"},
                    "",
                    1,
                    3,
                    run_dir,
                    45,
                    False,
                    {},
                    {},
                    run_dir=run_dir,
                    output_iter=1,
                )
            error_text = (run_dir / "v1" / "review_error.md").read_text(encoding="utf-8")
            self.assertIn("API_ERROR", error_text)
            self.assertIn("retry", error_text)
    @patch("cross_eval.pipeline.invoke_agent")
    def test_execute_parallel_batch_saves_success_and_timeout_error(self, mock_invoke: MagicMock) -> None:
        def _fake_invoke(agent_config: AgentConfig, prompt: str, step_name: str, **kwargs: object) -> AgentResult:
            if step_name == "review_ok":
                return AgentResult(
                    output="VERDICT: PASS",
                    exit_code=0,
                    agent_name=agent_config.name,
                    step_name=step_name,
                    duration_seconds=0.1,
                )
            raise subprocess.TimeoutExpired(
                cmd=["codex"],
                timeout=30,
                output="halfway",
                stderr="timeout stderr",
            )
        mock_invoke.side_effect = _fake_invoke
        batch = [
            StepConfig(
                name="review_ok",
                agent="claude-reviewer",
                role="review",
                prompt_template="default:review",
                output_key="review_ok",
                parallel=True,
            ),
            StepConfig(
                name="review_slow",
                agent="codex-reviewer",
                role="review",
                prompt_template="default:review",
                output_key="review_slow",
                parallel=True,
            ),
        ]
        config = PipelineConfig(
            agents={
                "claude-reviewer": AgentConfig(name="claude-reviewer", command="claude", args=["-p"]),
                "codex-reviewer": AgentConfig(name="codex-reviewer", command="codex", args=["exec", "-"]),
            },
        )
        step_outputs: dict[str, str] = {}
        step_results: dict[str, AgentResult] = {}
        with tempfile.TemporaryDirectory() as tmpdir:
            run_dir = Path(tmpdir)
            with self.assertRaises(RuntimeError) as ctx:
                _execute_parallel_batch(
                    batch,
                    config,
                    {"plan": "Plan", "checklist": "Checklist"},
                    "",
                    1,
                    3,
                    run_dir,
                    30,
                    False,
                    step_outputs,
                    step_results,
                    run_dir=run_dir,
                    output_iter=1,
                )
            self.assertIn("Successful outputs were saved for: review_ok", str(ctx.exception))
            self.assertEqual(step_outputs["review_ok"], "VERDICT: PASS")
            self.assertTrue((run_dir / "v1" / "review_ok.md").exists())
            self.assertTrue((run_dir / "v1" / "review_slow_error.md").exists())
    @patch("cross_eval.pipeline._execute_step")
    def test_execute_parallel_batch_dry_run_uses_sequential_path(self, mock_step: MagicMock) -> None:
        batch = [
            StepConfig(
                name="review_a",
                agent="claude-reviewer",
                role="review",
                prompt_template="default:review",
                output_key="review_a",
                parallel=True,
            ),
            StepConfig(
                name="review_b",
                agent="codex-reviewer",
                role="review",
                prompt_template="default:review",
                output_key="review_b",
                parallel=True,
            ),
        ]
        config = PipelineConfig(agents={})
        with tempfile.TemporaryDirectory() as tmpdir:
            _execute_parallel_batch(
                batch,
                config,
                {"plan": "Plan"},
                "",
                1,
                3,
                Path(tmpdir),
                None,
                True,
                {},
                {},
                run_dir=Path(tmpdir),
                output_iter=1,
            )
        self.assertEqual(mock_step.call_count, 2)
    @patch("cross_eval.pipeline._execute_step")
    def test_execute_parallel_batch_agentic_steps_fall_back_to_sequential(self, mock_step: MagicMock) -> None:
        batch = [
            StepConfig(
                name="review_a",
                agent="agentic-a",
                role="review",
                prompt_template="default:review",
                output_key="review_a",
                parallel=True,
            ),
            StepConfig(
                name="review_b",
                agent="agentic-b",
                role="review",
                prompt_template="default:review",
                output_key="review_b",
                parallel=True,
            ),
        ]
        config = PipelineConfig(
            agents={
                "agentic-a": AgentConfig(name="agentic-a", command="claude", agentic=True),
                "agentic-b": AgentConfig(name="agentic-b", command="codex", agentic=True),
            },
        )
        with tempfile.TemporaryDirectory() as tmpdir:
            _execute_parallel_batch(
                batch,
                config,
                {"plan": "Plan"},
                "",
                1,
                3,
                Path(tmpdir),
                None,
                False,
                {},
                {},
                run_dir=Path(tmpdir),
                output_iter=1,
                worktree_path=Path(tmpdir),
            )
        self.assertEqual(mock_step.call_count, 2)
    @patch("cross_eval.worktree.remove_worktree", side_effect=RuntimeError("cleanup failed"))
    @patch("cross_eval.worktree.commit_worktree", side_effect=RuntimeError("commit failed"))
    def test_finalize_worktree_handles_cleanup_failures(
        self,
        mock_commit: MagicMock,
        mock_remove: MagicMock,
    ) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            branch = _finalize_worktree(
                Path(tmpdir),
                Path(tmpdir) / "wt",
                "cross-eval/fail",
                "review-fix",
                "FAIL",
            )
        self.assertIsNone(branch)
 class TestRuntimeEnvironmentHelpers(unittest.TestCase):
    def test_parse_dotenv_handles_export_and_quotes(self) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            env_path = Path(tmpdir) / ".env"
            env_path.write_text(
                "export FOO='bar'\nBAR=\"line\\nvalue\"\nINVALID\n=skip\n",
                encoding="utf-8",
            )
            values = parse_dotenv(env_path)
        self.assertEqual(values["FOO"], "bar")
        self.assertEqual(values["BAR"], "line\nvalue")
        self.assertNotIn("INVALID", values)
    def test_resolve_env_files_deduplicates_and_filters_missing(self) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            root = Path(tmpdir)
            env_path = root / ".env"
            env_path.write_text("FOO=bar\n", encoding="utf-8")
            execution = ExecutionConfig(
                env_files=[".env", str(env_path)],
                auto_env_files=[".env", ".env.local"],
            )
            resolved = resolve_env_files(execution, root)
            self.assertEqual(resolved, [env_path.resolve()])
    def test_summarize_environment_hides_names_when_disabled(self) -> None:
        execution = ExecutionConfig(expose_env_names=False, auto_context_targets=["postgres"])
        summary = summarize_environment(
            execution,
            [],
            {"DATABASE_URL": "postgres://localhost"},
            {},
        )
        self.assertIn("names are hidden", summary)
        self.assertIn("Execution targets hinted by the user: postgres", summary)
    def test_build_execution_policy_for_minimal_mode(self) -> None:
        policy = build_execution_policy(
            ExecutionConfig(mode="agent-decides", command_policy="minimal"),
        )
        self.assertIn("Command policy: minimal", policy)
        self.assertIn("Keep command usage minimal", policy)
 class TestWorktreeFailures(unittest.TestCase):
    @patch("cross_eval.worktree.subprocess.run")
    def test_create_worktree_raises_when_branch_creation_fails(self, mock_run: MagicMock) -> None:
        mock_run.side_effect = subprocess.CalledProcessError(
            1,
            ["git", "branch"],
            stderr="branch failed",
        )
        with tempfile.TemporaryDirectory() as tmpdir:
            base = Path(tmpdir)
            work_dir = base / "wt"
            with self.assertRaises(WorktreeError) as ctx:
                create_worktree(base, work_dir, "cross-eval/fail")
        self.assertIn("Failed to create branch", str(ctx.exception))
    @patch("cross_eval.worktree.subprocess.run")
    def test_create_worktree_cleans_branch_on_worktree_failure(self, mock_run: MagicMock) -> None:
        mock_run.side_effect = [
            MagicMock(returncode=0),
            subprocess.CalledProcessError(
                1,
                ["git", "worktree", "add"],
                stderr="worktree failed",
            ),
            MagicMock(returncode=0),
        ]
        with tempfile.TemporaryDirectory() as tmpdir:
            base = Path(tmpdir)
            work_dir = base / "wt"
            with self.assertRaises(WorktreeError):
                create_worktree(base, work_dir, "cross-eval/fail")
        cleanup_call = mock_run.call_args_list[-1]
        self.assertEqual(cleanup_call[0][0][:3], ["git", "branch", "-D"])
    @patch("cross_eval.worktree.shutil.rmtree")
    @patch("cross_eval.worktree.subprocess.run")
    def test_remove_worktree_falls_back_to_prune(self, mock_run: MagicMock, mock_rmtree: MagicMock) -> None:
        mock_run.side_effect = [
            subprocess.CalledProcessError(1, ["git", "worktree", "remove"]),
            MagicMock(returncode=0),
        ]
        with tempfile.TemporaryDirectory() as tmpdir:
            base = Path(tmpdir) / "repo"
            work_dir = Path(tmpdir) / "wt"
            base.mkdir()
            work_dir.mkdir()
            remove_worktree(base, work_dir)
        resolved = work_dir.resolve()
        mock_rmtree.assert_any_call(resolved, ignore_errors=True)
        self.assertEqual(mock_run.call_args_list[-1][0][0], ["git", "worktree", "prune"])