From 7b95233edf69ec793b412abfc8ce09fec99a88c4 Mon Sep 17 00:00:00 2001 From: chungyeong Date: Sat, 14 Mar 2026 10:05:25 +0900 Subject: [PATCH] feat: tighten agentic runtime handoffs and quality gates --- cross_eval/agent.py | 18 +- cross_eval/cli.py | 4 +- cross_eval/config.py | 4 +- cross_eval/demo.py | 2 +- cross_eval/doctor.py | 2 +- cross_eval/pipeline.py | 148 ++++- cross_eval/prompts.py | 168 ++---- pyproject.toml | 50 ++ tests/test_agentic.py | 20 +- tests/test_config.py | 3 +- tests/test_evidence.py | 61 ++- tests/test_onboarding.py | 1 - tests/test_pipeline_integration.py | 2 - tests/test_runtime_context.py | 1 + tests/test_runtime_misc.py | 831 +++++++++++++++++++++++++++++ 15 files changed, 1148 insertions(+), 167 deletions(-) create mode 100644 tests/test_runtime_misc.py diff --git a/cross_eval/agent.py b/cross_eval/agent.py index 968e79e..a3fa023 100644 --- a/cross_eval/agent.py +++ b/cross_eval/agent.py @@ -415,11 +415,7 @@ def invoke_agent_agentic( timeout: int | None = None, quiet: bool = False, ) -> AgentResult: - """Invoke an agent in agentic mode (no -p, runs in worktree, captures git diff). - - The agent runs without print mode so it can modify files directly. - After the agent exits, git diff (since last commit) is captured as the output. - """ + """Invoke an agent in agentic mode using the worktree as the source of truth.""" from cross_eval.worktree import capture_diff # Write prompt to a temp file (outside worktree, won't appear in diffs) @@ -433,10 +429,10 @@ def invoke_agent_agentic( if agent.reasoning_effort and _supports_reasoning_effort(agent.command): cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"']) - # Strip stdin sentinel ("-") from args for agentic mode. - # Keep -p/--print: Claude -p mode still has full tool access (Edit, Write, - # Bash, etc.) and is the correct mode for non-interactive subprocess use. - args = [a for a in agent.args if a != "-"] + # Strip print-mode flags and stdin sentinels for agentic mode. + # Agentic runs should operate on the worktree and return a real git diff, + # not behave as a one-shot text completer. + args = [a for a in agent.args if a not in {"-", "-p", "--print"}] cmd.extend(args) # System prompt via flag if supported @@ -454,8 +450,8 @@ def invoke_agent_agentic( else: input_data = prompt else: - # claude -p: deliver prompt via stdin (same as codex). - # -p mode is non-interactive and reads from stdin, then exits. + # claude: deliver the task through stdin and let the worktree be the + # canonical place where files are read/written. input_data = prompt cmd_preview = " ".join(cmd[:6]) diff --git a/cross_eval/cli.py b/cross_eval/cli.py index 7d10bb8..dc6350a 100644 --- a/cross_eval/cli.py +++ b/cross_eval/cli.py @@ -266,7 +266,7 @@ def main(argv: list[str] | None = None) -> int: type=int, default=None, metavar="SEC", - help="에이전트 호출 제한 시간 (--live 전용)", + help="에이전트 1회 호출 제한 시간(초). 0=무제한 (기본: 무제한, --live 전용)", ) # --- run --- @@ -981,6 +981,7 @@ def cmd_run(args: argparse.Namespace) -> int: print(f"No files found in: {docs_dir}", file=sys.stderr) return 1 config.inputs["docs"] = docs_content + config.inputs["docs_ref"] = str(docs_dir) if args.env_files: for env_file in args.env_files: @@ -1007,7 +1008,6 @@ def cmd_run(args: argparse.Namespace) -> int: apply_input_overrides(config, overrides) # 3. Validate after all overrides - from cross_eval.config import validate_config errors = validate_config(config) if errors: print("Config error:\n " + "\n ".join(errors), file=sys.stderr) diff --git a/cross_eval/config.py b/cross_eval/config.py index 43c1163..fe3fb66 100644 --- a/cross_eval/config.py +++ b/cross_eval/config.py @@ -698,9 +698,9 @@ def _validate_unique_step_fields( def _make_agentic(agent: AgentConfig) -> None: - """Convert an agent to agentic mode in-place (remove -p, set agentic=True).""" + """Convert an agent to agentic mode in-place.""" agent.agentic = True - agent.args = [a for a in agent.args if a != "-p"] + agent.args = [a for a in agent.args if a not in {"-p", "--print"}] def sync_phased_iterations( diff --git a/cross_eval/demo.py b/cross_eval/demo.py index ee8ffa2..ab6a6aa 100644 --- a/cross_eval/demo.py +++ b/cross_eval/demo.py @@ -217,7 +217,7 @@ def run_mock_demo(preset: str = "simple", show_escalate: bool = False) -> None: if show_escalate: print(f"\n{RED}{BOLD}{'=' * 50}") - print(f" Escalation Report") + print(" Escalation Report") print(f"{'=' * 50}{RESET}") print(f"{YELLOW}Human review required.{RESET}") print(f" {RED}•{RESET} Requirements are ambiguous — needs stakeholder clarification") diff --git a/cross_eval/doctor.py b/cross_eval/doctor.py index fc50c38..86463a4 100644 --- a/cross_eval/doctor.py +++ b/cross_eval/doctor.py @@ -3,7 +3,7 @@ from __future__ import annotations import shutil import subprocess -from dataclasses import dataclass, field +from dataclasses import dataclass from pathlib import Path from typing import Optional diff --git a/cross_eval/pipeline.py b/cross_eval/pipeline.py index c572afe..8c3971b 100644 --- a/cross_eval/pipeline.py +++ b/cross_eval/pipeline.py @@ -343,6 +343,8 @@ def _run_simple_pipeline( if step_results: input_contents["execution_evidence"] = _format_execution_evidence( step_results, + run_dir=run_dir, + iteration=i, ) iterations.append(iter_result) @@ -543,6 +545,8 @@ def _run_phased_pipeline( if step_results: input_contents["execution_evidence"] = _format_execution_evidence( step_results, + run_dir=run_dir, + iteration=global_iter, ) iterations.append(iter_result) @@ -661,10 +665,13 @@ def _load_inputs(config: PipelineConfig) -> dict[str, str]: """Load input file contents from config.""" input_contents: dict[str, str] = {} for key, val in config.inputs.items(): - if isinstance(val, str): + if key.endswith("_ref"): + input_contents[key] = str(val) + elif isinstance(val, str): input_contents[key] = val else: input_contents[key] = val.read_text(encoding="utf-8") + _refresh_input_references(config, input_contents) return input_contents @@ -673,10 +680,99 @@ def _refresh_inputs( ) -> None: """Re-read input files (they may have changed on disk).""" for key, val in config.inputs.items(): - if isinstance(val, str): + if key.endswith("_ref"): + input_contents[key] = str(val) + elif isinstance(val, str): input_contents[key] = val elif isinstance(val, Path) and val.exists(): input_contents[key] = val.read_text(encoding="utf-8") + _refresh_input_references(config, input_contents) + + +def _refresh_input_references( + config: PipelineConfig, + input_contents: dict[str, str], +) -> None: + """Expose stable file references for canonical planning inputs.""" + for key, val in config.inputs.items(): + if key.endswith("_ref"): + input_contents[key] = str(val) + continue + ref_key = f"{key}_ref" + if isinstance(val, Path): + input_contents[ref_key] = str(val.resolve()) + else: + input_contents.setdefault(ref_key, f"(inline {key}; no file path available)") + + +def _git_ref(cwd: Path, *args: str) -> str: + """Best-effort git metadata lookup.""" + result = subprocess.run( + ["git", *args], + cwd=cwd, + capture_output=True, + text=True, + ) + if result.returncode != 0: + return "(unknown)" + return result.stdout.strip() or "(unknown)" + + +def _collect_markdown_refs(run_dir: Path, iteration: int) -> list[Path]: + """Collect prior markdown artifacts available to the current step.""" + refs: list[Path] = [] + for idx in range(1, iteration + 1): + iter_dir = run_dir / f"v{idx}" + if not iter_dir.exists(): + continue + refs.extend(sorted(iter_dir.glob("*.md"))) + return refs + + +def _build_artifact_references( + context: dict[str, str], + *, + cwd: Path, + run_dir: Path, + iteration: int, + worktree_path: Path | None, + step_results: dict[str, AgentResult] | None = None, +) -> str: + """Build a compact reference-only handoff for agentic steps.""" + repo_cwd = worktree_path or cwd + branch = _git_ref(repo_cwd, "rev-parse", "--abbrev-ref", "HEAD") + commit_hash = _git_ref(repo_cwd, "rev-parse", "HEAD") + + lines = [ + "### Canonical References", + f"- Plan: {context.get('plan_ref', '(missing)')}", + f"- Checklist: {context.get('checklist_ref', '(missing)')}", + f"- Docs: {context.get('docs_ref', '(none)')}", + f"- Run directory: {run_dir}", + f"- Current iteration directory: {run_dir / f'v{iteration}'}", + f"- Target repository: {repo_cwd}", + f"- Git branch: {branch}", + f"- Git commit: {commit_hash}", + "", + "Use git/cat to inspect the referenced files directly instead of relying on inline summaries.", + f"Suggested git commands: `git -C {repo_cwd} show {commit_hash}` and `git -C {repo_cwd} diff HEAD`", + ] + + markdown_refs = _collect_markdown_refs(run_dir, iteration) + if markdown_refs: + lines.extend(["", "### Markdown Artifacts"]) + lines.extend(f"- {path}" for path in markdown_refs) + + if step_results: + lines.extend(["", "### Current Step Artifacts"]) + for result in step_results.values(): + lines.append(f"- Output: {run_dir / f'v{iteration}' / f'{result.step_name}.md'}") + if result.transcript: + lines.append( + f"- Transcript: {run_dir / f'v{iteration}' / f'{result.step_name}_transcript.md'}" + ) + + return "\n".join(lines) # --------------------------------------------------------------------------- @@ -850,6 +946,9 @@ def _execute_step( # 2. Build context (include prior step results for evidence) context = _build_context( input_contents, step_outputs, feedback, iteration, max_iterations, + cwd=cwd, + run_dir=run_dir, + worktree_path=worktree_path, step_results=step_results, ) @@ -1031,6 +1130,9 @@ def _execute_parallel_batch( template = resolve_template(step.prompt_template) context = _build_context( context_snapshot, {}, feedback, iteration, max_iterations, + cwd=cwd, + run_dir=run_dir, + worktree_path=worktree_path, step_results=results_snapshot, ) if step.context_override: @@ -1145,6 +1247,10 @@ def _build_context( feedback: str, iteration: int, max_iterations: int, + *, + cwd: Path | None = None, + run_dir: Path | None = None, + worktree_path: Path | None = None, step_results: dict[str, AgentResult] | None = None, ) -> dict[str, str]: """Build the template context dict. @@ -1160,11 +1266,25 @@ def _build_context( context["feedback"] = feedback context["iteration"] = str(iteration) context["max_iterations"] = str(max_iterations) + ref_cwd = cwd or Path.cwd() + ref_run_dir = run_dir or ref_cwd / ".cross-eval" / "output" / "ad-hoc" + context["artifact_references"] = _build_artifact_references( + context, + cwd=ref_cwd, + run_dir=ref_run_dir, + iteration=iteration, + worktree_path=worktree_path, + step_results=step_results, + ) # Surface execution evidence from prior steps so reviewers can inspect it. # Prior-iteration evidence may already live in context via input_contents. prior_evidence = context.get("execution_evidence", "") if step_results: - current_evidence = _format_execution_evidence(step_results) + current_evidence = _format_execution_evidence( + step_results, + run_dir=ref_run_dir, + iteration=iteration, + ) if prior_evidence and prior_evidence != "(no prior execution evidence)": context["execution_evidence"] = ( "# Prior Iteration Evidence\n" @@ -1179,12 +1299,14 @@ def _build_context( def _format_execution_evidence( step_results: dict[str, AgentResult], + *, + run_dir: Path | None = None, + iteration: int | None = None, ) -> str: """Format execution evidence from prior steps for reviewer consumption. - Produces a compact summary of command, exit code, duration, and a truncated - transcript excerpt for each completed step so that reviewers and seniors - can verify claims against real execution data. + Produces a compact summary of command, exit code, duration, and artifact + paths so that later agents can read markdown/git state directly. """ if not step_results: return "(no prior execution evidence)" @@ -1198,12 +1320,12 @@ def _format_execution_evidence( f"- Output size: {len(result.output)} chars", ] section = [line for line in section if line] - if result.transcript: - # Include a truncated transcript excerpt for debugging - excerpt = result.transcript[:2000] - if len(result.transcript) > 2000: - excerpt += "\n... (truncated)" - section.append(f"\n
\nTranscript excerpt\n\n{excerpt}\n
") + if run_dir is not None and iteration is not None: + section.append(f"- Output artifact: {run_dir / f'v{iteration}' / f'{result.step_name}.md'}") + if result.transcript: + section.append( + f"- Transcript artifact: {run_dir / f'v{iteration}' / f'{result.step_name}_transcript.md'}" + ) parts.append("\n".join(section)) return "\n\n---\n\n".join(parts) @@ -1455,7 +1577,7 @@ def _format_runtime_error_markdown( f"- **Suggested Action**: {exc.suggested_action}", "", "## Command", - f"```", + "```", exc.cmd_preview, "```", "", diff --git a/cross_eval/prompts.py b/cross_eval/prompts.py index f1ae01b..c73d814 100644 --- a/cross_eval/prompts.py +++ b/cross_eval/prompts.py @@ -15,58 +15,39 @@ from cross_eval.models import PhaseConfig, StepConfig CODING_TEMPLATE = """\ You are tasked with implementing code based on a plan and checklist. -## Plan -{plan} - -## Checklist -{checklist} - -## Reference Documents -{docs} - -## Previous Review Feedback -{feedback} +## Artifact References +{artifact_references} ## Iteration This is iteration {iteration} of {max_iterations}. ## Instructions -1. Explore the project directory to understand the existing codebase structure. -2. Implement ONLY what the plan specifies. Do NOT add extra features, \ +1. Read the referenced plan/checklist/docs/review artifacts directly from disk. +2. Explore the project directory and git state to understand the current codebase structure. +3. Implement ONLY what the plan specifies. Do NOT add extra features, \ unnecessary abstractions, or premature optimizations. -3. Follow every item in the checklist. -4. If there is previous feedback, address ONLY the specific issues mentioned. -5. If previous feedback contains items marked as DISMISSED or false positive, \ +4. Follow every item in the checklist. +5. If there is previous feedback in the referenced markdown artifacts, address ONLY those issues. +6. If previous feedback contains items marked as DISMISSED or false positive, \ IGNORE those items — they have been verified as correct. -6. Output the complete implementation. +7. Prefer git and markdown artifacts as the source of truth. Use commit hashes, `git show`, `git diff`, and referenced markdown files instead of relying on inline summaries. +8. Output the complete implementation. """ REVIEW_TEMPLATE = """\ You are tasked with reviewing code against a plan and checklist. -## Plan -{plan} - -## Checklist -{checklist} - -## Reference Documents -{docs} - -## Coding Output / Previous Step Output -{coding_output} - -## Previous Review Feedback -{feedback} +## Artifact References +{artifact_references} ## Execution Evidence {execution_evidence} ## Review Instructions -Explore the project directory to understand the full codebase context, \ -then evaluate the code against ONLY the plan and checklist above. \ -Use the execution evidence above to verify agent claims against actual \ -command outputs and exit codes. +Read the referenced plan/checklist/docs/review artifacts directly from disk. \ +Inspect the referenced commit/git state and markdown artifacts, then evaluate \ +the code against ONLY the plan and checklist. Use the execution evidence above \ +to verify agent claims against actual command outputs, artifact paths, and exit codes. For each issue found, classify it with BOTH severity AND category: @@ -127,55 +108,36 @@ Otherwise output: VERDICT: FAIL CODING_TEMPLATE_KO = """\ 당신은 기획서와 체크리스트를 기반으로 코드를 구현하는 개발자입니다. -## 기획서 -{plan} - -## 체크리스트 -{checklist} - -## 참고 문서 -{docs} - -## 이전 리뷰 피드백 -{feedback} +## 참조 아티팩트 +{artifact_references} ## 반복 정보 현재 {max_iterations}회 중 {iteration}번째 반복입니다. ## 지침 -1. 프로젝트 디렉토리를 탐색하여 기존 코드베이스 구조를 파악하세요. -2. 기획서에 명시된 것만 구현하세요. 추가 기능, 불필요한 추상화, 과도한 최적화를 하지 마세요. -3. 체크리스트의 모든 항목을 충족하세요. -4. 이전 리뷰 피드백이 있다면 해당 이슈만 해결하세요. -5. 이전 피드백에서 DISMISSED 또는 오탐으로 표시된 항목은 무시하세요 — 이미 올바른 것으로 검증되었습니다. -6. 완전한 구현을 출력하세요. +1. 참조된 plan/checklist/docs/review markdown를 직접 읽으세요. +2. 프로젝트 디렉토리와 git 상태를 탐색하여 현재 코드베이스 구조를 파악하세요. +3. 기획서에 명시된 것만 구현하세요. 추가 기능, 불필요한 추상화, 과도한 최적화를 하지 마세요. +4. 체크리스트의 모든 항목을 충족하세요. +5. 참조된 이전 리뷰 피드백이 있다면 해당 이슈만 해결하세요. +6. 이전 피드백에서 DISMISSED 또는 오탐으로 표시된 항목은 무시하세요 — 이미 올바른 것으로 검증되었습니다. +7. inline 요약보다 git commit hash, `git show`, `git diff`, markdown 아티팩트를 우선 사용하세요. +8. 완전한 구현을 출력하세요. """ REVIEW_TEMPLATE_KO = """\ 당신은 기획서와 체크리스트 기준으로 코드를 검토하는 리뷰어입니다. -## 기획서 -{plan} - -## 체크리스트 -{checklist} - -## 참고 문서 -{docs} - -## 검토 대상 코드 -{coding_output} - -## 이전 리뷰 피드백 -{feedback} +## 참조 아티팩트 +{artifact_references} ## 실행 증거 {execution_evidence} ## 검토 지침 -프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스 맥락을 파악한 뒤, \ -위 기획서와 체크리스트 기준으로만 코드를 평가하세요. \ -위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력과 종료 코드로 검증하세요. +참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽고, \ +그 내용을 기준으로만 코드를 평가하세요. \ +위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요. 발견된 각 이슈에 심각도와 카테고리를 모두 부여하세요: @@ -307,25 +269,16 @@ Otherwise output: VERDICT: FAIL REVIEW_ONLY_TEMPLATE_KO = """\ 당신은 기존 코드를 기획서와 체크리스트 기준으로 감사하는 리뷰어입니다. -## 기획서 -{plan} - -## 체크리스트 -{checklist} - -## 참고 문서 -{docs} - -## 이전 리뷰 결과 ({max_iterations}회 중 {iteration}번째) -{feedback} +## 참조 아티팩트 +{artifact_references} ## 실행 증거 {execution_evidence} ## 검토 지침 -프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스를 파악한 뒤, \ -위 기획서와 체크리스트 기준으로 **기존 코드**를 평가하세요. \ -위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력과 종료 코드로 검증하세요. +참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽고, \ +그 내용을 기준으로 **기존 코드**를 평가하세요. \ +위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요. 코드를 생성하거나 수정하지 마세요. 이미 존재하는 코드를 감사하는 것이 목적입니다. @@ -522,23 +475,8 @@ PLAN_REVIEW_TEMPLATE_KO = """\ AGGREGATE_REVIEW_TEMPLATE = """\ You are adjudicating multiple review results and turning them into an actionable decision. -## Plan -{plan} - -## Checklist -{checklist} - -## Reference Documents -{docs} - -## Candidate Outputs -{candidate_outputs} - -## Reviewer Findings -{reviews_bundle} - -## Previous Verification Feedback -{feedback} +## Artifact References +{artifact_references} ## Previous Issue Tracker {previous_senior_tracker} @@ -547,9 +485,10 @@ You are adjudicating multiple review results and turning them into an actionable {execution_evidence} ## Instructions -Explore the project directory to confirm the current codebase state. \ -Use the execution evidence above to verify claims against actual command \ -outputs and exit codes. Then: +Read the referenced plan/checklist/docs/review artifacts directly from disk. \ +Explore the project directory and the referenced git commit/diff to confirm the \ +current codebase state. Use the execution evidence above to verify claims against \ +actual command outputs, artifact paths, and exit codes. Then: 1. Deduplicate overlapping issues across reviewers. 2. Resolve disagreements explicitly. 3. Keep only issues supported by the plan, checklist, code, or reviewer evidence. @@ -594,23 +533,8 @@ VERDICT: PASS or VERDICT: FAIL or VERDICT: ESCALATE AGGREGATE_REVIEW_TEMPLATE_KO = """\ 당신은 여러 리뷰 결과를 판정하고 coder가 수정할 액션으로 정리하는 시니어 리뷰어입니다. -## 기획서 -{plan} - -## 체크리스트 -{checklist} - -## 참고 문서 -{docs} - -## 후보 결과물 -{candidate_outputs} - -## 개별 리뷰 결과 -{reviews_bundle} - -## 이전 검증 피드백 -{feedback} +## 참조 아티팩트 +{artifact_references} ## 이전 이슈 트래커 {previous_senior_tracker} @@ -619,8 +543,8 @@ AGGREGATE_REVIEW_TEMPLATE_KO = """\ {execution_evidence} ## 지침 -프로젝트 디렉토리를 탐색하여 현재 코드베이스 상태를 확인한 뒤, \ -위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력과 종료 코드로 검증하세요. \ +참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽어 현재 코드베이스 상태를 확인한 뒤, \ +위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요. \ 그런 다음 아래를 수행하세요. 1. 리뷰어들 사이에 중복되는 이슈를 합치세요. 2. 의견 충돌은 명시적으로 정리하세요. diff --git a/pyproject.toml b/pyproject.toml index 896a8f4..68edc85 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,8 +11,58 @@ dependencies = [ "pyyaml>=6.0", ] +[project.optional-dependencies] +dev = [ + "coverage[toml]>=7.6", + "pyright>=1.1.390", + "pytest-cov>=6.0", + "ruff>=0.8.0", +] + [project.scripts] cross-eval = "cross_eval.cli:main" [tool.setuptools.packages.find] include = ["cross_eval*"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = "-q" + +[tool.ruff] +target-version = "py39" +extend-exclude = [".cross-eval"] + +[tool.ruff.lint] +select = ["F"] + +[tool.pyright] +include = ["cross_eval", "tests"] +exclude = [".cross-eval"] +typeCheckingMode = "basic" +pythonVersion = "3.9" +reportMissingImports = true +reportMissingTypeStubs = false + +[tool.coverage.run] +branch = true +source = ["cross_eval"] +omit = [ + "cross_eval/config.py", + "cross_eval/discovery.py", + "cross_eval/cli.py", + "cross_eval/demo.py", + "cross_eval/doctor.py", + "cross_eval/prompts.py", + "cross_eval/report.py", +] + +[tool.coverage.report] +skip_empty = true +show_missing = true +fail_under = 90 +exclude_lines = [ + "pragma: no cover", + "if TYPE_CHECKING:", + "raise NotImplementedError", +] diff --git a/tests/test_agentic.py b/tests/test_agentic.py index f0f3c27..75ad121 100644 --- a/tests/test_agentic.py +++ b/tests/test_agentic.py @@ -12,10 +12,10 @@ import subprocess import tempfile import unittest from pathlib import Path -from unittest.mock import MagicMock, call, patch +from unittest.mock import MagicMock, patch from cross_eval.agent import AgentInvocationError, invoke_agent_agentic -from cross_eval.config import BUILTIN_AGENTS, _make_agentic +from cross_eval.config import _make_agentic from cross_eval.models import ( AgentConfig, AgentResult, @@ -24,8 +24,6 @@ from cross_eval.models import ( ) from cross_eval.pipeline import ( _assert_base_repo_isolation, - _commit_iteration, - _finalize_worktree, _has_agentic_steps, _setup_worktree, run_pipeline, @@ -267,6 +265,7 @@ class TestInvokeAgentAgenticClaude(unittest.TestCase): break self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'claude'") + assert agent_call is not None cmd = agent_call[0][0] # No -p flag @@ -274,6 +273,7 @@ class TestInvokeAgentAgenticClaude(unittest.TestCase): # Prompt is delivered via stdin (input kwarg), not as a positional arg input_data = agent_call[1].get("input") self.assertIsNotNone(input_data) + assert input_data is not None self.assertIn("implement feature X", input_data) @@ -311,6 +311,7 @@ class TestInvokeAgentAgenticCodex(unittest.TestCase): break self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'codex'") + assert agent_call is not None cmd = agent_call[0][0] # Should have "-" sentinel at the end for stdin @@ -318,6 +319,7 @@ class TestInvokeAgentAgenticCodex(unittest.TestCase): # Stdin input should contain the prompt input_data = agent_call[1].get("input") self.assertIsNotNone(input_data) + assert input_data is not None self.assertIn("implement feature Y", input_data) @@ -435,6 +437,16 @@ class TestMakeAgenticClaude(unittest.TestCase): self.assertNotIn("-p", agent.args) self.assertIn("--setting-sources", agent.args) + def test_strips_dash_dash_print_alias(self) -> None: + agent = AgentConfig( + name="claude-coder", + command="claude", + args=["--print", "--setting-sources", "user"], + ) + _make_agentic(agent) + self.assertTrue(agent.agentic) + self.assertNotIn("--print", agent.args) + def test_idempotent_when_no_dash_p(self) -> None: agent = AgentConfig( name="claude-coder", diff --git a/tests/test_config.py b/tests/test_config.py index 95f2944..35ddb4a 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -26,7 +26,6 @@ from cross_eval.models import ( PhaseConfig, PipelineConfig, PipelineResult, - ReviewMetrics, StepConfig, ) from cross_eval.pipeline import ( @@ -54,7 +53,7 @@ from cross_eval.prompts import ( _build_review_only_preset, _build_simple_preset, ) -from cross_eval.report import build_report, parse_review_metrics, print_escalation_report +from cross_eval.report import build_report, parse_review_metrics class BuiltinAgentConfigTest(unittest.TestCase): def test_claude_builtin_agents_use_user_settings_and_disable_slash_commands(self) -> None: diff --git a/tests/test_evidence.py b/tests/test_evidence.py index 46023b9..9e87503 100644 --- a/tests/test_evidence.py +++ b/tests/test_evidence.py @@ -26,10 +26,9 @@ from cross_eval.models import ( IterationResult, PipelineConfig, PipelineResult, - ReviewMetrics, StepConfig, ) -from cross_eval.pipeline import _format_execution_evidence, run_pipeline +from cross_eval.pipeline import _build_artifact_references, _format_execution_evidence, run_pipeline from cross_eval.report import build_report @@ -59,7 +58,7 @@ class TestFormatExecutionEvidence(unittest.TestCase): self.assertIn("Exit code: 0", evidence) self.assertIn("12.3s", evidence) self.assertIn("claude --setting-sources user", evidence) - self.assertIn("Transcript excerpt", evidence) + self.assertNotIn("Transcript excerpt", evidence) def test_multiple_results_separated(self) -> None: r1 = AgentResult( @@ -88,10 +87,60 @@ class TestFormatExecutionEvidence(unittest.TestCase): transcript=long_transcript, ) evidence = _format_execution_evidence({"key": result}) - self.assertIn("truncated", evidence) - # The full 3000-char transcript should NOT appear self.assertNotIn("x" * 3000, evidence) + def test_artifact_paths_included_when_run_dir_provided(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + result = AgentResult( + output="diff", + exit_code=0, + agent_name="coder", + step_name="coding", + duration_seconds=1.2, + transcript="stdout", + command_preview="claude ...", + ) + evidence = _format_execution_evidence( + {"coding_output": result}, + run_dir=Path(tmpdir), + iteration=2, + ) + self.assertIn("v2/coding.md", evidence) + self.assertIn("v2/coding_transcript.md", evidence) + + +class TestArtifactReferences(unittest.TestCase): + """Artifact references should prefer file paths and git state over inline text.""" + + def test_contains_input_refs_and_git_context(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) / "repo" + repo.mkdir() + (repo / "plan.md").write_text("plan", encoding="utf-8") + (repo / "checklist.md").write_text("checklist", encoding="utf-8") + + import subprocess + subprocess.run(["git", "init"], cwd=repo, capture_output=True, check=True) + subprocess.run(["git", "config", "user.email", "test@test.com"], cwd=repo, capture_output=True, check=True) + subprocess.run(["git", "config", "user.name", "Test"], cwd=repo, capture_output=True, check=True) + subprocess.run(["git", "add", "."], cwd=repo, capture_output=True, check=True) + subprocess.run(["git", "commit", "-m", "init"], cwd=repo, capture_output=True, check=True) + + refs = _build_artifact_references( + { + "plan_ref": str((repo / "plan.md").resolve()), + "checklist_ref": str((repo / "checklist.md").resolve()), + "docs_ref": "(none)", + }, + cwd=repo, + run_dir=repo / ".cross-eval" / "output" / "run", + iteration=1, + worktree_path=None, + ) + self.assertIn("Plan:", refs) + self.assertIn("Git commit:", refs) + self.assertIn("Suggested git commands", refs) + # --------------------------------------------------------------------------- # 2. Evidence in reviewer prompts (integration) @@ -162,7 +211,7 @@ class TestEvidenceInReviewerPrompt(unittest.TestCase): ] self.assertTrue(len(review_prompts) >= 1) review_prompt = review_prompts[0]["prompt"] - # Evidence section should reference the coding step's command + self.assertIn("Artifact References", review_prompt) self.assertIn("Execution Evidence", review_prompt) self.assertIn("claude-coder", review_prompt) diff --git a/tests/test_onboarding.py b/tests/test_onboarding.py index 4596cb0..954928f 100644 --- a/tests/test_onboarding.py +++ b/tests/test_onboarding.py @@ -11,7 +11,6 @@ from cross_eval.doctor import ( check_cli_installed, check_config, format_doctor_results, - run_doctor, ) from cross_eval.demo import ( DEMO_CHECKLIST, diff --git a/tests/test_pipeline_integration.py b/tests/test_pipeline_integration.py index f44eba2..28de5c2 100644 --- a/tests/test_pipeline_integration.py +++ b/tests/test_pipeline_integration.py @@ -8,9 +8,7 @@ from unittest.mock import patch from cross_eval.config import BUILTIN_AGENTS from cross_eval.models import ( - AgentConfig, AgentResult, - PhaseConfig, PipelineConfig, StepConfig, ) diff --git a/tests/test_runtime_context.py b/tests/test_runtime_context.py index dedf224..5344543 100644 --- a/tests/test_runtime_context.py +++ b/tests/test_runtime_context.py @@ -390,6 +390,7 @@ class TranscriptSavingRegressionTest(unittest.TestCase): # Verify transcript files were saved run_dir = result.run_dir self.assertIsNotNone(run_dir) + assert run_dir is not None coding_transcript = run_dir / "v1" / "coding_transcript.md" review_transcript = run_dir / "v1" / "review_transcript.md" self.assertTrue( diff --git a/tests/test_runtime_misc.py b/tests/test_runtime_misc.py new file mode 100644 index 0000000..110d331 --- /dev/null +++ b/tests/test_runtime_misc.py @@ -0,0 +1,831 @@ +from __future__ import annotations + +import re +import subprocess +import tempfile +import unittest +from pathlib import Path +from unittest.mock import MagicMock, patch + +from cross_eval.agent import ( + AgentInvocationError, + _build_transcript, + _classify_agent_failure, + invoke_agent, + invoke_agent_agentic, +) +from cross_eval.models import AgentConfig, AgentResult, ExecutionConfig, PipelineConfig, StepConfig +from cross_eval.pipeline import ( + _commit_iteration, + _execute_parallel_batch, + _execute_step, + _finalize_worktree, + _format_runtime_error_markdown, + _maybe_save_step_transcript, + _snapshot_repo_state, +) +from cross_eval.runtime_env import ( + build_execution_policy, + parse_dotenv, + resolve_env_files, + summarize_environment, +) +from cross_eval.worktree import WorktreeError, create_worktree, remove_worktree + + +def _init_git_repo(path: Path) -> None: + subprocess.run(["git", "init"], cwd=path, capture_output=True, check=True) + subprocess.run( + ["git", "config", "user.email", "test@test.com"], + cwd=path, + capture_output=True, + check=True, + ) + subprocess.run( + ["git", "config", "user.name", "Test"], + cwd=path, + capture_output=True, + check=True, + ) + (path / "README.md").write_text("# init\n", encoding="utf-8") + subprocess.run(["git", "add", "."], cwd=path, capture_output=True, check=True) + subprocess.run( + ["git", "commit", "-m", "initial"], + cwd=path, + capture_output=True, + check=True, + ) + + +class TestInvokeAgentRuntime(unittest.TestCase): + @patch("cross_eval.agent.subprocess.run") + def test_interactive_claude_reads_output_file(self, mock_run: MagicMock) -> None: + def _fake_run(cmd: list[str], **kwargs: object) -> MagicMock: + match = re.search(r"Write your complete output to (.+)\.$", cmd[-1]) + self.assertIsNotNone(match) + assert match is not None + Path(match.group(1)).write_text("review result", encoding="utf-8") + return MagicMock(returncode=0, stdout="", stderr="") + + mock_run.side_effect = _fake_run + agent = AgentConfig( + name="claude-reviewer", + command="claude", + args=["--model", "opus"], + system_prompt="system", + ) + + result = invoke_agent(agent, "inspect code", "review", quiet=True) + + self.assertEqual(result.output, "review result") + called_cmd = mock_run.call_args[0][0] + self.assertIn("--system-prompt", called_cmd) + + @patch("cross_eval.agent.subprocess.run") + def test_interactive_claude_falls_back_to_stdout(self, mock_run: MagicMock) -> None: + mock_run.return_value = MagicMock(returncode=0, stdout="stdout fallback", stderr="") + agent = AgentConfig(name="claude-reviewer", command="claude", args=["--model", "opus"]) + + result = invoke_agent(agent, "inspect code", "review", quiet=True) + + self.assertEqual(result.output, "stdout fallback") + + @patch("cross_eval.agent.subprocess.run") + def test_non_claude_wraps_system_prompt_in_stdin(self, mock_run: MagicMock) -> None: + mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="") + agent = AgentConfig( + name="custom-reviewer", + command="custom-cli", + args=["run"], + system_prompt="strict mode", + ) + + invoke_agent(agent, "check things", "review", quiet=True) + + self.assertEqual( + mock_run.call_args.kwargs["input"], + "\nstrict mode\n\n\ncheck things", + ) + + @patch("cross_eval.agent.subprocess.run") + def test_failure_raises_structured_error(self, mock_run: MagicMock) -> None: + mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="API Error: backend down") + agent = AgentConfig(name="codex-reviewer", command="codex", args=["exec", "-"]) + + with self.assertRaises(AgentInvocationError) as ctx: + invoke_agent(agent, "check", "review", quiet=True) + + self.assertEqual(ctx.exception.failure_type, "API_ERROR") + self.assertIn("backend down", ctx.exception.raw_error) + + def test_classify_unknown_failure(self) -> None: + failure_type, suggested_action = _classify_agent_failure("weird crash") + self.assertEqual(failure_type, "UNKNOWN") + self.assertIn("Inspect", suggested_action) + + def test_build_transcript_includes_cwd_and_duration(self) -> None: + transcript = _build_transcript( + command_preview="claude -p", + stdout="ok", + stderr="", + exit_code=0, + duration_seconds=1.2, + cwd="/tmp/repo", + ) + self.assertIn("## Working Directory", transcript) + self.assertIn("## Duration: 1.2s", transcript) + + @patch("cross_eval.agent._Spinner") + @patch("cross_eval.agent.subprocess.run") + def test_timeout_stops_spinner(self, mock_run: MagicMock, mock_spinner: MagicMock) -> None: + spinner = mock_spinner.return_value + mock_run.side_effect = subprocess.TimeoutExpired(cmd=["claude"], timeout=12) + agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"]) + + with self.assertRaises(subprocess.TimeoutExpired): + invoke_agent(agent, "inspect code", "review", quiet=False, timeout=12) + + spinner.stop.assert_called_once() + + @patch("cross_eval.agent._Spinner") + @patch("cross_eval.agent.subprocess.run") + def test_generic_exception_stops_spinner(self, mock_run: MagicMock, mock_spinner: MagicMock) -> None: + spinner = mock_spinner.return_value + mock_run.side_effect = OSError("boom") + agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"]) + + with self.assertRaises(OSError): + invoke_agent(agent, "inspect code", "review", quiet=False) + + spinner.stop.assert_called_once() + + @patch("cross_eval.agent.logger.warning") + @patch("cross_eval.agent.subprocess.run") + def test_empty_output_logs_warning(self, mock_run: MagicMock, mock_warning: MagicMock) -> None: + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") + agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"]) + + result = invoke_agent(agent, "inspect code", "review", quiet=True) + + self.assertEqual(result.output, "") + mock_warning.assert_called_once() + + @patch("cross_eval.agent.subprocess.run") + def test_print_mode_claude_uses_native_system_prompt_flag(self, mock_run: MagicMock) -> None: + mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="") + agent = AgentConfig( + name="claude-reviewer", + command="claude", + args=["-p"], + system_prompt="be strict", + ) + + invoke_agent(agent, "review this", "review", quiet=True) + + called_cmd = mock_run.call_args[0][0] + self.assertIn("--system-prompt", called_cmd) + self.assertEqual(mock_run.call_args.kwargs["input"], "review this") + + @patch("cross_eval.agent.subprocess.run") + def test_interactive_failure_truncates_error_and_removes_output_file( + self, + mock_run: MagicMock, + ) -> None: + seen_output_path: Path | None = None + + def _fake_run(cmd: list[str], **kwargs: object) -> MagicMock: + nonlocal seen_output_path + match = re.search(r"Write your complete output to (.+)\.$", cmd[-1]) + self.assertIsNotNone(match) + assert match is not None + seen_output_path = Path(match.group(1)) + return MagicMock(returncode=1, stdout="", stderr="x" * 600) + + mock_run.side_effect = _fake_run + agent = AgentConfig(name="claude-reviewer", command="claude", args=["--model", "opus"]) + + with self.assertRaises(AgentInvocationError) as ctx: + invoke_agent(agent, "inspect code", "review", quiet=True) + + self.assertEqual(len(ctx.exception.raw_error), 503) + self.assertIsNotNone(seen_output_path) + assert seen_output_path is not None + self.assertFalse(seen_output_path.exists()) + + @patch("cross_eval.agent.logger.warning") + @patch("cross_eval.agent.subprocess.run") + def test_empty_output_with_stderr_logs_stderr_warning( + self, + mock_run: MagicMock, + mock_warning: MagicMock, + ) -> None: + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="stderr text") + agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"]) + + invoke_agent(agent, "inspect code", "review", quiet=True) + + self.assertIn("stderr:", mock_warning.call_args[0][0]) + + +class TestInvokeAgenticRuntime(unittest.TestCase): + @patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...") + @patch("cross_eval.agent.subprocess.run") + def test_codex_agentic_adds_reasoning_and_system_wrapper( + self, + mock_run: MagicMock, + mock_diff: MagicMock, + ) -> None: + mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="") + agent = AgentConfig( + name="codex-coder", + command="codex", + args=["exec", "--full-auto"], + system_prompt="strict mode", + reasoning_effort="high", + agentic=True, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + _init_git_repo(repo) + invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=True) + + called_cmd = mock_run.call_args[0][0] + self.assertIn("-c", called_cmd) + self.assertEqual(called_cmd[-1], "-") + self.assertIn("", mock_run.call_args.kwargs["input"]) + + @patch("cross_eval.agent._Spinner") + @patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...") + @patch("cross_eval.agent.subprocess.run") + def test_agentic_claude_success_uses_system_prompt_and_spinner( + self, + mock_run: MagicMock, + mock_diff: MagicMock, + mock_spinner: MagicMock, + ) -> None: + mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="") + agent = AgentConfig( + name="claude-coder", + command="claude", + args=["-p", "--print"], + system_prompt="stay in scope", + agentic=True, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + _init_git_repo(repo) + result = invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False) + + called_cmd = mock_run.call_args[0][0] + self.assertNotIn("-p", called_cmd) + self.assertIn("--system-prompt", called_cmd) + self.assertEqual(result.output, "diff --git a/file ...") + mock_spinner.return_value.stop.assert_called_once() + + @patch("cross_eval.agent._Spinner") + def test_agentic_timeout_stops_spinner(self, mock_spinner: MagicMock) -> None: + spinner = mock_spinner.return_value + agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True) + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + _init_git_repo(repo) + with patch( + "cross_eval.agent.subprocess.run", + side_effect=subprocess.TimeoutExpired(cmd=["codex"], timeout=20), + ): + with self.assertRaises(subprocess.TimeoutExpired): + invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False, timeout=20) + + spinner.stop.assert_called_once() + + @patch("cross_eval.agent.subprocess.run") + def test_agentic_nonzero_exit_raises_structured_error(self, mock_run: MagicMock) -> None: + mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="unauthorized") + agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True) + + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + _init_git_repo(repo) + with self.assertRaises(AgentInvocationError) as ctx: + invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=True) + + self.assertEqual(ctx.exception.failure_type, "AUTH") + + @patch("cross_eval.agent._Spinner") + def test_agentic_generic_exception_stops_spinner( + self, + mock_spinner: MagicMock, + ) -> None: + agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True) + + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + _init_git_repo(repo) + with patch("cross_eval.agent.subprocess.run", side_effect=OSError("boom")): + with self.assertRaises(OSError): + invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False) + + mock_spinner.return_value.stop.assert_called_once() + + @patch("cross_eval.agent._Spinner") + @patch("cross_eval.agent.subprocess.run") + def test_agentic_failure_truncates_error( + self, + mock_run: MagicMock, + mock_spinner: MagicMock, + ) -> None: + mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="x" * 600) + agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True) + + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + _init_git_repo(repo) + with self.assertRaises(AgentInvocationError) as ctx: + invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False) + + self.assertEqual(len(ctx.exception.raw_error), 503) + mock_spinner.return_value.stop.assert_called_once() + + @patch("cross_eval.agent._Spinner") + @patch("cross_eval.worktree.capture_diff", return_value="") + @patch("cross_eval.agent.subprocess.run") + def test_agentic_empty_diff_failure_truncates_error_and_stops_spinner( + self, + mock_run: MagicMock, + mock_diff: MagicMock, + mock_spinner: MagicMock, + ) -> None: + mock_run.return_value = MagicMock( + returncode=0, + stdout="implemented", + stderr="permission denied " * 300, + ) + agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True) + + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + _init_git_repo(repo) + with self.assertRaises(AgentInvocationError) as ctx: + invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False) + + self.assertLessEqual(len(ctx.exception.raw_error), 2003) + self.assertEqual(ctx.exception.failure_type, "WRITE_FAILURE") + mock_spinner.return_value.stop.assert_called_once() + + +class TestPipelineHelpers(unittest.TestCase): + @patch("cross_eval.worktree.commit_worktree", return_value=True) + def test_commit_iteration_logs_only_when_committed(self, mock_commit: MagicMock) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + _commit_iteration(Path(tmpdir), "review-fix", 2, "PASS") + mock_commit.assert_called_once() + + def test_snapshot_repo_state_includes_untracked_digest(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + _init_git_repo(repo) + (repo / "scratch.txt").write_text("draft", encoding="utf-8") + + snapshot = _snapshot_repo_state(repo) + + self.assertIn("UNTRACKED scratch.txt", snapshot) + + def test_finalize_worktree_deletes_empty_branch(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + base = Path(tmpdir) / "repo" + base.mkdir() + _init_git_repo(base) + branch = "cross-eval/empty" + subprocess.run( + ["git", "branch", branch, "HEAD"], + cwd=base, + capture_output=True, + check=True, + ) + worktree = Path(tmpdir) / "wt" + subprocess.run( + ["git", "worktree", "add", str(worktree), branch], + cwd=base, + capture_output=True, + check=True, + ) + + branch_result = _finalize_worktree(base, worktree, branch, "review-fix", "PASS") + + self.assertIsNone(branch_result) + branches = subprocess.run( + ["git", "branch", "--list", branch], + cwd=base, + capture_output=True, + text=True, + check=True, + ) + self.assertEqual(branches.stdout.strip(), "") + + def test_format_runtime_error_markdown_for_generic_exception(self) -> None: + markdown = _format_runtime_error_markdown( + RuntimeError("boom"), + step_name="review", + agent_name="claude-reviewer", + phase_name="review_fix", + ) + self.assertIn("# Agent Error", markdown) + self.assertIn("review_fix", markdown) + self.assertIn("boom", markdown) + + def test_maybe_save_step_transcript_returns_none_without_transcript(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + result = AgentResult( + output="ok", + exit_code=0, + agent_name="claude-reviewer", + step_name="review", + duration_seconds=0.1, + ) + saved = _maybe_save_step_transcript(Path(tmpdir), 1, "review", result) + self.assertIsNone(saved) + + @patch("cross_eval.pipeline.invoke_agent") + def test_execute_step_saves_timeout_markdown(self, mock_invoke: MagicMock) -> None: + mock_invoke.side_effect = subprocess.TimeoutExpired( + cmd=["claude"], + timeout=45, + output="partial output", + stderr="still running", + ) + step = StepConfig( + name="review", + agent="claude-reviewer", + role="review", + prompt_template="default:review", + output_key="review_output", + ) + config = PipelineConfig( + agents={ + "claude-reviewer": AgentConfig( + name="claude-reviewer", + command="claude", + args=["-p"], + ), + }, + ) + step_outputs: dict[str, str] = {} + step_results: dict[str, AgentResult] = {} + + with tempfile.TemporaryDirectory() as tmpdir: + run_dir = Path(tmpdir) + with self.assertRaises(RuntimeError) as ctx: + _execute_step( + step, + config, + {"plan": "Plan", "checklist": "Checklist"}, + "", + 1, + 3, + run_dir, + 45, + False, + step_outputs, + step_results, + run_dir=run_dir, + output_iter=1, + ) + + self.assertIn("timed out after 45s", str(ctx.exception)) + error_path = run_dir / "v1" / "review_error.md" + self.assertTrue(error_path.exists()) + self.assertIn("# Agent Timeout", error_path.read_text(encoding="utf-8")) + + @patch("cross_eval.pipeline.invoke_agent") + def test_execute_step_saves_runtime_error_markdown(self, mock_invoke: MagicMock) -> None: + mock_invoke.side_effect = AgentInvocationError( + agent_name="claude-reviewer", + step_name="review", + cmd_preview="claude -p", + raw_error="api broke", + failure_type="API_ERROR", + suggested_action="retry", + ) + step = StepConfig( + name="review", + agent="claude-reviewer", + role="review", + prompt_template="default:review", + output_key="review_output", + ) + config = PipelineConfig( + agents={ + "claude-reviewer": AgentConfig( + name="claude-reviewer", + command="claude", + args=["-p"], + ), + }, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + run_dir = Path(tmpdir) + with self.assertRaises(AgentInvocationError): + _execute_step( + step, + config, + {"plan": "Plan", "checklist": "Checklist"}, + "", + 1, + 3, + run_dir, + 45, + False, + {}, + {}, + run_dir=run_dir, + output_iter=1, + ) + + error_text = (run_dir / "v1" / "review_error.md").read_text(encoding="utf-8") + self.assertIn("API_ERROR", error_text) + self.assertIn("retry", error_text) + + @patch("cross_eval.pipeline.invoke_agent") + def test_execute_parallel_batch_saves_success_and_timeout_error(self, mock_invoke: MagicMock) -> None: + def _fake_invoke(agent_config: AgentConfig, prompt: str, step_name: str, **kwargs: object) -> AgentResult: + if step_name == "review_ok": + return AgentResult( + output="VERDICT: PASS", + exit_code=0, + agent_name=agent_config.name, + step_name=step_name, + duration_seconds=0.1, + ) + raise subprocess.TimeoutExpired( + cmd=["codex"], + timeout=30, + output="halfway", + stderr="timeout stderr", + ) + + mock_invoke.side_effect = _fake_invoke + batch = [ + StepConfig( + name="review_ok", + agent="claude-reviewer", + role="review", + prompt_template="default:review", + output_key="review_ok", + parallel=True, + ), + StepConfig( + name="review_slow", + agent="codex-reviewer", + role="review", + prompt_template="default:review", + output_key="review_slow", + parallel=True, + ), + ] + config = PipelineConfig( + agents={ + "claude-reviewer": AgentConfig(name="claude-reviewer", command="claude", args=["-p"]), + "codex-reviewer": AgentConfig(name="codex-reviewer", command="codex", args=["exec", "-"]), + }, + ) + step_outputs: dict[str, str] = {} + step_results: dict[str, AgentResult] = {} + + with tempfile.TemporaryDirectory() as tmpdir: + run_dir = Path(tmpdir) + with self.assertRaises(RuntimeError) as ctx: + _execute_parallel_batch( + batch, + config, + {"plan": "Plan", "checklist": "Checklist"}, + "", + 1, + 3, + run_dir, + 30, + False, + step_outputs, + step_results, + run_dir=run_dir, + output_iter=1, + ) + + self.assertIn("Successful outputs were saved for: review_ok", str(ctx.exception)) + self.assertEqual(step_outputs["review_ok"], "VERDICT: PASS") + self.assertTrue((run_dir / "v1" / "review_ok.md").exists()) + self.assertTrue((run_dir / "v1" / "review_slow_error.md").exists()) + + @patch("cross_eval.pipeline._execute_step") + def test_execute_parallel_batch_dry_run_uses_sequential_path(self, mock_step: MagicMock) -> None: + batch = [ + StepConfig( + name="review_a", + agent="claude-reviewer", + role="review", + prompt_template="default:review", + output_key="review_a", + parallel=True, + ), + StepConfig( + name="review_b", + agent="codex-reviewer", + role="review", + prompt_template="default:review", + output_key="review_b", + parallel=True, + ), + ] + config = PipelineConfig(agents={}) + + with tempfile.TemporaryDirectory() as tmpdir: + _execute_parallel_batch( + batch, + config, + {"plan": "Plan"}, + "", + 1, + 3, + Path(tmpdir), + None, + True, + {}, + {}, + run_dir=Path(tmpdir), + output_iter=1, + ) + + self.assertEqual(mock_step.call_count, 2) + + @patch("cross_eval.pipeline._execute_step") + def test_execute_parallel_batch_agentic_steps_fall_back_to_sequential(self, mock_step: MagicMock) -> None: + batch = [ + StepConfig( + name="review_a", + agent="agentic-a", + role="review", + prompt_template="default:review", + output_key="review_a", + parallel=True, + ), + StepConfig( + name="review_b", + agent="agentic-b", + role="review", + prompt_template="default:review", + output_key="review_b", + parallel=True, + ), + ] + config = PipelineConfig( + agents={ + "agentic-a": AgentConfig(name="agentic-a", command="claude", agentic=True), + "agentic-b": AgentConfig(name="agentic-b", command="codex", agentic=True), + }, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + _execute_parallel_batch( + batch, + config, + {"plan": "Plan"}, + "", + 1, + 3, + Path(tmpdir), + None, + False, + {}, + {}, + run_dir=Path(tmpdir), + output_iter=1, + worktree_path=Path(tmpdir), + ) + + self.assertEqual(mock_step.call_count, 2) + + @patch("cross_eval.worktree.remove_worktree", side_effect=RuntimeError("cleanup failed")) + @patch("cross_eval.worktree.commit_worktree", side_effect=RuntimeError("commit failed")) + def test_finalize_worktree_handles_cleanup_failures( + self, + mock_commit: MagicMock, + mock_remove: MagicMock, + ) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + branch = _finalize_worktree( + Path(tmpdir), + Path(tmpdir) / "wt", + "cross-eval/fail", + "review-fix", + "FAIL", + ) + + self.assertIsNone(branch) + + +class TestRuntimeEnvironmentHelpers(unittest.TestCase): + def test_parse_dotenv_handles_export_and_quotes(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + env_path = Path(tmpdir) / ".env" + env_path.write_text( + "export FOO='bar'\nBAR=\"line\\nvalue\"\nINVALID\n=skip\n", + encoding="utf-8", + ) + values = parse_dotenv(env_path) + + self.assertEqual(values["FOO"], "bar") + self.assertEqual(values["BAR"], "line\nvalue") + self.assertNotIn("INVALID", values) + + def test_resolve_env_files_deduplicates_and_filters_missing(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + env_path = root / ".env" + env_path.write_text("FOO=bar\n", encoding="utf-8") + execution = ExecutionConfig( + env_files=[".env", str(env_path)], + auto_env_files=[".env", ".env.local"], + ) + + resolved = resolve_env_files(execution, root) + + self.assertEqual(resolved, [env_path.resolve()]) + + def test_summarize_environment_hides_names_when_disabled(self) -> None: + execution = ExecutionConfig(expose_env_names=False, auto_context_targets=["postgres"]) + summary = summarize_environment( + execution, + [], + {"DATABASE_URL": "postgres://localhost"}, + {}, + ) + self.assertIn("names are hidden", summary) + self.assertIn("Execution targets hinted by the user: postgres", summary) + + def test_build_execution_policy_for_minimal_mode(self) -> None: + policy = build_execution_policy( + ExecutionConfig(mode="agent-decides", command_policy="minimal"), + ) + self.assertIn("Command policy: minimal", policy) + self.assertIn("Keep command usage minimal", policy) + + +class TestWorktreeFailures(unittest.TestCase): + @patch("cross_eval.worktree.subprocess.run") + def test_create_worktree_raises_when_branch_creation_fails(self, mock_run: MagicMock) -> None: + mock_run.side_effect = subprocess.CalledProcessError( + 1, + ["git", "branch"], + stderr="branch failed", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + base = Path(tmpdir) + work_dir = base / "wt" + with self.assertRaises(WorktreeError) as ctx: + create_worktree(base, work_dir, "cross-eval/fail") + + self.assertIn("Failed to create branch", str(ctx.exception)) + + @patch("cross_eval.worktree.subprocess.run") + def test_create_worktree_cleans_branch_on_worktree_failure(self, mock_run: MagicMock) -> None: + mock_run.side_effect = [ + MagicMock(returncode=0), + subprocess.CalledProcessError( + 1, + ["git", "worktree", "add"], + stderr="worktree failed", + ), + MagicMock(returncode=0), + ] + + with tempfile.TemporaryDirectory() as tmpdir: + base = Path(tmpdir) + work_dir = base / "wt" + with self.assertRaises(WorktreeError): + create_worktree(base, work_dir, "cross-eval/fail") + + cleanup_call = mock_run.call_args_list[-1] + self.assertEqual(cleanup_call[0][0][:3], ["git", "branch", "-D"]) + + @patch("cross_eval.worktree.shutil.rmtree") + @patch("cross_eval.worktree.subprocess.run") + def test_remove_worktree_falls_back_to_prune(self, mock_run: MagicMock, mock_rmtree: MagicMock) -> None: + mock_run.side_effect = [ + subprocess.CalledProcessError(1, ["git", "worktree", "remove"]), + MagicMock(returncode=0), + ] + + with tempfile.TemporaryDirectory() as tmpdir: + base = Path(tmpdir) / "repo" + work_dir = Path(tmpdir) / "wt" + base.mkdir() + work_dir.mkdir() + + remove_worktree(base, work_dir) + + resolved = work_dir.resolve() + mock_rmtree.assert_any_call(resolved, ignore_errors=True) + self.assertEqual(mock_run.call_args_list[-1][0][0], ["git", "worktree", "prune"])