feat: tighten agentic runtime handoffs and quality gates

This commit is contained in:
chungyeong
2026-03-14 10:05:25 +09:00
parent 87bc0ffbfb
commit 7b95233edf
15 changed files with 1148 additions and 167 deletions

View File

@@ -415,11 +415,7 @@ def invoke_agent_agentic(
timeout: int | None = None,
quiet: bool = False,
) -> AgentResult:
"""Invoke an agent in agentic mode (no -p, runs in worktree, captures git diff).
The agent runs without print mode so it can modify files directly.
After the agent exits, git diff (since last commit) is captured as the output.
"""
"""Invoke an agent in agentic mode using the worktree as the source of truth."""
from cross_eval.worktree import capture_diff
# Write prompt to a temp file (outside worktree, won't appear in diffs)
@@ -433,10 +429,10 @@ def invoke_agent_agentic(
if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
# Strip stdin sentinel ("-") from args for agentic mode.
# Keep -p/--print: Claude -p mode still has full tool access (Edit, Write,
# Bash, etc.) and is the correct mode for non-interactive subprocess use.
args = [a for a in agent.args if a != "-"]
# Strip print-mode flags and stdin sentinels for agentic mode.
# Agentic runs should operate on the worktree and return a real git diff,
# not behave as a one-shot text completer.
args = [a for a in agent.args if a not in {"-", "-p", "--print"}]
cmd.extend(args)
# System prompt via flag if supported
@@ -454,8 +450,8 @@ def invoke_agent_agentic(
else:
input_data = prompt
else:
# claude -p: deliver prompt via stdin (same as codex).
# -p mode is non-interactive and reads from stdin, then exits.
# claude: deliver the task through stdin and let the worktree be the
# canonical place where files are read/written.
input_data = prompt
cmd_preview = " ".join(cmd[:6])

View File

@@ -266,7 +266,7 @@ def main(argv: list[str] | None = None) -> int:
type=int,
default=None,
metavar="SEC",
help="에이전트 호출 제한 시간 (--live 전용)",
help="에이전트 1회 호출 제한 시간(초). 0=무제한 (기본: 무제한, --live 전용)",
)
# --- run ---
@@ -981,6 +981,7 @@ def cmd_run(args: argparse.Namespace) -> int:
print(f"No files found in: {docs_dir}", file=sys.stderr)
return 1
config.inputs["docs"] = docs_content
config.inputs["docs_ref"] = str(docs_dir)
if args.env_files:
for env_file in args.env_files:
@@ -1007,7 +1008,6 @@ def cmd_run(args: argparse.Namespace) -> int:
apply_input_overrides(config, overrides)
# 3. Validate after all overrides
from cross_eval.config import validate_config
errors = validate_config(config)
if errors:
print("Config error:\n " + "\n ".join(errors), file=sys.stderr)

View File

@@ -698,9 +698,9 @@ def _validate_unique_step_fields(
def _make_agentic(agent: AgentConfig) -> None:
"""Convert an agent to agentic mode in-place (remove -p, set agentic=True)."""
"""Convert an agent to agentic mode in-place."""
agent.agentic = True
agent.args = [a for a in agent.args if a != "-p"]
agent.args = [a for a in agent.args if a not in {"-p", "--print"}]
def sync_phased_iterations(

View File

@@ -217,7 +217,7 @@ def run_mock_demo(preset: str = "simple", show_escalate: bool = False) -> None:
if show_escalate:
print(f"\n{RED}{BOLD}{'=' * 50}")
print(f" Escalation Report")
print(" Escalation Report")
print(f"{'=' * 50}{RESET}")
print(f"{YELLOW}Human review required.{RESET}")
print(f" {RED}{RESET} Requirements are ambiguous — needs stakeholder clarification")

View File

@@ -3,7 +3,7 @@ from __future__ import annotations
import shutil
import subprocess
from dataclasses import dataclass, field
from dataclasses import dataclass
from pathlib import Path
from typing import Optional

View File

@@ -343,6 +343,8 @@ def _run_simple_pipeline(
if step_results:
input_contents["execution_evidence"] = _format_execution_evidence(
step_results,
run_dir=run_dir,
iteration=i,
)
iterations.append(iter_result)
@@ -543,6 +545,8 @@ def _run_phased_pipeline(
if step_results:
input_contents["execution_evidence"] = _format_execution_evidence(
step_results,
run_dir=run_dir,
iteration=global_iter,
)
iterations.append(iter_result)
@@ -661,10 +665,13 @@ def _load_inputs(config: PipelineConfig) -> dict[str, str]:
"""Load input file contents from config."""
input_contents: dict[str, str] = {}
for key, val in config.inputs.items():
if isinstance(val, str):
if key.endswith("_ref"):
input_contents[key] = str(val)
elif isinstance(val, str):
input_contents[key] = val
else:
input_contents[key] = val.read_text(encoding="utf-8")
_refresh_input_references(config, input_contents)
return input_contents
@@ -673,10 +680,99 @@ def _refresh_inputs(
) -> None:
"""Re-read input files (they may have changed on disk)."""
for key, val in config.inputs.items():
if isinstance(val, str):
if key.endswith("_ref"):
input_contents[key] = str(val)
elif isinstance(val, str):
input_contents[key] = val
elif isinstance(val, Path) and val.exists():
input_contents[key] = val.read_text(encoding="utf-8")
_refresh_input_references(config, input_contents)
def _refresh_input_references(
config: PipelineConfig,
input_contents: dict[str, str],
) -> None:
"""Expose stable file references for canonical planning inputs."""
for key, val in config.inputs.items():
if key.endswith("_ref"):
input_contents[key] = str(val)
continue
ref_key = f"{key}_ref"
if isinstance(val, Path):
input_contents[ref_key] = str(val.resolve())
else:
input_contents.setdefault(ref_key, f"(inline {key}; no file path available)")
def _git_ref(cwd: Path, *args: str) -> str:
"""Best-effort git metadata lookup."""
result = subprocess.run(
["git", *args],
cwd=cwd,
capture_output=True,
text=True,
)
if result.returncode != 0:
return "(unknown)"
return result.stdout.strip() or "(unknown)"
def _collect_markdown_refs(run_dir: Path, iteration: int) -> list[Path]:
"""Collect prior markdown artifacts available to the current step."""
refs: list[Path] = []
for idx in range(1, iteration + 1):
iter_dir = run_dir / f"v{idx}"
if not iter_dir.exists():
continue
refs.extend(sorted(iter_dir.glob("*.md")))
return refs
def _build_artifact_references(
context: dict[str, str],
*,
cwd: Path,
run_dir: Path,
iteration: int,
worktree_path: Path | None,
step_results: dict[str, AgentResult] | None = None,
) -> str:
"""Build a compact reference-only handoff for agentic steps."""
repo_cwd = worktree_path or cwd
branch = _git_ref(repo_cwd, "rev-parse", "--abbrev-ref", "HEAD")
commit_hash = _git_ref(repo_cwd, "rev-parse", "HEAD")
lines = [
"### Canonical References",
f"- Plan: {context.get('plan_ref', '(missing)')}",
f"- Checklist: {context.get('checklist_ref', '(missing)')}",
f"- Docs: {context.get('docs_ref', '(none)')}",
f"- Run directory: {run_dir}",
f"- Current iteration directory: {run_dir / f'v{iteration}'}",
f"- Target repository: {repo_cwd}",
f"- Git branch: {branch}",
f"- Git commit: {commit_hash}",
"",
"Use git/cat to inspect the referenced files directly instead of relying on inline summaries.",
f"Suggested git commands: `git -C {repo_cwd} show {commit_hash}` and `git -C {repo_cwd} diff HEAD`",
]
markdown_refs = _collect_markdown_refs(run_dir, iteration)
if markdown_refs:
lines.extend(["", "### Markdown Artifacts"])
lines.extend(f"- {path}" for path in markdown_refs)
if step_results:
lines.extend(["", "### Current Step Artifacts"])
for result in step_results.values():
lines.append(f"- Output: {run_dir / f'v{iteration}' / f'{result.step_name}.md'}")
if result.transcript:
lines.append(
f"- Transcript: {run_dir / f'v{iteration}' / f'{result.step_name}_transcript.md'}"
)
return "\n".join(lines)
# ---------------------------------------------------------------------------
@@ -850,6 +946,9 @@ def _execute_step(
# 2. Build context (include prior step results for evidence)
context = _build_context(
input_contents, step_outputs, feedback, iteration, max_iterations,
cwd=cwd,
run_dir=run_dir,
worktree_path=worktree_path,
step_results=step_results,
)
@@ -1031,6 +1130,9 @@ def _execute_parallel_batch(
template = resolve_template(step.prompt_template)
context = _build_context(
context_snapshot, {}, feedback, iteration, max_iterations,
cwd=cwd,
run_dir=run_dir,
worktree_path=worktree_path,
step_results=results_snapshot,
)
if step.context_override:
@@ -1145,6 +1247,10 @@ def _build_context(
feedback: str,
iteration: int,
max_iterations: int,
*,
cwd: Path | None = None,
run_dir: Path | None = None,
worktree_path: Path | None = None,
step_results: dict[str, AgentResult] | None = None,
) -> dict[str, str]:
"""Build the template context dict.
@@ -1160,11 +1266,25 @@ def _build_context(
context["feedback"] = feedback
context["iteration"] = str(iteration)
context["max_iterations"] = str(max_iterations)
ref_cwd = cwd or Path.cwd()
ref_run_dir = run_dir or ref_cwd / ".cross-eval" / "output" / "ad-hoc"
context["artifact_references"] = _build_artifact_references(
context,
cwd=ref_cwd,
run_dir=ref_run_dir,
iteration=iteration,
worktree_path=worktree_path,
step_results=step_results,
)
# Surface execution evidence from prior steps so reviewers can inspect it.
# Prior-iteration evidence may already live in context via input_contents.
prior_evidence = context.get("execution_evidence", "")
if step_results:
current_evidence = _format_execution_evidence(step_results)
current_evidence = _format_execution_evidence(
step_results,
run_dir=ref_run_dir,
iteration=iteration,
)
if prior_evidence and prior_evidence != "(no prior execution evidence)":
context["execution_evidence"] = (
"# Prior Iteration Evidence\n"
@@ -1179,12 +1299,14 @@ def _build_context(
def _format_execution_evidence(
step_results: dict[str, AgentResult],
*,
run_dir: Path | None = None,
iteration: int | None = None,
) -> str:
"""Format execution evidence from prior steps for reviewer consumption.
Produces a compact summary of command, exit code, duration, and a truncated
transcript excerpt for each completed step so that reviewers and seniors
can verify claims against real execution data.
Produces a compact summary of command, exit code, duration, and artifact
paths so that later agents can read markdown/git state directly.
"""
if not step_results:
return "(no prior execution evidence)"
@@ -1198,12 +1320,12 @@ def _format_execution_evidence(
f"- Output size: {len(result.output)} chars",
]
section = [line for line in section if line]
if result.transcript:
# Include a truncated transcript excerpt for debugging
excerpt = result.transcript[:2000]
if len(result.transcript) > 2000:
excerpt += "\n... (truncated)"
section.append(f"\n<details>\n<summary>Transcript excerpt</summary>\n\n{excerpt}\n</details>")
if run_dir is not None and iteration is not None:
section.append(f"- Output artifact: {run_dir / f'v{iteration}' / f'{result.step_name}.md'}")
if result.transcript:
section.append(
f"- Transcript artifact: {run_dir / f'v{iteration}' / f'{result.step_name}_transcript.md'}"
)
parts.append("\n".join(section))
return "\n\n---\n\n".join(parts)
@@ -1455,7 +1577,7 @@ def _format_runtime_error_markdown(
f"- **Suggested Action**: {exc.suggested_action}",
"",
"## Command",
f"```",
"```",
exc.cmd_preview,
"```",
"",

View File

@@ -15,58 +15,39 @@ from cross_eval.models import PhaseConfig, StepConfig
CODING_TEMPLATE = """\
You are tasked with implementing code based on a plan and checklist.
## Plan
{plan}
## Checklist
{checklist}
## Reference Documents
{docs}
## Previous Review Feedback
{feedback}
## Artifact References
{artifact_references}
## Iteration
This is iteration {iteration} of {max_iterations}.
## Instructions
1. Explore the project directory to understand the existing codebase structure.
2. Implement ONLY what the plan specifies. Do NOT add extra features, \
1. Read the referenced plan/checklist/docs/review artifacts directly from disk.
2. Explore the project directory and git state to understand the current codebase structure.
3. Implement ONLY what the plan specifies. Do NOT add extra features, \
unnecessary abstractions, or premature optimizations.
3. Follow every item in the checklist.
4. If there is previous feedback, address ONLY the specific issues mentioned.
5. If previous feedback contains items marked as DISMISSED or false positive, \
4. Follow every item in the checklist.
5. If there is previous feedback in the referenced markdown artifacts, address ONLY those issues.
6. If previous feedback contains items marked as DISMISSED or false positive, \
IGNORE those items — they have been verified as correct.
6. Output the complete implementation.
7. Prefer git and markdown artifacts as the source of truth. Use commit hashes, `git show`, `git diff`, and referenced markdown files instead of relying on inline summaries.
8. Output the complete implementation.
"""
REVIEW_TEMPLATE = """\
You are tasked with reviewing code against a plan and checklist.
## Plan
{plan}
## Checklist
{checklist}
## Reference Documents
{docs}
## Coding Output / Previous Step Output
{coding_output}
## Previous Review Feedback
{feedback}
## Artifact References
{artifact_references}
## Execution Evidence
{execution_evidence}
## Review Instructions
Explore the project directory to understand the full codebase context, \
then evaluate the code against ONLY the plan and checklist above. \
Use the execution evidence above to verify agent claims against actual \
command outputs and exit codes.
Read the referenced plan/checklist/docs/review artifacts directly from disk. \
Inspect the referenced commit/git state and markdown artifacts, then evaluate \
the code against ONLY the plan and checklist. Use the execution evidence above \
to verify agent claims against actual command outputs, artifact paths, and exit codes.
For each issue found, classify it with BOTH severity AND category:
@@ -127,55 +108,36 @@ Otherwise output: VERDICT: FAIL
CODING_TEMPLATE_KO = """\
당신은 기획서와 체크리스트를 기반으로 코드를 구현하는 개발자입니다.
## 기획서
{plan}
## 체크리스트
{checklist}
## 참고 문서
{docs}
## 이전 리뷰 피드백
{feedback}
## 참조 아티팩트
{artifact_references}
## 반복 정보
현재 {max_iterations}회 중 {iteration}번째 반복입니다.
## 지침
1. 프로젝트 디렉토리를 탐색하여 기존 코드베이스 구조를 파악하세요.
2. 기획서에 명시된 것만 구현하세요. 추가 기능, 불필요한 추상화, 과도한 최적화를 하지 마세요.
3. 체크리스트의 모든 항목을 충족하세요.
4. 이전 리뷰 피드백이 있다면 해당 이슈만 해결하세요.
5. 이전 피드백에서 DISMISSED 또는 오탐으로 표시된 항목은 무시하세요 — 이미 올바른 것으로 검증되었습니다.
6. 완전한 구현을 출력하세요.
1. 참조된 plan/checklist/docs/review markdown를 직접 읽으세요.
2. 프로젝트 디렉토리와 git 상태를 탐색하여 현재 코드베이스 구조를 파악하세요.
3. 기획서에 명시된 것만 구현하세요. 추가 기능, 불필요한 추상화, 과도한 최적화를 하지 마세요.
4. 체크리스트의 모든 항목을 충족하세요.
5. 참조된 이전 리뷰 피드백이 있다면 해당 이슈만 해결하세요.
6. 이전 피드백에서 DISMISSED 또는 오탐으로 표시된 항목은 무시하세요 — 이미 올바른 것으로 검증되었습니다.
7. inline 요약보다 git commit hash, `git show`, `git diff`, markdown 아티팩트를 우선 사용하세요.
8. 완전한 구현을 출력하세요.
"""
REVIEW_TEMPLATE_KO = """\
당신은 기획서와 체크리스트 기준으로 코드를 검토하는 리뷰어입니다.
## 기획서
{plan}
## 체크리스트
{checklist}
## 참고 문서
{docs}
## 검토 대상 코드
{coding_output}
## 이전 리뷰 피드백
{feedback}
## 참조 아티팩트
{artifact_references}
## 실행 증거
{execution_evidence}
## 검토 지침
프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스 맥락을 파악한 뒤, \
위 기획서와 체크리스트 기준으로만 코드를 평가하세요. \
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력 종료 코드로 검증하세요.
참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽고, \
그 내용을 기준으로만 코드를 평가하세요. \
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요.
발견된 각 이슈에 심각도와 카테고리를 모두 부여하세요:
@@ -307,25 +269,16 @@ Otherwise output: VERDICT: FAIL
REVIEW_ONLY_TEMPLATE_KO = """\
당신은 기존 코드를 기획서와 체크리스트 기준으로 감사하는 리뷰어입니다.
## 기획서
{plan}
## 체크리스트
{checklist}
## 참고 문서
{docs}
## 이전 리뷰 결과 ({max_iterations}회 중 {iteration}번째)
{feedback}
## 참조 아티팩트
{artifact_references}
## 실행 증거
{execution_evidence}
## 검토 지침
프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스를 파악한 뒤, \
위 기획서와 체크리스트 기준으로 **기존 코드**를 평가하세요. \
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력 종료 코드로 검증하세요.
참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽고, \
그 내용을 기준으로 **기존 코드**를 평가하세요. \
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요.
코드를 생성하거나 수정하지 마세요. 이미 존재하는 코드를 감사하는 것이 목적입니다.
@@ -522,23 +475,8 @@ PLAN_REVIEW_TEMPLATE_KO = """\
AGGREGATE_REVIEW_TEMPLATE = """\
You are adjudicating multiple review results and turning them into an actionable decision.
## Plan
{plan}
## Checklist
{checklist}
## Reference Documents
{docs}
## Candidate Outputs
{candidate_outputs}
## Reviewer Findings
{reviews_bundle}
## Previous Verification Feedback
{feedback}
## Artifact References
{artifact_references}
## Previous Issue Tracker
{previous_senior_tracker}
@@ -547,9 +485,10 @@ You are adjudicating multiple review results and turning them into an actionable
{execution_evidence}
## Instructions
Explore the project directory to confirm the current codebase state. \
Use the execution evidence above to verify claims against actual command \
outputs and exit codes. Then:
Read the referenced plan/checklist/docs/review artifacts directly from disk. \
Explore the project directory and the referenced git commit/diff to confirm the \
current codebase state. Use the execution evidence above to verify claims against \
actual command outputs, artifact paths, and exit codes. Then:
1. Deduplicate overlapping issues across reviewers.
2. Resolve disagreements explicitly.
3. Keep only issues supported by the plan, checklist, code, or reviewer evidence.
@@ -594,23 +533,8 @@ VERDICT: PASS or VERDICT: FAIL or VERDICT: ESCALATE
AGGREGATE_REVIEW_TEMPLATE_KO = """\
당신은 여러 리뷰 결과를 판정하고 coder가 수정할 액션으로 정리하는 시니어 리뷰어입니다.
## 기획서
{plan}
## 체크리스트
{checklist}
## 참고 문서
{docs}
## 후보 결과물
{candidate_outputs}
## 개별 리뷰 결과
{reviews_bundle}
## 이전 검증 피드백
{feedback}
## 참조 아티팩트
{artifact_references}
## 이전 이슈 트래커
{previous_senior_tracker}
@@ -619,8 +543,8 @@ AGGREGATE_REVIEW_TEMPLATE_KO = """\
{execution_evidence}
## 지침
프로젝트 디렉토리를 탐색하여 현재 코드베이스 상태를 확인한 뒤, \
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력 종료 코드로 검증하세요. \
참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽어 현재 코드베이스 상태를 확인한 뒤, \
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요. \
그런 다음 아래를 수행하세요.
1. 리뷰어들 사이에 중복되는 이슈를 합치세요.
2. 의견 충돌은 명시적으로 정리하세요.

View File

@@ -11,8 +11,58 @@ dependencies = [
"pyyaml>=6.0",
]
[project.optional-dependencies]
dev = [
"coverage[toml]>=7.6",
"pyright>=1.1.390",
"pytest-cov>=6.0",
"ruff>=0.8.0",
]
[project.scripts]
cross-eval = "cross_eval.cli:main"
[tool.setuptools.packages.find]
include = ["cross_eval*"]
[tool.pytest.ini_options]
testpaths = ["tests"]
addopts = "-q"
[tool.ruff]
target-version = "py39"
extend-exclude = [".cross-eval"]
[tool.ruff.lint]
select = ["F"]
[tool.pyright]
include = ["cross_eval", "tests"]
exclude = [".cross-eval"]
typeCheckingMode = "basic"
pythonVersion = "3.9"
reportMissingImports = true
reportMissingTypeStubs = false
[tool.coverage.run]
branch = true
source = ["cross_eval"]
omit = [
"cross_eval/config.py",
"cross_eval/discovery.py",
"cross_eval/cli.py",
"cross_eval/demo.py",
"cross_eval/doctor.py",
"cross_eval/prompts.py",
"cross_eval/report.py",
]
[tool.coverage.report]
skip_empty = true
show_missing = true
fail_under = 90
exclude_lines = [
"pragma: no cover",
"if TYPE_CHECKING:",
"raise NotImplementedError",
]

View File

@@ -12,10 +12,10 @@ import subprocess
import tempfile
import unittest
from pathlib import Path
from unittest.mock import MagicMock, call, patch
from unittest.mock import MagicMock, patch
from cross_eval.agent import AgentInvocationError, invoke_agent_agentic
from cross_eval.config import BUILTIN_AGENTS, _make_agentic
from cross_eval.config import _make_agentic
from cross_eval.models import (
AgentConfig,
AgentResult,
@@ -24,8 +24,6 @@ from cross_eval.models import (
)
from cross_eval.pipeline import (
_assert_base_repo_isolation,
_commit_iteration,
_finalize_worktree,
_has_agentic_steps,
_setup_worktree,
run_pipeline,
@@ -267,6 +265,7 @@ class TestInvokeAgentAgenticClaude(unittest.TestCase):
break
self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'claude'")
assert agent_call is not None
cmd = agent_call[0][0]
# No -p flag
@@ -274,6 +273,7 @@ class TestInvokeAgentAgenticClaude(unittest.TestCase):
# Prompt is delivered via stdin (input kwarg), not as a positional arg
input_data = agent_call[1].get("input")
self.assertIsNotNone(input_data)
assert input_data is not None
self.assertIn("implement feature X", input_data)
@@ -311,6 +311,7 @@ class TestInvokeAgentAgenticCodex(unittest.TestCase):
break
self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'codex'")
assert agent_call is not None
cmd = agent_call[0][0]
# Should have "-" sentinel at the end for stdin
@@ -318,6 +319,7 @@ class TestInvokeAgentAgenticCodex(unittest.TestCase):
# Stdin input should contain the prompt
input_data = agent_call[1].get("input")
self.assertIsNotNone(input_data)
assert input_data is not None
self.assertIn("implement feature Y", input_data)
@@ -435,6 +437,16 @@ class TestMakeAgenticClaude(unittest.TestCase):
self.assertNotIn("-p", agent.args)
self.assertIn("--setting-sources", agent.args)
def test_strips_dash_dash_print_alias(self) -> None:
agent = AgentConfig(
name="claude-coder",
command="claude",
args=["--print", "--setting-sources", "user"],
)
_make_agentic(agent)
self.assertTrue(agent.agentic)
self.assertNotIn("--print", agent.args)
def test_idempotent_when_no_dash_p(self) -> None:
agent = AgentConfig(
name="claude-coder",

View File

@@ -26,7 +26,6 @@ from cross_eval.models import (
PhaseConfig,
PipelineConfig,
PipelineResult,
ReviewMetrics,
StepConfig,
)
from cross_eval.pipeline import (
@@ -54,7 +53,7 @@ from cross_eval.prompts import (
_build_review_only_preset,
_build_simple_preset,
)
from cross_eval.report import build_report, parse_review_metrics, print_escalation_report
from cross_eval.report import build_report, parse_review_metrics
class BuiltinAgentConfigTest(unittest.TestCase):
def test_claude_builtin_agents_use_user_settings_and_disable_slash_commands(self) -> None:

View File

@@ -26,10 +26,9 @@ from cross_eval.models import (
IterationResult,
PipelineConfig,
PipelineResult,
ReviewMetrics,
StepConfig,
)
from cross_eval.pipeline import _format_execution_evidence, run_pipeline
from cross_eval.pipeline import _build_artifact_references, _format_execution_evidence, run_pipeline
from cross_eval.report import build_report
@@ -59,7 +58,7 @@ class TestFormatExecutionEvidence(unittest.TestCase):
self.assertIn("Exit code: 0", evidence)
self.assertIn("12.3s", evidence)
self.assertIn("claude --setting-sources user", evidence)
self.assertIn("Transcript excerpt", evidence)
self.assertNotIn("Transcript excerpt", evidence)
def test_multiple_results_separated(self) -> None:
r1 = AgentResult(
@@ -88,10 +87,60 @@ class TestFormatExecutionEvidence(unittest.TestCase):
transcript=long_transcript,
)
evidence = _format_execution_evidence({"key": result})
self.assertIn("truncated", evidence)
# The full 3000-char transcript should NOT appear
self.assertNotIn("x" * 3000, evidence)
def test_artifact_paths_included_when_run_dir_provided(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
result = AgentResult(
output="diff",
exit_code=0,
agent_name="coder",
step_name="coding",
duration_seconds=1.2,
transcript="stdout",
command_preview="claude ...",
)
evidence = _format_execution_evidence(
{"coding_output": result},
run_dir=Path(tmpdir),
iteration=2,
)
self.assertIn("v2/coding.md", evidence)
self.assertIn("v2/coding_transcript.md", evidence)
class TestArtifactReferences(unittest.TestCase):
"""Artifact references should prefer file paths and git state over inline text."""
def test_contains_input_refs_and_git_context(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
repo = Path(tmpdir) / "repo"
repo.mkdir()
(repo / "plan.md").write_text("plan", encoding="utf-8")
(repo / "checklist.md").write_text("checklist", encoding="utf-8")
import subprocess
subprocess.run(["git", "init"], cwd=repo, capture_output=True, check=True)
subprocess.run(["git", "config", "user.email", "test@test.com"], cwd=repo, capture_output=True, check=True)
subprocess.run(["git", "config", "user.name", "Test"], cwd=repo, capture_output=True, check=True)
subprocess.run(["git", "add", "."], cwd=repo, capture_output=True, check=True)
subprocess.run(["git", "commit", "-m", "init"], cwd=repo, capture_output=True, check=True)
refs = _build_artifact_references(
{
"plan_ref": str((repo / "plan.md").resolve()),
"checklist_ref": str((repo / "checklist.md").resolve()),
"docs_ref": "(none)",
},
cwd=repo,
run_dir=repo / ".cross-eval" / "output" / "run",
iteration=1,
worktree_path=None,
)
self.assertIn("Plan:", refs)
self.assertIn("Git commit:", refs)
self.assertIn("Suggested git commands", refs)
# ---------------------------------------------------------------------------
# 2. Evidence in reviewer prompts (integration)
@@ -162,7 +211,7 @@ class TestEvidenceInReviewerPrompt(unittest.TestCase):
]
self.assertTrue(len(review_prompts) >= 1)
review_prompt = review_prompts[0]["prompt"]
# Evidence section should reference the coding step's command
self.assertIn("Artifact References", review_prompt)
self.assertIn("Execution Evidence", review_prompt)
self.assertIn("claude-coder", review_prompt)

View File

@@ -11,7 +11,6 @@ from cross_eval.doctor import (
check_cli_installed,
check_config,
format_doctor_results,
run_doctor,
)
from cross_eval.demo import (
DEMO_CHECKLIST,

View File

@@ -8,9 +8,7 @@ from unittest.mock import patch
from cross_eval.config import BUILTIN_AGENTS
from cross_eval.models import (
AgentConfig,
AgentResult,
PhaseConfig,
PipelineConfig,
StepConfig,
)

View File

@@ -390,6 +390,7 @@ class TranscriptSavingRegressionTest(unittest.TestCase):
# Verify transcript files were saved
run_dir = result.run_dir
self.assertIsNotNone(run_dir)
assert run_dir is not None
coding_transcript = run_dir / "v1" / "coding_transcript.md"
review_transcript = run_dir / "v1" / "review_transcript.md"
self.assertTrue(

831
tests/test_runtime_misc.py Normal file
View File

@@ -0,0 +1,831 @@
from __future__ import annotations
import re
import subprocess
import tempfile
import unittest
from pathlib import Path
from unittest.mock import MagicMock, patch
from cross_eval.agent import (
AgentInvocationError,
_build_transcript,
_classify_agent_failure,
invoke_agent,
invoke_agent_agentic,
)
from cross_eval.models import AgentConfig, AgentResult, ExecutionConfig, PipelineConfig, StepConfig
from cross_eval.pipeline import (
_commit_iteration,
_execute_parallel_batch,
_execute_step,
_finalize_worktree,
_format_runtime_error_markdown,
_maybe_save_step_transcript,
_snapshot_repo_state,
)
from cross_eval.runtime_env import (
build_execution_policy,
parse_dotenv,
resolve_env_files,
summarize_environment,
)
from cross_eval.worktree import WorktreeError, create_worktree, remove_worktree
def _init_git_repo(path: Path) -> None:
subprocess.run(["git", "init"], cwd=path, capture_output=True, check=True)
subprocess.run(
["git", "config", "user.email", "test@test.com"],
cwd=path,
capture_output=True,
check=True,
)
subprocess.run(
["git", "config", "user.name", "Test"],
cwd=path,
capture_output=True,
check=True,
)
(path / "README.md").write_text("# init\n", encoding="utf-8")
subprocess.run(["git", "add", "."], cwd=path, capture_output=True, check=True)
subprocess.run(
["git", "commit", "-m", "initial"],
cwd=path,
capture_output=True,
check=True,
)
class TestInvokeAgentRuntime(unittest.TestCase):
@patch("cross_eval.agent.subprocess.run")
def test_interactive_claude_reads_output_file(self, mock_run: MagicMock) -> None:
def _fake_run(cmd: list[str], **kwargs: object) -> MagicMock:
match = re.search(r"Write your complete output to (.+)\.$", cmd[-1])
self.assertIsNotNone(match)
assert match is not None
Path(match.group(1)).write_text("review result", encoding="utf-8")
return MagicMock(returncode=0, stdout="", stderr="")
mock_run.side_effect = _fake_run
agent = AgentConfig(
name="claude-reviewer",
command="claude",
args=["--model", "opus"],
system_prompt="system",
)
result = invoke_agent(agent, "inspect code", "review", quiet=True)
self.assertEqual(result.output, "review result")
called_cmd = mock_run.call_args[0][0]
self.assertIn("--system-prompt", called_cmd)
@patch("cross_eval.agent.subprocess.run")
def test_interactive_claude_falls_back_to_stdout(self, mock_run: MagicMock) -> None:
mock_run.return_value = MagicMock(returncode=0, stdout="stdout fallback", stderr="")
agent = AgentConfig(name="claude-reviewer", command="claude", args=["--model", "opus"])
result = invoke_agent(agent, "inspect code", "review", quiet=True)
self.assertEqual(result.output, "stdout fallback")
@patch("cross_eval.agent.subprocess.run")
def test_non_claude_wraps_system_prompt_in_stdin(self, mock_run: MagicMock) -> None:
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
agent = AgentConfig(
name="custom-reviewer",
command="custom-cli",
args=["run"],
system_prompt="strict mode",
)
invoke_agent(agent, "check things", "review", quiet=True)
self.assertEqual(
mock_run.call_args.kwargs["input"],
"<system>\nstrict mode\n</system>\n\ncheck things",
)
@patch("cross_eval.agent.subprocess.run")
def test_failure_raises_structured_error(self, mock_run: MagicMock) -> None:
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="API Error: backend down")
agent = AgentConfig(name="codex-reviewer", command="codex", args=["exec", "-"])
with self.assertRaises(AgentInvocationError) as ctx:
invoke_agent(agent, "check", "review", quiet=True)
self.assertEqual(ctx.exception.failure_type, "API_ERROR")
self.assertIn("backend down", ctx.exception.raw_error)
def test_classify_unknown_failure(self) -> None:
failure_type, suggested_action = _classify_agent_failure("weird crash")
self.assertEqual(failure_type, "UNKNOWN")
self.assertIn("Inspect", suggested_action)
def test_build_transcript_includes_cwd_and_duration(self) -> None:
transcript = _build_transcript(
command_preview="claude -p",
stdout="ok",
stderr="",
exit_code=0,
duration_seconds=1.2,
cwd="/tmp/repo",
)
self.assertIn("## Working Directory", transcript)
self.assertIn("## Duration: 1.2s", transcript)
@patch("cross_eval.agent._Spinner")
@patch("cross_eval.agent.subprocess.run")
def test_timeout_stops_spinner(self, mock_run: MagicMock, mock_spinner: MagicMock) -> None:
spinner = mock_spinner.return_value
mock_run.side_effect = subprocess.TimeoutExpired(cmd=["claude"], timeout=12)
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
with self.assertRaises(subprocess.TimeoutExpired):
invoke_agent(agent, "inspect code", "review", quiet=False, timeout=12)
spinner.stop.assert_called_once()
@patch("cross_eval.agent._Spinner")
@patch("cross_eval.agent.subprocess.run")
def test_generic_exception_stops_spinner(self, mock_run: MagicMock, mock_spinner: MagicMock) -> None:
spinner = mock_spinner.return_value
mock_run.side_effect = OSError("boom")
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
with self.assertRaises(OSError):
invoke_agent(agent, "inspect code", "review", quiet=False)
spinner.stop.assert_called_once()
@patch("cross_eval.agent.logger.warning")
@patch("cross_eval.agent.subprocess.run")
def test_empty_output_logs_warning(self, mock_run: MagicMock, mock_warning: MagicMock) -> None:
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
result = invoke_agent(agent, "inspect code", "review", quiet=True)
self.assertEqual(result.output, "")
mock_warning.assert_called_once()
@patch("cross_eval.agent.subprocess.run")
def test_print_mode_claude_uses_native_system_prompt_flag(self, mock_run: MagicMock) -> None:
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
agent = AgentConfig(
name="claude-reviewer",
command="claude",
args=["-p"],
system_prompt="be strict",
)
invoke_agent(agent, "review this", "review", quiet=True)
called_cmd = mock_run.call_args[0][0]
self.assertIn("--system-prompt", called_cmd)
self.assertEqual(mock_run.call_args.kwargs["input"], "review this")
@patch("cross_eval.agent.subprocess.run")
def test_interactive_failure_truncates_error_and_removes_output_file(
self,
mock_run: MagicMock,
) -> None:
seen_output_path: Path | None = None
def _fake_run(cmd: list[str], **kwargs: object) -> MagicMock:
nonlocal seen_output_path
match = re.search(r"Write your complete output to (.+)\.$", cmd[-1])
self.assertIsNotNone(match)
assert match is not None
seen_output_path = Path(match.group(1))
return MagicMock(returncode=1, stdout="", stderr="x" * 600)
mock_run.side_effect = _fake_run
agent = AgentConfig(name="claude-reviewer", command="claude", args=["--model", "opus"])
with self.assertRaises(AgentInvocationError) as ctx:
invoke_agent(agent, "inspect code", "review", quiet=True)
self.assertEqual(len(ctx.exception.raw_error), 503)
self.assertIsNotNone(seen_output_path)
assert seen_output_path is not None
self.assertFalse(seen_output_path.exists())
@patch("cross_eval.agent.logger.warning")
@patch("cross_eval.agent.subprocess.run")
def test_empty_output_with_stderr_logs_stderr_warning(
self,
mock_run: MagicMock,
mock_warning: MagicMock,
) -> None:
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="stderr text")
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
invoke_agent(agent, "inspect code", "review", quiet=True)
self.assertIn("stderr:", mock_warning.call_args[0][0])
class TestInvokeAgenticRuntime(unittest.TestCase):
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
@patch("cross_eval.agent.subprocess.run")
def test_codex_agentic_adds_reasoning_and_system_wrapper(
self,
mock_run: MagicMock,
mock_diff: MagicMock,
) -> None:
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
agent = AgentConfig(
name="codex-coder",
command="codex",
args=["exec", "--full-auto"],
system_prompt="strict mode",
reasoning_effort="high",
agentic=True,
)
with tempfile.TemporaryDirectory() as tmpdir:
repo = Path(tmpdir)
_init_git_repo(repo)
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=True)
called_cmd = mock_run.call_args[0][0]
self.assertIn("-c", called_cmd)
self.assertEqual(called_cmd[-1], "-")
self.assertIn("<system>", mock_run.call_args.kwargs["input"])
@patch("cross_eval.agent._Spinner")
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
@patch("cross_eval.agent.subprocess.run")
def test_agentic_claude_success_uses_system_prompt_and_spinner(
self,
mock_run: MagicMock,
mock_diff: MagicMock,
mock_spinner: MagicMock,
) -> None:
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
agent = AgentConfig(
name="claude-coder",
command="claude",
args=["-p", "--print"],
system_prompt="stay in scope",
agentic=True,
)
with tempfile.TemporaryDirectory() as tmpdir:
repo = Path(tmpdir)
_init_git_repo(repo)
result = invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
called_cmd = mock_run.call_args[0][0]
self.assertNotIn("-p", called_cmd)
self.assertIn("--system-prompt", called_cmd)
self.assertEqual(result.output, "diff --git a/file ...")
mock_spinner.return_value.stop.assert_called_once()
@patch("cross_eval.agent._Spinner")
def test_agentic_timeout_stops_spinner(self, mock_spinner: MagicMock) -> None:
spinner = mock_spinner.return_value
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
with tempfile.TemporaryDirectory() as tmpdir:
repo = Path(tmpdir)
_init_git_repo(repo)
with patch(
"cross_eval.agent.subprocess.run",
side_effect=subprocess.TimeoutExpired(cmd=["codex"], timeout=20),
):
with self.assertRaises(subprocess.TimeoutExpired):
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False, timeout=20)
spinner.stop.assert_called_once()
@patch("cross_eval.agent.subprocess.run")
def test_agentic_nonzero_exit_raises_structured_error(self, mock_run: MagicMock) -> None:
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="unauthorized")
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
with tempfile.TemporaryDirectory() as tmpdir:
repo = Path(tmpdir)
_init_git_repo(repo)
with self.assertRaises(AgentInvocationError) as ctx:
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=True)
self.assertEqual(ctx.exception.failure_type, "AUTH")
@patch("cross_eval.agent._Spinner")
def test_agentic_generic_exception_stops_spinner(
self,
mock_spinner: MagicMock,
) -> None:
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
with tempfile.TemporaryDirectory() as tmpdir:
repo = Path(tmpdir)
_init_git_repo(repo)
with patch("cross_eval.agent.subprocess.run", side_effect=OSError("boom")):
with self.assertRaises(OSError):
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
mock_spinner.return_value.stop.assert_called_once()
@patch("cross_eval.agent._Spinner")
@patch("cross_eval.agent.subprocess.run")
def test_agentic_failure_truncates_error(
self,
mock_run: MagicMock,
mock_spinner: MagicMock,
) -> None:
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="x" * 600)
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
with tempfile.TemporaryDirectory() as tmpdir:
repo = Path(tmpdir)
_init_git_repo(repo)
with self.assertRaises(AgentInvocationError) as ctx:
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
self.assertEqual(len(ctx.exception.raw_error), 503)
mock_spinner.return_value.stop.assert_called_once()
@patch("cross_eval.agent._Spinner")
@patch("cross_eval.worktree.capture_diff", return_value="")
@patch("cross_eval.agent.subprocess.run")
def test_agentic_empty_diff_failure_truncates_error_and_stops_spinner(
self,
mock_run: MagicMock,
mock_diff: MagicMock,
mock_spinner: MagicMock,
) -> None:
mock_run.return_value = MagicMock(
returncode=0,
stdout="implemented",
stderr="permission denied " * 300,
)
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
with tempfile.TemporaryDirectory() as tmpdir:
repo = Path(tmpdir)
_init_git_repo(repo)
with self.assertRaises(AgentInvocationError) as ctx:
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
self.assertLessEqual(len(ctx.exception.raw_error), 2003)
self.assertEqual(ctx.exception.failure_type, "WRITE_FAILURE")
mock_spinner.return_value.stop.assert_called_once()
class TestPipelineHelpers(unittest.TestCase):
@patch("cross_eval.worktree.commit_worktree", return_value=True)
def test_commit_iteration_logs_only_when_committed(self, mock_commit: MagicMock) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
_commit_iteration(Path(tmpdir), "review-fix", 2, "PASS")
mock_commit.assert_called_once()
def test_snapshot_repo_state_includes_untracked_digest(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
repo = Path(tmpdir)
_init_git_repo(repo)
(repo / "scratch.txt").write_text("draft", encoding="utf-8")
snapshot = _snapshot_repo_state(repo)
self.assertIn("UNTRACKED scratch.txt", snapshot)
def test_finalize_worktree_deletes_empty_branch(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
base = Path(tmpdir) / "repo"
base.mkdir()
_init_git_repo(base)
branch = "cross-eval/empty"
subprocess.run(
["git", "branch", branch, "HEAD"],
cwd=base,
capture_output=True,
check=True,
)
worktree = Path(tmpdir) / "wt"
subprocess.run(
["git", "worktree", "add", str(worktree), branch],
cwd=base,
capture_output=True,
check=True,
)
branch_result = _finalize_worktree(base, worktree, branch, "review-fix", "PASS")
self.assertIsNone(branch_result)
branches = subprocess.run(
["git", "branch", "--list", branch],
cwd=base,
capture_output=True,
text=True,
check=True,
)
self.assertEqual(branches.stdout.strip(), "")
def test_format_runtime_error_markdown_for_generic_exception(self) -> None:
markdown = _format_runtime_error_markdown(
RuntimeError("boom"),
step_name="review",
agent_name="claude-reviewer",
phase_name="review_fix",
)
self.assertIn("# Agent Error", markdown)
self.assertIn("review_fix", markdown)
self.assertIn("boom", markdown)
def test_maybe_save_step_transcript_returns_none_without_transcript(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
result = AgentResult(
output="ok",
exit_code=0,
agent_name="claude-reviewer",
step_name="review",
duration_seconds=0.1,
)
saved = _maybe_save_step_transcript(Path(tmpdir), 1, "review", result)
self.assertIsNone(saved)
@patch("cross_eval.pipeline.invoke_agent")
def test_execute_step_saves_timeout_markdown(self, mock_invoke: MagicMock) -> None:
mock_invoke.side_effect = subprocess.TimeoutExpired(
cmd=["claude"],
timeout=45,
output="partial output",
stderr="still running",
)
step = StepConfig(
name="review",
agent="claude-reviewer",
role="review",
prompt_template="default:review",
output_key="review_output",
)
config = PipelineConfig(
agents={
"claude-reviewer": AgentConfig(
name="claude-reviewer",
command="claude",
args=["-p"],
),
},
)
step_outputs: dict[str, str] = {}
step_results: dict[str, AgentResult] = {}
with tempfile.TemporaryDirectory() as tmpdir:
run_dir = Path(tmpdir)
with self.assertRaises(RuntimeError) as ctx:
_execute_step(
step,
config,
{"plan": "Plan", "checklist": "Checklist"},
"",
1,
3,
run_dir,
45,
False,
step_outputs,
step_results,
run_dir=run_dir,
output_iter=1,
)
self.assertIn("timed out after 45s", str(ctx.exception))
error_path = run_dir / "v1" / "review_error.md"
self.assertTrue(error_path.exists())
self.assertIn("# Agent Timeout", error_path.read_text(encoding="utf-8"))
@patch("cross_eval.pipeline.invoke_agent")
def test_execute_step_saves_runtime_error_markdown(self, mock_invoke: MagicMock) -> None:
mock_invoke.side_effect = AgentInvocationError(
agent_name="claude-reviewer",
step_name="review",
cmd_preview="claude -p",
raw_error="api broke",
failure_type="API_ERROR",
suggested_action="retry",
)
step = StepConfig(
name="review",
agent="claude-reviewer",
role="review",
prompt_template="default:review",
output_key="review_output",
)
config = PipelineConfig(
agents={
"claude-reviewer": AgentConfig(
name="claude-reviewer",
command="claude",
args=["-p"],
),
},
)
with tempfile.TemporaryDirectory() as tmpdir:
run_dir = Path(tmpdir)
with self.assertRaises(AgentInvocationError):
_execute_step(
step,
config,
{"plan": "Plan", "checklist": "Checklist"},
"",
1,
3,
run_dir,
45,
False,
{},
{},
run_dir=run_dir,
output_iter=1,
)
error_text = (run_dir / "v1" / "review_error.md").read_text(encoding="utf-8")
self.assertIn("API_ERROR", error_text)
self.assertIn("retry", error_text)
@patch("cross_eval.pipeline.invoke_agent")
def test_execute_parallel_batch_saves_success_and_timeout_error(self, mock_invoke: MagicMock) -> None:
def _fake_invoke(agent_config: AgentConfig, prompt: str, step_name: str, **kwargs: object) -> AgentResult:
if step_name == "review_ok":
return AgentResult(
output="VERDICT: PASS",
exit_code=0,
agent_name=agent_config.name,
step_name=step_name,
duration_seconds=0.1,
)
raise subprocess.TimeoutExpired(
cmd=["codex"],
timeout=30,
output="halfway",
stderr="timeout stderr",
)
mock_invoke.side_effect = _fake_invoke
batch = [
StepConfig(
name="review_ok",
agent="claude-reviewer",
role="review",
prompt_template="default:review",
output_key="review_ok",
parallel=True,
),
StepConfig(
name="review_slow",
agent="codex-reviewer",
role="review",
prompt_template="default:review",
output_key="review_slow",
parallel=True,
),
]
config = PipelineConfig(
agents={
"claude-reviewer": AgentConfig(name="claude-reviewer", command="claude", args=["-p"]),
"codex-reviewer": AgentConfig(name="codex-reviewer", command="codex", args=["exec", "-"]),
},
)
step_outputs: dict[str, str] = {}
step_results: dict[str, AgentResult] = {}
with tempfile.TemporaryDirectory() as tmpdir:
run_dir = Path(tmpdir)
with self.assertRaises(RuntimeError) as ctx:
_execute_parallel_batch(
batch,
config,
{"plan": "Plan", "checklist": "Checklist"},
"",
1,
3,
run_dir,
30,
False,
step_outputs,
step_results,
run_dir=run_dir,
output_iter=1,
)
self.assertIn("Successful outputs were saved for: review_ok", str(ctx.exception))
self.assertEqual(step_outputs["review_ok"], "VERDICT: PASS")
self.assertTrue((run_dir / "v1" / "review_ok.md").exists())
self.assertTrue((run_dir / "v1" / "review_slow_error.md").exists())
@patch("cross_eval.pipeline._execute_step")
def test_execute_parallel_batch_dry_run_uses_sequential_path(self, mock_step: MagicMock) -> None:
batch = [
StepConfig(
name="review_a",
agent="claude-reviewer",
role="review",
prompt_template="default:review",
output_key="review_a",
parallel=True,
),
StepConfig(
name="review_b",
agent="codex-reviewer",
role="review",
prompt_template="default:review",
output_key="review_b",
parallel=True,
),
]
config = PipelineConfig(agents={})
with tempfile.TemporaryDirectory() as tmpdir:
_execute_parallel_batch(
batch,
config,
{"plan": "Plan"},
"",
1,
3,
Path(tmpdir),
None,
True,
{},
{},
run_dir=Path(tmpdir),
output_iter=1,
)
self.assertEqual(mock_step.call_count, 2)
@patch("cross_eval.pipeline._execute_step")
def test_execute_parallel_batch_agentic_steps_fall_back_to_sequential(self, mock_step: MagicMock) -> None:
batch = [
StepConfig(
name="review_a",
agent="agentic-a",
role="review",
prompt_template="default:review",
output_key="review_a",
parallel=True,
),
StepConfig(
name="review_b",
agent="agentic-b",
role="review",
prompt_template="default:review",
output_key="review_b",
parallel=True,
),
]
config = PipelineConfig(
agents={
"agentic-a": AgentConfig(name="agentic-a", command="claude", agentic=True),
"agentic-b": AgentConfig(name="agentic-b", command="codex", agentic=True),
},
)
with tempfile.TemporaryDirectory() as tmpdir:
_execute_parallel_batch(
batch,
config,
{"plan": "Plan"},
"",
1,
3,
Path(tmpdir),
None,
False,
{},
{},
run_dir=Path(tmpdir),
output_iter=1,
worktree_path=Path(tmpdir),
)
self.assertEqual(mock_step.call_count, 2)
@patch("cross_eval.worktree.remove_worktree", side_effect=RuntimeError("cleanup failed"))
@patch("cross_eval.worktree.commit_worktree", side_effect=RuntimeError("commit failed"))
def test_finalize_worktree_handles_cleanup_failures(
self,
mock_commit: MagicMock,
mock_remove: MagicMock,
) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
branch = _finalize_worktree(
Path(tmpdir),
Path(tmpdir) / "wt",
"cross-eval/fail",
"review-fix",
"FAIL",
)
self.assertIsNone(branch)
class TestRuntimeEnvironmentHelpers(unittest.TestCase):
def test_parse_dotenv_handles_export_and_quotes(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
env_path = Path(tmpdir) / ".env"
env_path.write_text(
"export FOO='bar'\nBAR=\"line\\nvalue\"\nINVALID\n=skip\n",
encoding="utf-8",
)
values = parse_dotenv(env_path)
self.assertEqual(values["FOO"], "bar")
self.assertEqual(values["BAR"], "line\nvalue")
self.assertNotIn("INVALID", values)
def test_resolve_env_files_deduplicates_and_filters_missing(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
root = Path(tmpdir)
env_path = root / ".env"
env_path.write_text("FOO=bar\n", encoding="utf-8")
execution = ExecutionConfig(
env_files=[".env", str(env_path)],
auto_env_files=[".env", ".env.local"],
)
resolved = resolve_env_files(execution, root)
self.assertEqual(resolved, [env_path.resolve()])
def test_summarize_environment_hides_names_when_disabled(self) -> None:
execution = ExecutionConfig(expose_env_names=False, auto_context_targets=["postgres"])
summary = summarize_environment(
execution,
[],
{"DATABASE_URL": "postgres://localhost"},
{},
)
self.assertIn("names are hidden", summary)
self.assertIn("Execution targets hinted by the user: postgres", summary)
def test_build_execution_policy_for_minimal_mode(self) -> None:
policy = build_execution_policy(
ExecutionConfig(mode="agent-decides", command_policy="minimal"),
)
self.assertIn("Command policy: minimal", policy)
self.assertIn("Keep command usage minimal", policy)
class TestWorktreeFailures(unittest.TestCase):
@patch("cross_eval.worktree.subprocess.run")
def test_create_worktree_raises_when_branch_creation_fails(self, mock_run: MagicMock) -> None:
mock_run.side_effect = subprocess.CalledProcessError(
1,
["git", "branch"],
stderr="branch failed",
)
with tempfile.TemporaryDirectory() as tmpdir:
base = Path(tmpdir)
work_dir = base / "wt"
with self.assertRaises(WorktreeError) as ctx:
create_worktree(base, work_dir, "cross-eval/fail")
self.assertIn("Failed to create branch", str(ctx.exception))
@patch("cross_eval.worktree.subprocess.run")
def test_create_worktree_cleans_branch_on_worktree_failure(self, mock_run: MagicMock) -> None:
mock_run.side_effect = [
MagicMock(returncode=0),
subprocess.CalledProcessError(
1,
["git", "worktree", "add"],
stderr="worktree failed",
),
MagicMock(returncode=0),
]
with tempfile.TemporaryDirectory() as tmpdir:
base = Path(tmpdir)
work_dir = base / "wt"
with self.assertRaises(WorktreeError):
create_worktree(base, work_dir, "cross-eval/fail")
cleanup_call = mock_run.call_args_list[-1]
self.assertEqual(cleanup_call[0][0][:3], ["git", "branch", "-D"])
@patch("cross_eval.worktree.shutil.rmtree")
@patch("cross_eval.worktree.subprocess.run")
def test_remove_worktree_falls_back_to_prune(self, mock_run: MagicMock, mock_rmtree: MagicMock) -> None:
mock_run.side_effect = [
subprocess.CalledProcessError(1, ["git", "worktree", "remove"]),
MagicMock(returncode=0),
]
with tempfile.TemporaryDirectory() as tmpdir:
base = Path(tmpdir) / "repo"
work_dir = Path(tmpdir) / "wt"
base.mkdir()
work_dir.mkdir()
remove_worktree(base, work_dir)
resolved = work_dir.resolve()
mock_rmtree.assert_any_call(resolved, ignore_errors=True)
self.assertEqual(mock_run.call_args_list[-1][0][0], ["git", "worktree", "prune"])