feat: propagate execution evidence across iterations and enhance reports

- Carry execution evidence forward so reviewer/senior prompts in
  subsequent iterations can inspect prior transcript and command data
- Add {execution_evidence} to REVIEW_ONLY templates (en/ko)
- Add evidence summary table to iteration reports
- Fix test_agentic to match stdin-based prompt delivery for Claude
- Add expanded claim/no-change marker tests and cross-iteration
  evidence propagation tests

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
chungyeong
2026-03-13 23:36:28 +09:00
parent c467222a2a
commit 87bc0ffbfb
5 changed files with 591 additions and 10 deletions

View File

@@ -338,6 +338,13 @@ def _run_simple_pipeline(
if tracker: if tracker:
input_contents["previous_senior_tracker"] = tracker input_contents["previous_senior_tracker"] = tracker
# Carry execution evidence forward so subsequent iterations'
# reviewer/senior prompts can inspect prior transcript data.
if step_results:
input_contents["execution_evidence"] = _format_execution_evidence(
step_results,
)
iterations.append(iter_result) iterations.append(iter_result)
# ESCALATE check (highest priority) # ESCALATE check (highest priority)
@@ -531,6 +538,13 @@ def _run_phased_pipeline(
if tracker: if tracker:
input_contents["previous_senior_tracker"] = tracker input_contents["previous_senior_tracker"] = tracker
# Carry execution evidence forward so subsequent iterations'
# reviewer/senior prompts can inspect prior transcript data.
if step_results:
input_contents["execution_evidence"] = _format_execution_evidence(
step_results,
)
iterations.append(iter_result) iterations.append(iter_result)
# ESCALATE check # ESCALATE check
@@ -1133,16 +1147,33 @@ def _build_context(
max_iterations: int, max_iterations: int,
step_results: dict[str, AgentResult] | None = None, step_results: dict[str, AgentResult] | None = None,
) -> dict[str, str]: ) -> dict[str, str]:
"""Build the template context dict.""" """Build the template context dict.
Execution evidence from prior iterations is carried forward in
``input_contents["execution_evidence"]``. When the current iteration
has its own step results, the evidence is merged so reviewers/seniors
see both prior and current data.
"""
context: dict[str, str] = {} context: dict[str, str] = {}
context.update(input_contents) context.update(input_contents)
context.update(step_outputs) context.update(step_outputs)
context["feedback"] = feedback context["feedback"] = feedback
context["iteration"] = str(iteration) context["iteration"] = str(iteration)
context["max_iterations"] = str(max_iterations) context["max_iterations"] = str(max_iterations)
# Surface execution evidence from prior steps so reviewers can inspect it # Surface execution evidence from prior steps so reviewers can inspect it.
# Prior-iteration evidence may already live in context via input_contents.
prior_evidence = context.get("execution_evidence", "")
if step_results: if step_results:
context["execution_evidence"] = _format_execution_evidence(step_results) current_evidence = _format_execution_evidence(step_results)
if prior_evidence and prior_evidence != "(no prior execution evidence)":
context["execution_evidence"] = (
"# Prior Iteration Evidence\n"
+ prior_evidence
+ "\n\n# Current Iteration Evidence\n"
+ current_evidence
)
else:
context["execution_evidence"] = current_evidence
return context return context
@@ -1164,6 +1195,7 @@ def _format_execution_evidence(
f"- Command: `{result.command_preview}`" if result.command_preview else "", f"- Command: `{result.command_preview}`" if result.command_preview else "",
f"- Exit code: {result.exit_code}", f"- Exit code: {result.exit_code}",
f"- Duration: {result.duration_seconds}s", f"- Duration: {result.duration_seconds}s",
f"- Output size: {len(result.output)} chars",
] ]
section = [line for line in section if line] section = [line for line in section if line]
if result.transcript: if result.transcript:

View File

@@ -243,9 +243,14 @@ You are tasked with reviewing existing code against a plan and checklist.
## Previous Review (iteration {iteration} of {max_iterations}) ## Previous Review (iteration {iteration} of {max_iterations})
{feedback} {feedback}
## Execution Evidence
{execution_evidence}
## Review Instructions ## Review Instructions
Explore the project directory thoroughly to understand the full codebase, \ Explore the project directory thoroughly to understand the full codebase, \
then evaluate the EXISTING code against ONLY the plan and checklist above. then evaluate the EXISTING code against ONLY the plan and checklist above. \
Use the execution evidence above to verify agent claims against actual \
command outputs and exit codes.
You are NOT generating or modifying code. You are auditing what already exists. You are NOT generating or modifying code. You are auditing what already exists.
@@ -314,9 +319,13 @@ REVIEW_ONLY_TEMPLATE_KO = """\
## 이전 리뷰 결과 ({max_iterations}회 중 {iteration}번째) ## 이전 리뷰 결과 ({max_iterations}회 중 {iteration}번째)
{feedback} {feedback}
## 실행 증거
{execution_evidence}
## 검토 지침 ## 검토 지침
프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스를 파악한 뒤, \ 프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스를 파악한 뒤, \
위 기획서와 체크리스트 기준으로 **기존 코드**를 평가하세요. 위 기획서와 체크리스트 기준으로 **기존 코드**를 평가하세요. \
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력과 종료 코드로 검증하세요.
코드를 생성하거나 수정하지 마세요. 이미 존재하는 코드를 감사하는 것이 목적입니다. 코드를 생성하거나 수정하지 마세요. 이미 존재하는 코드를 감사하는 것이 목적입니다.

View File

@@ -58,6 +58,12 @@ _STRINGS: dict[str, dict[str, str]] = {
"metrics_total_issues": "Total Issues", "metrics_total_issues": "Total Issues",
"metrics_na": "N/A", "metrics_na": "N/A",
"iteration_details": "Iteration Details", "iteration_details": "Iteration Details",
"evidence_summary": "Evidence Summary",
"evidence_agent": "Agent",
"evidence_exit_code": "Exit Code",
"evidence_duration": "Duration",
"evidence_output_size": "Output Size",
"evidence_transcript": "Execution transcript",
}, },
"ko": { "ko": {
"title": "교차 검증 리포트", "title": "교차 검증 리포트",
@@ -99,6 +105,12 @@ _STRINGS: dict[str, dict[str, str]] = {
"metrics_total_issues": "총 이슈", "metrics_total_issues": "총 이슈",
"metrics_na": "해당 없음", "metrics_na": "해당 없음",
"iteration_details": "반복 상세", "iteration_details": "반복 상세",
"evidence_summary": "실행 증거 요약",
"evidence_agent": "에이전트",
"evidence_exit_code": "종료 코드",
"evidence_duration": "소요 시간",
"evidence_output_size": "출력 크기",
"evidence_transcript": "실행 트랜스크립트",
}, },
} }
@@ -377,6 +389,30 @@ def _append_iteration_steps(
If *skip_extraction* is True, out-of-scope and review-metrics parsing If *skip_extraction* is True, out-of-scope and review-metrics parsing
is skipped (useful when a pre-scan already collected that data). is skipped (useful when a pre-scan already collected that data).
""" """
# Evidence summary table — quick overview of all steps' execution data
has_evidence = any(
iter_result.step_results.get(s.output_key) for s in steps
)
if has_evidence:
s_step = _t(config, "step")
s_agent = _t(config, "evidence_agent")
s_exit = _t(config, "evidence_exit_code")
s_dur = _t(config, "evidence_duration")
s_size = _t(config, "evidence_output_size")
lines.append(f"**{_t(config, 'evidence_summary')}**\n")
lines.append(f"| {s_step} | {s_agent} | {s_exit} | {s_dur} | {s_size} |")
lines.append("|------|-------|-----------|----------|-------------|")
for step in steps:
ar = iter_result.step_results.get(step.output_key)
out = iter_result.step_outputs.get(step.output_key, "")
if ar:
lines.append(
f"| {step.name} | {ar.agent_name} "
f"| {ar.exit_code} | {ar.duration_seconds}s "
f"| {len(out)} chars |"
)
lines.append("")
for step in steps: for step in steps:
agent_result = iter_result.step_results.get(step.output_key) agent_result = iter_result.step_results.get(step.output_key)
output = iter_result.step_outputs.get(step.output_key, "") output = iter_result.step_outputs.get(step.output_key, "")
@@ -410,8 +446,9 @@ def _append_iteration_steps(
transcript_preview = agent_result.transcript[:1500] transcript_preview = agent_result.transcript[:1500]
if len(agent_result.transcript) > 1500: if len(agent_result.transcript) > 1500:
transcript_preview += "\n... (truncated)" transcript_preview += "\n... (truncated)"
transcript_label = _t(config, "evidence_transcript")
lines.append("<details>") lines.append("<details>")
lines.append("<summary>Execution transcript</summary>\n") lines.append(f"<summary>{transcript_label}</summary>\n")
lines.append(transcript_preview) lines.append(transcript_preview)
lines.append("\n</details>\n") lines.append("\n</details>\n")

View File

@@ -233,11 +233,11 @@ class TestBaseRepoIsolation(unittest.TestCase):
# =================================================================== # ===================================================================
class TestInvokeAgentAgenticClaude(unittest.TestCase): class TestInvokeAgentAgenticClaude(unittest.TestCase):
"""invoke_agent_agentic builds correct cmd for claude (no -p, prompt as positional arg).""" """invoke_agent_agentic builds correct cmd for claude (no -p, prompt via stdin)."""
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...") @patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
@patch("subprocess.run") @patch("subprocess.run")
def test_claude_cmd_has_no_dash_p_and_prompt_as_positional( def test_claude_cmd_has_no_dash_p_and_prompt_via_stdin(
self, mock_run: MagicMock, mock_diff: MagicMock, self, mock_run: MagicMock, mock_diff: MagicMock,
) -> None: ) -> None:
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="") mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
@@ -271,8 +271,10 @@ class TestInvokeAgentAgenticClaude(unittest.TestCase):
# No -p flag # No -p flag
self.assertNotIn("-p", cmd) self.assertNotIn("-p", cmd)
# Last arg is a task file reference (not raw prompt — avoids arg length limits) # Prompt is delivered via stdin (input kwarg), not as a positional arg
self.assertIn("task file", cmd[-1].lower()) input_data = agent_call[1].get("input")
self.assertIsNotNone(input_data)
self.assertIn("implement feature X", input_data)
class TestInvokeAgentAgenticCodex(unittest.TestCase): class TestInvokeAgentAgenticCodex(unittest.TestCase):

View File

@@ -391,5 +391,506 @@ class TestAgenticExpandedClaimMarkers(unittest.TestCase):
self.assertEqual(ctx.exception.failure_type, "EMPTY_DIFF") self.assertEqual(ctx.exception.failure_type, "EMPTY_DIFF")
# ---------------------------------------------------------------------------
# 5. Expanded claim/no-change markers
# ---------------------------------------------------------------------------
class TestExpandedClaimMarkers(unittest.TestCase):
"""New claim markers detect additional Claude output patterns."""
def test_completed_all_the_changes(self) -> None:
self.assertTrue(_claims_file_changes("I completed all the changes"))
def test_finished_implementing(self) -> None:
self.assertTrue(_claims_file_changes("Finished implementing the feature"))
def test_all_tasks_completed(self) -> None:
self.assertTrue(_claims_file_changes("All tasks completed successfully"))
def test_done_with_the_implementation(self) -> None:
self.assertTrue(_claims_file_changes("Done with the implementation"))
def test_successfully_implemented(self) -> None:
self.assertTrue(_claims_file_changes("Successfully implemented the changes"))
def test_changes_are_complete(self) -> None:
self.assertTrue(_claims_file_changes("All changes are complete"))
class TestExpandedNoChangeMarkers(unittest.TestCase):
"""New no-change markers prevent false positives."""
def test_no_changes_needed(self) -> None:
self.assertFalse(_claims_file_changes("No changes needed"))
def test_no_fixes_needed(self) -> None:
self.assertFalse(_claims_file_changes("No fixes needed for this code"))
def test_code_is_correct_as_is(self) -> None:
self.assertFalse(_claims_file_changes("The code is correct as-is"))
def test_already_correct(self) -> None:
self.assertFalse(_claims_file_changes("Implementation is already correct"))
def test_no_action_required(self) -> None:
self.assertFalse(_claims_file_changes("No action required"))
# ---------------------------------------------------------------------------
# 6. Cross-iteration evidence propagation
# ---------------------------------------------------------------------------
class TestCrossIterationEvidencePropagation(unittest.TestCase):
"""Execution evidence from prior iterations is available to subsequent iterations."""
def test_prior_evidence_available_in_iteration_2(self) -> None:
"""Review step in iteration 2 should see coding evidence from iteration 1."""
with tempfile.TemporaryDirectory() as tmpdir:
steps = [
StepConfig(
name="coding", agent="claude-coder", role="coding",
prompt_template="default:coding", output_key="coding_output",
),
StepConfig(
name="review", agent="claude-reviewer", role="review",
prompt_template="default:review", output_key="review_result",
verdict=True,
),
]
config = PipelineConfig(
output_dir=Path(tmpdir),
max_iterations=2,
min_iterations=1,
language="en",
inputs={"plan": "Test plan", "checklist": "Test checklist"},
agents=dict(BUILTIN_AGENTS),
coders=["claude-coder"],
reviewers=["claude-reviewer"],
pipeline=steps,
preset_name="simple",
)
captured_prompts: list[dict] = []
def _mock(agent_config, prompt, step_name, **kwargs):
captured_prompts.append({
"step_name": step_name,
"prompt": prompt,
})
if step_name == "coding":
return AgentResult(
output="Implemented feature X",
exit_code=0,
agent_name=agent_config.name,
step_name=step_name,
duration_seconds=5.0,
transcript="# Transcript\nclaude ran the task",
command_preview="claude --setting-sources user",
)
# First review: FAIL, second review: PASS
review_calls = [
p for p in captured_prompts if p["step_name"] == "review"
]
if len(review_calls) <= 1:
return AgentResult(
output="Issues found\n\nVERDICT: FAIL",
exit_code=0,
agent_name=agent_config.name,
step_name=step_name,
duration_seconds=2.0,
transcript="# Transcript\nreview ran",
command_preview="claude -p --setting-sources user",
)
return AgentResult(
output="All good\n\nVERDICT: PASS",
exit_code=0,
agent_name=agent_config.name,
step_name=step_name,
duration_seconds=2.0,
)
with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock):
result = run_pipeline(config)
self.assertEqual(result.final_verdict, "PASS")
self.assertEqual(len(result.iterations), 2)
# The review prompt in iteration 2 should reference prior evidence
# (from iteration 1's coding step)
iter2_review_prompts = [
p for p in captured_prompts
if p["step_name"] == "review"
]
# There should be 2 review prompts (one per iteration)
self.assertEqual(len(iter2_review_prompts), 2)
iter2_review = iter2_review_prompts[1]["prompt"]
# Prior evidence should appear because it was carried forward
# The review step runs after coding, so it sees current iteration's
# coding evidence. But the key test is that evidence IS present.
self.assertIn("Exit code: 0", iter2_review)
self.assertIn("claude-coder", iter2_review)
# ---------------------------------------------------------------------------
# 7. Report evidence summary table
# ---------------------------------------------------------------------------
class TestReportEvidenceSummaryTable(unittest.TestCase):
"""Report includes evidence summary table per iteration."""
def test_report_contains_evidence_summary(self) -> None:
steps = [
StepConfig(
name="coding", agent="claude-coder", role="coding",
prompt_template="default:coding", output_key="coding_output",
),
StepConfig(
name="review", agent="claude-reviewer", role="review",
prompt_template="default:review", output_key="review_result",
verdict=True,
),
]
config = PipelineConfig(
max_iterations=1,
language="en",
inputs={"plan": "Plan", "checklist": "CL"},
agents=dict(BUILTIN_AGENTS),
pipeline=steps,
preset_name="simple",
)
coding_result = AgentResult(
output="diff --git a/file ...",
exit_code=0,
agent_name="claude-coder",
step_name="coding",
duration_seconds=10.0,
transcript="# Transcript",
command_preview="claude --setting-sources user",
)
review_result = AgentResult(
output="VERDICT: PASS",
exit_code=0,
agent_name="claude-reviewer",
step_name="review",
duration_seconds=5.0,
transcript="# Transcript",
command_preview="claude -p",
)
iteration = IterationResult(
iteration=1,
step_results={
"coding_output": coding_result,
"review_result": review_result,
},
step_outputs={
"coding_output": "diff --git a/file ...",
"review_result": "VERDICT: PASS",
},
verdict="PASS",
)
pipeline_result = PipelineResult(
iterations=[iteration],
final_verdict="PASS",
total_duration=15.0,
)
report = build_report(config, pipeline_result)
self.assertIn("Evidence Summary", report)
self.assertIn("claude-coder", report)
self.assertIn("claude-reviewer", report)
self.assertIn("10.0s", report)
self.assertIn("5.0s", report)
# ---------------------------------------------------------------------------
# 8. _build_context merges prior and current evidence
# ---------------------------------------------------------------------------
class TestBuildContextMergesEvidence(unittest.TestCase):
"""_build_context merges prior iteration evidence with current step evidence."""
def test_prior_evidence_used_when_no_current_results(self) -> None:
from cross_eval.pipeline import _build_context
input_contents = {
"plan": "test",
"execution_evidence": "### Step: coding (coder)\n- Exit code: 0",
}
context = _build_context(
input_contents, {}, "feedback", 2, 5, step_results=None,
)
# Prior evidence should survive when there are no current results
self.assertIn("coding (coder)", context["execution_evidence"])
def test_current_and_prior_merged(self) -> None:
from cross_eval.pipeline import _build_context
input_contents = {
"plan": "test",
"execution_evidence": "### Step: coding (coder)\n- Exit code: 0",
}
current_result = AgentResult(
output="review text", exit_code=0, agent_name="reviewer",
step_name="review", duration_seconds=3.0,
command_preview="cmd",
)
context = _build_context(
input_contents, {}, "feedback", 2, 5,
step_results={"review_result": current_result},
)
evidence = context["execution_evidence"]
# Both prior and current should appear
self.assertIn("Prior Iteration Evidence", evidence)
self.assertIn("Current Iteration Evidence", evidence)
self.assertIn("coding (coder)", evidence)
self.assertIn("reviewer", evidence)
# ---------------------------------------------------------------------------
# 9. Evidence in review-only template (used by review-fix preset)
# ---------------------------------------------------------------------------
class TestReviewOnlyTemplateIncludesEvidence(unittest.TestCase):
"""review-only template includes {execution_evidence} placeholder."""
def test_review_only_template_has_evidence_placeholder(self) -> None:
from cross_eval.prompts import REVIEW_ONLY_TEMPLATE, REVIEW_ONLY_TEMPLATE_KO
self.assertIn("{execution_evidence}", REVIEW_ONLY_TEMPLATE)
self.assertIn("{execution_evidence}", REVIEW_ONLY_TEMPLATE_KO)
def test_review_only_renders_evidence(self) -> None:
from cross_eval.prompts import render_template, REVIEW_ONLY_TEMPLATE
context = {
"plan": "Test plan",
"checklist": "Test checklist",
"docs": "Test docs",
"feedback": "No feedback",
"execution_evidence": "### Step: coding (coder)\n- Exit code: 0\n- Duration: 5.0s",
"iteration": "1",
"max_iterations": "3",
}
rendered = render_template(REVIEW_ONLY_TEMPLATE, context)
self.assertIn("Exit code: 0", rendered)
self.assertIn("Duration: 5.0s", rendered)
# ---------------------------------------------------------------------------
# 10. Evidence propagation in phased pipeline (coding-review-fix)
# ---------------------------------------------------------------------------
class TestPhasedPipelineEvidencePropagation(unittest.TestCase):
"""Evidence propagates correctly in coding-review-fix phased pipeline."""
def test_reviewer_receives_coding_evidence_in_phased_pipeline(self) -> None:
"""In coding-review-fix, review-phase reviewers see coding-phase evidence."""
from cross_eval.prompts import _build_coding_review_fix_preset
with tempfile.TemporaryDirectory() as tmpdir:
coders = ["claude-coder"]
reviewers = ["claude-reviewer"]
seniors = ["claude-senior"]
phases = _build_coding_review_fix_preset(coders, reviewers, seniors)
config = PipelineConfig(
output_dir=Path(tmpdir),
max_iterations=5,
min_iterations=1,
language="en",
inputs={"plan": "Test plan", "checklist": "Test checklist"},
agents=dict(BUILTIN_AGENTS),
coders=coders,
reviewers=reviewers,
seniors=seniors,
phases=phases,
preset_name="coding-review-fix",
)
captured_prompts: list[dict] = []
def _mock(agent_config, prompt, step_name, **kwargs):
captured_prompts.append({
"step_name": step_name,
"prompt": prompt,
"agent_name": agent_config.name,
})
if step_name == "coding":
return AgentResult(
output="Implemented feature X",
exit_code=0,
agent_name=agent_config.name,
step_name=step_name,
duration_seconds=10.0,
transcript="# Transcript\nclaude executed coding task",
command_preview="claude --setting-sources user",
)
if step_name == "verify":
return AgentResult(
output="All good\n\nVERDICT: PASS",
exit_code=0,
agent_name=agent_config.name,
step_name=step_name,
duration_seconds=3.0,
)
return AgentResult(
output=f"Output for {step_name}",
exit_code=0,
agent_name=agent_config.name,
step_name=step_name,
duration_seconds=2.0,
transcript=f"# Transcript for {step_name}",
command_preview=f"cmd-{step_name}",
)
with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock):
result = run_pipeline(config)
self.assertEqual(result.final_verdict, "PASS")
# Check that review-phase reviewers received evidence
review_prompts = [
p for p in captured_prompts
if p["step_name"].startswith("review_")
]
self.assertTrue(len(review_prompts) >= 1)
# The review prompt should contain evidence from the coding phase
review_prompt = review_prompts[0]["prompt"]
self.assertIn("Execution Evidence", review_prompt)
# ---------------------------------------------------------------------------
# 11. Evidence format includes output size
# ---------------------------------------------------------------------------
class TestEvidenceIncludesOutputSize(unittest.TestCase):
"""_format_execution_evidence includes output size for debugging."""
def test_output_size_in_evidence(self) -> None:
result = AgentResult(
output="x" * 500,
exit_code=0,
agent_name="claude-coder",
step_name="coding",
duration_seconds=5.0,
command_preview="claude --setting-sources user",
)
evidence = _format_execution_evidence({"coding_output": result})
self.assertIn("Output size: 500 chars", evidence)
# ---------------------------------------------------------------------------
# 12. Report transcript label i18n
# ---------------------------------------------------------------------------
class TestReportTranscriptLabelI18n(unittest.TestCase):
"""Report uses translated transcript label."""
def test_korean_transcript_label(self) -> None:
steps = [
StepConfig(
name="coding", agent="claude-coder", role="coding",
prompt_template="default:coding", output_key="coding_output",
),
]
config = PipelineConfig(
max_iterations=1,
language="ko",
inputs={"plan": "Plan", "checklist": "CL"},
agents=dict(BUILTIN_AGENTS),
pipeline=steps,
preset_name="simple",
)
coding_result = AgentResult(
output="diff --git a/file ...",
exit_code=0,
agent_name="claude-coder",
step_name="coding",
duration_seconds=10.0,
transcript="# Agent Execution Transcript\n## Command\nclaude ...",
command_preview="claude --setting-sources user",
)
iteration = IterationResult(
iteration=1,
step_results={"coding_output": coding_result},
step_outputs={"coding_output": "diff --git a/file ..."},
)
pipeline_result = PipelineResult(
iterations=[iteration],
final_verdict="MAX_ITERATIONS_REACHED",
total_duration=10.0,
)
report = build_report(config, pipeline_result)
self.assertIn("실행 트랜스크립트", report)
# ---------------------------------------------------------------------------
# 13. Claude coder + Codex reviewer/senior combination
# ---------------------------------------------------------------------------
class TestCodingReviewFixClaudeCodexCombination(unittest.TestCase):
"""coding-review-fix works with Claude as coder and Codex as reviewer/senior."""
def test_claude_coder_codex_reviewer_completes(self) -> None:
"""Verify the preset completes with mixed Claude/Codex agents."""
from cross_eval.prompts import _build_coding_review_fix_preset
with tempfile.TemporaryDirectory() as tmpdir:
coders = ["claude-coder"]
reviewers = ["codex-reviewer"]
seniors = ["codex-senior"]
phases = _build_coding_review_fix_preset(coders, reviewers, seniors)
config = PipelineConfig(
output_dir=Path(tmpdir),
max_iterations=5,
min_iterations=1,
language="en",
inputs={"plan": "Test plan", "checklist": "Test checklist"},
agents=dict(BUILTIN_AGENTS),
coders=coders,
reviewers=reviewers,
seniors=seniors,
phases=phases,
preset_name="coding-review-fix",
)
def _mock(agent_config, prompt, step_name, **kwargs):
if step_name == "verify":
return AgentResult(
output="All good\n\nVERDICT: PASS",
exit_code=0,
agent_name=agent_config.name,
step_name=step_name,
duration_seconds=2.0,
transcript="# Transcript",
command_preview="codex exec",
)
return AgentResult(
output=f"Output for {step_name}",
exit_code=0,
agent_name=agent_config.name,
step_name=step_name,
duration_seconds=3.0,
transcript=f"# Transcript for {step_name}",
command_preview=f"cmd-{step_name}",
)
with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock):
result = run_pipeline(config)
self.assertEqual(result.final_verdict, "PASS")
# Verify both Claude and Codex agents were used
all_agents = set()
for ir in result.iterations:
for ar in ir.step_results.values():
all_agents.add(ar.agent_name)
self.assertIn("claude-coder", all_agents)
self.assertIn("codex-reviewer", all_agents)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()