feat: propagate execution evidence across iterations and enhance reports

- Carry execution evidence forward so reviewer/senior prompts in
  subsequent iterations can inspect prior transcript and command data
- Add {execution_evidence} to REVIEW_ONLY templates (en/ko)
- Add evidence summary table to iteration reports
- Fix test_agentic to match stdin-based prompt delivery for Claude
- Add expanded claim/no-change marker tests and cross-iteration
  evidence propagation tests

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
chungyeong
2026-03-13 23:36:28 +09:00
parent c467222a2a
commit 87bc0ffbfb
5 changed files with 591 additions and 10 deletions

View File

@@ -391,5 +391,506 @@ class TestAgenticExpandedClaimMarkers(unittest.TestCase):
self.assertEqual(ctx.exception.failure_type, "EMPTY_DIFF")
# ---------------------------------------------------------------------------
# 5. Expanded claim/no-change markers
# ---------------------------------------------------------------------------
class TestExpandedClaimMarkers(unittest.TestCase):
"""New claim markers detect additional Claude output patterns."""
def test_completed_all_the_changes(self) -> None:
self.assertTrue(_claims_file_changes("I completed all the changes"))
def test_finished_implementing(self) -> None:
self.assertTrue(_claims_file_changes("Finished implementing the feature"))
def test_all_tasks_completed(self) -> None:
self.assertTrue(_claims_file_changes("All tasks completed successfully"))
def test_done_with_the_implementation(self) -> None:
self.assertTrue(_claims_file_changes("Done with the implementation"))
def test_successfully_implemented(self) -> None:
self.assertTrue(_claims_file_changes("Successfully implemented the changes"))
def test_changes_are_complete(self) -> None:
self.assertTrue(_claims_file_changes("All changes are complete"))
class TestExpandedNoChangeMarkers(unittest.TestCase):
"""New no-change markers prevent false positives."""
def test_no_changes_needed(self) -> None:
self.assertFalse(_claims_file_changes("No changes needed"))
def test_no_fixes_needed(self) -> None:
self.assertFalse(_claims_file_changes("No fixes needed for this code"))
def test_code_is_correct_as_is(self) -> None:
self.assertFalse(_claims_file_changes("The code is correct as-is"))
def test_already_correct(self) -> None:
self.assertFalse(_claims_file_changes("Implementation is already correct"))
def test_no_action_required(self) -> None:
self.assertFalse(_claims_file_changes("No action required"))
# ---------------------------------------------------------------------------
# 6. Cross-iteration evidence propagation
# ---------------------------------------------------------------------------
class TestCrossIterationEvidencePropagation(unittest.TestCase):
"""Execution evidence from prior iterations is available to subsequent iterations."""
def test_prior_evidence_available_in_iteration_2(self) -> None:
"""Review step in iteration 2 should see coding evidence from iteration 1."""
with tempfile.TemporaryDirectory() as tmpdir:
steps = [
StepConfig(
name="coding", agent="claude-coder", role="coding",
prompt_template="default:coding", output_key="coding_output",
),
StepConfig(
name="review", agent="claude-reviewer", role="review",
prompt_template="default:review", output_key="review_result",
verdict=True,
),
]
config = PipelineConfig(
output_dir=Path(tmpdir),
max_iterations=2,
min_iterations=1,
language="en",
inputs={"plan": "Test plan", "checklist": "Test checklist"},
agents=dict(BUILTIN_AGENTS),
coders=["claude-coder"],
reviewers=["claude-reviewer"],
pipeline=steps,
preset_name="simple",
)
captured_prompts: list[dict] = []
def _mock(agent_config, prompt, step_name, **kwargs):
captured_prompts.append({
"step_name": step_name,
"prompt": prompt,
})
if step_name == "coding":
return AgentResult(
output="Implemented feature X",
exit_code=0,
agent_name=agent_config.name,
step_name=step_name,
duration_seconds=5.0,
transcript="# Transcript\nclaude ran the task",
command_preview="claude --setting-sources user",
)
# First review: FAIL, second review: PASS
review_calls = [
p for p in captured_prompts if p["step_name"] == "review"
]
if len(review_calls) <= 1:
return AgentResult(
output="Issues found\n\nVERDICT: FAIL",
exit_code=0,
agent_name=agent_config.name,
step_name=step_name,
duration_seconds=2.0,
transcript="# Transcript\nreview ran",
command_preview="claude -p --setting-sources user",
)
return AgentResult(
output="All good\n\nVERDICT: PASS",
exit_code=0,
agent_name=agent_config.name,
step_name=step_name,
duration_seconds=2.0,
)
with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock):
result = run_pipeline(config)
self.assertEqual(result.final_verdict, "PASS")
self.assertEqual(len(result.iterations), 2)
# The review prompt in iteration 2 should reference prior evidence
# (from iteration 1's coding step)
iter2_review_prompts = [
p for p in captured_prompts
if p["step_name"] == "review"
]
# There should be 2 review prompts (one per iteration)
self.assertEqual(len(iter2_review_prompts), 2)
iter2_review = iter2_review_prompts[1]["prompt"]
# Prior evidence should appear because it was carried forward
# The review step runs after coding, so it sees current iteration's
# coding evidence. But the key test is that evidence IS present.
self.assertIn("Exit code: 0", iter2_review)
self.assertIn("claude-coder", iter2_review)
# ---------------------------------------------------------------------------
# 7. Report evidence summary table
# ---------------------------------------------------------------------------
class TestReportEvidenceSummaryTable(unittest.TestCase):
"""Report includes evidence summary table per iteration."""
def test_report_contains_evidence_summary(self) -> None:
steps = [
StepConfig(
name="coding", agent="claude-coder", role="coding",
prompt_template="default:coding", output_key="coding_output",
),
StepConfig(
name="review", agent="claude-reviewer", role="review",
prompt_template="default:review", output_key="review_result",
verdict=True,
),
]
config = PipelineConfig(
max_iterations=1,
language="en",
inputs={"plan": "Plan", "checklist": "CL"},
agents=dict(BUILTIN_AGENTS),
pipeline=steps,
preset_name="simple",
)
coding_result = AgentResult(
output="diff --git a/file ...",
exit_code=0,
agent_name="claude-coder",
step_name="coding",
duration_seconds=10.0,
transcript="# Transcript",
command_preview="claude --setting-sources user",
)
review_result = AgentResult(
output="VERDICT: PASS",
exit_code=0,
agent_name="claude-reviewer",
step_name="review",
duration_seconds=5.0,
transcript="# Transcript",
command_preview="claude -p",
)
iteration = IterationResult(
iteration=1,
step_results={
"coding_output": coding_result,
"review_result": review_result,
},
step_outputs={
"coding_output": "diff --git a/file ...",
"review_result": "VERDICT: PASS",
},
verdict="PASS",
)
pipeline_result = PipelineResult(
iterations=[iteration],
final_verdict="PASS",
total_duration=15.0,
)
report = build_report(config, pipeline_result)
self.assertIn("Evidence Summary", report)
self.assertIn("claude-coder", report)
self.assertIn("claude-reviewer", report)
self.assertIn("10.0s", report)
self.assertIn("5.0s", report)
# ---------------------------------------------------------------------------
# 8. _build_context merges prior and current evidence
# ---------------------------------------------------------------------------
class TestBuildContextMergesEvidence(unittest.TestCase):
"""_build_context merges prior iteration evidence with current step evidence."""
def test_prior_evidence_used_when_no_current_results(self) -> None:
from cross_eval.pipeline import _build_context
input_contents = {
"plan": "test",
"execution_evidence": "### Step: coding (coder)\n- Exit code: 0",
}
context = _build_context(
input_contents, {}, "feedback", 2, 5, step_results=None,
)
# Prior evidence should survive when there are no current results
self.assertIn("coding (coder)", context["execution_evidence"])
def test_current_and_prior_merged(self) -> None:
from cross_eval.pipeline import _build_context
input_contents = {
"plan": "test",
"execution_evidence": "### Step: coding (coder)\n- Exit code: 0",
}
current_result = AgentResult(
output="review text", exit_code=0, agent_name="reviewer",
step_name="review", duration_seconds=3.0,
command_preview="cmd",
)
context = _build_context(
input_contents, {}, "feedback", 2, 5,
step_results={"review_result": current_result},
)
evidence = context["execution_evidence"]
# Both prior and current should appear
self.assertIn("Prior Iteration Evidence", evidence)
self.assertIn("Current Iteration Evidence", evidence)
self.assertIn("coding (coder)", evidence)
self.assertIn("reviewer", evidence)
# ---------------------------------------------------------------------------
# 9. Evidence in review-only template (used by review-fix preset)
# ---------------------------------------------------------------------------
class TestReviewOnlyTemplateIncludesEvidence(unittest.TestCase):
"""review-only template includes {execution_evidence} placeholder."""
def test_review_only_template_has_evidence_placeholder(self) -> None:
from cross_eval.prompts import REVIEW_ONLY_TEMPLATE, REVIEW_ONLY_TEMPLATE_KO
self.assertIn("{execution_evidence}", REVIEW_ONLY_TEMPLATE)
self.assertIn("{execution_evidence}", REVIEW_ONLY_TEMPLATE_KO)
def test_review_only_renders_evidence(self) -> None:
from cross_eval.prompts import render_template, REVIEW_ONLY_TEMPLATE
context = {
"plan": "Test plan",
"checklist": "Test checklist",
"docs": "Test docs",
"feedback": "No feedback",
"execution_evidence": "### Step: coding (coder)\n- Exit code: 0\n- Duration: 5.0s",
"iteration": "1",
"max_iterations": "3",
}
rendered = render_template(REVIEW_ONLY_TEMPLATE, context)
self.assertIn("Exit code: 0", rendered)
self.assertIn("Duration: 5.0s", rendered)
# ---------------------------------------------------------------------------
# 10. Evidence propagation in phased pipeline (coding-review-fix)
# ---------------------------------------------------------------------------
class TestPhasedPipelineEvidencePropagation(unittest.TestCase):
"""Evidence propagates correctly in coding-review-fix phased pipeline."""
def test_reviewer_receives_coding_evidence_in_phased_pipeline(self) -> None:
"""In coding-review-fix, review-phase reviewers see coding-phase evidence."""
from cross_eval.prompts import _build_coding_review_fix_preset
with tempfile.TemporaryDirectory() as tmpdir:
coders = ["claude-coder"]
reviewers = ["claude-reviewer"]
seniors = ["claude-senior"]
phases = _build_coding_review_fix_preset(coders, reviewers, seniors)
config = PipelineConfig(
output_dir=Path(tmpdir),
max_iterations=5,
min_iterations=1,
language="en",
inputs={"plan": "Test plan", "checklist": "Test checklist"},
agents=dict(BUILTIN_AGENTS),
coders=coders,
reviewers=reviewers,
seniors=seniors,
phases=phases,
preset_name="coding-review-fix",
)
captured_prompts: list[dict] = []
def _mock(agent_config, prompt, step_name, **kwargs):
captured_prompts.append({
"step_name": step_name,
"prompt": prompt,
"agent_name": agent_config.name,
})
if step_name == "coding":
return AgentResult(
output="Implemented feature X",
exit_code=0,
agent_name=agent_config.name,
step_name=step_name,
duration_seconds=10.0,
transcript="# Transcript\nclaude executed coding task",
command_preview="claude --setting-sources user",
)
if step_name == "verify":
return AgentResult(
output="All good\n\nVERDICT: PASS",
exit_code=0,
agent_name=agent_config.name,
step_name=step_name,
duration_seconds=3.0,
)
return AgentResult(
output=f"Output for {step_name}",
exit_code=0,
agent_name=agent_config.name,
step_name=step_name,
duration_seconds=2.0,
transcript=f"# Transcript for {step_name}",
command_preview=f"cmd-{step_name}",
)
with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock):
result = run_pipeline(config)
self.assertEqual(result.final_verdict, "PASS")
# Check that review-phase reviewers received evidence
review_prompts = [
p for p in captured_prompts
if p["step_name"].startswith("review_")
]
self.assertTrue(len(review_prompts) >= 1)
# The review prompt should contain evidence from the coding phase
review_prompt = review_prompts[0]["prompt"]
self.assertIn("Execution Evidence", review_prompt)
# ---------------------------------------------------------------------------
# 11. Evidence format includes output size
# ---------------------------------------------------------------------------
class TestEvidenceIncludesOutputSize(unittest.TestCase):
"""_format_execution_evidence includes output size for debugging."""
def test_output_size_in_evidence(self) -> None:
result = AgentResult(
output="x" * 500,
exit_code=0,
agent_name="claude-coder",
step_name="coding",
duration_seconds=5.0,
command_preview="claude --setting-sources user",
)
evidence = _format_execution_evidence({"coding_output": result})
self.assertIn("Output size: 500 chars", evidence)
# ---------------------------------------------------------------------------
# 12. Report transcript label i18n
# ---------------------------------------------------------------------------
class TestReportTranscriptLabelI18n(unittest.TestCase):
"""Report uses translated transcript label."""
def test_korean_transcript_label(self) -> None:
steps = [
StepConfig(
name="coding", agent="claude-coder", role="coding",
prompt_template="default:coding", output_key="coding_output",
),
]
config = PipelineConfig(
max_iterations=1,
language="ko",
inputs={"plan": "Plan", "checklist": "CL"},
agents=dict(BUILTIN_AGENTS),
pipeline=steps,
preset_name="simple",
)
coding_result = AgentResult(
output="diff --git a/file ...",
exit_code=0,
agent_name="claude-coder",
step_name="coding",
duration_seconds=10.0,
transcript="# Agent Execution Transcript\n## Command\nclaude ...",
command_preview="claude --setting-sources user",
)
iteration = IterationResult(
iteration=1,
step_results={"coding_output": coding_result},
step_outputs={"coding_output": "diff --git a/file ..."},
)
pipeline_result = PipelineResult(
iterations=[iteration],
final_verdict="MAX_ITERATIONS_REACHED",
total_duration=10.0,
)
report = build_report(config, pipeline_result)
self.assertIn("실행 트랜스크립트", report)
# ---------------------------------------------------------------------------
# 13. Claude coder + Codex reviewer/senior combination
# ---------------------------------------------------------------------------
class TestCodingReviewFixClaudeCodexCombination(unittest.TestCase):
"""coding-review-fix works with Claude as coder and Codex as reviewer/senior."""
def test_claude_coder_codex_reviewer_completes(self) -> None:
"""Verify the preset completes with mixed Claude/Codex agents."""
from cross_eval.prompts import _build_coding_review_fix_preset
with tempfile.TemporaryDirectory() as tmpdir:
coders = ["claude-coder"]
reviewers = ["codex-reviewer"]
seniors = ["codex-senior"]
phases = _build_coding_review_fix_preset(coders, reviewers, seniors)
config = PipelineConfig(
output_dir=Path(tmpdir),
max_iterations=5,
min_iterations=1,
language="en",
inputs={"plan": "Test plan", "checklist": "Test checklist"},
agents=dict(BUILTIN_AGENTS),
coders=coders,
reviewers=reviewers,
seniors=seniors,
phases=phases,
preset_name="coding-review-fix",
)
def _mock(agent_config, prompt, step_name, **kwargs):
if step_name == "verify":
return AgentResult(
output="All good\n\nVERDICT: PASS",
exit_code=0,
agent_name=agent_config.name,
step_name=step_name,
duration_seconds=2.0,
transcript="# Transcript",
command_preview="codex exec",
)
return AgentResult(
output=f"Output for {step_name}",
exit_code=0,
agent_name=agent_config.name,
step_name=step_name,
duration_seconds=3.0,
transcript=f"# Transcript for {step_name}",
command_preview=f"cmd-{step_name}",
)
with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock):
result = run_pipeline(config)
self.assertEqual(result.final_verdict, "PASS")
# Verify both Claude and Codex agents were used
all_agents = set()
for ir in result.iterations:
for ar in ir.step_results.values():
all_agents.add(ar.agent_name)
self.assertIn("claude-coder", all_agents)
self.assertIn("codex-reviewer", all_agents)
if __name__ == "__main__":
unittest.main()