952 lines
38 KiB
Python
952 lines
38 KiB
Python
"""Regression tests for runtime evidence propagation and report visibility.
|
|
|
|
Covers:
|
|
1. Execution evidence is surfaced in reviewer/senior prompt context.
|
|
2. Reports include command preview and transcript excerpts.
|
|
3. Claude agentic failure detection (empty diff, write failure, expanded markers).
|
|
4. _format_execution_evidence produces expected output.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import tempfile
|
|
import unittest
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
from cross_eval.agent import (
|
|
AgentInvocationError,
|
|
_claims_file_changes,
|
|
_has_write_failure_indicators,
|
|
invoke_agent_agentic,
|
|
)
|
|
from cross_eval.config import BUILTIN_AGENTS
|
|
from cross_eval.models import (
|
|
AgentConfig,
|
|
AgentResult,
|
|
IterationResult,
|
|
PipelineConfig,
|
|
PipelineResult,
|
|
StepConfig,
|
|
)
|
|
from cross_eval.pipeline import _build_artifact_references, _format_execution_evidence, run_pipeline
|
|
from cross_eval.report import build_report
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 1. Execution evidence formatting
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestFormatExecutionEvidence(unittest.TestCase):
|
|
"""_format_execution_evidence produces a compact summary for reviewers."""
|
|
|
|
def test_empty_results_returns_placeholder(self) -> None:
|
|
self.assertIn("no prior execution evidence", _format_execution_evidence({}))
|
|
|
|
def test_single_result_includes_key_fields(self) -> None:
|
|
result = AgentResult(
|
|
output="some diff",
|
|
exit_code=0,
|
|
agent_name="claude-coder",
|
|
step_name="coding",
|
|
duration_seconds=12.3,
|
|
transcript="# Agent Execution Transcript\n\n## Command\nclaude ...",
|
|
command_preview="claude --setting-sources user",
|
|
)
|
|
evidence = _format_execution_evidence({"coding_output": result})
|
|
self.assertIn("claude-coder", evidence)
|
|
self.assertIn("coding", evidence)
|
|
self.assertIn("Exit code: 0", evidence)
|
|
self.assertIn("12.3s", evidence)
|
|
self.assertIn("claude --setting-sources user", evidence)
|
|
self.assertNotIn("Transcript excerpt", evidence)
|
|
|
|
def test_multiple_results_separated(self) -> None:
|
|
r1 = AgentResult(
|
|
output="diff1", exit_code=0, agent_name="coder",
|
|
step_name="coding", duration_seconds=1.0,
|
|
command_preview="cmd1",
|
|
)
|
|
r2 = AgentResult(
|
|
output="review text", exit_code=0, agent_name="reviewer",
|
|
step_name="review", duration_seconds=2.0,
|
|
command_preview="cmd2",
|
|
)
|
|
evidence = _format_execution_evidence({
|
|
"coding_output": r1,
|
|
"review_result": r2,
|
|
})
|
|
self.assertIn("coder", evidence)
|
|
self.assertIn("reviewer", evidence)
|
|
self.assertIn("---", evidence)
|
|
|
|
def test_transcript_truncated_at_2000_chars(self) -> None:
|
|
long_transcript = "x" * 3000
|
|
result = AgentResult(
|
|
output="out", exit_code=0, agent_name="agent",
|
|
step_name="step", duration_seconds=1.0,
|
|
transcript=long_transcript,
|
|
)
|
|
evidence = _format_execution_evidence({"key": result})
|
|
self.assertNotIn("x" * 3000, evidence)
|
|
|
|
def test_artifact_paths_included_when_run_dir_provided(self) -> None:
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
result = AgentResult(
|
|
output="diff",
|
|
exit_code=0,
|
|
agent_name="coder",
|
|
step_name="coding",
|
|
duration_seconds=1.2,
|
|
transcript="stdout",
|
|
command_preview="claude ...",
|
|
)
|
|
evidence = _format_execution_evidence(
|
|
{"coding_output": result},
|
|
run_dir=Path(tmpdir),
|
|
iteration=2,
|
|
)
|
|
self.assertIn("v2/coding.md", evidence)
|
|
self.assertIn("v2/coding_transcript.md", evidence)
|
|
|
|
|
|
class TestArtifactReferences(unittest.TestCase):
|
|
"""Artifact references should prefer file paths and git state over inline text."""
|
|
|
|
def test_contains_input_refs_and_git_context(self) -> None:
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
repo = Path(tmpdir) / "repo"
|
|
repo.mkdir()
|
|
(repo / "plan.md").write_text("plan", encoding="utf-8")
|
|
(repo / "checklist.md").write_text("checklist", encoding="utf-8")
|
|
|
|
import subprocess
|
|
subprocess.run(["git", "init"], cwd=repo, capture_output=True, check=True)
|
|
subprocess.run(["git", "config", "user.email", "test@test.com"], cwd=repo, capture_output=True, check=True)
|
|
subprocess.run(["git", "config", "user.name", "Test"], cwd=repo, capture_output=True, check=True)
|
|
subprocess.run(["git", "add", "."], cwd=repo, capture_output=True, check=True)
|
|
subprocess.run(["git", "commit", "-m", "init"], cwd=repo, capture_output=True, check=True)
|
|
|
|
refs = _build_artifact_references(
|
|
{
|
|
"plan_ref": str((repo / "plan.md").resolve()),
|
|
"checklist_ref": str((repo / "checklist.md").resolve()),
|
|
"docs_ref": "(none)",
|
|
},
|
|
cwd=repo,
|
|
run_dir=repo / ".cross-eval" / "output" / "run",
|
|
iteration=1,
|
|
worktree_path=None,
|
|
)
|
|
self.assertIn("Plan:", refs)
|
|
self.assertIn("Git commit:", refs)
|
|
self.assertIn("Suggested git commands", refs)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 2. Evidence in reviewer prompts (integration)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestEvidenceInReviewerPrompt(unittest.TestCase):
|
|
"""Reviewer prompts include execution evidence from prior coding step."""
|
|
|
|
def test_reviewer_receives_evidence(self) -> None:
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
steps = [
|
|
StepConfig(
|
|
name="coding", agent="claude-coder", role="coding",
|
|
prompt_template="default:coding", output_key="coding_output",
|
|
),
|
|
StepConfig(
|
|
name="review", agent="claude-reviewer", role="review",
|
|
prompt_template="default:review", output_key="review_result",
|
|
verdict=True,
|
|
),
|
|
]
|
|
config = PipelineConfig(
|
|
output_dir=Path(tmpdir),
|
|
max_iterations=1,
|
|
min_iterations=1,
|
|
language="en",
|
|
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
|
agents=dict(BUILTIN_AGENTS),
|
|
coders=["claude-coder"],
|
|
reviewers=["claude-reviewer"],
|
|
pipeline=steps,
|
|
preset_name="simple",
|
|
)
|
|
|
|
captured_prompts: list[dict] = []
|
|
|
|
def _mock(agent_config, prompt, step_name, **kwargs):
|
|
captured_prompts.append({
|
|
"step_name": step_name,
|
|
"prompt": prompt,
|
|
})
|
|
if step_name == "coding":
|
|
return AgentResult(
|
|
output="Implemented feature X",
|
|
exit_code=0,
|
|
agent_name=agent_config.name,
|
|
step_name=step_name,
|
|
duration_seconds=5.0,
|
|
transcript="# Transcript\nclaude ran...",
|
|
command_preview="claude --setting-sources user",
|
|
)
|
|
return AgentResult(
|
|
output="VERDICT: PASS",
|
|
exit_code=0,
|
|
agent_name=agent_config.name,
|
|
step_name=step_name,
|
|
duration_seconds=2.0,
|
|
)
|
|
|
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock):
|
|
result = run_pipeline(config)
|
|
|
|
self.assertEqual(result.final_verdict, "PASS")
|
|
|
|
# The reviewer prompt should contain execution evidence
|
|
review_prompts = [
|
|
p for p in captured_prompts if p["step_name"] == "review"
|
|
]
|
|
self.assertTrue(len(review_prompts) >= 1)
|
|
review_prompt = review_prompts[0]["prompt"]
|
|
self.assertIn("Artifact References", review_prompt)
|
|
self.assertIn("Execution Evidence", review_prompt)
|
|
self.assertIn("claude-coder", review_prompt)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 3. Report includes evidence
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestReportIncludesEvidence(unittest.TestCase):
|
|
"""Report generation includes command preview and transcript excerpts."""
|
|
|
|
def _make_pipeline_result(self) -> tuple[PipelineConfig, PipelineResult]:
|
|
steps = [
|
|
StepConfig(
|
|
name="coding", agent="claude-coder", role="coding",
|
|
prompt_template="default:coding", output_key="coding_output",
|
|
),
|
|
StepConfig(
|
|
name="review", agent="claude-reviewer", role="review",
|
|
prompt_template="default:review", output_key="review_result",
|
|
verdict=True,
|
|
),
|
|
]
|
|
config = PipelineConfig(
|
|
max_iterations=1,
|
|
language="en",
|
|
inputs={"plan": "Plan", "checklist": "CL"},
|
|
agents=dict(BUILTIN_AGENTS),
|
|
pipeline=steps,
|
|
preset_name="simple",
|
|
)
|
|
|
|
coding_result = AgentResult(
|
|
output="diff --git a/file ...",
|
|
exit_code=0,
|
|
agent_name="claude-coder",
|
|
step_name="coding",
|
|
duration_seconds=10.0,
|
|
transcript="# Agent Execution Transcript\n## Command\nclaude ...\n## Stdout\nok",
|
|
command_preview="claude --setting-sources user",
|
|
)
|
|
review_result = AgentResult(
|
|
output="All good.\n\nVERDICT: PASS",
|
|
exit_code=0,
|
|
agent_name="claude-reviewer",
|
|
step_name="review",
|
|
duration_seconds=5.0,
|
|
transcript="# Agent Execution Transcript\n## Command\nclaude -p ...\n## Stdout\nAll good.",
|
|
command_preview="claude -p --setting-sources user",
|
|
)
|
|
|
|
iteration = IterationResult(
|
|
iteration=1,
|
|
step_results={
|
|
"coding_output": coding_result,
|
|
"review_result": review_result,
|
|
},
|
|
step_outputs={
|
|
"coding_output": "diff --git a/file ...",
|
|
"review_result": "All good.\n\nVERDICT: PASS",
|
|
},
|
|
verdict="PASS",
|
|
)
|
|
|
|
pipeline_result = PipelineResult(
|
|
iterations=[iteration],
|
|
final_verdict="PASS",
|
|
total_duration=15.0,
|
|
)
|
|
|
|
return config, pipeline_result
|
|
|
|
def test_report_contains_command_preview(self) -> None:
|
|
config, result = self._make_pipeline_result()
|
|
report = build_report(config, result)
|
|
self.assertIn("claude --setting-sources user", report)
|
|
self.assertIn("**Command**", report)
|
|
|
|
def test_report_contains_transcript_excerpt(self) -> None:
|
|
config, result = self._make_pipeline_result()
|
|
report = build_report(config, result)
|
|
self.assertIn("Execution transcript", report)
|
|
self.assertIn("Agent Execution Transcript", report)
|
|
|
|
def test_report_contains_exit_code(self) -> None:
|
|
config, result = self._make_pipeline_result()
|
|
report = build_report(config, result)
|
|
self.assertIn("**Exit code**: 0", report)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 4. Claude agentic hardened failure detection
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestClaimsFileChangesExpanded(unittest.TestCase):
|
|
"""Expanded change-claim markers detect more Claude output patterns."""
|
|
|
|
def test_ive_implemented(self) -> None:
|
|
self.assertTrue(_claims_file_changes("I've implemented the feature"))
|
|
|
|
def test_ive_updated(self) -> None:
|
|
self.assertTrue(_claims_file_changes("I've updated the config"))
|
|
|
|
def test_made_the_following_changes(self) -> None:
|
|
self.assertTrue(_claims_file_changes("I made the following changes to the file"))
|
|
|
|
def test_applied_the_fix(self) -> None:
|
|
self.assertTrue(_claims_file_changes("Applied the fix for the bug"))
|
|
|
|
def test_changes_have_been_applied(self) -> None:
|
|
self.assertTrue(_claims_file_changes("Changes have been applied successfully"))
|
|
|
|
def test_wrote_the_code(self) -> None:
|
|
self.assertTrue(_claims_file_changes("Wrote the code for the new module"))
|
|
|
|
def test_refactored(self) -> None:
|
|
self.assertTrue(_claims_file_changes("I refactored the pipeline"))
|
|
|
|
def test_no_changes_still_returns_false(self) -> None:
|
|
self.assertFalse(_claims_file_changes("No changes were necessary"))
|
|
|
|
def test_empty_string_returns_false(self) -> None:
|
|
self.assertFalse(_claims_file_changes(""))
|
|
|
|
|
|
class TestWriteFailureIndicators(unittest.TestCase):
|
|
"""_has_write_failure_indicators detects stderr patterns."""
|
|
|
|
def test_permission_denied(self) -> None:
|
|
self.assertTrue(_has_write_failure_indicators("Error: Permission denied"))
|
|
|
|
def test_read_only_filesystem(self) -> None:
|
|
self.assertTrue(_has_write_failure_indicators("read-only file system"))
|
|
|
|
def test_sandbox_restriction(self) -> None:
|
|
self.assertTrue(_has_write_failure_indicators("Blocked by sandbox policy"))
|
|
|
|
def test_eacces(self) -> None:
|
|
self.assertTrue(_has_write_failure_indicators("EACCES: operation not permitted"))
|
|
|
|
def test_empty_stderr_returns_false(self) -> None:
|
|
self.assertFalse(_has_write_failure_indicators(""))
|
|
|
|
def test_normal_stderr_returns_false(self) -> None:
|
|
self.assertFalse(_has_write_failure_indicators("Downloading model..."))
|
|
|
|
|
|
class TestAgenticWriteFailureRaisesError(unittest.TestCase):
|
|
"""Agentic mode raises AgentInvocationError on stderr write-failure indicators."""
|
|
|
|
@patch("cross_eval.worktree.capture_diff", return_value="")
|
|
@patch("subprocess.run")
|
|
def test_write_failure_detected_from_stderr(
|
|
self, mock_run: MagicMock, mock_diff: MagicMock,
|
|
) -> None:
|
|
mock_run.return_value = MagicMock(
|
|
returncode=0,
|
|
stdout="Done.",
|
|
stderr="Error: Permission denied writing to /src/main.py",
|
|
)
|
|
|
|
agent = AgentConfig(
|
|
name="claude-coder", command="claude",
|
|
args=["--setting-sources", "user"], agentic=True,
|
|
)
|
|
|
|
import subprocess as _sp
|
|
import tempfile as _tf
|
|
|
|
with _tf.TemporaryDirectory() as td:
|
|
wt = Path(td)
|
|
_sp.run(["git", "init"], cwd=wt, capture_output=True, check=True)
|
|
_sp.run(["git", "config", "user.email", "t@t.com"], cwd=wt, capture_output=True)
|
|
_sp.run(["git", "config", "user.name", "T"], cwd=wt, capture_output=True)
|
|
(wt / "README.md").write_text("# init\n")
|
|
_sp.run(["git", "add", "."], cwd=wt, capture_output=True, check=True)
|
|
_sp.run(["git", "commit", "-m", "init"], cwd=wt, capture_output=True, check=True)
|
|
|
|
with self.assertRaises(AgentInvocationError) as ctx:
|
|
invoke_agent_agentic(
|
|
agent, "implement feature", "coding",
|
|
worktree_path=wt, quiet=True,
|
|
)
|
|
|
|
self.assertEqual(ctx.exception.failure_type, "WRITE_FAILURE")
|
|
self.assertIn("Permission denied", ctx.exception.raw_error)
|
|
|
|
|
|
class TestAgenticExpandedClaimMarkers(unittest.TestCase):
|
|
"""Agentic mode detects expanded claim markers in empty diff scenarios."""
|
|
|
|
@patch("cross_eval.worktree.capture_diff", return_value="")
|
|
@patch("subprocess.run")
|
|
def test_ive_implemented_triggers_empty_diff_error(
|
|
self, mock_run: MagicMock, mock_diff: MagicMock,
|
|
) -> None:
|
|
mock_run.return_value = MagicMock(
|
|
returncode=0,
|
|
stdout="I've implemented the requested changes to the pipeline.",
|
|
stderr="",
|
|
)
|
|
|
|
agent = AgentConfig(
|
|
name="claude-coder", command="claude",
|
|
args=["--setting-sources", "user"], agentic=True,
|
|
)
|
|
|
|
import subprocess as _sp
|
|
import tempfile as _tf
|
|
|
|
with _tf.TemporaryDirectory() as td:
|
|
wt = Path(td)
|
|
_sp.run(["git", "init"], cwd=wt, capture_output=True, check=True)
|
|
_sp.run(["git", "config", "user.email", "t@t.com"], cwd=wt, capture_output=True)
|
|
_sp.run(["git", "config", "user.name", "T"], cwd=wt, capture_output=True)
|
|
(wt / "README.md").write_text("# init\n")
|
|
_sp.run(["git", "add", "."], cwd=wt, capture_output=True, check=True)
|
|
_sp.run(["git", "commit", "-m", "init"], cwd=wt, capture_output=True, check=True)
|
|
|
|
with self.assertRaises(AgentInvocationError) as ctx:
|
|
invoke_agent_agentic(
|
|
agent, "implement feature", "coding",
|
|
worktree_path=wt, quiet=True,
|
|
)
|
|
|
|
self.assertEqual(ctx.exception.failure_type, "EMPTY_DIFF")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 5. Expanded claim/no-change markers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestExpandedClaimMarkers(unittest.TestCase):
|
|
"""New claim markers detect additional Claude output patterns."""
|
|
|
|
def test_completed_all_the_changes(self) -> None:
|
|
self.assertTrue(_claims_file_changes("I completed all the changes"))
|
|
|
|
def test_finished_implementing(self) -> None:
|
|
self.assertTrue(_claims_file_changes("Finished implementing the feature"))
|
|
|
|
def test_all_tasks_completed(self) -> None:
|
|
self.assertTrue(_claims_file_changes("All tasks completed successfully"))
|
|
|
|
def test_done_with_the_implementation(self) -> None:
|
|
self.assertTrue(_claims_file_changes("Done with the implementation"))
|
|
|
|
def test_successfully_implemented(self) -> None:
|
|
self.assertTrue(_claims_file_changes("Successfully implemented the changes"))
|
|
|
|
def test_changes_are_complete(self) -> None:
|
|
self.assertTrue(_claims_file_changes("All changes are complete"))
|
|
|
|
def test_korean_change_summary_triggers(self) -> None:
|
|
self.assertTrue(_claims_file_changes("모든 수정이 완료되었습니다. 아래는 변경 요약입니다."))
|
|
|
|
|
|
class TestExpandedNoChangeMarkers(unittest.TestCase):
|
|
"""New no-change markers prevent false positives."""
|
|
|
|
def test_no_changes_needed(self) -> None:
|
|
self.assertFalse(_claims_file_changes("No changes needed"))
|
|
|
|
def test_no_fixes_needed(self) -> None:
|
|
self.assertFalse(_claims_file_changes("No fixes needed for this code"))
|
|
|
|
def test_code_is_correct_as_is(self) -> None:
|
|
self.assertFalse(_claims_file_changes("The code is correct as-is"))
|
|
|
|
def test_already_correct(self) -> None:
|
|
self.assertFalse(_claims_file_changes("Implementation is already correct"))
|
|
|
|
def test_no_action_required(self) -> None:
|
|
self.assertFalse(_claims_file_changes("No action required"))
|
|
|
|
def test_korean_no_change_marker(self) -> None:
|
|
self.assertFalse(_claims_file_changes("변경할 필요 없음"))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 6. Cross-iteration evidence propagation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestCrossIterationEvidencePropagation(unittest.TestCase):
|
|
"""Execution evidence from prior iterations is available to subsequent iterations."""
|
|
|
|
def test_prior_evidence_available_in_iteration_2(self) -> None:
|
|
"""Review step in iteration 2 should see coding evidence from iteration 1."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
steps = [
|
|
StepConfig(
|
|
name="coding", agent="claude-coder", role="coding",
|
|
prompt_template="default:coding", output_key="coding_output",
|
|
),
|
|
StepConfig(
|
|
name="review", agent="claude-reviewer", role="review",
|
|
prompt_template="default:review", output_key="review_result",
|
|
verdict=True,
|
|
),
|
|
]
|
|
config = PipelineConfig(
|
|
output_dir=Path(tmpdir),
|
|
max_iterations=2,
|
|
min_iterations=1,
|
|
language="en",
|
|
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
|
agents=dict(BUILTIN_AGENTS),
|
|
coders=["claude-coder"],
|
|
reviewers=["claude-reviewer"],
|
|
pipeline=steps,
|
|
preset_name="simple",
|
|
)
|
|
|
|
captured_prompts: list[dict] = []
|
|
|
|
def _mock(agent_config, prompt, step_name, **kwargs):
|
|
captured_prompts.append({
|
|
"step_name": step_name,
|
|
"prompt": prompt,
|
|
})
|
|
if step_name == "coding":
|
|
return AgentResult(
|
|
output="Implemented feature X",
|
|
exit_code=0,
|
|
agent_name=agent_config.name,
|
|
step_name=step_name,
|
|
duration_seconds=5.0,
|
|
transcript="# Transcript\nclaude ran the task",
|
|
command_preview="claude --setting-sources user",
|
|
)
|
|
# First review: FAIL, second review: PASS
|
|
review_calls = [
|
|
p for p in captured_prompts if p["step_name"] == "review"
|
|
]
|
|
if len(review_calls) <= 1:
|
|
return AgentResult(
|
|
output="Issues found\n\nVERDICT: FAIL",
|
|
exit_code=0,
|
|
agent_name=agent_config.name,
|
|
step_name=step_name,
|
|
duration_seconds=2.0,
|
|
transcript="# Transcript\nreview ran",
|
|
command_preview="claude -p --setting-sources user",
|
|
)
|
|
return AgentResult(
|
|
output="All good\n\nVERDICT: PASS",
|
|
exit_code=0,
|
|
agent_name=agent_config.name,
|
|
step_name=step_name,
|
|
duration_seconds=2.0,
|
|
)
|
|
|
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock):
|
|
result = run_pipeline(config)
|
|
|
|
self.assertEqual(result.final_verdict, "PASS")
|
|
self.assertEqual(len(result.iterations), 2)
|
|
|
|
# The review prompt in iteration 2 should reference prior evidence
|
|
# (from iteration 1's coding step)
|
|
iter2_review_prompts = [
|
|
p for p in captured_prompts
|
|
if p["step_name"] == "review"
|
|
]
|
|
# There should be 2 review prompts (one per iteration)
|
|
self.assertEqual(len(iter2_review_prompts), 2)
|
|
iter2_review = iter2_review_prompts[1]["prompt"]
|
|
# Prior evidence should appear because it was carried forward
|
|
# The review step runs after coding, so it sees current iteration's
|
|
# coding evidence. But the key test is that evidence IS present.
|
|
self.assertIn("Exit code: 0", iter2_review)
|
|
self.assertIn("claude-coder", iter2_review)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 7. Report evidence summary table
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestReportEvidenceSummaryTable(unittest.TestCase):
|
|
"""Report includes evidence summary table per iteration."""
|
|
|
|
def test_report_contains_evidence_summary(self) -> None:
|
|
steps = [
|
|
StepConfig(
|
|
name="coding", agent="claude-coder", role="coding",
|
|
prompt_template="default:coding", output_key="coding_output",
|
|
),
|
|
StepConfig(
|
|
name="review", agent="claude-reviewer", role="review",
|
|
prompt_template="default:review", output_key="review_result",
|
|
verdict=True,
|
|
),
|
|
]
|
|
config = PipelineConfig(
|
|
max_iterations=1,
|
|
language="en",
|
|
inputs={"plan": "Plan", "checklist": "CL"},
|
|
agents=dict(BUILTIN_AGENTS),
|
|
pipeline=steps,
|
|
preset_name="simple",
|
|
)
|
|
|
|
coding_result = AgentResult(
|
|
output="diff --git a/file ...",
|
|
exit_code=0,
|
|
agent_name="claude-coder",
|
|
step_name="coding",
|
|
duration_seconds=10.0,
|
|
transcript="# Transcript",
|
|
command_preview="claude --setting-sources user",
|
|
)
|
|
review_result = AgentResult(
|
|
output="VERDICT: PASS",
|
|
exit_code=0,
|
|
agent_name="claude-reviewer",
|
|
step_name="review",
|
|
duration_seconds=5.0,
|
|
transcript="# Transcript",
|
|
command_preview="claude -p",
|
|
)
|
|
|
|
iteration = IterationResult(
|
|
iteration=1,
|
|
step_results={
|
|
"coding_output": coding_result,
|
|
"review_result": review_result,
|
|
},
|
|
step_outputs={
|
|
"coding_output": "diff --git a/file ...",
|
|
"review_result": "VERDICT: PASS",
|
|
},
|
|
verdict="PASS",
|
|
)
|
|
|
|
pipeline_result = PipelineResult(
|
|
iterations=[iteration],
|
|
final_verdict="PASS",
|
|
total_duration=15.0,
|
|
)
|
|
|
|
report = build_report(config, pipeline_result)
|
|
self.assertIn("Evidence Summary", report)
|
|
self.assertIn("claude-coder", report)
|
|
self.assertIn("claude-reviewer", report)
|
|
self.assertIn("10.0s", report)
|
|
self.assertIn("5.0s", report)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 8. _build_context merges prior and current evidence
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestBuildContextMergesEvidence(unittest.TestCase):
|
|
"""_build_context merges prior iteration evidence with current step evidence."""
|
|
|
|
def test_prior_evidence_used_when_no_current_results(self) -> None:
|
|
from cross_eval.pipeline import _build_context
|
|
input_contents = {
|
|
"plan": "test",
|
|
"execution_evidence": "### Step: coding (coder)\n- Exit code: 0",
|
|
}
|
|
context = _build_context(
|
|
input_contents, {}, "feedback", 2, 5, step_results=None,
|
|
)
|
|
# Prior evidence should survive when there are no current results
|
|
self.assertIn("coding (coder)", context["execution_evidence"])
|
|
|
|
def test_current_and_prior_merged(self) -> None:
|
|
from cross_eval.pipeline import _build_context
|
|
input_contents = {
|
|
"plan": "test",
|
|
"execution_evidence": "### Step: coding (coder)\n- Exit code: 0",
|
|
}
|
|
current_result = AgentResult(
|
|
output="review text", exit_code=0, agent_name="reviewer",
|
|
step_name="review", duration_seconds=3.0,
|
|
command_preview="cmd",
|
|
)
|
|
context = _build_context(
|
|
input_contents, {}, "feedback", 2, 5,
|
|
step_results={"review_result": current_result},
|
|
)
|
|
evidence = context["execution_evidence"]
|
|
# Both prior and current should appear
|
|
self.assertIn("Prior Iteration Evidence", evidence)
|
|
self.assertIn("Current Iteration Evidence", evidence)
|
|
self.assertIn("coding (coder)", evidence)
|
|
self.assertIn("reviewer", evidence)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 9. Evidence in review-only template (used by review-fix preset)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestReviewOnlyTemplateIncludesEvidence(unittest.TestCase):
|
|
"""review-only template includes {execution_evidence} placeholder."""
|
|
|
|
def test_review_only_template_has_evidence_placeholder(self) -> None:
|
|
from cross_eval.prompts import REVIEW_ONLY_TEMPLATE, REVIEW_ONLY_TEMPLATE_KO
|
|
self.assertIn("{execution_evidence}", REVIEW_ONLY_TEMPLATE)
|
|
self.assertIn("{execution_evidence}", REVIEW_ONLY_TEMPLATE_KO)
|
|
|
|
def test_review_only_renders_evidence(self) -> None:
|
|
from cross_eval.prompts import render_template, REVIEW_ONLY_TEMPLATE
|
|
context = {
|
|
"plan": "Test plan",
|
|
"checklist": "Test checklist",
|
|
"docs": "Test docs",
|
|
"feedback": "No feedback",
|
|
"execution_evidence": "### Step: coding (coder)\n- Exit code: 0\n- Duration: 5.0s",
|
|
"iteration": "1",
|
|
"max_iterations": "3",
|
|
}
|
|
rendered = render_template(REVIEW_ONLY_TEMPLATE, context)
|
|
self.assertIn("Exit code: 0", rendered)
|
|
self.assertIn("Duration: 5.0s", rendered)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 10. Evidence propagation in phased pipeline (coding-review-fix)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestPhasedPipelineEvidencePropagation(unittest.TestCase):
|
|
"""Evidence propagates correctly in coding-review-fix phased pipeline."""
|
|
|
|
def test_reviewer_receives_coding_evidence_in_phased_pipeline(self) -> None:
|
|
"""In coding-review-fix, review-phase reviewers see coding-phase evidence."""
|
|
from cross_eval.prompts import _build_coding_review_fix_preset
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
coders = ["claude-coder"]
|
|
reviewers = ["claude-reviewer"]
|
|
seniors = ["claude-senior"]
|
|
phases = _build_coding_review_fix_preset(coders, reviewers, seniors)
|
|
|
|
config = PipelineConfig(
|
|
output_dir=Path(tmpdir),
|
|
max_iterations=5,
|
|
min_iterations=1,
|
|
language="en",
|
|
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
|
agents=dict(BUILTIN_AGENTS),
|
|
coders=coders,
|
|
reviewers=reviewers,
|
|
seniors=seniors,
|
|
phases=phases,
|
|
preset_name="coding-review-fix",
|
|
)
|
|
|
|
captured_prompts: list[dict] = []
|
|
|
|
def _mock(agent_config, prompt, step_name, **kwargs):
|
|
captured_prompts.append({
|
|
"step_name": step_name,
|
|
"prompt": prompt,
|
|
"agent_name": agent_config.name,
|
|
})
|
|
if step_name == "coding":
|
|
return AgentResult(
|
|
output="Implemented feature X",
|
|
exit_code=0,
|
|
agent_name=agent_config.name,
|
|
step_name=step_name,
|
|
duration_seconds=10.0,
|
|
transcript="# Transcript\nclaude executed coding task",
|
|
command_preview="claude --setting-sources user",
|
|
)
|
|
if step_name == "verify":
|
|
return AgentResult(
|
|
output="All good\n\nVERDICT: PASS",
|
|
exit_code=0,
|
|
agent_name=agent_config.name,
|
|
step_name=step_name,
|
|
duration_seconds=3.0,
|
|
)
|
|
return AgentResult(
|
|
output=f"Output for {step_name}",
|
|
exit_code=0,
|
|
agent_name=agent_config.name,
|
|
step_name=step_name,
|
|
duration_seconds=2.0,
|
|
transcript=f"# Transcript for {step_name}",
|
|
command_preview=f"cmd-{step_name}",
|
|
)
|
|
|
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock):
|
|
result = run_pipeline(config)
|
|
|
|
self.assertEqual(result.final_verdict, "PASS")
|
|
|
|
# Check that review-phase reviewers received evidence
|
|
review_prompts = [
|
|
p for p in captured_prompts
|
|
if p["step_name"].startswith("review_")
|
|
]
|
|
self.assertTrue(len(review_prompts) >= 1)
|
|
# The review prompt should contain evidence from the coding phase
|
|
review_prompt = review_prompts[0]["prompt"]
|
|
self.assertIn("Execution Evidence", review_prompt)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 11. Evidence format includes output size
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestEvidenceIncludesOutputSize(unittest.TestCase):
|
|
"""_format_execution_evidence includes output size for debugging."""
|
|
|
|
def test_output_size_in_evidence(self) -> None:
|
|
result = AgentResult(
|
|
output="x" * 500,
|
|
exit_code=0,
|
|
agent_name="claude-coder",
|
|
step_name="coding",
|
|
duration_seconds=5.0,
|
|
command_preview="claude --setting-sources user",
|
|
)
|
|
evidence = _format_execution_evidence({"coding_output": result})
|
|
self.assertIn("Output size: 500 chars", evidence)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 12. Report transcript label i18n
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestReportTranscriptLabelI18n(unittest.TestCase):
|
|
"""Report uses translated transcript label."""
|
|
|
|
def test_korean_transcript_label(self) -> None:
|
|
steps = [
|
|
StepConfig(
|
|
name="coding", agent="claude-coder", role="coding",
|
|
prompt_template="default:coding", output_key="coding_output",
|
|
),
|
|
]
|
|
config = PipelineConfig(
|
|
max_iterations=1,
|
|
language="ko",
|
|
inputs={"plan": "Plan", "checklist": "CL"},
|
|
agents=dict(BUILTIN_AGENTS),
|
|
pipeline=steps,
|
|
preset_name="simple",
|
|
)
|
|
|
|
coding_result = AgentResult(
|
|
output="diff --git a/file ...",
|
|
exit_code=0,
|
|
agent_name="claude-coder",
|
|
step_name="coding",
|
|
duration_seconds=10.0,
|
|
transcript="# Agent Execution Transcript\n## Command\nclaude ...",
|
|
command_preview="claude --setting-sources user",
|
|
)
|
|
|
|
iteration = IterationResult(
|
|
iteration=1,
|
|
step_results={"coding_output": coding_result},
|
|
step_outputs={"coding_output": "diff --git a/file ..."},
|
|
)
|
|
|
|
pipeline_result = PipelineResult(
|
|
iterations=[iteration],
|
|
final_verdict="MAX_ITERATIONS_REACHED",
|
|
total_duration=10.0,
|
|
)
|
|
|
|
report = build_report(config, pipeline_result)
|
|
self.assertIn("실행 트랜스크립트", report)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 13. Claude coder + Codex reviewer/senior combination
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestCodingReviewFixClaudeCodexCombination(unittest.TestCase):
|
|
"""coding-review-fix works with Claude as coder and Codex as reviewer/senior."""
|
|
|
|
def test_claude_coder_codex_reviewer_completes(self) -> None:
|
|
"""Verify the preset completes with mixed Claude/Codex agents."""
|
|
from cross_eval.prompts import _build_coding_review_fix_preset
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
coders = ["claude-coder"]
|
|
reviewers = ["codex-reviewer"]
|
|
seniors = ["codex-senior"]
|
|
phases = _build_coding_review_fix_preset(coders, reviewers, seniors)
|
|
|
|
config = PipelineConfig(
|
|
output_dir=Path(tmpdir),
|
|
max_iterations=5,
|
|
min_iterations=1,
|
|
language="en",
|
|
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
|
agents=dict(BUILTIN_AGENTS),
|
|
coders=coders,
|
|
reviewers=reviewers,
|
|
seniors=seniors,
|
|
phases=phases,
|
|
preset_name="coding-review-fix",
|
|
)
|
|
|
|
def _mock(agent_config, prompt, step_name, **kwargs):
|
|
if step_name == "verify":
|
|
return AgentResult(
|
|
output="All good\n\nVERDICT: PASS",
|
|
exit_code=0,
|
|
agent_name=agent_config.name,
|
|
step_name=step_name,
|
|
duration_seconds=2.0,
|
|
transcript="# Transcript",
|
|
command_preview="codex exec",
|
|
)
|
|
return AgentResult(
|
|
output=f"Output for {step_name}",
|
|
exit_code=0,
|
|
agent_name=agent_config.name,
|
|
step_name=step_name,
|
|
duration_seconds=3.0,
|
|
transcript=f"# Transcript for {step_name}",
|
|
command_preview=f"cmd-{step_name}",
|
|
)
|
|
|
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock):
|
|
result = run_pipeline(config)
|
|
|
|
self.assertEqual(result.final_verdict, "PASS")
|
|
# Verify both Claude and Codex agents were used
|
|
all_agents = set()
|
|
for ir in result.iterations:
|
|
for ar in ir.step_results.values():
|
|
all_agents.add(ar.agent_name)
|
|
self.assertIn("claude-coder", all_agents)
|
|
self.assertIn("codex-reviewer", all_agents)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|