"""Regression tests for runtime evidence propagation and report visibility. Covers: 1. Execution evidence is surfaced in reviewer/senior prompt context. 2. Reports include command preview and transcript excerpts. 3. Claude agentic failure detection (empty diff, write failure, expanded markers). 4. _format_execution_evidence produces expected output. """ from __future__ import annotations import tempfile import unittest from pathlib import Path from unittest.mock import MagicMock, patch from cross_eval.agent import ( AgentInvocationError, _claims_file_changes, _has_write_failure_indicators, invoke_agent_agentic, ) from cross_eval.config import BUILTIN_AGENTS from cross_eval.models import ( AgentConfig, AgentResult, IterationResult, PipelineConfig, PipelineResult, StepConfig, ) from cross_eval.pipeline import _build_artifact_references, _format_execution_evidence, run_pipeline from cross_eval.report import build_report # --------------------------------------------------------------------------- # 1. Execution evidence formatting # --------------------------------------------------------------------------- class TestFormatExecutionEvidence(unittest.TestCase): """_format_execution_evidence produces a compact summary for reviewers.""" def test_empty_results_returns_placeholder(self) -> None: self.assertIn("no prior execution evidence", _format_execution_evidence({})) def test_single_result_includes_key_fields(self) -> None: result = AgentResult( output="some diff", exit_code=0, agent_name="claude-coder", step_name="coding", duration_seconds=12.3, transcript="# Agent Execution Transcript\n\n## Command\nclaude ...", command_preview="claude --setting-sources user", ) evidence = _format_execution_evidence({"coding_output": result}) self.assertIn("claude-coder", evidence) self.assertIn("coding", evidence) self.assertIn("Exit code: 0", evidence) self.assertIn("12.3s", evidence) self.assertIn("claude --setting-sources user", evidence) self.assertNotIn("Transcript excerpt", evidence) def test_multiple_results_separated(self) -> None: r1 = AgentResult( output="diff1", exit_code=0, agent_name="coder", step_name="coding", duration_seconds=1.0, command_preview="cmd1", ) r2 = AgentResult( output="review text", exit_code=0, agent_name="reviewer", step_name="review", duration_seconds=2.0, command_preview="cmd2", ) evidence = _format_execution_evidence({ "coding_output": r1, "review_result": r2, }) self.assertIn("coder", evidence) self.assertIn("reviewer", evidence) self.assertIn("---", evidence) def test_transcript_truncated_at_2000_chars(self) -> None: long_transcript = "x" * 3000 result = AgentResult( output="out", exit_code=0, agent_name="agent", step_name="step", duration_seconds=1.0, transcript=long_transcript, ) evidence = _format_execution_evidence({"key": result}) self.assertNotIn("x" * 3000, evidence) def test_artifact_paths_included_when_run_dir_provided(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: result = AgentResult( output="diff", exit_code=0, agent_name="coder", step_name="coding", duration_seconds=1.2, transcript="stdout", command_preview="claude ...", ) evidence = _format_execution_evidence( {"coding_output": result}, run_dir=Path(tmpdir), iteration=2, ) self.assertIn("v2/coding.md", evidence) self.assertIn("v2/coding_transcript.md", evidence) class TestArtifactReferences(unittest.TestCase): """Artifact references should prefer file paths and git state over inline text.""" def test_contains_input_refs_and_git_context(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: repo = Path(tmpdir) / "repo" repo.mkdir() (repo / "plan.md").write_text("plan", encoding="utf-8") (repo / "checklist.md").write_text("checklist", encoding="utf-8") import subprocess subprocess.run(["git", "init"], cwd=repo, capture_output=True, check=True) subprocess.run(["git", "config", "user.email", "test@test.com"], cwd=repo, capture_output=True, check=True) subprocess.run(["git", "config", "user.name", "Test"], cwd=repo, capture_output=True, check=True) subprocess.run(["git", "add", "."], cwd=repo, capture_output=True, check=True) subprocess.run(["git", "commit", "-m", "init"], cwd=repo, capture_output=True, check=True) refs = _build_artifact_references( { "plan_ref": str((repo / "plan.md").resolve()), "checklist_ref": str((repo / "checklist.md").resolve()), "docs_ref": "(none)", }, cwd=repo, run_dir=repo / ".cross-eval" / "output" / "run", iteration=1, worktree_path=None, ) self.assertIn("Plan:", refs) self.assertIn("Git commit:", refs) self.assertIn("Suggested git commands", refs) # --------------------------------------------------------------------------- # 2. Evidence in reviewer prompts (integration) # --------------------------------------------------------------------------- class TestEvidenceInReviewerPrompt(unittest.TestCase): """Reviewer prompts include execution evidence from prior coding step.""" def test_reviewer_receives_evidence(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: steps = [ StepConfig( name="coding", agent="claude-coder", role="coding", prompt_template="default:coding", output_key="coding_output", ), StepConfig( name="review", agent="claude-reviewer", role="review", prompt_template="default:review", output_key="review_result", verdict=True, ), ] config = PipelineConfig( output_dir=Path(tmpdir), max_iterations=1, min_iterations=1, language="en", inputs={"plan": "Test plan", "checklist": "Test checklist"}, agents=dict(BUILTIN_AGENTS), coders=["claude-coder"], reviewers=["claude-reviewer"], pipeline=steps, preset_name="simple", ) captured_prompts: list[dict] = [] def _mock(agent_config, prompt, step_name, **kwargs): captured_prompts.append({ "step_name": step_name, "prompt": prompt, }) if step_name == "coding": return AgentResult( output="Implemented feature X", exit_code=0, agent_name=agent_config.name, step_name=step_name, duration_seconds=5.0, transcript="# Transcript\nclaude ran...", command_preview="claude --setting-sources user", ) return AgentResult( output="VERDICT: PASS", exit_code=0, agent_name=agent_config.name, step_name=step_name, duration_seconds=2.0, ) with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock): result = run_pipeline(config) self.assertEqual(result.final_verdict, "PASS") # The reviewer prompt should contain execution evidence review_prompts = [ p for p in captured_prompts if p["step_name"] == "review" ] self.assertTrue(len(review_prompts) >= 1) review_prompt = review_prompts[0]["prompt"] self.assertIn("Artifact References", review_prompt) self.assertIn("Execution Evidence", review_prompt) self.assertIn("claude-coder", review_prompt) # --------------------------------------------------------------------------- # 3. Report includes evidence # --------------------------------------------------------------------------- class TestReportIncludesEvidence(unittest.TestCase): """Report generation includes command preview and transcript excerpts.""" def _make_pipeline_result(self) -> tuple[PipelineConfig, PipelineResult]: steps = [ StepConfig( name="coding", agent="claude-coder", role="coding", prompt_template="default:coding", output_key="coding_output", ), StepConfig( name="review", agent="claude-reviewer", role="review", prompt_template="default:review", output_key="review_result", verdict=True, ), ] config = PipelineConfig( max_iterations=1, language="en", inputs={"plan": "Plan", "checklist": "CL"}, agents=dict(BUILTIN_AGENTS), pipeline=steps, preset_name="simple", ) coding_result = AgentResult( output="diff --git a/file ...", exit_code=0, agent_name="claude-coder", step_name="coding", duration_seconds=10.0, transcript="# Agent Execution Transcript\n## Command\nclaude ...\n## Stdout\nok", command_preview="claude --setting-sources user", ) review_result = AgentResult( output="All good.\n\nVERDICT: PASS", exit_code=0, agent_name="claude-reviewer", step_name="review", duration_seconds=5.0, transcript="# Agent Execution Transcript\n## Command\nclaude -p ...\n## Stdout\nAll good.", command_preview="claude -p --setting-sources user", ) iteration = IterationResult( iteration=1, step_results={ "coding_output": coding_result, "review_result": review_result, }, step_outputs={ "coding_output": "diff --git a/file ...", "review_result": "All good.\n\nVERDICT: PASS", }, verdict="PASS", ) pipeline_result = PipelineResult( iterations=[iteration], final_verdict="PASS", total_duration=15.0, ) return config, pipeline_result def test_report_contains_command_preview(self) -> None: config, result = self._make_pipeline_result() report = build_report(config, result) self.assertIn("claude --setting-sources user", report) self.assertIn("**Command**", report) def test_report_contains_transcript_excerpt(self) -> None: config, result = self._make_pipeline_result() report = build_report(config, result) self.assertIn("Execution transcript", report) self.assertIn("Agent Execution Transcript", report) def test_report_contains_exit_code(self) -> None: config, result = self._make_pipeline_result() report = build_report(config, result) self.assertIn("**Exit code**: 0", report) # --------------------------------------------------------------------------- # 4. Claude agentic hardened failure detection # --------------------------------------------------------------------------- class TestClaimsFileChangesExpanded(unittest.TestCase): """Expanded change-claim markers detect more Claude output patterns.""" def test_ive_implemented(self) -> None: self.assertTrue(_claims_file_changes("I've implemented the feature")) def test_ive_updated(self) -> None: self.assertTrue(_claims_file_changes("I've updated the config")) def test_made_the_following_changes(self) -> None: self.assertTrue(_claims_file_changes("I made the following changes to the file")) def test_applied_the_fix(self) -> None: self.assertTrue(_claims_file_changes("Applied the fix for the bug")) def test_changes_have_been_applied(self) -> None: self.assertTrue(_claims_file_changes("Changes have been applied successfully")) def test_wrote_the_code(self) -> None: self.assertTrue(_claims_file_changes("Wrote the code for the new module")) def test_refactored(self) -> None: self.assertTrue(_claims_file_changes("I refactored the pipeline")) def test_no_changes_still_returns_false(self) -> None: self.assertFalse(_claims_file_changes("No changes were necessary")) def test_empty_string_returns_false(self) -> None: self.assertFalse(_claims_file_changes("")) class TestWriteFailureIndicators(unittest.TestCase): """_has_write_failure_indicators detects stderr patterns.""" def test_permission_denied(self) -> None: self.assertTrue(_has_write_failure_indicators("Error: Permission denied")) def test_read_only_filesystem(self) -> None: self.assertTrue(_has_write_failure_indicators("read-only file system")) def test_sandbox_restriction(self) -> None: self.assertTrue(_has_write_failure_indicators("Blocked by sandbox policy")) def test_eacces(self) -> None: self.assertTrue(_has_write_failure_indicators("EACCES: operation not permitted")) def test_empty_stderr_returns_false(self) -> None: self.assertFalse(_has_write_failure_indicators("")) def test_normal_stderr_returns_false(self) -> None: self.assertFalse(_has_write_failure_indicators("Downloading model...")) class TestAgenticWriteFailureRaisesError(unittest.TestCase): """Agentic mode raises AgentInvocationError on stderr write-failure indicators.""" @patch("cross_eval.worktree.capture_diff", return_value="") @patch("subprocess.run") def test_write_failure_detected_from_stderr( self, mock_run: MagicMock, mock_diff: MagicMock, ) -> None: mock_run.return_value = MagicMock( returncode=0, stdout="Done.", stderr="Error: Permission denied writing to /src/main.py", ) agent = AgentConfig( name="claude-coder", command="claude", args=["--setting-sources", "user"], agentic=True, ) import subprocess as _sp import tempfile as _tf with _tf.TemporaryDirectory() as td: wt = Path(td) _sp.run(["git", "init"], cwd=wt, capture_output=True, check=True) _sp.run(["git", "config", "user.email", "t@t.com"], cwd=wt, capture_output=True) _sp.run(["git", "config", "user.name", "T"], cwd=wt, capture_output=True) (wt / "README.md").write_text("# init\n") _sp.run(["git", "add", "."], cwd=wt, capture_output=True, check=True) _sp.run(["git", "commit", "-m", "init"], cwd=wt, capture_output=True, check=True) with self.assertRaises(AgentInvocationError) as ctx: invoke_agent_agentic( agent, "implement feature", "coding", worktree_path=wt, quiet=True, ) self.assertEqual(ctx.exception.failure_type, "WRITE_FAILURE") self.assertIn("Permission denied", ctx.exception.raw_error) class TestAgenticExpandedClaimMarkers(unittest.TestCase): """Agentic mode detects expanded claim markers in empty diff scenarios.""" @patch("cross_eval.worktree.capture_diff", return_value="") @patch("subprocess.run") def test_ive_implemented_triggers_empty_diff_error( self, mock_run: MagicMock, mock_diff: MagicMock, ) -> None: mock_run.return_value = MagicMock( returncode=0, stdout="I've implemented the requested changes to the pipeline.", stderr="", ) agent = AgentConfig( name="claude-coder", command="claude", args=["--setting-sources", "user"], agentic=True, ) import subprocess as _sp import tempfile as _tf with _tf.TemporaryDirectory() as td: wt = Path(td) _sp.run(["git", "init"], cwd=wt, capture_output=True, check=True) _sp.run(["git", "config", "user.email", "t@t.com"], cwd=wt, capture_output=True) _sp.run(["git", "config", "user.name", "T"], cwd=wt, capture_output=True) (wt / "README.md").write_text("# init\n") _sp.run(["git", "add", "."], cwd=wt, capture_output=True, check=True) _sp.run(["git", "commit", "-m", "init"], cwd=wt, capture_output=True, check=True) with self.assertRaises(AgentInvocationError) as ctx: invoke_agent_agentic( agent, "implement feature", "coding", worktree_path=wt, quiet=True, ) self.assertEqual(ctx.exception.failure_type, "EMPTY_DIFF") # --------------------------------------------------------------------------- # 5. Expanded claim/no-change markers # --------------------------------------------------------------------------- class TestExpandedClaimMarkers(unittest.TestCase): """New claim markers detect additional Claude output patterns.""" def test_completed_all_the_changes(self) -> None: self.assertTrue(_claims_file_changes("I completed all the changes")) def test_finished_implementing(self) -> None: self.assertTrue(_claims_file_changes("Finished implementing the feature")) def test_all_tasks_completed(self) -> None: self.assertTrue(_claims_file_changes("All tasks completed successfully")) def test_done_with_the_implementation(self) -> None: self.assertTrue(_claims_file_changes("Done with the implementation")) def test_successfully_implemented(self) -> None: self.assertTrue(_claims_file_changes("Successfully implemented the changes")) def test_changes_are_complete(self) -> None: self.assertTrue(_claims_file_changes("All changes are complete")) class TestExpandedNoChangeMarkers(unittest.TestCase): """New no-change markers prevent false positives.""" def test_no_changes_needed(self) -> None: self.assertFalse(_claims_file_changes("No changes needed")) def test_no_fixes_needed(self) -> None: self.assertFalse(_claims_file_changes("No fixes needed for this code")) def test_code_is_correct_as_is(self) -> None: self.assertFalse(_claims_file_changes("The code is correct as-is")) def test_already_correct(self) -> None: self.assertFalse(_claims_file_changes("Implementation is already correct")) def test_no_action_required(self) -> None: self.assertFalse(_claims_file_changes("No action required")) # --------------------------------------------------------------------------- # 6. Cross-iteration evidence propagation # --------------------------------------------------------------------------- class TestCrossIterationEvidencePropagation(unittest.TestCase): """Execution evidence from prior iterations is available to subsequent iterations.""" def test_prior_evidence_available_in_iteration_2(self) -> None: """Review step in iteration 2 should see coding evidence from iteration 1.""" with tempfile.TemporaryDirectory() as tmpdir: steps = [ StepConfig( name="coding", agent="claude-coder", role="coding", prompt_template="default:coding", output_key="coding_output", ), StepConfig( name="review", agent="claude-reviewer", role="review", prompt_template="default:review", output_key="review_result", verdict=True, ), ] config = PipelineConfig( output_dir=Path(tmpdir), max_iterations=2, min_iterations=1, language="en", inputs={"plan": "Test plan", "checklist": "Test checklist"}, agents=dict(BUILTIN_AGENTS), coders=["claude-coder"], reviewers=["claude-reviewer"], pipeline=steps, preset_name="simple", ) captured_prompts: list[dict] = [] def _mock(agent_config, prompt, step_name, **kwargs): captured_prompts.append({ "step_name": step_name, "prompt": prompt, }) if step_name == "coding": return AgentResult( output="Implemented feature X", exit_code=0, agent_name=agent_config.name, step_name=step_name, duration_seconds=5.0, transcript="# Transcript\nclaude ran the task", command_preview="claude --setting-sources user", ) # First review: FAIL, second review: PASS review_calls = [ p for p in captured_prompts if p["step_name"] == "review" ] if len(review_calls) <= 1: return AgentResult( output="Issues found\n\nVERDICT: FAIL", exit_code=0, agent_name=agent_config.name, step_name=step_name, duration_seconds=2.0, transcript="# Transcript\nreview ran", command_preview="claude -p --setting-sources user", ) return AgentResult( output="All good\n\nVERDICT: PASS", exit_code=0, agent_name=agent_config.name, step_name=step_name, duration_seconds=2.0, ) with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock): result = run_pipeline(config) self.assertEqual(result.final_verdict, "PASS") self.assertEqual(len(result.iterations), 2) # The review prompt in iteration 2 should reference prior evidence # (from iteration 1's coding step) iter2_review_prompts = [ p for p in captured_prompts if p["step_name"] == "review" ] # There should be 2 review prompts (one per iteration) self.assertEqual(len(iter2_review_prompts), 2) iter2_review = iter2_review_prompts[1]["prompt"] # Prior evidence should appear because it was carried forward # The review step runs after coding, so it sees current iteration's # coding evidence. But the key test is that evidence IS present. self.assertIn("Exit code: 0", iter2_review) self.assertIn("claude-coder", iter2_review) # --------------------------------------------------------------------------- # 7. Report evidence summary table # --------------------------------------------------------------------------- class TestReportEvidenceSummaryTable(unittest.TestCase): """Report includes evidence summary table per iteration.""" def test_report_contains_evidence_summary(self) -> None: steps = [ StepConfig( name="coding", agent="claude-coder", role="coding", prompt_template="default:coding", output_key="coding_output", ), StepConfig( name="review", agent="claude-reviewer", role="review", prompt_template="default:review", output_key="review_result", verdict=True, ), ] config = PipelineConfig( max_iterations=1, language="en", inputs={"plan": "Plan", "checklist": "CL"}, agents=dict(BUILTIN_AGENTS), pipeline=steps, preset_name="simple", ) coding_result = AgentResult( output="diff --git a/file ...", exit_code=0, agent_name="claude-coder", step_name="coding", duration_seconds=10.0, transcript="# Transcript", command_preview="claude --setting-sources user", ) review_result = AgentResult( output="VERDICT: PASS", exit_code=0, agent_name="claude-reviewer", step_name="review", duration_seconds=5.0, transcript="# Transcript", command_preview="claude -p", ) iteration = IterationResult( iteration=1, step_results={ "coding_output": coding_result, "review_result": review_result, }, step_outputs={ "coding_output": "diff --git a/file ...", "review_result": "VERDICT: PASS", }, verdict="PASS", ) pipeline_result = PipelineResult( iterations=[iteration], final_verdict="PASS", total_duration=15.0, ) report = build_report(config, pipeline_result) self.assertIn("Evidence Summary", report) self.assertIn("claude-coder", report) self.assertIn("claude-reviewer", report) self.assertIn("10.0s", report) self.assertIn("5.0s", report) # --------------------------------------------------------------------------- # 8. _build_context merges prior and current evidence # --------------------------------------------------------------------------- class TestBuildContextMergesEvidence(unittest.TestCase): """_build_context merges prior iteration evidence with current step evidence.""" def test_prior_evidence_used_when_no_current_results(self) -> None: from cross_eval.pipeline import _build_context input_contents = { "plan": "test", "execution_evidence": "### Step: coding (coder)\n- Exit code: 0", } context = _build_context( input_contents, {}, "feedback", 2, 5, step_results=None, ) # Prior evidence should survive when there are no current results self.assertIn("coding (coder)", context["execution_evidence"]) def test_current_and_prior_merged(self) -> None: from cross_eval.pipeline import _build_context input_contents = { "plan": "test", "execution_evidence": "### Step: coding (coder)\n- Exit code: 0", } current_result = AgentResult( output="review text", exit_code=0, agent_name="reviewer", step_name="review", duration_seconds=3.0, command_preview="cmd", ) context = _build_context( input_contents, {}, "feedback", 2, 5, step_results={"review_result": current_result}, ) evidence = context["execution_evidence"] # Both prior and current should appear self.assertIn("Prior Iteration Evidence", evidence) self.assertIn("Current Iteration Evidence", evidence) self.assertIn("coding (coder)", evidence) self.assertIn("reviewer", evidence) # --------------------------------------------------------------------------- # 9. Evidence in review-only template (used by review-fix preset) # --------------------------------------------------------------------------- class TestReviewOnlyTemplateIncludesEvidence(unittest.TestCase): """review-only template includes {execution_evidence} placeholder.""" def test_review_only_template_has_evidence_placeholder(self) -> None: from cross_eval.prompts import REVIEW_ONLY_TEMPLATE, REVIEW_ONLY_TEMPLATE_KO self.assertIn("{execution_evidence}", REVIEW_ONLY_TEMPLATE) self.assertIn("{execution_evidence}", REVIEW_ONLY_TEMPLATE_KO) def test_review_only_renders_evidence(self) -> None: from cross_eval.prompts import render_template, REVIEW_ONLY_TEMPLATE context = { "plan": "Test plan", "checklist": "Test checklist", "docs": "Test docs", "feedback": "No feedback", "execution_evidence": "### Step: coding (coder)\n- Exit code: 0\n- Duration: 5.0s", "iteration": "1", "max_iterations": "3", } rendered = render_template(REVIEW_ONLY_TEMPLATE, context) self.assertIn("Exit code: 0", rendered) self.assertIn("Duration: 5.0s", rendered) # --------------------------------------------------------------------------- # 10. Evidence propagation in phased pipeline (coding-review-fix) # --------------------------------------------------------------------------- class TestPhasedPipelineEvidencePropagation(unittest.TestCase): """Evidence propagates correctly in coding-review-fix phased pipeline.""" def test_reviewer_receives_coding_evidence_in_phased_pipeline(self) -> None: """In coding-review-fix, review-phase reviewers see coding-phase evidence.""" from cross_eval.prompts import _build_coding_review_fix_preset with tempfile.TemporaryDirectory() as tmpdir: coders = ["claude-coder"] reviewers = ["claude-reviewer"] seniors = ["claude-senior"] phases = _build_coding_review_fix_preset(coders, reviewers, seniors) config = PipelineConfig( output_dir=Path(tmpdir), max_iterations=5, min_iterations=1, language="en", inputs={"plan": "Test plan", "checklist": "Test checklist"}, agents=dict(BUILTIN_AGENTS), coders=coders, reviewers=reviewers, seniors=seniors, phases=phases, preset_name="coding-review-fix", ) captured_prompts: list[dict] = [] def _mock(agent_config, prompt, step_name, **kwargs): captured_prompts.append({ "step_name": step_name, "prompt": prompt, "agent_name": agent_config.name, }) if step_name == "coding": return AgentResult( output="Implemented feature X", exit_code=0, agent_name=agent_config.name, step_name=step_name, duration_seconds=10.0, transcript="# Transcript\nclaude executed coding task", command_preview="claude --setting-sources user", ) if step_name == "verify": return AgentResult( output="All good\n\nVERDICT: PASS", exit_code=0, agent_name=agent_config.name, step_name=step_name, duration_seconds=3.0, ) return AgentResult( output=f"Output for {step_name}", exit_code=0, agent_name=agent_config.name, step_name=step_name, duration_seconds=2.0, transcript=f"# Transcript for {step_name}", command_preview=f"cmd-{step_name}", ) with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock): result = run_pipeline(config) self.assertEqual(result.final_verdict, "PASS") # Check that review-phase reviewers received evidence review_prompts = [ p for p in captured_prompts if p["step_name"].startswith("review_") ] self.assertTrue(len(review_prompts) >= 1) # The review prompt should contain evidence from the coding phase review_prompt = review_prompts[0]["prompt"] self.assertIn("Execution Evidence", review_prompt) # --------------------------------------------------------------------------- # 11. Evidence format includes output size # --------------------------------------------------------------------------- class TestEvidenceIncludesOutputSize(unittest.TestCase): """_format_execution_evidence includes output size for debugging.""" def test_output_size_in_evidence(self) -> None: result = AgentResult( output="x" * 500, exit_code=0, agent_name="claude-coder", step_name="coding", duration_seconds=5.0, command_preview="claude --setting-sources user", ) evidence = _format_execution_evidence({"coding_output": result}) self.assertIn("Output size: 500 chars", evidence) # --------------------------------------------------------------------------- # 12. Report transcript label i18n # --------------------------------------------------------------------------- class TestReportTranscriptLabelI18n(unittest.TestCase): """Report uses translated transcript label.""" def test_korean_transcript_label(self) -> None: steps = [ StepConfig( name="coding", agent="claude-coder", role="coding", prompt_template="default:coding", output_key="coding_output", ), ] config = PipelineConfig( max_iterations=1, language="ko", inputs={"plan": "Plan", "checklist": "CL"}, agents=dict(BUILTIN_AGENTS), pipeline=steps, preset_name="simple", ) coding_result = AgentResult( output="diff --git a/file ...", exit_code=0, agent_name="claude-coder", step_name="coding", duration_seconds=10.0, transcript="# Agent Execution Transcript\n## Command\nclaude ...", command_preview="claude --setting-sources user", ) iteration = IterationResult( iteration=1, step_results={"coding_output": coding_result}, step_outputs={"coding_output": "diff --git a/file ..."}, ) pipeline_result = PipelineResult( iterations=[iteration], final_verdict="MAX_ITERATIONS_REACHED", total_duration=10.0, ) report = build_report(config, pipeline_result) self.assertIn("실행 트랜스크립트", report) # --------------------------------------------------------------------------- # 13. Claude coder + Codex reviewer/senior combination # --------------------------------------------------------------------------- class TestCodingReviewFixClaudeCodexCombination(unittest.TestCase): """coding-review-fix works with Claude as coder and Codex as reviewer/senior.""" def test_claude_coder_codex_reviewer_completes(self) -> None: """Verify the preset completes with mixed Claude/Codex agents.""" from cross_eval.prompts import _build_coding_review_fix_preset with tempfile.TemporaryDirectory() as tmpdir: coders = ["claude-coder"] reviewers = ["codex-reviewer"] seniors = ["codex-senior"] phases = _build_coding_review_fix_preset(coders, reviewers, seniors) config = PipelineConfig( output_dir=Path(tmpdir), max_iterations=5, min_iterations=1, language="en", inputs={"plan": "Test plan", "checklist": "Test checklist"}, agents=dict(BUILTIN_AGENTS), coders=coders, reviewers=reviewers, seniors=seniors, phases=phases, preset_name="coding-review-fix", ) def _mock(agent_config, prompt, step_name, **kwargs): if step_name == "verify": return AgentResult( output="All good\n\nVERDICT: PASS", exit_code=0, agent_name=agent_config.name, step_name=step_name, duration_seconds=2.0, transcript="# Transcript", command_preview="codex exec", ) return AgentResult( output=f"Output for {step_name}", exit_code=0, agent_name=agent_config.name, step_name=step_name, duration_seconds=3.0, transcript=f"# Transcript for {step_name}", command_preview=f"cmd-{step_name}", ) with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock): result = run_pipeline(config) self.assertEqual(result.final_verdict, "PASS") # Verify both Claude and Codex agents were used all_agents = set() for ir in result.iterations: for ar in ir.step_results.values(): all_agents.add(ar.agent_name) self.assertIn("claude-coder", all_agents) self.assertIn("codex-reviewer", all_agents) if __name__ == "__main__": unittest.main()