"""Integration tests for cross-eval pipeline with mocked agents.""" from __future__ import annotations import tempfile import unittest from pathlib import Path from unittest.mock import patch from cross_eval.config import BUILTIN_AGENTS from cross_eval.models import ( AgentConfig, AgentResult, PhaseConfig, PipelineConfig, StepConfig, ) from cross_eval.pipeline import run_pipeline from cross_eval.prompts import _build_review_fix_preset, _build_simple_preset def _make_mock_agent(outputs: list[str]): """Returns a side_effect function that returns outputs in sequence.""" call_count = [0] def _mock(agent_config, prompt, step_name, **kwargs): idx = min(call_count[0], len(outputs) - 1) call_count[0] += 1 return AgentResult( output=outputs[idx], exit_code=0, agent_name=agent_config.name, step_name=step_name, duration_seconds=0.1, ) return _mock def _make_step_mock(step_outputs: dict[str, list[str]]): """Returns a side_effect that dispatches by step_name, cycling through outputs.""" counters: dict[str, int] = {} def _mock(agent_config, prompt, step_name, **kwargs): if step_name not in counters: counters[step_name] = 0 outputs = step_outputs.get(step_name, [""]) idx = min(counters[step_name], len(outputs) - 1) counters[step_name] += 1 return AgentResult( output=outputs[idx], exit_code=0, agent_name=agent_config.name, step_name=step_name, duration_seconds=0.1, ) return _mock def _minimal_simple_config( run_dir: Path, max_iterations: int = 3, seniors: list[str] | None = None, ) -> PipelineConfig: """Build a minimal simple pipeline config for testing.""" coders = ["claude-coder"] reviewers = ["claude-reviewer"] senior_list = seniors if seniors is not None else [] steps = _build_simple_preset(coders, reviewers, senior_list) agents = dict(BUILTIN_AGENTS) return PipelineConfig( output_dir=run_dir, max_iterations=max_iterations, min_iterations=1, language="en", inputs={"plan": "Test plan", "checklist": "Test checklist"}, agents=agents, coders=coders, reviewers=reviewers, seniors=senior_list, pipeline=steps, preset_name="simple", ) class TestSimplePipelinePassStopsLoop(unittest.TestCase): """Test 1: mock agent returns VERDICT: PASS on first review -> stops at iteration 1.""" def test_simple_pipeline_pass_stops_loop(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: config = _minimal_simple_config(Path(tmpdir)) mock = _make_mock_agent([ "Coding output here", # coding step "All good\n\nVERDICT: PASS", # review step ]) with patch("cross_eval.pipeline.invoke_agent", side_effect=mock): result = run_pipeline(config) self.assertEqual(result.final_verdict, "PASS") self.assertEqual(len(result.iterations), 1) class TestSimplePipelineFailThenPass(unittest.TestCase): """Test 2: FAIL on first review, PASS on second -> 2 iterations.""" def test_simple_pipeline_fail_then_pass(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: config = _minimal_simple_config(Path(tmpdir), max_iterations=5) mock = _make_step_mock({ "coding": ["Coding output v1", "Coding output v2"], "review": [ "Issues found\n\nVERDICT: FAIL", "All good\n\nVERDICT: PASS", ], }) with patch("cross_eval.pipeline.invoke_agent", side_effect=mock): result = run_pipeline(config) self.assertEqual(result.final_verdict, "PASS") self.assertEqual(len(result.iterations), 2) class TestSimplePipelineEscalateBreaksLoop(unittest.TestCase): """Test 3: ESCALATE on review -> stops immediately, final_verdict=ESCALATE.""" def test_simple_pipeline_escalate_breaks_loop(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: config = _minimal_simple_config( Path(tmpdir), max_iterations=5, seniors=["claude-senior"], ) escalate_output = ( "### Confirmed Issues\n" "- [Critical] Requirements are ambiguous\n\n" "### Escalated Issues\n" "Requirements need stakeholder clarification\n\n" "### Verdict\n" "VERDICT: ESCALATE\n" ) mock = _make_step_mock({ "coding": ["Coding output"], "review": ["Issues found\n\nVERDICT: FAIL"], "senior_review": [escalate_output], }) with patch("cross_eval.pipeline.invoke_agent", side_effect=mock): result = run_pipeline(config) self.assertEqual(result.final_verdict, "ESCALATE") self.assertEqual(len(result.iterations), 1) self.assertTrue(len(result.escalated_issues) > 0) class TestSimplePipelineEscalatePriorityOverPass(unittest.TestCase): """Test 4: one verdict step returns PASS, another returns ESCALATE -> ESCALATE wins.""" def test_simple_pipeline_escalate_priority_over_pass(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: # Build a custom pipeline with 2 verdict steps (no senior) steps = [ StepConfig( name="coding", agent="claude-coder", role="coding", prompt_template="default:coding", output_key="coding_output", ), StepConfig( name="review_a", agent="claude-reviewer", role="review", prompt_template="default:review", output_key="review_a_result", verdict=True, ), StepConfig( name="review_b", agent="claude-reviewer", role="review", prompt_template="default:review", output_key="review_b_result", verdict=True, ), ] config = PipelineConfig( output_dir=Path(tmpdir), max_iterations=3, min_iterations=1, language="en", inputs={"plan": "Test plan", "checklist": "Test checklist"}, agents=dict(BUILTIN_AGENTS), coders=["claude-coder"], reviewers=["claude-reviewer"], pipeline=steps, preset_name="custom", ) escalate_output = ( "### Escalated Issues\n" "Ambiguous requirements need clarification\n\n" "VERDICT: ESCALATE\n" ) mock = _make_step_mock({ "coding": ["Coding output"], "review_a": ["All good\n\nVERDICT: PASS"], "review_b": [escalate_output], }) with patch("cross_eval.pipeline.invoke_agent", side_effect=mock): result = run_pipeline(config) self.assertEqual(result.final_verdict, "ESCALATE") self.assertTrue(len(result.escalated_issues) > 0) class TestPhasedPipelineEscalateBreaksPhase(unittest.TestCase): """Test 5: phased pipeline (review-fix), verify step returns ESCALATE -> phase stops.""" def test_phased_pipeline_escalate_breaks_phase(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: coders = ["claude-coder"] reviewers = ["claude-reviewer"] seniors = ["claude-senior"] phases = _build_review_fix_preset(coders, reviewers, seniors) config = PipelineConfig( output_dir=Path(tmpdir), max_iterations=5, min_iterations=1, language="en", inputs={"plan": "Test plan", "checklist": "Test checklist"}, agents=dict(BUILTIN_AGENTS), coders=coders, reviewers=reviewers, seniors=seniors, phases=phases, preset_name="review-fix", ) escalate_output = ( "### Escalated Issues\n" "Architecture decisions needed beyond plan scope\n\n" "### Verdict\n" "VERDICT: ESCALATE\n" ) mock = _make_step_mock({ "review_claude_reviewer": ["Review findings here"], "aggregate_review": ["Aggregated review\n\nAction items: fix X"], "coding": ["Fixed code"], "verify": [escalate_output], }) with patch("cross_eval.pipeline.invoke_agent", side_effect=mock): result = run_pipeline(config) self.assertEqual(result.final_verdict, "ESCALATE") self.assertTrue(len(result.escalated_issues) > 0) class TestAutoEscalateFiresWithoutSenior(unittest.TestCase): """Test 6: simple pipeline without senior, same FAIL feedback 3 times -> auto-escalate.""" def test_auto_escalate_fires_without_senior(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: # No seniors -> review step has verdict=True config = _minimal_simple_config( Path(tmpdir), max_iterations=5, seniors=None, ) # Same feedback mentioning the same file paths across all iterations repeated_fail = ( "Issues found in src/auth.py: missing validation check.\n" "The file src/auth.py still has the same problem.\n\n" "VERDICT: FAIL" ) mock = _make_step_mock({ "coding": ["Coding output v1", "Coding output v2", "Coding output v3"], "review": [repeated_fail, repeated_fail, repeated_fail], }) with patch("cross_eval.pipeline.invoke_agent", side_effect=mock): result = run_pipeline(config) self.assertEqual(result.final_verdict, "ESCALATE") self.assertTrue( any("Auto-escalated" in iss for iss in result.escalated_issues), ) class TestAutoEscalateDoesNotFireWithSenior(unittest.TestCase): """Test 7: same repeated FAIL but WITH senior/aggregate step -> no auto-escalate.""" def test_auto_escalate_does_not_fire_with_senior(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: # With seniors -> senior_review step has verdict=True, review does not config = _minimal_simple_config( Path(tmpdir), max_iterations=5, seniors=["claude-senior"], ) repeated_fail_review = ( "Issues found in src/auth.py: missing validation check.\n" "VERDICT: FAIL" ) # Senior also returns FAIL but the auto-escalate should NOT fire # because has_aggregator is True (seniors list is populated) senior_fail = ( "### Confirmed Issues\n" "- Missing validation in src/auth.py\n\n" "### Action Items\n" "1. Add validation in src/auth.py\n\n" "VERDICT: FAIL" ) mock = _make_step_mock({ "coding": [ "Coding output v1", "Coding output v2", "Coding output v3", "Coding output v4", "Coding output v5", ], "review": [ repeated_fail_review, repeated_fail_review, repeated_fail_review, repeated_fail_review, repeated_fail_review, ], "senior_review": [ senior_fail, senior_fail, senior_fail, senior_fail, senior_fail, ], }) with patch("cross_eval.pipeline.invoke_agent", side_effect=mock): result = run_pipeline(config) # Should NOT auto-escalate; should reach max iterations self.assertNotEqual(result.final_verdict, "ESCALATE") self.assertEqual(result.final_verdict, "MAX_ITERATIONS_REACHED") self.assertEqual(len(result.iterations), 5) class TestTrackerExtractionAcrossIterations(unittest.TestCase): """Test 8: senior review output with Issue Tracker table -> passed to next iteration.""" def test_tracker_extraction_across_iterations(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: config = _minimal_simple_config( Path(tmpdir), max_iterations=3, seniors=["claude-senior"], ) tracker_table = ( "## Issue Tracker\n" "| ISS-ID | Severity | Description | Status | Since |\n" "|--------|----------|-------------|--------|-------|\n" "| ISS-001 | Critical | Missing auth check | Open | v1 |\n" "| ISS-002 | Major | No validation | Open | v1 |\n" ) senior_output_v1 = ( "### Confirmed Issues\n" "- Missing auth\n\n" f"{tracker_table}\n" "### Verdict\n" "VERDICT: FAIL" ) senior_output_v2 = ( "### Confirmed Issues\n" "- None remaining\n\n" "## Issue Tracker\n" "| ISS-ID | Severity | Description | Status | Since |\n" "|--------|----------|-------------|--------|-------|\n" "| ISS-001 | Critical | Missing auth check | Fixed | v1 |\n" "| ISS-002 | Major | No validation | Fixed | v1 |\n" "\n### Verdict\n" "VERDICT: PASS" ) captured_prompts: list[dict[str, str]] = [] def _tracking_mock(agent_config, prompt, step_name, **kwargs): captured_prompts.append({ "step_name": step_name, "prompt": prompt, "agent_name": agent_config.name, }) if step_name == "coding": return AgentResult( output="Coding output", exit_code=0, agent_name=agent_config.name, step_name=step_name, duration_seconds=0.1, ) elif step_name == "review": return AgentResult( output="Review findings\n\nVERDICT: FAIL", exit_code=0, agent_name=agent_config.name, step_name=step_name, duration_seconds=0.1, ) elif step_name == "senior_review": # First call: FAIL with tracker, second call: PASS senior_calls = [ p for p in captured_prompts if p["step_name"] == "senior_review" ] if len(senior_calls) <= 1: output = senior_output_v1 else: output = senior_output_v2 return AgentResult( output=output, exit_code=0, agent_name=agent_config.name, step_name=step_name, duration_seconds=0.1, ) return AgentResult( output="", exit_code=0, agent_name=agent_config.name, step_name=step_name, duration_seconds=0.1, ) with patch("cross_eval.pipeline.invoke_agent", side_effect=_tracking_mock): result = run_pipeline(config) self.assertEqual(result.final_verdict, "PASS") self.assertEqual(len(result.iterations), 2) # Verify that the second iteration's senior_review prompt contains # the tracker table from iteration 1 iter2_senior_prompts = [ p for p in captured_prompts if p["step_name"] == "senior_review" and "ISS-001" in p["prompt"] and "Missing auth check" in p["prompt"] ] # The second senior_review call should have the tracker in its prompt self.assertTrue( len(iter2_senior_prompts) >= 1, "Expected previous_senior_tracker content (ISS-001) to appear " "in at least one senior_review prompt", ) if __name__ == "__main__": unittest.main()