feat: ESCALATE verdict, issue tracker, onboarding commands

Add 3-verdict system (PASS/FAIL/ESCALATE) with priority handling across simple and phased pipelines. Senior reviewers can now escalate issues requiring human intervention, immediately breaking the review loop. - ESCALATE verdict extraction with highest priority over PASS/FAIL - Issue Tracker tables (ISS-NNN) carried across iterations - Auto-escalate heuristic using (file, keyword) composite fingerprints - Report restructuring: executive view first (verdict → tracker → metrics) - Onboarding: `doctor`, `demo`, `init --guided` commands - Exit codes: PASS=0, FAIL=1, ESCALATE=2 - 87 tests passing (54 config + 25 onboarding + 8 integration) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 18:19:05 +09:00
parent ee4f1a07ef
commit 204e071b74
15 changed files with 3032 additions and 156 deletions
--- a/tests/test_pipeline_integration.py
+++ b/tests/test_pipeline_integration.py
@@ -0,0 +1,461 @@
+"""Integration tests for cross-eval pipeline with mocked agents."""
+from __future__ import annotations
+
+import tempfile
+import unittest
+from pathlib import Path
+from unittest.mock import patch
+
+from cross_eval.config import BUILTIN_AGENTS
+from cross_eval.models import (
+    AgentConfig,
+    AgentResult,
+    PhaseConfig,
+    PipelineConfig,
+    StepConfig,
+)
+from cross_eval.pipeline import run_pipeline
+from cross_eval.prompts import _build_review_fix_preset, _build_simple_preset
+
+
+def _make_mock_agent(outputs: list[str]):
+    """Returns a side_effect function that returns outputs in sequence."""
+    call_count = [0]
+
+    def _mock(agent_config, prompt, step_name, **kwargs):
+        idx = min(call_count[0], len(outputs) - 1)
+        call_count[0] += 1
+        return AgentResult(
+            output=outputs[idx],
+            exit_code=0,
+            agent_name=agent_config.name,
+            step_name=step_name,
+            duration_seconds=0.1,
+        )
+
+    return _mock
+
+
+def _make_step_mock(step_outputs: dict[str, list[str]]):
+    """Returns a side_effect that dispatches by step_name, cycling through outputs."""
+    counters: dict[str, int] = {}
+
+    def _mock(agent_config, prompt, step_name, **kwargs):
+        if step_name not in counters:
+            counters[step_name] = 0
+        outputs = step_outputs.get(step_name, [""])
+        idx = min(counters[step_name], len(outputs) - 1)
+        counters[step_name] += 1
+        return AgentResult(
+            output=outputs[idx],
+            exit_code=0,
+            agent_name=agent_config.name,
+            step_name=step_name,
+            duration_seconds=0.1,
+        )
+
+    return _mock
+
+
+def _minimal_simple_config(
+    run_dir: Path,
+    max_iterations: int = 3,
+    seniors: list[str] | None = None,
+) -> PipelineConfig:
+    """Build a minimal simple pipeline config for testing."""
+    coders = ["claude-coder"]
+    reviewers = ["claude-reviewer"]
+    senior_list = seniors if seniors is not None else []
+    steps = _build_simple_preset(coders, reviewers, senior_list)
+    agents = dict(BUILTIN_AGENTS)
+    return PipelineConfig(
+        output_dir=run_dir,
+        max_iterations=max_iterations,
+        min_iterations=1,
+        language="en",
+        inputs={"plan": "Test plan", "checklist": "Test checklist"},
+        agents=agents,
+        coders=coders,
+        reviewers=reviewers,
+        seniors=senior_list,
+        pipeline=steps,
+        preset_name="simple",
+    )
+
+
+class TestSimplePipelinePassStopsLoop(unittest.TestCase):
+    """Test 1: mock agent returns VERDICT: PASS on first review -> stops at iteration 1."""
+
+    def test_simple_pipeline_pass_stops_loop(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = _minimal_simple_config(Path(tmpdir))
+
+            mock = _make_mock_agent([
+                "Coding output here",       # coding step
+                "All good\n\nVERDICT: PASS", # review step
+            ])
+
+            with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
+                result = run_pipeline(config)
+
+            self.assertEqual(result.final_verdict, "PASS")
+            self.assertEqual(len(result.iterations), 1)
+
+
+class TestSimplePipelineFailThenPass(unittest.TestCase):
+    """Test 2: FAIL on first review, PASS on second -> 2 iterations."""
+
+    def test_simple_pipeline_fail_then_pass(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = _minimal_simple_config(Path(tmpdir), max_iterations=5)
+
+            mock = _make_step_mock({
+                "coding": ["Coding output v1", "Coding output v2"],
+                "review": [
+                    "Issues found\n\nVERDICT: FAIL",
+                    "All good\n\nVERDICT: PASS",
+                ],
+            })
+
+            with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
+                result = run_pipeline(config)
+
+            self.assertEqual(result.final_verdict, "PASS")
+            self.assertEqual(len(result.iterations), 2)
+
+
+class TestSimplePipelineEscalateBreaksLoop(unittest.TestCase):
+    """Test 3: ESCALATE on review -> stops immediately, final_verdict=ESCALATE."""
+
+    def test_simple_pipeline_escalate_breaks_loop(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = _minimal_simple_config(
+                Path(tmpdir), max_iterations=5, seniors=["claude-senior"],
+            )
+
+            escalate_output = (
+                "### Confirmed Issues\n"
+                "- [Critical] Requirements are ambiguous\n\n"
+                "### Escalated Issues\n"
+                "Requirements need stakeholder clarification\n\n"
+                "### Verdict\n"
+                "VERDICT: ESCALATE\n"
+            )
+
+            mock = _make_step_mock({
+                "coding": ["Coding output"],
+                "review": ["Issues found\n\nVERDICT: FAIL"],
+                "senior_review": [escalate_output],
+            })
+
+            with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
+                result = run_pipeline(config)
+
+            self.assertEqual(result.final_verdict, "ESCALATE")
+            self.assertEqual(len(result.iterations), 1)
+            self.assertTrue(len(result.escalated_issues) > 0)
+
+
+class TestSimplePipelineEscalatePriorityOverPass(unittest.TestCase):
+    """Test 4: one verdict step returns PASS, another returns ESCALATE -> ESCALATE wins."""
+
+    def test_simple_pipeline_escalate_priority_over_pass(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Build a custom pipeline with 2 verdict steps (no senior)
+            steps = [
+                StepConfig(
+                    name="coding",
+                    agent="claude-coder",
+                    role="coding",
+                    prompt_template="default:coding",
+                    output_key="coding_output",
+                ),
+                StepConfig(
+                    name="review_a",
+                    agent="claude-reviewer",
+                    role="review",
+                    prompt_template="default:review",
+                    output_key="review_a_result",
+                    verdict=True,
+                ),
+                StepConfig(
+                    name="review_b",
+                    agent="claude-reviewer",
+                    role="review",
+                    prompt_template="default:review",
+                    output_key="review_b_result",
+                    verdict=True,
+                ),
+            ]
+            config = PipelineConfig(
+                output_dir=Path(tmpdir),
+                max_iterations=3,
+                min_iterations=1,
+                language="en",
+                inputs={"plan": "Test plan", "checklist": "Test checklist"},
+                agents=dict(BUILTIN_AGENTS),
+                coders=["claude-coder"],
+                reviewers=["claude-reviewer"],
+                pipeline=steps,
+                preset_name="custom",
+            )
+
+            escalate_output = (
+                "### Escalated Issues\n"
+                "Ambiguous requirements need clarification\n\n"
+                "VERDICT: ESCALATE\n"
+            )
+
+            mock = _make_step_mock({
+                "coding": ["Coding output"],
+                "review_a": ["All good\n\nVERDICT: PASS"],
+                "review_b": [escalate_output],
+            })
+
+            with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
+                result = run_pipeline(config)
+
+            self.assertEqual(result.final_verdict, "ESCALATE")
+            self.assertTrue(len(result.escalated_issues) > 0)
+
+
+class TestPhasedPipelineEscalateBreaksPhase(unittest.TestCase):
+    """Test 5: phased pipeline (review-fix), verify step returns ESCALATE -> phase stops."""
+
+    def test_phased_pipeline_escalate_breaks_phase(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            coders = ["claude-coder"]
+            reviewers = ["claude-reviewer"]
+            seniors = ["claude-senior"]
+            phases = _build_review_fix_preset(coders, reviewers, seniors)
+
+            config = PipelineConfig(
+                output_dir=Path(tmpdir),
+                max_iterations=5,
+                min_iterations=1,
+                language="en",
+                inputs={"plan": "Test plan", "checklist": "Test checklist"},
+                agents=dict(BUILTIN_AGENTS),
+                coders=coders,
+                reviewers=reviewers,
+                seniors=seniors,
+                phases=phases,
+                preset_name="review-fix",
+            )
+
+            escalate_output = (
+                "### Escalated Issues\n"
+                "Architecture decisions needed beyond plan scope\n\n"
+                "### Verdict\n"
+                "VERDICT: ESCALATE\n"
+            )
+
+            mock = _make_step_mock({
+                "review_claude_reviewer": ["Review findings here"],
+                "aggregate_review": ["Aggregated review\n\nAction items: fix X"],
+                "coding": ["Fixed code"],
+                "verify": [escalate_output],
+            })
+
+            with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
+                result = run_pipeline(config)
+
+            self.assertEqual(result.final_verdict, "ESCALATE")
+            self.assertTrue(len(result.escalated_issues) > 0)
+
+
+class TestAutoEscalateFiresWithoutSenior(unittest.TestCase):
+    """Test 6: simple pipeline without senior, same FAIL feedback 3 times -> auto-escalate."""
+
+    def test_auto_escalate_fires_without_senior(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # No seniors -> review step has verdict=True
+            config = _minimal_simple_config(
+                Path(tmpdir), max_iterations=5, seniors=None,
+            )
+
+            # Same feedback mentioning the same file paths across all iterations
+            repeated_fail = (
+                "Issues found in src/auth.py: missing validation check.\n"
+                "The file src/auth.py still has the same problem.\n\n"
+                "VERDICT: FAIL"
+            )
+
+            mock = _make_step_mock({
+                "coding": ["Coding output v1", "Coding output v2", "Coding output v3"],
+                "review": [repeated_fail, repeated_fail, repeated_fail],
+            })
+
+            with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
+                result = run_pipeline(config)
+
+            self.assertEqual(result.final_verdict, "ESCALATE")
+            self.assertTrue(
+                any("Auto-escalated" in iss for iss in result.escalated_issues),
+            )
+
+
+class TestAutoEscalateDoesNotFireWithSenior(unittest.TestCase):
+    """Test 7: same repeated FAIL but WITH senior/aggregate step -> no auto-escalate."""
+
+    def test_auto_escalate_does_not_fire_with_senior(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # With seniors -> senior_review step has verdict=True, review does not
+            config = _minimal_simple_config(
+                Path(tmpdir), max_iterations=5, seniors=["claude-senior"],
+            )
+
+            repeated_fail_review = (
+                "Issues found in src/auth.py: missing validation check.\n"
+                "VERDICT: FAIL"
+            )
+            # Senior also returns FAIL but the auto-escalate should NOT fire
+            # because has_aggregator is True (seniors list is populated)
+            senior_fail = (
+                "### Confirmed Issues\n"
+                "- Missing validation in src/auth.py\n\n"
+                "### Action Items\n"
+                "1. Add validation in src/auth.py\n\n"
+                "VERDICT: FAIL"
+            )
+
+            mock = _make_step_mock({
+                "coding": [
+                    "Coding output v1",
+                    "Coding output v2",
+                    "Coding output v3",
+                    "Coding output v4",
+                    "Coding output v5",
+                ],
+                "review": [
+                    repeated_fail_review,
+                    repeated_fail_review,
+                    repeated_fail_review,
+                    repeated_fail_review,
+                    repeated_fail_review,
+                ],
+                "senior_review": [
+                    senior_fail,
+                    senior_fail,
+                    senior_fail,
+                    senior_fail,
+                    senior_fail,
+                ],
+            })
+
+            with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
+                result = run_pipeline(config)
+
+            # Should NOT auto-escalate; should reach max iterations
+            self.assertNotEqual(result.final_verdict, "ESCALATE")
+            self.assertEqual(result.final_verdict, "MAX_ITERATIONS_REACHED")
+            self.assertEqual(len(result.iterations), 5)
+
+
+class TestTrackerExtractionAcrossIterations(unittest.TestCase):
+    """Test 8: senior review output with Issue Tracker table -> passed to next iteration."""
+
+    def test_tracker_extraction_across_iterations(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = _minimal_simple_config(
+                Path(tmpdir), max_iterations=3, seniors=["claude-senior"],
+            )
+
+            tracker_table = (
+                "## Issue Tracker\n"
+                "| ISS-ID | Severity | Description | Status | Since |\n"
+                "|--------|----------|-------------|--------|-------|\n"
+                "| ISS-001 | Critical | Missing auth check | Open | v1 |\n"
+                "| ISS-002 | Major | No validation | Open | v1 |\n"
+            )
+            senior_output_v1 = (
+                "### Confirmed Issues\n"
+                "- Missing auth\n\n"
+                f"{tracker_table}\n"
+                "### Verdict\n"
+                "VERDICT: FAIL"
+            )
+            senior_output_v2 = (
+                "### Confirmed Issues\n"
+                "- None remaining\n\n"
+                "## Issue Tracker\n"
+                "| ISS-ID | Severity | Description | Status | Since |\n"
+                "|--------|----------|-------------|--------|-------|\n"
+                "| ISS-001 | Critical | Missing auth check | Fixed | v1 |\n"
+                "| ISS-002 | Major | No validation | Fixed | v1 |\n"
+                "\n### Verdict\n"
+                "VERDICT: PASS"
+            )
+
+            captured_prompts: list[dict[str, str]] = []
+
+            def _tracking_mock(agent_config, prompt, step_name, **kwargs):
+                captured_prompts.append({
+                    "step_name": step_name,
+                    "prompt": prompt,
+                    "agent_name": agent_config.name,
+                })
+                if step_name == "coding":
+                    return AgentResult(
+                        output="Coding output",
+                        exit_code=0,
+                        agent_name=agent_config.name,
+                        step_name=step_name,
+                        duration_seconds=0.1,
+                    )
+                elif step_name == "review":
+                    return AgentResult(
+                        output="Review findings\n\nVERDICT: FAIL",
+                        exit_code=0,
+                        agent_name=agent_config.name,
+                        step_name=step_name,
+                        duration_seconds=0.1,
+                    )
+                elif step_name == "senior_review":
+                    # First call: FAIL with tracker, second call: PASS
+                    senior_calls = [
+                        p for p in captured_prompts if p["step_name"] == "senior_review"
+                    ]
+                    if len(senior_calls) <= 1:
+                        output = senior_output_v1
+                    else:
+                        output = senior_output_v2
+                    return AgentResult(
+                        output=output,
+                        exit_code=0,
+                        agent_name=agent_config.name,
+                        step_name=step_name,
+                        duration_seconds=0.1,
+                    )
+                return AgentResult(
+                    output="",
+                    exit_code=0,
+                    agent_name=agent_config.name,
+                    step_name=step_name,
+                    duration_seconds=0.1,
+                )
+
+            with patch("cross_eval.pipeline.invoke_agent", side_effect=_tracking_mock):
+                result = run_pipeline(config)
+
+            self.assertEqual(result.final_verdict, "PASS")
+            self.assertEqual(len(result.iterations), 2)
+
+            # Verify that the second iteration's senior_review prompt contains
+            # the tracker table from iteration 1
+            iter2_senior_prompts = [
+                p for p in captured_prompts
+                if p["step_name"] == "senior_review"
+                and "ISS-001" in p["prompt"]
+                and "Missing auth check" in p["prompt"]
+            ]
+            # The second senior_review call should have the tracker in its prompt
+            self.assertTrue(
+                len(iter2_senior_prompts) >= 1,
+                "Expected previous_senior_tracker content (ISS-001) to appear "
+                "in at least one senior_review prompt",
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()