feat: ESCALATE verdict, issue tracker, onboarding commands

Add 3-verdict system (PASS/FAIL/ESCALATE) with priority handling across simple and phased pipelines. Senior reviewers can now escalate issues requiring human intervention, immediately breaking the review loop. - ESCALATE verdict extraction with highest priority over PASS/FAIL - Issue Tracker tables (ISS-NNN) carried across iterations - Auto-escalate heuristic using (file, keyword) composite fingerprints - Report restructuring: executive view first (verdict → tracker → metrics) - Onboarding: `doctor`, `demo`, `init --guided` commands - Exit codes: PASS=0, FAIL=1, ESCALATE=2 - 87 tests passing (54 config + 25 onboarding + 8 integration) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 18:19:05 +09:00
parent ee4f1a07ef
commit 204e071b74
15 changed files with 3032 additions and 156 deletions
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,19 +1,25 @@
 from __future__ import annotations

+import tempfile
 import unittest
+from pathlib import Path
 from unittest.mock import patch

-from cross_eval.agent import _supports_reasoning_effort
+from cross_eval.agent import AgentInvocationError, _supports_reasoning_effort
+from cross_eval.cli import _apply_phased_iteration_override
 from cross_eval.agent import invoke_agent
 from cross_eval.config import (
    BUILTIN_AGENTS,
    _default_seniors_for_preset,
    apply_reasoning_effort_settings,
    normalize_reasoning_effort,
+    normalize_prompt_template,
+    normalize_step_role,
    validate_config,
 )
 from cross_eval.models import (
    AgentConfig,
+    AgentResult,
    IterationResult,
    PhaseConfig,
    PipelineConfig,
@@ -21,25 +27,53 @@ from cross_eval.models import (
    ReviewMetrics,
    StepConfig,
 )
-from cross_eval.pipeline import _detect_repeated_aggregate
+from cross_eval.pipeline import (
+    _detect_auto_escalate,
+    _detect_repeated_aggregate,
+    _execute_parallel_batch,
+    _extract_senior_tracker,
+    _extract_verdict,
+)
 from cross_eval.prompts import (
-    GENERATE_TEMPLATE,
-    GENERATE_TEMPLATE_KO,
+    CODING_TEMPLATE,
+    CODING_TEMPLATE_KO,
    REVIEW_TEMPLATE,
    REVIEW_TEMPLATE_KO,
+    PLAN_REVIEW_TEMPLATE,
+    PLAN_REVIEW_TEMPLATE_KO,
    REVIEW_ONLY_TEMPLATE,
    REVIEW_ONLY_TEMPLATE_KO,
    AGGREGATE_REVIEW_TEMPLATE,
    AGGREGATE_REVIEW_TEMPLATE_KO,
    _build_cross_review_preset,
+    _build_coding_review_fix_preset,
+    _build_plan_review_preset,
    _build_review_fix_preset,
    _build_review_only_preset,
    _build_simple_preset,
 )
-from cross_eval.report import build_report, parse_review_metrics
-
+from cross_eval.config import _SENIOR_SYSTEM_PROMPT
+from cross_eval.report import build_report, parse_review_metrics, print_escalation_report

 class BuiltinAgentConfigTest(unittest.TestCase):
+    def test_claude_builtin_agents_use_user_settings_and_disable_slash_commands(self) -> None:
+        for agent_name in ("claude-coder", "claude-reviewer", "claude-senior"):
+            with self.subTest(agent=agent_name):
+                args = BUILTIN_AGENTS[agent_name].args
+                self.assertIn("--setting-sources", args)
+                self.assertIn("user", args)
+                self.assertIn("--disable-slash-commands", args)
+
+    def test_claude_builtin_agents_use_role_specific_permission_modes(self) -> None:
+        coder_args = BUILTIN_AGENTS["claude-coder"].args
+        reviewer_args = BUILTIN_AGENTS["claude-reviewer"].args
+        senior_args = BUILTIN_AGENTS["claude-senior"].args
+
+        self.assertIn("--dangerously-skip-permissions", coder_args)
+        self.assertIn("bypassPermissions", coder_args)
+        self.assertIn("plan", reviewer_args)
+        self.assertIn("plan", senior_args)
+
    def test_codex_builtin_agents_skip_git_repo_check(self) -> None:
        for agent_name in ("codex-coder", "codex-reviewer", "codex-senior"):
            with self.subTest(agent=agent_name):
@@ -62,6 +96,10 @@ class BuiltinAgentConfigTest(unittest.TestCase):
        self.assertEqual(normalize_reasoning_effort("extra_high"), "xhigh")
        self.assertEqual(normalize_reasoning_effort("x-high"), "xhigh")

+    def test_normalize_step_role_and_template_aliases(self) -> None:
+        self.assertEqual(normalize_step_role("coding"), "coding")
+        self.assertEqual(normalize_prompt_template("default:coding"), "default:coding")
+
    def test_apply_reasoning_effort_settings_uses_defaults_and_role_overrides(self) -> None:
        config = PipelineConfig(
            agents={
@@ -116,6 +154,123 @@ class BuiltinAgentConfigTest(unittest.TestCase):
            ["codex", "-c", 'model_reasoning_effort="high"'],
        )

+    def test_invoke_agent_classifies_auth_failures(self) -> None:
+        def _fake_run(cmd, **kwargs):
+            class _Result:
+                returncode = 1
+                stdout = ""
+                stderr = "Not logged in · Please run /login"
+
+            return _Result()
+
+        agent = AgentConfig(
+            name="claude-reviewer",
+            command="claude",
+            args=["-p", "--model", "opus"],
+        )
+
+        with patch("subprocess.run", side_effect=_fake_run):
+            with self.assertRaises(AgentInvocationError) as ctx:
+                invoke_agent(agent, "prompt", "review", quiet=True)
+
+        self.assertEqual(ctx.exception.failure_type, "AUTH")
+        self.assertIn("Re-authenticate", ctx.exception.suggested_action)
+
+    def test_invoke_agent_classifies_usage_limit_failures(self) -> None:
+        def _fake_run(cmd, **kwargs):
+            class _Result:
+                returncode = 1
+                stdout = ""
+                stderr = "API Error: 429 rate limit exceeded for current quota"
+
+            return _Result()
+
+        agent = AgentConfig(
+            name="codex-reviewer",
+            command="codex",
+            args=["exec", "--model", "gpt-5.4", "-"],
+        )
+
+        with patch("subprocess.run", side_effect=_fake_run):
+            with self.assertRaises(AgentInvocationError) as ctx:
+                invoke_agent(agent, "prompt", "review", quiet=True)
+
+        self.assertEqual(ctx.exception.failure_type, "USAGE_LIMIT")
+        self.assertIn("quota", ctx.exception.suggested_action)
+
+    def test_parallel_batch_saves_successes_before_failure(self) -> None:
+        config = PipelineConfig(
+            agents={
+                "ok-reviewer": AgentConfig(name="ok-reviewer", command="codex"),
+                "bad-reviewer": AgentConfig(name="bad-reviewer", command="claude"),
+            },
+        )
+        steps = [
+            StepConfig(
+                name="review_ok",
+                agent="ok-reviewer",
+                role="review",
+                prompt_template="default:review-only",
+                output_key="review_ok",
+                parallel=True,
+            ),
+            StepConfig(
+                name="review_bad",
+                agent="bad-reviewer",
+                role="review",
+                prompt_template="default:review-only",
+                output_key="review_bad",
+                parallel=True,
+            ),
+        ]
+        step_outputs: dict[str, str] = {}
+        step_results: dict[str, AgentResult] = {}
+
+        def _fake_invoke(agent, prompt, step_name, **kwargs):
+            if step_name == "review_ok":
+                return AgentResult(
+                    output="VERDICT: PASS",
+                    exit_code=0,
+                    agent_name=agent.name,
+                    step_name=step_name,
+                    duration_seconds=1.0,
+                )
+            raise AgentInvocationError(
+                agent_name=agent.name,
+                step_name=step_name,
+                cmd_preview="claude -p ...",
+                raw_error="API Error: 429 rate limit exceeded for current quota",
+                failure_type="USAGE_LIMIT",
+                suggested_action="Agent CLI hit a quota, billing, or token budget limit. Refill or raise the limit, then rerun.",
+            )
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with patch("cross_eval.pipeline.invoke_agent", side_effect=_fake_invoke):
+                with self.assertRaises(RuntimeError) as ctx:
+                    _execute_parallel_batch(
+                        steps,
+                        config,
+                        input_contents={},
+                        feedback="",
+                        iteration=1,
+                        max_iterations=3,
+                        cwd=Path(tmpdir),
+                        timeout=None,
+                        dry_run=False,
+                        step_outputs=step_outputs,
+                        step_results=step_results,
+                        run_dir=Path(tmpdir),
+                        output_iter=1,
+                    )
+
+            self.assertIn("Successful outputs were saved for: review_ok", str(ctx.exception))
+            self.assertEqual(step_outputs["review_ok"], "VERDICT: PASS")
+            self.assertTrue((Path(tmpdir) / "v1" / "review_ok.md").exists())
+            error_path = Path(tmpdir) / "v1" / "review_bad_error.md"
+            self.assertTrue(error_path.exists())
+            self.assertIn("Failure Type", error_path.read_text(encoding="utf-8"))
+            self.assertIn("USAGE_LIMIT", error_path.read_text(encoding="utf-8"))
+
    def test_detect_repeated_aggregate_warns_on_same_output(self) -> None:
        steps = [
            StepConfig(
@@ -169,6 +324,14 @@ class BuiltinAgentConfigTest(unittest.TestCase):
            ),
            ["claude-senior"],
        )
+        self.assertEqual(
+            _default_seniors_for_preset(
+                "preset:coding-review-fix",
+                ["codex-reviewer"],
+                BUILTIN_AGENTS,
+            ),
+            ["codex-senior"],
+        )
        self.assertEqual(
            _default_seniors_for_preset(
                "preset:simple",
@@ -204,9 +367,37 @@ class BuiltinAgentConfigTest(unittest.TestCase):
        )
        self.assertEqual(
            [step.name for step in converge.steps[3:]],
-            ["aggregate_review", "generate", "verify"],
+            ["aggregate_review", "coding", "verify"],
        )

+    def test_coding_review_fix_starts_with_single_coding_phase(self) -> None:
+        phases = _build_coding_review_fix_preset(
+            ["codex-coder"],
+            ["claude-reviewer", "codex-reviewer"],
+            ["codex-senior"],
+        )
+
+        self.assertEqual([phase.name for phase in phases], ["initial_coding", "review_fix"])
+        self.assertEqual(phases[0].max_iterations, 1)
+        self.assertEqual([step.name for step in phases[0].steps], ["coding"])
+        self.assertEqual([step.name for step in phases[1].steps[2:]], ["aggregate_review", "coding", "verify"])
+
+    def test_apply_phased_iteration_override_updates_only_verdict_phases(self) -> None:
+        config = PipelineConfig(
+            phases=_build_coding_review_fix_preset(
+                ["codex-coder"],
+                ["codex-reviewer"],
+                ["codex-senior"],
+            ),
+        )
+
+        _apply_phased_iteration_override(config, 10)
+
+        self.assertEqual(config.phases[0].name, "initial_coding")
+        self.assertEqual(config.phases[0].max_iterations, 1)
+        self.assertEqual(config.phases[1].name, "review_fix")
+        self.assertEqual(config.phases[1].max_iterations, 10)
+
    def test_review_only_duplicate_reviewers_get_unique_step_keys(self) -> None:
        steps = _build_review_only_preset(
            ["codex-coder"],
@@ -219,6 +410,31 @@ class BuiltinAgentConfigTest(unittest.TestCase):
            ["review_codex_reviewer", "review_codex_reviewer_2"],
        )

+    def test_plan_review_duplicate_reviewers_get_unique_step_keys(self) -> None:
+        steps = _build_plan_review_preset(
+            ["codex-coder"],
+            ["codex-reviewer", "codex-reviewer"],
+            [],
+        )
+
+        self.assertEqual(
+            [step.output_key for step in steps],
+            ["plan_review_codex_reviewer", "plan_review_codex_reviewer_2"],
+        )
+
+    def test_plan_review_with_senior_adds_aggregate_step(self) -> None:
+        steps = _build_plan_review_preset(
+            ["codex-coder"],
+            ["claude-reviewer", "codex-reviewer"],
+            ["claude-senior"],
+        )
+
+        self.assertEqual(steps[-1].name, "senior_review")
+        self.assertEqual(steps[-1].agent, "claude-senior")
+        self.assertTrue(steps[-1].verdict)
+        self.assertFalse(steps[0].verdict)
+        self.assertFalse(steps[1].verdict)
+
    def test_cross_review_duplicate_coders_get_unique_step_keys(self) -> None:
        steps = _build_cross_review_preset(
            ["codex-coder", "codex-coder"],
@@ -246,7 +462,7 @@ class BuiltinAgentConfigTest(unittest.TestCase):
        steps = phases[0].steps
        self.assertEqual(steps[2].name, "aggregate_review")
        self.assertEqual(steps[2].agent, "codex-senior")
-        self.assertEqual(steps[3].name, "generate")
+        self.assertEqual(steps[3].name, "coding")
        self.assertEqual(steps[4].name, "verify")
        self.assertEqual(steps[4].agent, "codex-senior")
        self.assertTrue(steps[4].verdict)
@@ -273,7 +489,7 @@ class BuiltinAgentConfigTest(unittest.TestCase):

        self.assertEqual(
            [step.name for step in steps],
-            ["generate", "review", "senior_review"],
+            ["coding", "review", "senior_review"],
        )
        self.assertFalse(steps[1].verdict)
        self.assertTrue(steps[2].verdict)
@@ -325,6 +541,8 @@ class PromptTemplateTest(unittest.TestCase):
        for tmpl, label in [
            (REVIEW_TEMPLATE, "REVIEW_TEMPLATE"),
            (REVIEW_TEMPLATE_KO, "REVIEW_TEMPLATE_KO"),
+            (PLAN_REVIEW_TEMPLATE, "PLAN_REVIEW_TEMPLATE"),
+            (PLAN_REVIEW_TEMPLATE_KO, "PLAN_REVIEW_TEMPLATE_KO"),
            (REVIEW_ONLY_TEMPLATE, "REVIEW_ONLY_TEMPLATE"),
            (REVIEW_ONLY_TEMPLATE_KO, "REVIEW_ONLY_TEMPLATE_KO"),
        ]:
@@ -351,10 +569,10 @@ class PromptTemplateTest(unittest.TestCase):
                self.assertIn("CONFIRMED", tmpl)
                self.assertIn("DISMISSED", tmpl)

-    def test_generate_templates_ignore_dismissed(self) -> None:
-        """Generate templates should tell coder to ignore DISMISSED items."""
-        self.assertIn("DISMISSED", GENERATE_TEMPLATE)
-        self.assertIn("DISMISSED", GENERATE_TEMPLATE_KO)
+    def test_coding_templates_ignore_dismissed(self) -> None:
+        """Coding templates should tell coder to ignore DISMISSED items."""
+        self.assertIn("DISMISSED", CODING_TEMPLATE)
+        self.assertIn("DISMISSED", CODING_TEMPLATE_KO)

    def test_aggregate_templates_dismissed_structure(self) -> None:
        """Aggregate templates should use [False positive] / [Already fixed] tags."""
@@ -487,11 +705,11 @@ class ReviewMetricsParsingTest(unittest.TestCase):
            language="en",
            pipeline=[
                StepConfig(
-                    name="generate",
+                    name="coding",
                    agent="claude-coder",
-                    role="generate",
-                    prompt_template="default:generate",
-                    output_key="generated_code",
+                    role="coding",
+                    prompt_template="default:coding",
+                    output_key="coding_output",
                    verdict=True,
                ),
            ],
@@ -500,7 +718,7 @@ class ReviewMetricsParsingTest(unittest.TestCase):
            iterations=[
                IterationResult(
                    iteration=1,
-                    step_outputs={"generated_code": "some code"},
+                    step_outputs={"coding_output": "some code"},
                    verdict="PASS",
                ),
            ],
@@ -511,5 +729,230 @@ class ReviewMetricsParsingTest(unittest.TestCase):
        self.assertNotIn("Review Metrics", report)


+class EscalateVerdictTest(unittest.TestCase):
+    """Test ESCALATE verdict functionality."""
+
+    def test_extract_verdict_escalate(self) -> None:
+        output = "Some review content\n\nVERDICT: ESCALATE\n"
+        result = _extract_verdict(output, r"VERDICT:\s*PASS")
+        self.assertEqual(result, "ESCALATE")
+
+    def test_extract_verdict_escalate_priority(self) -> None:
+        """ESCALATE should take priority even if PASS pattern also matches."""
+        output = "VERDICT: PASS\n\nVERDICT: ESCALATE\n"
+        result = _extract_verdict(output, r"VERDICT:\s*PASS")
+        self.assertEqual(result, "ESCALATE")
+
+    def test_extract_verdict_pass_still_works(self) -> None:
+        output = "All good\n\nVERDICT: PASS\n"
+        result = _extract_verdict(output, r"VERDICT:\s*PASS")
+        self.assertEqual(result, "PASS")
+
+    def test_extract_verdict_fail_still_works(self) -> None:
+        output = "Issues found\n\nVERDICT: FAIL\n"
+        result = _extract_verdict(output, r"VERDICT:\s*PASS")
+        self.assertEqual(result, "FAIL")
+
+    def test_extract_senior_tracker(self) -> None:
+        output = (
+            "Some text\n\n"
+            "## Issue Tracker\n"
+            "| ISS-ID | Severity | Description | Status | Since |\n"
+            "|--------|----------|-------------|--------|-------|\n"
+            "| ISS-001 | Critical | Missing auth | Open | v1 |\n"
+            "| ISS-002 | Major | Bad naming | Fixed | v1 |\n"
+            "\nMore text"
+        )
+        tracker = _extract_senior_tracker(output)
+        self.assertIn("Issue Tracker", tracker)
+        self.assertIn("ISS-001", tracker)
+        self.assertIn("ISS-002", tracker)
+
+    def test_extract_senior_tracker_empty(self) -> None:
+        output = "No tracker table here"
+        tracker = _extract_senior_tracker(output)
+        self.assertEqual(tracker, "")
+
+    def test_auto_escalate_heuristic(self) -> None:
+        prev1 = "Issue in src/auth.py: missing validation"
+        prev2 = "Issue in src/auth.py: validation still missing"
+        current = "Issue in src/auth.py: validation not implemented"
+
+        # Should detect repeated issue
+        self.assertTrue(_detect_auto_escalate([prev1, prev2], current, threshold=2))
+
+    def test_auto_escalate_no_repeat(self) -> None:
+        prev1 = "Issue in src/auth.py: missing validation"
+        current = "Issue in src/database.py: connection pool"
+
+        self.assertFalse(_detect_auto_escalate([prev1], current, threshold=2))
+
+    def test_auto_escalate_different_issues_same_file(self) -> None:
+        """Same file path but different issues should NOT trigger escalation."""
+        prev1 = "Issue in src/utils.py: missing validation on input"
+        prev2 = "Issue in src/utils.py: unused import at top of file"
+        current = "Issue in src/utils.py: error handling not implemented"
+
+        # All mention src/utils.py, but the issue keywords differ across
+        # iterations, so this should NOT escalate.
+        self.assertFalse(_detect_auto_escalate([prev1, prev2], current, threshold=2))
+
+    def test_report_escalate_verdict(self) -> None:
+        config = PipelineConfig(language="en")
+        result = PipelineResult(
+            final_verdict="ESCALATE",
+            escalated_issues=["Requirements are ambiguous — need stakeholder input"],
+        )
+
+        report = build_report(config, result)
+
+        self.assertIn("ESCALATE", report)
+        self.assertIn("Human review required", report)
+        self.assertIn("ambiguous", report)
+
+    def test_report_escalate_verdict_ko(self) -> None:
+        config = PipelineConfig(language="ko")
+        result = PipelineResult(
+            final_verdict="ESCALATE",
+            escalated_issues=["요구사항이 모호함"],
+        )
+
+        report = build_report(config, result)
+
+        self.assertIn("ESCALATE", report)
+        self.assertIn("사람의 확인이 필요합니다", report)
+
+    def test_exit_code_escalate(self) -> None:
+        from cross_eval.cli import main
+
+        mock_result = PipelineResult(
+            final_verdict="ESCALATE",
+            escalated_issues=["Needs human review"],
+        )
+
+        with patch("cross_eval.config.load_config") as mock_load, \
+             patch("cross_eval.config.validate_config", return_value=[]), \
+             patch("cross_eval.pipeline.run_pipeline", return_value=mock_result), \
+             patch("cross_eval.report.print_escalation_report"):
+            mock_config = PipelineConfig(
+                pipeline=[
+                    StepConfig(
+                        name="review",
+                        agent="claude-reviewer",
+                        role="review",
+                        prompt_template="default:review",
+                        output_key="review_result",
+                        verdict=True,
+                    ),
+                ],
+                agents=dict(BUILTIN_AGENTS),
+                coders=["claude-coder"],
+                reviewers=["claude-reviewer"],
+                inputs={"plan": Path("/tmp/plan.md")},
+                language="en",
+                max_iterations=3,
+                preset_name="simple",
+            )
+            mock_load.return_value = mock_config
+
+            with tempfile.NamedTemporaryFile(suffix=".yaml", mode="w") as f:
+                f.write("inputs:\n  plan: /tmp/plan.md\n")
+                f.flush()
+                exit_code = main(["run", "-c", f.name])
+
+            self.assertEqual(exit_code, 2)
+
+    def test_senior_prompt_includes_escalate(self) -> None:
+        self.assertIn("ESCALATE", _SENIOR_SYSTEM_PROMPT)
+        self.assertIn("ambiguous", _SENIOR_SYSTEM_PROMPT.lower())
+
+    def test_aggregate_template_has_tracker(self) -> None:
+        self.assertIn("{previous_senior_tracker}", AGGREGATE_REVIEW_TEMPLATE)
+        self.assertIn("Issue Tracker", AGGREGATE_REVIEW_TEMPLATE)
+        self.assertIn("VERDICT: ESCALATE", AGGREGATE_REVIEW_TEMPLATE)
+
+    def test_report_includes_issue_tracker_summary(self) -> None:
+        config = PipelineConfig(
+            language="en",
+            pipeline=[
+                StepConfig(
+                    name="review",
+                    agent="claude-reviewer",
+                    role="review",
+                    prompt_template="default:review",
+                    output_key="review_result",
+                    verdict=True,
+                ),
+            ],
+        )
+        result = PipelineResult(
+            iterations=[
+                IterationResult(
+                    iteration=1,
+                    step_outputs={
+                        "review_result": (
+                            "### Issues Found\n"
+                            "- ISS-001 [Critical][Omission] Missing auth check\n"
+                            "- ISS-002 [Major][Omission] No input validation\n"
+                            "### Verdict\nVERDICT: FAIL"
+                        ),
+                    },
+                    verdict="FAIL",
+                ),
+            ],
+            final_verdict="FAIL",
+        )
+
+        report = build_report(config, result)
+        self.assertIn("Issue Tracker Summary", report)
+        self.assertIn("ISS-001", report)
+        self.assertIn("ISS-002", report)
+
+    def test_report_includes_senior_tracker_table(self) -> None:
+        config = PipelineConfig(
+            language="en",
+            pipeline=[
+                StepConfig(
+                    name="senior_review",
+                    agent="claude-senior",
+                    role="review",
+                    prompt_template="default:aggregate-review",
+                    output_key="senior_review_result",
+                    verdict=True,
+                ),
+            ],
+        )
+        result = PipelineResult(
+            iterations=[
+                IterationResult(
+                    iteration=1,
+                    step_outputs={
+                        "senior_review_result": (
+                            "### Confirmed Issues\n- Missing auth\n\n"
+                            "## Issue Tracker\n"
+                            "| ISS-ID | Severity | Description | Status | Since |\n"
+                            "|--------|----------|-------------|--------|-------|\n"
+                            "| ISS-001 | Critical | Missing auth check | Open | v1 |\n"
+                            "| ISS-002 | Major | No validation | Fixed | v1 |\n"
+                            "\n### Verdict\nVERDICT: FAIL"
+                        ),
+                    },
+                    verdict="FAIL",
+                ),
+            ],
+            final_verdict="FAIL",
+        )
+
+        report = build_report(config, result)
+        self.assertIn("Issue Tracker Summary", report)
+        self.assertIn("ISS-001", report)
+        self.assertIn("Fixed", report)
+
+    def test_aggregate_template_ko_has_tracker(self) -> None:
+        self.assertIn("{previous_senior_tracker}", AGGREGATE_REVIEW_TEMPLATE_KO)
+        self.assertIn("이슈 트래커", AGGREGATE_REVIEW_TEMPLATE_KO)
+        self.assertIn("VERDICT: ESCALATE", AGGREGATE_REVIEW_TEMPLATE_KO)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_onboarding.py
+++ b/tests/test_onboarding.py
@@ -0,0 +1,267 @@
+"""Tests for doctor, demo, and guided init features."""
+from __future__ import annotations
+
+import tempfile
+import unittest
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+from cross_eval.doctor import (
+    DoctorCheck,
+    check_cli_installed,
+    check_config,
+    format_doctor_results,
+    run_doctor,
+)
+from cross_eval.demo import (
+    DEMO_CHECKLIST,
+    DEMO_PLAN,
+    run_mock_demo,
+)
+from cross_eval.cli import (
+    _generate_guided_config,
+    _prompt_choice,
+    _prompt_text,
+    main,
+)
+
+
+# ---------------------------------------------------------------------------
+# Doctor tests
+# ---------------------------------------------------------------------------
+
+class DoctorCheckInstalledTest(unittest.TestCase):
+    def test_check_cli_installed_found(self) -> None:
+        with patch("cross_eval.doctor.shutil.which", return_value="/usr/bin/python3"):
+            with patch("cross_eval.doctor.subprocess.run") as mock_run:
+                mock_run.return_value = MagicMock(
+                    stdout="Python 3.12.0", stderr=""
+                )
+                found, version = check_cli_installed("python3")
+
+        self.assertTrue(found)
+        self.assertIn("Python", version)
+
+    def test_check_cli_installed_not_found(self) -> None:
+        with patch("cross_eval.doctor.shutil.which", return_value=None):
+            found, msg = check_cli_installed("nonexistent-tool")
+
+        self.assertFalse(found)
+        self.assertIn("not found", msg)
+
+    def test_check_config_exists_valid(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            ce_dir = Path(tmpdir) / ".cross-eval"
+            ce_dir.mkdir()
+            config_path = ce_dir / "config.yaml"
+            config_path.write_text(
+                "inputs:\n  plan: plan.md\ncoders: [claude-coder]\n"
+                "reviewers: [claude-reviewer]\npipeline: preset:simple\n",
+                encoding="utf-8",
+            )
+            # Also create plan.md so validation passes
+            (ce_dir / "plan.md").write_text("# Plan", encoding="utf-8")
+
+            ok, path, errors = check_config(Path(tmpdir))
+
+        self.assertTrue(ok)
+        self.assertIsNotNone(path)
+        self.assertEqual(errors, [])
+
+    def test_check_config_not_exists(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            ok, path, errors = check_config(Path(tmpdir))
+
+        self.assertFalse(ok)
+        self.assertIsNone(path)
+
+    def test_check_config_invalid(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            ce_dir = Path(tmpdir) / ".cross-eval"
+            ce_dir.mkdir()
+            # Valid YAML but missing required fields → validation fails
+            (ce_dir / "config.yaml").write_text(
+                "inputs:\n  plan: /nonexistent/plan.md\n",
+                encoding="utf-8",
+            )
+
+            ok, path, errors = check_config(Path(tmpdir))
+
+        self.assertFalse(ok)
+        self.assertIsNotNone(path)
+
+    def test_format_doctor_results_all_pass(self) -> None:
+        checks = [
+            DoctorCheck("test", True, True, "ok"),
+            DoctorCheck("test2", True, False, "ok"),
+        ]
+        output = format_doctor_results(checks)
+        self.assertIn("✓", output)
+        self.assertIn("All checks passed", output)
+
+    def test_format_doctor_results_critical_fail(self) -> None:
+        checks = [
+            DoctorCheck("claude CLI", False, True, "not found"),
+        ]
+        output = format_doctor_results(checks)
+        self.assertIn("✗", output)
+        self.assertIn("critical", output.lower())
+
+    def test_cmd_doctor_returns_0_all_pass(self) -> None:
+        with patch("cross_eval.doctor.run_doctor") as mock:
+            mock.return_value = [
+                DoctorCheck("test", True, True, "ok"),
+            ]
+            exit_code = main(["doctor"])
+        self.assertEqual(exit_code, 0)
+
+    def test_cmd_doctor_returns_1_critical_fail(self) -> None:
+        with patch("cross_eval.doctor.run_doctor") as mock:
+            mock.return_value = [
+                DoctorCheck("claude CLI", False, True, "not found"),
+            ]
+            exit_code = main(["doctor"])
+        self.assertEqual(exit_code, 1)
+
+
+# ---------------------------------------------------------------------------
+# Demo tests
+# ---------------------------------------------------------------------------
+
+class DemoTest(unittest.TestCase):
+    def test_demo_plan_is_nonempty(self) -> None:
+        self.assertIn("fibonacci", DEMO_PLAN.lower())
+
+    def test_demo_checklist_is_nonempty(self) -> None:
+        self.assertIn("fibonacci", DEMO_CHECKLIST.lower())
+
+    def test_mock_demo_runs_without_error(self) -> None:
+        # Should not raise
+        with patch("sys.stdout"):
+            run_mock_demo(preset="simple")
+
+    def test_mock_demo_escalate_runs_without_error(self) -> None:
+        with patch("sys.stdout"):
+            run_mock_demo(preset="simple", show_escalate=True)
+
+    def test_cmd_demo_mock_default(self) -> None:
+        with patch("cross_eval.demo.run_mock_demo") as mock:
+            exit_code = main(["demo"])
+        mock.assert_called_once_with(preset="simple", show_escalate=False)
+        self.assertEqual(exit_code, 0)
+
+    def test_cmd_demo_escalate_flag(self) -> None:
+        with patch("cross_eval.demo.run_mock_demo") as mock:
+            exit_code = main(["demo", "--escalate"])
+        mock.assert_called_once_with(preset="simple", show_escalate=True)
+        self.assertEqual(exit_code, 0)
+
+    def test_cmd_demo_live_requires_confirmation(self) -> None:
+        with patch("builtins.input", return_value="n"):
+            exit_code = main(["demo", "--live"])
+        self.assertEqual(exit_code, 0)
+
+
+# ---------------------------------------------------------------------------
+# Guided init tests
+# ---------------------------------------------------------------------------
+
+class GuidedInitTest(unittest.TestCase):
+    def test_prompt_choice_default(self) -> None:
+        with patch("builtins.input", return_value=""):
+            result = _prompt_choice("Pick:", ["a", "b", "c"], default=2)
+        self.assertEqual(result, "b")
+
+    def test_prompt_choice_by_number(self) -> None:
+        with patch("builtins.input", return_value="3"):
+            result = _prompt_choice("Pick:", ["a", "b", "c"], default=1)
+        self.assertEqual(result, "c")
+
+    def test_prompt_choice_by_name(self) -> None:
+        with patch("builtins.input", return_value="simple"):
+            result = _prompt_choice("Pick:", ["simple", "review-fix"], default=1)
+        self.assertEqual(result, "simple")
+
+    def test_prompt_text_default(self) -> None:
+        with patch("builtins.input", return_value=""):
+            result = _prompt_text("Name", default="claude")
+        self.assertEqual(result, "claude")
+
+    def test_prompt_text_custom(self) -> None:
+        with patch("builtins.input", return_value="codex"):
+            result = _prompt_text("Name", default="claude")
+        self.assertEqual(result, "codex")
+
+    def test_generate_guided_config(self) -> None:
+        config = _generate_guided_config(
+            "review-fix", "ko",
+            {
+                "coder": "claude",
+                "reviewer": "codex",
+                "senior": "codex",
+                "max_iter": 5,
+            },
+        )
+        self.assertIn("preset:review-fix", config)
+        self.assertIn("language: ko", config)
+        self.assertIn("claude-coder", config)
+        self.assertIn("codex-reviewer", config)
+        self.assertIn("codex-senior", config)
+        self.assertIn("max_iterations: 5", config)
+
+    def test_generate_guided_config_full_name(self) -> None:
+        config = _generate_guided_config(
+            "simple", "ko",
+            {
+                "coder": "claude-coder",
+                "reviewer": "codex-reviewer",
+                "senior": "",
+                "max_iter": 3,
+            },
+        )
+        # Full names should not be double-suffixed
+        self.assertIn("claude-coder", config)
+        self.assertNotIn("claude-coder-coder", config)
+        self.assertIn("codex-reviewer", config)
+        self.assertNotIn("codex-reviewer-reviewer", config)
+
+    def test_generate_guided_config_no_senior(self) -> None:
+        config = _generate_guided_config(
+            "simple", "en",
+            {
+                "coder": "claude",
+                "reviewer": "claude",
+                "senior": "",
+                "max_iter": 3,
+            },
+        )
+        self.assertNotIn("senior", config.lower())
+
+    def test_guided_init_creates_files(self) -> None:
+        # Simulate guided init with all defaults
+        inputs = iter(["", "", "", "", "", "", ""])
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with patch("builtins.input", side_effect=lambda _="": next(inputs, "")):
+                exit_code = main(["init", "--guided", "--dir", tmpdir])
+
+            config_path = Path(tmpdir) / ".cross-eval" / "config.yaml"
+            self.assertTrue(config_path.exists())
+            self.assertEqual(exit_code, 0)
+
+    def test_guided_init_preserves_existing_files(self) -> None:
+        inputs = iter(["", "", "", "", "", "", ""])
+        with tempfile.TemporaryDirectory() as tmpdir:
+            ce_dir = Path(tmpdir) / ".cross-eval"
+            ce_dir.mkdir()
+            existing = ce_dir / "config.yaml"
+            existing.write_text("# existing", encoding="utf-8")
+
+            with patch("builtins.input", side_effect=lambda _="": next(inputs, "")):
+                main(["init", "--guided", "--dir", tmpdir])
+
+            # Should not overwrite
+            self.assertEqual(existing.read_text(), "# existing")
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_pipeline_integration.py
+++ b/tests/test_pipeline_integration.py
@@ -0,0 +1,461 @@
+"""Integration tests for cross-eval pipeline with mocked agents."""
+from __future__ import annotations
+
+import tempfile
+import unittest
+from pathlib import Path
+from unittest.mock import patch
+
+from cross_eval.config import BUILTIN_AGENTS
+from cross_eval.models import (
+    AgentConfig,
+    AgentResult,
+    PhaseConfig,
+    PipelineConfig,
+    StepConfig,
+)
+from cross_eval.pipeline import run_pipeline
+from cross_eval.prompts import _build_review_fix_preset, _build_simple_preset
+
+
+def _make_mock_agent(outputs: list[str]):
+    """Returns a side_effect function that returns outputs in sequence."""
+    call_count = [0]
+
+    def _mock(agent_config, prompt, step_name, **kwargs):
+        idx = min(call_count[0], len(outputs) - 1)
+        call_count[0] += 1
+        return AgentResult(
+            output=outputs[idx],
+            exit_code=0,
+            agent_name=agent_config.name,
+            step_name=step_name,
+            duration_seconds=0.1,
+        )
+
+    return _mock
+
+
+def _make_step_mock(step_outputs: dict[str, list[str]]):
+    """Returns a side_effect that dispatches by step_name, cycling through outputs."""
+    counters: dict[str, int] = {}
+
+    def _mock(agent_config, prompt, step_name, **kwargs):
+        if step_name not in counters:
+            counters[step_name] = 0
+        outputs = step_outputs.get(step_name, [""])
+        idx = min(counters[step_name], len(outputs) - 1)
+        counters[step_name] += 1
+        return AgentResult(
+            output=outputs[idx],
+            exit_code=0,
+            agent_name=agent_config.name,
+            step_name=step_name,
+            duration_seconds=0.1,
+        )
+
+    return _mock
+
+
+def _minimal_simple_config(
+    run_dir: Path,
+    max_iterations: int = 3,
+    seniors: list[str] | None = None,
+) -> PipelineConfig:
+    """Build a minimal simple pipeline config for testing."""
+    coders = ["claude-coder"]
+    reviewers = ["claude-reviewer"]
+    senior_list = seniors if seniors is not None else []
+    steps = _build_simple_preset(coders, reviewers, senior_list)
+    agents = dict(BUILTIN_AGENTS)
+    return PipelineConfig(
+        output_dir=run_dir,
+        max_iterations=max_iterations,
+        min_iterations=1,
+        language="en",
+        inputs={"plan": "Test plan", "checklist": "Test checklist"},
+        agents=agents,
+        coders=coders,
+        reviewers=reviewers,
+        seniors=senior_list,
+        pipeline=steps,
+        preset_name="simple",
+    )
+
+
+class TestSimplePipelinePassStopsLoop(unittest.TestCase):
+    """Test 1: mock agent returns VERDICT: PASS on first review -> stops at iteration 1."""
+
+    def test_simple_pipeline_pass_stops_loop(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = _minimal_simple_config(Path(tmpdir))
+
+            mock = _make_mock_agent([
+                "Coding output here",       # coding step
+                "All good\n\nVERDICT: PASS", # review step
+            ])
+
+            with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
+                result = run_pipeline(config)
+
+            self.assertEqual(result.final_verdict, "PASS")
+            self.assertEqual(len(result.iterations), 1)
+
+
+class TestSimplePipelineFailThenPass(unittest.TestCase):
+    """Test 2: FAIL on first review, PASS on second -> 2 iterations."""
+
+    def test_simple_pipeline_fail_then_pass(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = _minimal_simple_config(Path(tmpdir), max_iterations=5)
+
+            mock = _make_step_mock({
+                "coding": ["Coding output v1", "Coding output v2"],
+                "review": [
+                    "Issues found\n\nVERDICT: FAIL",
+                    "All good\n\nVERDICT: PASS",
+                ],
+            })
+
+            with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
+                result = run_pipeline(config)
+
+            self.assertEqual(result.final_verdict, "PASS")
+            self.assertEqual(len(result.iterations), 2)
+
+
+class TestSimplePipelineEscalateBreaksLoop(unittest.TestCase):
+    """Test 3: ESCALATE on review -> stops immediately, final_verdict=ESCALATE."""
+
+    def test_simple_pipeline_escalate_breaks_loop(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = _minimal_simple_config(
+                Path(tmpdir), max_iterations=5, seniors=["claude-senior"],
+            )
+
+            escalate_output = (
+                "### Confirmed Issues\n"
+                "- [Critical] Requirements are ambiguous\n\n"
+                "### Escalated Issues\n"
+                "Requirements need stakeholder clarification\n\n"
+                "### Verdict\n"
+                "VERDICT: ESCALATE\n"
+            )
+
+            mock = _make_step_mock({
+                "coding": ["Coding output"],
+                "review": ["Issues found\n\nVERDICT: FAIL"],
+                "senior_review": [escalate_output],
+            })
+
+            with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
+                result = run_pipeline(config)
+
+            self.assertEqual(result.final_verdict, "ESCALATE")
+            self.assertEqual(len(result.iterations), 1)
+            self.assertTrue(len(result.escalated_issues) > 0)
+
+
+class TestSimplePipelineEscalatePriorityOverPass(unittest.TestCase):
+    """Test 4: one verdict step returns PASS, another returns ESCALATE -> ESCALATE wins."""
+
+    def test_simple_pipeline_escalate_priority_over_pass(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Build a custom pipeline with 2 verdict steps (no senior)
+            steps = [
+                StepConfig(
+                    name="coding",
+                    agent="claude-coder",
+                    role="coding",
+                    prompt_template="default:coding",
+                    output_key="coding_output",
+                ),
+                StepConfig(
+                    name="review_a",
+                    agent="claude-reviewer",
+                    role="review",
+                    prompt_template="default:review",
+                    output_key="review_a_result",
+                    verdict=True,
+                ),
+                StepConfig(
+                    name="review_b",
+                    agent="claude-reviewer",
+                    role="review",
+                    prompt_template="default:review",
+                    output_key="review_b_result",
+                    verdict=True,
+                ),
+            ]
+            config = PipelineConfig(
+                output_dir=Path(tmpdir),
+                max_iterations=3,
+                min_iterations=1,
+                language="en",
+                inputs={"plan": "Test plan", "checklist": "Test checklist"},
+                agents=dict(BUILTIN_AGENTS),
+                coders=["claude-coder"],
+                reviewers=["claude-reviewer"],
+                pipeline=steps,
+                preset_name="custom",
+            )
+
+            escalate_output = (
+                "### Escalated Issues\n"
+                "Ambiguous requirements need clarification\n\n"
+                "VERDICT: ESCALATE\n"
+            )
+
+            mock = _make_step_mock({
+                "coding": ["Coding output"],
+                "review_a": ["All good\n\nVERDICT: PASS"],
+                "review_b": [escalate_output],
+            })
+
+            with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
+                result = run_pipeline(config)
+
+            self.assertEqual(result.final_verdict, "ESCALATE")
+            self.assertTrue(len(result.escalated_issues) > 0)
+
+
+class TestPhasedPipelineEscalateBreaksPhase(unittest.TestCase):
+    """Test 5: phased pipeline (review-fix), verify step returns ESCALATE -> phase stops."""
+
+    def test_phased_pipeline_escalate_breaks_phase(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            coders = ["claude-coder"]
+            reviewers = ["claude-reviewer"]
+            seniors = ["claude-senior"]
+            phases = _build_review_fix_preset(coders, reviewers, seniors)
+
+            config = PipelineConfig(
+                output_dir=Path(tmpdir),
+                max_iterations=5,
+                min_iterations=1,
+                language="en",
+                inputs={"plan": "Test plan", "checklist": "Test checklist"},
+                agents=dict(BUILTIN_AGENTS),
+                coders=coders,
+                reviewers=reviewers,
+                seniors=seniors,
+                phases=phases,
+                preset_name="review-fix",
+            )
+
+            escalate_output = (
+                "### Escalated Issues\n"
+                "Architecture decisions needed beyond plan scope\n\n"
+                "### Verdict\n"
+                "VERDICT: ESCALATE\n"
+            )
+
+            mock = _make_step_mock({
+                "review_claude_reviewer": ["Review findings here"],
+                "aggregate_review": ["Aggregated review\n\nAction items: fix X"],
+                "coding": ["Fixed code"],
+                "verify": [escalate_output],
+            })
+
+            with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
+                result = run_pipeline(config)
+
+            self.assertEqual(result.final_verdict, "ESCALATE")
+            self.assertTrue(len(result.escalated_issues) > 0)
+
+
+class TestAutoEscalateFiresWithoutSenior(unittest.TestCase):
+    """Test 6: simple pipeline without senior, same FAIL feedback 3 times -> auto-escalate."""
+
+    def test_auto_escalate_fires_without_senior(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # No seniors -> review step has verdict=True
+            config = _minimal_simple_config(
+                Path(tmpdir), max_iterations=5, seniors=None,
+            )
+
+            # Same feedback mentioning the same file paths across all iterations
+            repeated_fail = (
+                "Issues found in src/auth.py: missing validation check.\n"
+                "The file src/auth.py still has the same problem.\n\n"
+                "VERDICT: FAIL"
+            )
+
+            mock = _make_step_mock({
+                "coding": ["Coding output v1", "Coding output v2", "Coding output v3"],
+                "review": [repeated_fail, repeated_fail, repeated_fail],
+            })
+
+            with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
+                result = run_pipeline(config)
+
+            self.assertEqual(result.final_verdict, "ESCALATE")
+            self.assertTrue(
+                any("Auto-escalated" in iss for iss in result.escalated_issues),
+            )
+
+
+class TestAutoEscalateDoesNotFireWithSenior(unittest.TestCase):
+    """Test 7: same repeated FAIL but WITH senior/aggregate step -> no auto-escalate."""
+
+    def test_auto_escalate_does_not_fire_with_senior(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # With seniors -> senior_review step has verdict=True, review does not
+            config = _minimal_simple_config(
+                Path(tmpdir), max_iterations=5, seniors=["claude-senior"],
+            )
+
+            repeated_fail_review = (
+                "Issues found in src/auth.py: missing validation check.\n"
+                "VERDICT: FAIL"
+            )
+            # Senior also returns FAIL but the auto-escalate should NOT fire
+            # because has_aggregator is True (seniors list is populated)
+            senior_fail = (
+                "### Confirmed Issues\n"
+                "- Missing validation in src/auth.py\n\n"
+                "### Action Items\n"
+                "1. Add validation in src/auth.py\n\n"
+                "VERDICT: FAIL"
+            )
+
+            mock = _make_step_mock({
+                "coding": [
+                    "Coding output v1",
+                    "Coding output v2",
+                    "Coding output v3",
+                    "Coding output v4",
+                    "Coding output v5",
+                ],
+                "review": [
+                    repeated_fail_review,
+                    repeated_fail_review,
+                    repeated_fail_review,
+                    repeated_fail_review,
+                    repeated_fail_review,
+                ],
+                "senior_review": [
+                    senior_fail,
+                    senior_fail,
+                    senior_fail,
+                    senior_fail,
+                    senior_fail,
+                ],
+            })
+
+            with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
+                result = run_pipeline(config)
+
+            # Should NOT auto-escalate; should reach max iterations
+            self.assertNotEqual(result.final_verdict, "ESCALATE")
+            self.assertEqual(result.final_verdict, "MAX_ITERATIONS_REACHED")
+            self.assertEqual(len(result.iterations), 5)
+
+
+class TestTrackerExtractionAcrossIterations(unittest.TestCase):
+    """Test 8: senior review output with Issue Tracker table -> passed to next iteration."""
+
+    def test_tracker_extraction_across_iterations(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = _minimal_simple_config(
+                Path(tmpdir), max_iterations=3, seniors=["claude-senior"],
+            )
+
+            tracker_table = (
+                "## Issue Tracker\n"
+                "| ISS-ID | Severity | Description | Status | Since |\n"
+                "|--------|----------|-------------|--------|-------|\n"
+                "| ISS-001 | Critical | Missing auth check | Open | v1 |\n"
+                "| ISS-002 | Major | No validation | Open | v1 |\n"
+            )
+            senior_output_v1 = (
+                "### Confirmed Issues\n"
+                "- Missing auth\n\n"
+                f"{tracker_table}\n"
+                "### Verdict\n"
+                "VERDICT: FAIL"
+            )
+            senior_output_v2 = (
+                "### Confirmed Issues\n"
+                "- None remaining\n\n"
+                "## Issue Tracker\n"
+                "| ISS-ID | Severity | Description | Status | Since |\n"
+                "|--------|----------|-------------|--------|-------|\n"
+                "| ISS-001 | Critical | Missing auth check | Fixed | v1 |\n"
+                "| ISS-002 | Major | No validation | Fixed | v1 |\n"
+                "\n### Verdict\n"
+                "VERDICT: PASS"
+            )
+
+            captured_prompts: list[dict[str, str]] = []
+
+            def _tracking_mock(agent_config, prompt, step_name, **kwargs):
+                captured_prompts.append({
+                    "step_name": step_name,
+                    "prompt": prompt,
+                    "agent_name": agent_config.name,
+                })
+                if step_name == "coding":
+                    return AgentResult(
+                        output="Coding output",
+                        exit_code=0,
+                        agent_name=agent_config.name,
+                        step_name=step_name,
+                        duration_seconds=0.1,
+                    )
+                elif step_name == "review":
+                    return AgentResult(
+                        output="Review findings\n\nVERDICT: FAIL",
+                        exit_code=0,
+                        agent_name=agent_config.name,
+                        step_name=step_name,
+                        duration_seconds=0.1,
+                    )
+                elif step_name == "senior_review":
+                    # First call: FAIL with tracker, second call: PASS
+                    senior_calls = [
+                        p for p in captured_prompts if p["step_name"] == "senior_review"
+                    ]
+                    if len(senior_calls) <= 1:
+                        output = senior_output_v1
+                    else:
+                        output = senior_output_v2
+                    return AgentResult(
+                        output=output,
+                        exit_code=0,
+                        agent_name=agent_config.name,
+                        step_name=step_name,
+                        duration_seconds=0.1,
+                    )
+                return AgentResult(
+                    output="",
+                    exit_code=0,
+                    agent_name=agent_config.name,
+                    step_name=step_name,
+                    duration_seconds=0.1,
+                )
+
+            with patch("cross_eval.pipeline.invoke_agent", side_effect=_tracking_mock):
+                result = run_pipeline(config)
+
+            self.assertEqual(result.final_verdict, "PASS")
+            self.assertEqual(len(result.iterations), 2)
+
+            # Verify that the second iteration's senior_review prompt contains
+            # the tracker table from iteration 1
+            iter2_senior_prompts = [
+                p for p in captured_prompts
+                if p["step_name"] == "senior_review"
+                and "ISS-001" in p["prompt"]
+                and "Missing auth check" in p["prompt"]
+            ]
+            # The second senior_review call should have the tracker in its prompt
+            self.assertTrue(
+                len(iter2_senior_prompts) >= 1,
+                "Expected previous_senior_tracker content (ISS-001) to appear "
+                "in at least one senior_review prompt",
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()