feat: ESCALATE verdict, issue tracker, onboarding commands
Add 3-verdict system (PASS/FAIL/ESCALATE) with priority handling across simple and phased pipelines. Senior reviewers can now escalate issues requiring human intervention, immediately breaking the review loop. - ESCALATE verdict extraction with highest priority over PASS/FAIL - Issue Tracker tables (ISS-NNN) carried across iterations - Auto-escalate heuristic using (file, keyword) composite fingerprints - Report restructuring: executive view first (verdict → tracker → metrics) - Onboarding: `doctor`, `demo`, `init --guided` commands - Exit codes: PASS=0, FAIL=1, ESCALATE=2 - 87 tests passing (54 config + 25 onboarding + 8 integration) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
461
tests/test_pipeline_integration.py
Normal file
461
tests/test_pipeline_integration.py
Normal file
@@ -0,0 +1,461 @@
|
||||
"""Integration tests for cross-eval pipeline with mocked agents."""
|
||||
from __future__ import annotations
|
||||
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from cross_eval.config import BUILTIN_AGENTS
|
||||
from cross_eval.models import (
|
||||
AgentConfig,
|
||||
AgentResult,
|
||||
PhaseConfig,
|
||||
PipelineConfig,
|
||||
StepConfig,
|
||||
)
|
||||
from cross_eval.pipeline import run_pipeline
|
||||
from cross_eval.prompts import _build_review_fix_preset, _build_simple_preset
|
||||
|
||||
|
||||
def _make_mock_agent(outputs: list[str]):
|
||||
"""Returns a side_effect function that returns outputs in sequence."""
|
||||
call_count = [0]
|
||||
|
||||
def _mock(agent_config, prompt, step_name, **kwargs):
|
||||
idx = min(call_count[0], len(outputs) - 1)
|
||||
call_count[0] += 1
|
||||
return AgentResult(
|
||||
output=outputs[idx],
|
||||
exit_code=0,
|
||||
agent_name=agent_config.name,
|
||||
step_name=step_name,
|
||||
duration_seconds=0.1,
|
||||
)
|
||||
|
||||
return _mock
|
||||
|
||||
|
||||
def _make_step_mock(step_outputs: dict[str, list[str]]):
|
||||
"""Returns a side_effect that dispatches by step_name, cycling through outputs."""
|
||||
counters: dict[str, int] = {}
|
||||
|
||||
def _mock(agent_config, prompt, step_name, **kwargs):
|
||||
if step_name not in counters:
|
||||
counters[step_name] = 0
|
||||
outputs = step_outputs.get(step_name, [""])
|
||||
idx = min(counters[step_name], len(outputs) - 1)
|
||||
counters[step_name] += 1
|
||||
return AgentResult(
|
||||
output=outputs[idx],
|
||||
exit_code=0,
|
||||
agent_name=agent_config.name,
|
||||
step_name=step_name,
|
||||
duration_seconds=0.1,
|
||||
)
|
||||
|
||||
return _mock
|
||||
|
||||
|
||||
def _minimal_simple_config(
|
||||
run_dir: Path,
|
||||
max_iterations: int = 3,
|
||||
seniors: list[str] | None = None,
|
||||
) -> PipelineConfig:
|
||||
"""Build a minimal simple pipeline config for testing."""
|
||||
coders = ["claude-coder"]
|
||||
reviewers = ["claude-reviewer"]
|
||||
senior_list = seniors if seniors is not None else []
|
||||
steps = _build_simple_preset(coders, reviewers, senior_list)
|
||||
agents = dict(BUILTIN_AGENTS)
|
||||
return PipelineConfig(
|
||||
output_dir=run_dir,
|
||||
max_iterations=max_iterations,
|
||||
min_iterations=1,
|
||||
language="en",
|
||||
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||
agents=agents,
|
||||
coders=coders,
|
||||
reviewers=reviewers,
|
||||
seniors=senior_list,
|
||||
pipeline=steps,
|
||||
preset_name="simple",
|
||||
)
|
||||
|
||||
|
||||
class TestSimplePipelinePassStopsLoop(unittest.TestCase):
|
||||
"""Test 1: mock agent returns VERDICT: PASS on first review -> stops at iteration 1."""
|
||||
|
||||
def test_simple_pipeline_pass_stops_loop(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
config = _minimal_simple_config(Path(tmpdir))
|
||||
|
||||
mock = _make_mock_agent([
|
||||
"Coding output here", # coding step
|
||||
"All good\n\nVERDICT: PASS", # review step
|
||||
])
|
||||
|
||||
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||
result = run_pipeline(config)
|
||||
|
||||
self.assertEqual(result.final_verdict, "PASS")
|
||||
self.assertEqual(len(result.iterations), 1)
|
||||
|
||||
|
||||
class TestSimplePipelineFailThenPass(unittest.TestCase):
|
||||
"""Test 2: FAIL on first review, PASS on second -> 2 iterations."""
|
||||
|
||||
def test_simple_pipeline_fail_then_pass(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
config = _minimal_simple_config(Path(tmpdir), max_iterations=5)
|
||||
|
||||
mock = _make_step_mock({
|
||||
"coding": ["Coding output v1", "Coding output v2"],
|
||||
"review": [
|
||||
"Issues found\n\nVERDICT: FAIL",
|
||||
"All good\n\nVERDICT: PASS",
|
||||
],
|
||||
})
|
||||
|
||||
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||
result = run_pipeline(config)
|
||||
|
||||
self.assertEqual(result.final_verdict, "PASS")
|
||||
self.assertEqual(len(result.iterations), 2)
|
||||
|
||||
|
||||
class TestSimplePipelineEscalateBreaksLoop(unittest.TestCase):
|
||||
"""Test 3: ESCALATE on review -> stops immediately, final_verdict=ESCALATE."""
|
||||
|
||||
def test_simple_pipeline_escalate_breaks_loop(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
config = _minimal_simple_config(
|
||||
Path(tmpdir), max_iterations=5, seniors=["claude-senior"],
|
||||
)
|
||||
|
||||
escalate_output = (
|
||||
"### Confirmed Issues\n"
|
||||
"- [Critical] Requirements are ambiguous\n\n"
|
||||
"### Escalated Issues\n"
|
||||
"Requirements need stakeholder clarification\n\n"
|
||||
"### Verdict\n"
|
||||
"VERDICT: ESCALATE\n"
|
||||
)
|
||||
|
||||
mock = _make_step_mock({
|
||||
"coding": ["Coding output"],
|
||||
"review": ["Issues found\n\nVERDICT: FAIL"],
|
||||
"senior_review": [escalate_output],
|
||||
})
|
||||
|
||||
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||
result = run_pipeline(config)
|
||||
|
||||
self.assertEqual(result.final_verdict, "ESCALATE")
|
||||
self.assertEqual(len(result.iterations), 1)
|
||||
self.assertTrue(len(result.escalated_issues) > 0)
|
||||
|
||||
|
||||
class TestSimplePipelineEscalatePriorityOverPass(unittest.TestCase):
|
||||
"""Test 4: one verdict step returns PASS, another returns ESCALATE -> ESCALATE wins."""
|
||||
|
||||
def test_simple_pipeline_escalate_priority_over_pass(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# Build a custom pipeline with 2 verdict steps (no senior)
|
||||
steps = [
|
||||
StepConfig(
|
||||
name="coding",
|
||||
agent="claude-coder",
|
||||
role="coding",
|
||||
prompt_template="default:coding",
|
||||
output_key="coding_output",
|
||||
),
|
||||
StepConfig(
|
||||
name="review_a",
|
||||
agent="claude-reviewer",
|
||||
role="review",
|
||||
prompt_template="default:review",
|
||||
output_key="review_a_result",
|
||||
verdict=True,
|
||||
),
|
||||
StepConfig(
|
||||
name="review_b",
|
||||
agent="claude-reviewer",
|
||||
role="review",
|
||||
prompt_template="default:review",
|
||||
output_key="review_b_result",
|
||||
verdict=True,
|
||||
),
|
||||
]
|
||||
config = PipelineConfig(
|
||||
output_dir=Path(tmpdir),
|
||||
max_iterations=3,
|
||||
min_iterations=1,
|
||||
language="en",
|
||||
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||
agents=dict(BUILTIN_AGENTS),
|
||||
coders=["claude-coder"],
|
||||
reviewers=["claude-reviewer"],
|
||||
pipeline=steps,
|
||||
preset_name="custom",
|
||||
)
|
||||
|
||||
escalate_output = (
|
||||
"### Escalated Issues\n"
|
||||
"Ambiguous requirements need clarification\n\n"
|
||||
"VERDICT: ESCALATE\n"
|
||||
)
|
||||
|
||||
mock = _make_step_mock({
|
||||
"coding": ["Coding output"],
|
||||
"review_a": ["All good\n\nVERDICT: PASS"],
|
||||
"review_b": [escalate_output],
|
||||
})
|
||||
|
||||
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||
result = run_pipeline(config)
|
||||
|
||||
self.assertEqual(result.final_verdict, "ESCALATE")
|
||||
self.assertTrue(len(result.escalated_issues) > 0)
|
||||
|
||||
|
||||
class TestPhasedPipelineEscalateBreaksPhase(unittest.TestCase):
|
||||
"""Test 5: phased pipeline (review-fix), verify step returns ESCALATE -> phase stops."""
|
||||
|
||||
def test_phased_pipeline_escalate_breaks_phase(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
coders = ["claude-coder"]
|
||||
reviewers = ["claude-reviewer"]
|
||||
seniors = ["claude-senior"]
|
||||
phases = _build_review_fix_preset(coders, reviewers, seniors)
|
||||
|
||||
config = PipelineConfig(
|
||||
output_dir=Path(tmpdir),
|
||||
max_iterations=5,
|
||||
min_iterations=1,
|
||||
language="en",
|
||||
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||
agents=dict(BUILTIN_AGENTS),
|
||||
coders=coders,
|
||||
reviewers=reviewers,
|
||||
seniors=seniors,
|
||||
phases=phases,
|
||||
preset_name="review-fix",
|
||||
)
|
||||
|
||||
escalate_output = (
|
||||
"### Escalated Issues\n"
|
||||
"Architecture decisions needed beyond plan scope\n\n"
|
||||
"### Verdict\n"
|
||||
"VERDICT: ESCALATE\n"
|
||||
)
|
||||
|
||||
mock = _make_step_mock({
|
||||
"review_claude_reviewer": ["Review findings here"],
|
||||
"aggregate_review": ["Aggregated review\n\nAction items: fix X"],
|
||||
"coding": ["Fixed code"],
|
||||
"verify": [escalate_output],
|
||||
})
|
||||
|
||||
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||
result = run_pipeline(config)
|
||||
|
||||
self.assertEqual(result.final_verdict, "ESCALATE")
|
||||
self.assertTrue(len(result.escalated_issues) > 0)
|
||||
|
||||
|
||||
class TestAutoEscalateFiresWithoutSenior(unittest.TestCase):
|
||||
"""Test 6: simple pipeline without senior, same FAIL feedback 3 times -> auto-escalate."""
|
||||
|
||||
def test_auto_escalate_fires_without_senior(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# No seniors -> review step has verdict=True
|
||||
config = _minimal_simple_config(
|
||||
Path(tmpdir), max_iterations=5, seniors=None,
|
||||
)
|
||||
|
||||
# Same feedback mentioning the same file paths across all iterations
|
||||
repeated_fail = (
|
||||
"Issues found in src/auth.py: missing validation check.\n"
|
||||
"The file src/auth.py still has the same problem.\n\n"
|
||||
"VERDICT: FAIL"
|
||||
)
|
||||
|
||||
mock = _make_step_mock({
|
||||
"coding": ["Coding output v1", "Coding output v2", "Coding output v3"],
|
||||
"review": [repeated_fail, repeated_fail, repeated_fail],
|
||||
})
|
||||
|
||||
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||
result = run_pipeline(config)
|
||||
|
||||
self.assertEqual(result.final_verdict, "ESCALATE")
|
||||
self.assertTrue(
|
||||
any("Auto-escalated" in iss for iss in result.escalated_issues),
|
||||
)
|
||||
|
||||
|
||||
class TestAutoEscalateDoesNotFireWithSenior(unittest.TestCase):
|
||||
"""Test 7: same repeated FAIL but WITH senior/aggregate step -> no auto-escalate."""
|
||||
|
||||
def test_auto_escalate_does_not_fire_with_senior(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# With seniors -> senior_review step has verdict=True, review does not
|
||||
config = _minimal_simple_config(
|
||||
Path(tmpdir), max_iterations=5, seniors=["claude-senior"],
|
||||
)
|
||||
|
||||
repeated_fail_review = (
|
||||
"Issues found in src/auth.py: missing validation check.\n"
|
||||
"VERDICT: FAIL"
|
||||
)
|
||||
# Senior also returns FAIL but the auto-escalate should NOT fire
|
||||
# because has_aggregator is True (seniors list is populated)
|
||||
senior_fail = (
|
||||
"### Confirmed Issues\n"
|
||||
"- Missing validation in src/auth.py\n\n"
|
||||
"### Action Items\n"
|
||||
"1. Add validation in src/auth.py\n\n"
|
||||
"VERDICT: FAIL"
|
||||
)
|
||||
|
||||
mock = _make_step_mock({
|
||||
"coding": [
|
||||
"Coding output v1",
|
||||
"Coding output v2",
|
||||
"Coding output v3",
|
||||
"Coding output v4",
|
||||
"Coding output v5",
|
||||
],
|
||||
"review": [
|
||||
repeated_fail_review,
|
||||
repeated_fail_review,
|
||||
repeated_fail_review,
|
||||
repeated_fail_review,
|
||||
repeated_fail_review,
|
||||
],
|
||||
"senior_review": [
|
||||
senior_fail,
|
||||
senior_fail,
|
||||
senior_fail,
|
||||
senior_fail,
|
||||
senior_fail,
|
||||
],
|
||||
})
|
||||
|
||||
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||
result = run_pipeline(config)
|
||||
|
||||
# Should NOT auto-escalate; should reach max iterations
|
||||
self.assertNotEqual(result.final_verdict, "ESCALATE")
|
||||
self.assertEqual(result.final_verdict, "MAX_ITERATIONS_REACHED")
|
||||
self.assertEqual(len(result.iterations), 5)
|
||||
|
||||
|
||||
class TestTrackerExtractionAcrossIterations(unittest.TestCase):
|
||||
"""Test 8: senior review output with Issue Tracker table -> passed to next iteration."""
|
||||
|
||||
def test_tracker_extraction_across_iterations(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
config = _minimal_simple_config(
|
||||
Path(tmpdir), max_iterations=3, seniors=["claude-senior"],
|
||||
)
|
||||
|
||||
tracker_table = (
|
||||
"## Issue Tracker\n"
|
||||
"| ISS-ID | Severity | Description | Status | Since |\n"
|
||||
"|--------|----------|-------------|--------|-------|\n"
|
||||
"| ISS-001 | Critical | Missing auth check | Open | v1 |\n"
|
||||
"| ISS-002 | Major | No validation | Open | v1 |\n"
|
||||
)
|
||||
senior_output_v1 = (
|
||||
"### Confirmed Issues\n"
|
||||
"- Missing auth\n\n"
|
||||
f"{tracker_table}\n"
|
||||
"### Verdict\n"
|
||||
"VERDICT: FAIL"
|
||||
)
|
||||
senior_output_v2 = (
|
||||
"### Confirmed Issues\n"
|
||||
"- None remaining\n\n"
|
||||
"## Issue Tracker\n"
|
||||
"| ISS-ID | Severity | Description | Status | Since |\n"
|
||||
"|--------|----------|-------------|--------|-------|\n"
|
||||
"| ISS-001 | Critical | Missing auth check | Fixed | v1 |\n"
|
||||
"| ISS-002 | Major | No validation | Fixed | v1 |\n"
|
||||
"\n### Verdict\n"
|
||||
"VERDICT: PASS"
|
||||
)
|
||||
|
||||
captured_prompts: list[dict[str, str]] = []
|
||||
|
||||
def _tracking_mock(agent_config, prompt, step_name, **kwargs):
|
||||
captured_prompts.append({
|
||||
"step_name": step_name,
|
||||
"prompt": prompt,
|
||||
"agent_name": agent_config.name,
|
||||
})
|
||||
if step_name == "coding":
|
||||
return AgentResult(
|
||||
output="Coding output",
|
||||
exit_code=0,
|
||||
agent_name=agent_config.name,
|
||||
step_name=step_name,
|
||||
duration_seconds=0.1,
|
||||
)
|
||||
elif step_name == "review":
|
||||
return AgentResult(
|
||||
output="Review findings\n\nVERDICT: FAIL",
|
||||
exit_code=0,
|
||||
agent_name=agent_config.name,
|
||||
step_name=step_name,
|
||||
duration_seconds=0.1,
|
||||
)
|
||||
elif step_name == "senior_review":
|
||||
# First call: FAIL with tracker, second call: PASS
|
||||
senior_calls = [
|
||||
p for p in captured_prompts if p["step_name"] == "senior_review"
|
||||
]
|
||||
if len(senior_calls) <= 1:
|
||||
output = senior_output_v1
|
||||
else:
|
||||
output = senior_output_v2
|
||||
return AgentResult(
|
||||
output=output,
|
||||
exit_code=0,
|
||||
agent_name=agent_config.name,
|
||||
step_name=step_name,
|
||||
duration_seconds=0.1,
|
||||
)
|
||||
return AgentResult(
|
||||
output="",
|
||||
exit_code=0,
|
||||
agent_name=agent_config.name,
|
||||
step_name=step_name,
|
||||
duration_seconds=0.1,
|
||||
)
|
||||
|
||||
with patch("cross_eval.pipeline.invoke_agent", side_effect=_tracking_mock):
|
||||
result = run_pipeline(config)
|
||||
|
||||
self.assertEqual(result.final_verdict, "PASS")
|
||||
self.assertEqual(len(result.iterations), 2)
|
||||
|
||||
# Verify that the second iteration's senior_review prompt contains
|
||||
# the tracker table from iteration 1
|
||||
iter2_senior_prompts = [
|
||||
p for p in captured_prompts
|
||||
if p["step_name"] == "senior_review"
|
||||
and "ISS-001" in p["prompt"]
|
||||
and "Missing auth check" in p["prompt"]
|
||||
]
|
||||
# The second senior_review call should have the tracker in its prompt
|
||||
self.assertTrue(
|
||||
len(iter2_senior_prompts) >= 1,
|
||||
"Expected previous_senior_tracker content (ISS-001) to appear "
|
||||
"in at least one senior_review prompt",
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user