feat: ESCALATE verdict, issue tracker, onboarding commands
Add 3-verdict system (PASS/FAIL/ESCALATE) with priority handling across simple and phased pipelines. Senior reviewers can now escalate issues requiring human intervention, immediately breaking the review loop. - ESCALATE verdict extraction with highest priority over PASS/FAIL - Issue Tracker tables (ISS-NNN) carried across iterations - Auto-escalate heuristic using (file, keyword) composite fingerprints - Report restructuring: executive view first (verdict → tracker → metrics) - Onboarding: `doctor`, `demo`, `init --guided` commands - Exit codes: PASS=0, FAIL=1, ESCALATE=2 - 87 tests passing (54 config + 25 onboarding + 8 integration) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,19 +1,25 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from cross_eval.agent import _supports_reasoning_effort
|
||||
from cross_eval.agent import AgentInvocationError, _supports_reasoning_effort
|
||||
from cross_eval.cli import _apply_phased_iteration_override
|
||||
from cross_eval.agent import invoke_agent
|
||||
from cross_eval.config import (
|
||||
BUILTIN_AGENTS,
|
||||
_default_seniors_for_preset,
|
||||
apply_reasoning_effort_settings,
|
||||
normalize_reasoning_effort,
|
||||
normalize_prompt_template,
|
||||
normalize_step_role,
|
||||
validate_config,
|
||||
)
|
||||
from cross_eval.models import (
|
||||
AgentConfig,
|
||||
AgentResult,
|
||||
IterationResult,
|
||||
PhaseConfig,
|
||||
PipelineConfig,
|
||||
@@ -21,25 +27,53 @@ from cross_eval.models import (
|
||||
ReviewMetrics,
|
||||
StepConfig,
|
||||
)
|
||||
from cross_eval.pipeline import _detect_repeated_aggregate
|
||||
from cross_eval.pipeline import (
|
||||
_detect_auto_escalate,
|
||||
_detect_repeated_aggregate,
|
||||
_execute_parallel_batch,
|
||||
_extract_senior_tracker,
|
||||
_extract_verdict,
|
||||
)
|
||||
from cross_eval.prompts import (
|
||||
GENERATE_TEMPLATE,
|
||||
GENERATE_TEMPLATE_KO,
|
||||
CODING_TEMPLATE,
|
||||
CODING_TEMPLATE_KO,
|
||||
REVIEW_TEMPLATE,
|
||||
REVIEW_TEMPLATE_KO,
|
||||
PLAN_REVIEW_TEMPLATE,
|
||||
PLAN_REVIEW_TEMPLATE_KO,
|
||||
REVIEW_ONLY_TEMPLATE,
|
||||
REVIEW_ONLY_TEMPLATE_KO,
|
||||
AGGREGATE_REVIEW_TEMPLATE,
|
||||
AGGREGATE_REVIEW_TEMPLATE_KO,
|
||||
_build_cross_review_preset,
|
||||
_build_coding_review_fix_preset,
|
||||
_build_plan_review_preset,
|
||||
_build_review_fix_preset,
|
||||
_build_review_only_preset,
|
||||
_build_simple_preset,
|
||||
)
|
||||
from cross_eval.report import build_report, parse_review_metrics
|
||||
|
||||
from cross_eval.config import _SENIOR_SYSTEM_PROMPT
|
||||
from cross_eval.report import build_report, parse_review_metrics, print_escalation_report
|
||||
|
||||
class BuiltinAgentConfigTest(unittest.TestCase):
|
||||
def test_claude_builtin_agents_use_user_settings_and_disable_slash_commands(self) -> None:
|
||||
for agent_name in ("claude-coder", "claude-reviewer", "claude-senior"):
|
||||
with self.subTest(agent=agent_name):
|
||||
args = BUILTIN_AGENTS[agent_name].args
|
||||
self.assertIn("--setting-sources", args)
|
||||
self.assertIn("user", args)
|
||||
self.assertIn("--disable-slash-commands", args)
|
||||
|
||||
def test_claude_builtin_agents_use_role_specific_permission_modes(self) -> None:
|
||||
coder_args = BUILTIN_AGENTS["claude-coder"].args
|
||||
reviewer_args = BUILTIN_AGENTS["claude-reviewer"].args
|
||||
senior_args = BUILTIN_AGENTS["claude-senior"].args
|
||||
|
||||
self.assertIn("--dangerously-skip-permissions", coder_args)
|
||||
self.assertIn("bypassPermissions", coder_args)
|
||||
self.assertIn("plan", reviewer_args)
|
||||
self.assertIn("plan", senior_args)
|
||||
|
||||
def test_codex_builtin_agents_skip_git_repo_check(self) -> None:
|
||||
for agent_name in ("codex-coder", "codex-reviewer", "codex-senior"):
|
||||
with self.subTest(agent=agent_name):
|
||||
@@ -62,6 +96,10 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
||||
self.assertEqual(normalize_reasoning_effort("extra_high"), "xhigh")
|
||||
self.assertEqual(normalize_reasoning_effort("x-high"), "xhigh")
|
||||
|
||||
def test_normalize_step_role_and_template_aliases(self) -> None:
|
||||
self.assertEqual(normalize_step_role("coding"), "coding")
|
||||
self.assertEqual(normalize_prompt_template("default:coding"), "default:coding")
|
||||
|
||||
def test_apply_reasoning_effort_settings_uses_defaults_and_role_overrides(self) -> None:
|
||||
config = PipelineConfig(
|
||||
agents={
|
||||
@@ -116,6 +154,123 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
||||
["codex", "-c", 'model_reasoning_effort="high"'],
|
||||
)
|
||||
|
||||
def test_invoke_agent_classifies_auth_failures(self) -> None:
|
||||
def _fake_run(cmd, **kwargs):
|
||||
class _Result:
|
||||
returncode = 1
|
||||
stdout = ""
|
||||
stderr = "Not logged in · Please run /login"
|
||||
|
||||
return _Result()
|
||||
|
||||
agent = AgentConfig(
|
||||
name="claude-reviewer",
|
||||
command="claude",
|
||||
args=["-p", "--model", "opus"],
|
||||
)
|
||||
|
||||
with patch("subprocess.run", side_effect=_fake_run):
|
||||
with self.assertRaises(AgentInvocationError) as ctx:
|
||||
invoke_agent(agent, "prompt", "review", quiet=True)
|
||||
|
||||
self.assertEqual(ctx.exception.failure_type, "AUTH")
|
||||
self.assertIn("Re-authenticate", ctx.exception.suggested_action)
|
||||
|
||||
def test_invoke_agent_classifies_usage_limit_failures(self) -> None:
|
||||
def _fake_run(cmd, **kwargs):
|
||||
class _Result:
|
||||
returncode = 1
|
||||
stdout = ""
|
||||
stderr = "API Error: 429 rate limit exceeded for current quota"
|
||||
|
||||
return _Result()
|
||||
|
||||
agent = AgentConfig(
|
||||
name="codex-reviewer",
|
||||
command="codex",
|
||||
args=["exec", "--model", "gpt-5.4", "-"],
|
||||
)
|
||||
|
||||
with patch("subprocess.run", side_effect=_fake_run):
|
||||
with self.assertRaises(AgentInvocationError) as ctx:
|
||||
invoke_agent(agent, "prompt", "review", quiet=True)
|
||||
|
||||
self.assertEqual(ctx.exception.failure_type, "USAGE_LIMIT")
|
||||
self.assertIn("quota", ctx.exception.suggested_action)
|
||||
|
||||
def test_parallel_batch_saves_successes_before_failure(self) -> None:
|
||||
config = PipelineConfig(
|
||||
agents={
|
||||
"ok-reviewer": AgentConfig(name="ok-reviewer", command="codex"),
|
||||
"bad-reviewer": AgentConfig(name="bad-reviewer", command="claude"),
|
||||
},
|
||||
)
|
||||
steps = [
|
||||
StepConfig(
|
||||
name="review_ok",
|
||||
agent="ok-reviewer",
|
||||
role="review",
|
||||
prompt_template="default:review-only",
|
||||
output_key="review_ok",
|
||||
parallel=True,
|
||||
),
|
||||
StepConfig(
|
||||
name="review_bad",
|
||||
agent="bad-reviewer",
|
||||
role="review",
|
||||
prompt_template="default:review-only",
|
||||
output_key="review_bad",
|
||||
parallel=True,
|
||||
),
|
||||
]
|
||||
step_outputs: dict[str, str] = {}
|
||||
step_results: dict[str, AgentResult] = {}
|
||||
|
||||
def _fake_invoke(agent, prompt, step_name, **kwargs):
|
||||
if step_name == "review_ok":
|
||||
return AgentResult(
|
||||
output="VERDICT: PASS",
|
||||
exit_code=0,
|
||||
agent_name=agent.name,
|
||||
step_name=step_name,
|
||||
duration_seconds=1.0,
|
||||
)
|
||||
raise AgentInvocationError(
|
||||
agent_name=agent.name,
|
||||
step_name=step_name,
|
||||
cmd_preview="claude -p ...",
|
||||
raw_error="API Error: 429 rate limit exceeded for current quota",
|
||||
failure_type="USAGE_LIMIT",
|
||||
suggested_action="Agent CLI hit a quota, billing, or token budget limit. Refill or raise the limit, then rerun.",
|
||||
)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
with patch("cross_eval.pipeline.invoke_agent", side_effect=_fake_invoke):
|
||||
with self.assertRaises(RuntimeError) as ctx:
|
||||
_execute_parallel_batch(
|
||||
steps,
|
||||
config,
|
||||
input_contents={},
|
||||
feedback="",
|
||||
iteration=1,
|
||||
max_iterations=3,
|
||||
cwd=Path(tmpdir),
|
||||
timeout=None,
|
||||
dry_run=False,
|
||||
step_outputs=step_outputs,
|
||||
step_results=step_results,
|
||||
run_dir=Path(tmpdir),
|
||||
output_iter=1,
|
||||
)
|
||||
|
||||
self.assertIn("Successful outputs were saved for: review_ok", str(ctx.exception))
|
||||
self.assertEqual(step_outputs["review_ok"], "VERDICT: PASS")
|
||||
self.assertTrue((Path(tmpdir) / "v1" / "review_ok.md").exists())
|
||||
error_path = Path(tmpdir) / "v1" / "review_bad_error.md"
|
||||
self.assertTrue(error_path.exists())
|
||||
self.assertIn("Failure Type", error_path.read_text(encoding="utf-8"))
|
||||
self.assertIn("USAGE_LIMIT", error_path.read_text(encoding="utf-8"))
|
||||
|
||||
def test_detect_repeated_aggregate_warns_on_same_output(self) -> None:
|
||||
steps = [
|
||||
StepConfig(
|
||||
@@ -169,6 +324,14 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
||||
),
|
||||
["claude-senior"],
|
||||
)
|
||||
self.assertEqual(
|
||||
_default_seniors_for_preset(
|
||||
"preset:coding-review-fix",
|
||||
["codex-reviewer"],
|
||||
BUILTIN_AGENTS,
|
||||
),
|
||||
["codex-senior"],
|
||||
)
|
||||
self.assertEqual(
|
||||
_default_seniors_for_preset(
|
||||
"preset:simple",
|
||||
@@ -204,9 +367,37 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
||||
)
|
||||
self.assertEqual(
|
||||
[step.name for step in converge.steps[3:]],
|
||||
["aggregate_review", "generate", "verify"],
|
||||
["aggregate_review", "coding", "verify"],
|
||||
)
|
||||
|
||||
def test_coding_review_fix_starts_with_single_coding_phase(self) -> None:
|
||||
phases = _build_coding_review_fix_preset(
|
||||
["codex-coder"],
|
||||
["claude-reviewer", "codex-reviewer"],
|
||||
["codex-senior"],
|
||||
)
|
||||
|
||||
self.assertEqual([phase.name for phase in phases], ["initial_coding", "review_fix"])
|
||||
self.assertEqual(phases[0].max_iterations, 1)
|
||||
self.assertEqual([step.name for step in phases[0].steps], ["coding"])
|
||||
self.assertEqual([step.name for step in phases[1].steps[2:]], ["aggregate_review", "coding", "verify"])
|
||||
|
||||
def test_apply_phased_iteration_override_updates_only_verdict_phases(self) -> None:
|
||||
config = PipelineConfig(
|
||||
phases=_build_coding_review_fix_preset(
|
||||
["codex-coder"],
|
||||
["codex-reviewer"],
|
||||
["codex-senior"],
|
||||
),
|
||||
)
|
||||
|
||||
_apply_phased_iteration_override(config, 10)
|
||||
|
||||
self.assertEqual(config.phases[0].name, "initial_coding")
|
||||
self.assertEqual(config.phases[0].max_iterations, 1)
|
||||
self.assertEqual(config.phases[1].name, "review_fix")
|
||||
self.assertEqual(config.phases[1].max_iterations, 10)
|
||||
|
||||
def test_review_only_duplicate_reviewers_get_unique_step_keys(self) -> None:
|
||||
steps = _build_review_only_preset(
|
||||
["codex-coder"],
|
||||
@@ -219,6 +410,31 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
||||
["review_codex_reviewer", "review_codex_reviewer_2"],
|
||||
)
|
||||
|
||||
def test_plan_review_duplicate_reviewers_get_unique_step_keys(self) -> None:
|
||||
steps = _build_plan_review_preset(
|
||||
["codex-coder"],
|
||||
["codex-reviewer", "codex-reviewer"],
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
[step.output_key for step in steps],
|
||||
["plan_review_codex_reviewer", "plan_review_codex_reviewer_2"],
|
||||
)
|
||||
|
||||
def test_plan_review_with_senior_adds_aggregate_step(self) -> None:
|
||||
steps = _build_plan_review_preset(
|
||||
["codex-coder"],
|
||||
["claude-reviewer", "codex-reviewer"],
|
||||
["claude-senior"],
|
||||
)
|
||||
|
||||
self.assertEqual(steps[-1].name, "senior_review")
|
||||
self.assertEqual(steps[-1].agent, "claude-senior")
|
||||
self.assertTrue(steps[-1].verdict)
|
||||
self.assertFalse(steps[0].verdict)
|
||||
self.assertFalse(steps[1].verdict)
|
||||
|
||||
def test_cross_review_duplicate_coders_get_unique_step_keys(self) -> None:
|
||||
steps = _build_cross_review_preset(
|
||||
["codex-coder", "codex-coder"],
|
||||
@@ -246,7 +462,7 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
||||
steps = phases[0].steps
|
||||
self.assertEqual(steps[2].name, "aggregate_review")
|
||||
self.assertEqual(steps[2].agent, "codex-senior")
|
||||
self.assertEqual(steps[3].name, "generate")
|
||||
self.assertEqual(steps[3].name, "coding")
|
||||
self.assertEqual(steps[4].name, "verify")
|
||||
self.assertEqual(steps[4].agent, "codex-senior")
|
||||
self.assertTrue(steps[4].verdict)
|
||||
@@ -273,7 +489,7 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
||||
|
||||
self.assertEqual(
|
||||
[step.name for step in steps],
|
||||
["generate", "review", "senior_review"],
|
||||
["coding", "review", "senior_review"],
|
||||
)
|
||||
self.assertFalse(steps[1].verdict)
|
||||
self.assertTrue(steps[2].verdict)
|
||||
@@ -325,6 +541,8 @@ class PromptTemplateTest(unittest.TestCase):
|
||||
for tmpl, label in [
|
||||
(REVIEW_TEMPLATE, "REVIEW_TEMPLATE"),
|
||||
(REVIEW_TEMPLATE_KO, "REVIEW_TEMPLATE_KO"),
|
||||
(PLAN_REVIEW_TEMPLATE, "PLAN_REVIEW_TEMPLATE"),
|
||||
(PLAN_REVIEW_TEMPLATE_KO, "PLAN_REVIEW_TEMPLATE_KO"),
|
||||
(REVIEW_ONLY_TEMPLATE, "REVIEW_ONLY_TEMPLATE"),
|
||||
(REVIEW_ONLY_TEMPLATE_KO, "REVIEW_ONLY_TEMPLATE_KO"),
|
||||
]:
|
||||
@@ -351,10 +569,10 @@ class PromptTemplateTest(unittest.TestCase):
|
||||
self.assertIn("CONFIRMED", tmpl)
|
||||
self.assertIn("DISMISSED", tmpl)
|
||||
|
||||
def test_generate_templates_ignore_dismissed(self) -> None:
|
||||
"""Generate templates should tell coder to ignore DISMISSED items."""
|
||||
self.assertIn("DISMISSED", GENERATE_TEMPLATE)
|
||||
self.assertIn("DISMISSED", GENERATE_TEMPLATE_KO)
|
||||
def test_coding_templates_ignore_dismissed(self) -> None:
|
||||
"""Coding templates should tell coder to ignore DISMISSED items."""
|
||||
self.assertIn("DISMISSED", CODING_TEMPLATE)
|
||||
self.assertIn("DISMISSED", CODING_TEMPLATE_KO)
|
||||
|
||||
def test_aggregate_templates_dismissed_structure(self) -> None:
|
||||
"""Aggregate templates should use [False positive] / [Already fixed] tags."""
|
||||
@@ -487,11 +705,11 @@ class ReviewMetricsParsingTest(unittest.TestCase):
|
||||
language="en",
|
||||
pipeline=[
|
||||
StepConfig(
|
||||
name="generate",
|
||||
name="coding",
|
||||
agent="claude-coder",
|
||||
role="generate",
|
||||
prompt_template="default:generate",
|
||||
output_key="generated_code",
|
||||
role="coding",
|
||||
prompt_template="default:coding",
|
||||
output_key="coding_output",
|
||||
verdict=True,
|
||||
),
|
||||
],
|
||||
@@ -500,7 +718,7 @@ class ReviewMetricsParsingTest(unittest.TestCase):
|
||||
iterations=[
|
||||
IterationResult(
|
||||
iteration=1,
|
||||
step_outputs={"generated_code": "some code"},
|
||||
step_outputs={"coding_output": "some code"},
|
||||
verdict="PASS",
|
||||
),
|
||||
],
|
||||
@@ -511,5 +729,230 @@ class ReviewMetricsParsingTest(unittest.TestCase):
|
||||
self.assertNotIn("Review Metrics", report)
|
||||
|
||||
|
||||
class EscalateVerdictTest(unittest.TestCase):
|
||||
"""Test ESCALATE verdict functionality."""
|
||||
|
||||
def test_extract_verdict_escalate(self) -> None:
|
||||
output = "Some review content\n\nVERDICT: ESCALATE\n"
|
||||
result = _extract_verdict(output, r"VERDICT:\s*PASS")
|
||||
self.assertEqual(result, "ESCALATE")
|
||||
|
||||
def test_extract_verdict_escalate_priority(self) -> None:
|
||||
"""ESCALATE should take priority even if PASS pattern also matches."""
|
||||
output = "VERDICT: PASS\n\nVERDICT: ESCALATE\n"
|
||||
result = _extract_verdict(output, r"VERDICT:\s*PASS")
|
||||
self.assertEqual(result, "ESCALATE")
|
||||
|
||||
def test_extract_verdict_pass_still_works(self) -> None:
|
||||
output = "All good\n\nVERDICT: PASS\n"
|
||||
result = _extract_verdict(output, r"VERDICT:\s*PASS")
|
||||
self.assertEqual(result, "PASS")
|
||||
|
||||
def test_extract_verdict_fail_still_works(self) -> None:
|
||||
output = "Issues found\n\nVERDICT: FAIL\n"
|
||||
result = _extract_verdict(output, r"VERDICT:\s*PASS")
|
||||
self.assertEqual(result, "FAIL")
|
||||
|
||||
def test_extract_senior_tracker(self) -> None:
|
||||
output = (
|
||||
"Some text\n\n"
|
||||
"## Issue Tracker\n"
|
||||
"| ISS-ID | Severity | Description | Status | Since |\n"
|
||||
"|--------|----------|-------------|--------|-------|\n"
|
||||
"| ISS-001 | Critical | Missing auth | Open | v1 |\n"
|
||||
"| ISS-002 | Major | Bad naming | Fixed | v1 |\n"
|
||||
"\nMore text"
|
||||
)
|
||||
tracker = _extract_senior_tracker(output)
|
||||
self.assertIn("Issue Tracker", tracker)
|
||||
self.assertIn("ISS-001", tracker)
|
||||
self.assertIn("ISS-002", tracker)
|
||||
|
||||
def test_extract_senior_tracker_empty(self) -> None:
|
||||
output = "No tracker table here"
|
||||
tracker = _extract_senior_tracker(output)
|
||||
self.assertEqual(tracker, "")
|
||||
|
||||
def test_auto_escalate_heuristic(self) -> None:
|
||||
prev1 = "Issue in src/auth.py: missing validation"
|
||||
prev2 = "Issue in src/auth.py: validation still missing"
|
||||
current = "Issue in src/auth.py: validation not implemented"
|
||||
|
||||
# Should detect repeated issue
|
||||
self.assertTrue(_detect_auto_escalate([prev1, prev2], current, threshold=2))
|
||||
|
||||
def test_auto_escalate_no_repeat(self) -> None:
|
||||
prev1 = "Issue in src/auth.py: missing validation"
|
||||
current = "Issue in src/database.py: connection pool"
|
||||
|
||||
self.assertFalse(_detect_auto_escalate([prev1], current, threshold=2))
|
||||
|
||||
def test_auto_escalate_different_issues_same_file(self) -> None:
|
||||
"""Same file path but different issues should NOT trigger escalation."""
|
||||
prev1 = "Issue in src/utils.py: missing validation on input"
|
||||
prev2 = "Issue in src/utils.py: unused import at top of file"
|
||||
current = "Issue in src/utils.py: error handling not implemented"
|
||||
|
||||
# All mention src/utils.py, but the issue keywords differ across
|
||||
# iterations, so this should NOT escalate.
|
||||
self.assertFalse(_detect_auto_escalate([prev1, prev2], current, threshold=2))
|
||||
|
||||
def test_report_escalate_verdict(self) -> None:
|
||||
config = PipelineConfig(language="en")
|
||||
result = PipelineResult(
|
||||
final_verdict="ESCALATE",
|
||||
escalated_issues=["Requirements are ambiguous — need stakeholder input"],
|
||||
)
|
||||
|
||||
report = build_report(config, result)
|
||||
|
||||
self.assertIn("ESCALATE", report)
|
||||
self.assertIn("Human review required", report)
|
||||
self.assertIn("ambiguous", report)
|
||||
|
||||
def test_report_escalate_verdict_ko(self) -> None:
|
||||
config = PipelineConfig(language="ko")
|
||||
result = PipelineResult(
|
||||
final_verdict="ESCALATE",
|
||||
escalated_issues=["요구사항이 모호함"],
|
||||
)
|
||||
|
||||
report = build_report(config, result)
|
||||
|
||||
self.assertIn("ESCALATE", report)
|
||||
self.assertIn("사람의 확인이 필요합니다", report)
|
||||
|
||||
def test_exit_code_escalate(self) -> None:
|
||||
from cross_eval.cli import main
|
||||
|
||||
mock_result = PipelineResult(
|
||||
final_verdict="ESCALATE",
|
||||
escalated_issues=["Needs human review"],
|
||||
)
|
||||
|
||||
with patch("cross_eval.config.load_config") as mock_load, \
|
||||
patch("cross_eval.config.validate_config", return_value=[]), \
|
||||
patch("cross_eval.pipeline.run_pipeline", return_value=mock_result), \
|
||||
patch("cross_eval.report.print_escalation_report"):
|
||||
mock_config = PipelineConfig(
|
||||
pipeline=[
|
||||
StepConfig(
|
||||
name="review",
|
||||
agent="claude-reviewer",
|
||||
role="review",
|
||||
prompt_template="default:review",
|
||||
output_key="review_result",
|
||||
verdict=True,
|
||||
),
|
||||
],
|
||||
agents=dict(BUILTIN_AGENTS),
|
||||
coders=["claude-coder"],
|
||||
reviewers=["claude-reviewer"],
|
||||
inputs={"plan": Path("/tmp/plan.md")},
|
||||
language="en",
|
||||
max_iterations=3,
|
||||
preset_name="simple",
|
||||
)
|
||||
mock_load.return_value = mock_config
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".yaml", mode="w") as f:
|
||||
f.write("inputs:\n plan: /tmp/plan.md\n")
|
||||
f.flush()
|
||||
exit_code = main(["run", "-c", f.name])
|
||||
|
||||
self.assertEqual(exit_code, 2)
|
||||
|
||||
def test_senior_prompt_includes_escalate(self) -> None:
|
||||
self.assertIn("ESCALATE", _SENIOR_SYSTEM_PROMPT)
|
||||
self.assertIn("ambiguous", _SENIOR_SYSTEM_PROMPT.lower())
|
||||
|
||||
def test_aggregate_template_has_tracker(self) -> None:
|
||||
self.assertIn("{previous_senior_tracker}", AGGREGATE_REVIEW_TEMPLATE)
|
||||
self.assertIn("Issue Tracker", AGGREGATE_REVIEW_TEMPLATE)
|
||||
self.assertIn("VERDICT: ESCALATE", AGGREGATE_REVIEW_TEMPLATE)
|
||||
|
||||
def test_report_includes_issue_tracker_summary(self) -> None:
|
||||
config = PipelineConfig(
|
||||
language="en",
|
||||
pipeline=[
|
||||
StepConfig(
|
||||
name="review",
|
||||
agent="claude-reviewer",
|
||||
role="review",
|
||||
prompt_template="default:review",
|
||||
output_key="review_result",
|
||||
verdict=True,
|
||||
),
|
||||
],
|
||||
)
|
||||
result = PipelineResult(
|
||||
iterations=[
|
||||
IterationResult(
|
||||
iteration=1,
|
||||
step_outputs={
|
||||
"review_result": (
|
||||
"### Issues Found\n"
|
||||
"- ISS-001 [Critical][Omission] Missing auth check\n"
|
||||
"- ISS-002 [Major][Omission] No input validation\n"
|
||||
"### Verdict\nVERDICT: FAIL"
|
||||
),
|
||||
},
|
||||
verdict="FAIL",
|
||||
),
|
||||
],
|
||||
final_verdict="FAIL",
|
||||
)
|
||||
|
||||
report = build_report(config, result)
|
||||
self.assertIn("Issue Tracker Summary", report)
|
||||
self.assertIn("ISS-001", report)
|
||||
self.assertIn("ISS-002", report)
|
||||
|
||||
def test_report_includes_senior_tracker_table(self) -> None:
|
||||
config = PipelineConfig(
|
||||
language="en",
|
||||
pipeline=[
|
||||
StepConfig(
|
||||
name="senior_review",
|
||||
agent="claude-senior",
|
||||
role="review",
|
||||
prompt_template="default:aggregate-review",
|
||||
output_key="senior_review_result",
|
||||
verdict=True,
|
||||
),
|
||||
],
|
||||
)
|
||||
result = PipelineResult(
|
||||
iterations=[
|
||||
IterationResult(
|
||||
iteration=1,
|
||||
step_outputs={
|
||||
"senior_review_result": (
|
||||
"### Confirmed Issues\n- Missing auth\n\n"
|
||||
"## Issue Tracker\n"
|
||||
"| ISS-ID | Severity | Description | Status | Since |\n"
|
||||
"|--------|----------|-------------|--------|-------|\n"
|
||||
"| ISS-001 | Critical | Missing auth check | Open | v1 |\n"
|
||||
"| ISS-002 | Major | No validation | Fixed | v1 |\n"
|
||||
"\n### Verdict\nVERDICT: FAIL"
|
||||
),
|
||||
},
|
||||
verdict="FAIL",
|
||||
),
|
||||
],
|
||||
final_verdict="FAIL",
|
||||
)
|
||||
|
||||
report = build_report(config, result)
|
||||
self.assertIn("Issue Tracker Summary", report)
|
||||
self.assertIn("ISS-001", report)
|
||||
self.assertIn("Fixed", report)
|
||||
|
||||
def test_aggregate_template_ko_has_tracker(self) -> None:
|
||||
self.assertIn("{previous_senior_tracker}", AGGREGATE_REVIEW_TEMPLATE_KO)
|
||||
self.assertIn("이슈 트래커", AGGREGATE_REVIEW_TEMPLATE_KO)
|
||||
self.assertIn("VERDICT: ESCALATE", AGGREGATE_REVIEW_TEMPLATE_KO)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
267
tests/test_onboarding.py
Normal file
267
tests/test_onboarding.py
Normal file
@@ -0,0 +1,267 @@
|
||||
"""Tests for doctor, demo, and guided init features."""
|
||||
from __future__ import annotations
|
||||
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from cross_eval.doctor import (
|
||||
DoctorCheck,
|
||||
check_cli_installed,
|
||||
check_config,
|
||||
format_doctor_results,
|
||||
run_doctor,
|
||||
)
|
||||
from cross_eval.demo import (
|
||||
DEMO_CHECKLIST,
|
||||
DEMO_PLAN,
|
||||
run_mock_demo,
|
||||
)
|
||||
from cross_eval.cli import (
|
||||
_generate_guided_config,
|
||||
_prompt_choice,
|
||||
_prompt_text,
|
||||
main,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Doctor tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class DoctorCheckInstalledTest(unittest.TestCase):
|
||||
def test_check_cli_installed_found(self) -> None:
|
||||
with patch("cross_eval.doctor.shutil.which", return_value="/usr/bin/python3"):
|
||||
with patch("cross_eval.doctor.subprocess.run") as mock_run:
|
||||
mock_run.return_value = MagicMock(
|
||||
stdout="Python 3.12.0", stderr=""
|
||||
)
|
||||
found, version = check_cli_installed("python3")
|
||||
|
||||
self.assertTrue(found)
|
||||
self.assertIn("Python", version)
|
||||
|
||||
def test_check_cli_installed_not_found(self) -> None:
|
||||
with patch("cross_eval.doctor.shutil.which", return_value=None):
|
||||
found, msg = check_cli_installed("nonexistent-tool")
|
||||
|
||||
self.assertFalse(found)
|
||||
self.assertIn("not found", msg)
|
||||
|
||||
def test_check_config_exists_valid(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ce_dir = Path(tmpdir) / ".cross-eval"
|
||||
ce_dir.mkdir()
|
||||
config_path = ce_dir / "config.yaml"
|
||||
config_path.write_text(
|
||||
"inputs:\n plan: plan.md\ncoders: [claude-coder]\n"
|
||||
"reviewers: [claude-reviewer]\npipeline: preset:simple\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
# Also create plan.md so validation passes
|
||||
(ce_dir / "plan.md").write_text("# Plan", encoding="utf-8")
|
||||
|
||||
ok, path, errors = check_config(Path(tmpdir))
|
||||
|
||||
self.assertTrue(ok)
|
||||
self.assertIsNotNone(path)
|
||||
self.assertEqual(errors, [])
|
||||
|
||||
def test_check_config_not_exists(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ok, path, errors = check_config(Path(tmpdir))
|
||||
|
||||
self.assertFalse(ok)
|
||||
self.assertIsNone(path)
|
||||
|
||||
def test_check_config_invalid(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ce_dir = Path(tmpdir) / ".cross-eval"
|
||||
ce_dir.mkdir()
|
||||
# Valid YAML but missing required fields → validation fails
|
||||
(ce_dir / "config.yaml").write_text(
|
||||
"inputs:\n plan: /nonexistent/plan.md\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
ok, path, errors = check_config(Path(tmpdir))
|
||||
|
||||
self.assertFalse(ok)
|
||||
self.assertIsNotNone(path)
|
||||
|
||||
def test_format_doctor_results_all_pass(self) -> None:
|
||||
checks = [
|
||||
DoctorCheck("test", True, True, "ok"),
|
||||
DoctorCheck("test2", True, False, "ok"),
|
||||
]
|
||||
output = format_doctor_results(checks)
|
||||
self.assertIn("✓", output)
|
||||
self.assertIn("All checks passed", output)
|
||||
|
||||
def test_format_doctor_results_critical_fail(self) -> None:
|
||||
checks = [
|
||||
DoctorCheck("claude CLI", False, True, "not found"),
|
||||
]
|
||||
output = format_doctor_results(checks)
|
||||
self.assertIn("✗", output)
|
||||
self.assertIn("critical", output.lower())
|
||||
|
||||
def test_cmd_doctor_returns_0_all_pass(self) -> None:
|
||||
with patch("cross_eval.doctor.run_doctor") as mock:
|
||||
mock.return_value = [
|
||||
DoctorCheck("test", True, True, "ok"),
|
||||
]
|
||||
exit_code = main(["doctor"])
|
||||
self.assertEqual(exit_code, 0)
|
||||
|
||||
def test_cmd_doctor_returns_1_critical_fail(self) -> None:
|
||||
with patch("cross_eval.doctor.run_doctor") as mock:
|
||||
mock.return_value = [
|
||||
DoctorCheck("claude CLI", False, True, "not found"),
|
||||
]
|
||||
exit_code = main(["doctor"])
|
||||
self.assertEqual(exit_code, 1)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Demo tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class DemoTest(unittest.TestCase):
|
||||
def test_demo_plan_is_nonempty(self) -> None:
|
||||
self.assertIn("fibonacci", DEMO_PLAN.lower())
|
||||
|
||||
def test_demo_checklist_is_nonempty(self) -> None:
|
||||
self.assertIn("fibonacci", DEMO_CHECKLIST.lower())
|
||||
|
||||
def test_mock_demo_runs_without_error(self) -> None:
|
||||
# Should not raise
|
||||
with patch("sys.stdout"):
|
||||
run_mock_demo(preset="simple")
|
||||
|
||||
def test_mock_demo_escalate_runs_without_error(self) -> None:
|
||||
with patch("sys.stdout"):
|
||||
run_mock_demo(preset="simple", show_escalate=True)
|
||||
|
||||
def test_cmd_demo_mock_default(self) -> None:
|
||||
with patch("cross_eval.demo.run_mock_demo") as mock:
|
||||
exit_code = main(["demo"])
|
||||
mock.assert_called_once_with(preset="simple", show_escalate=False)
|
||||
self.assertEqual(exit_code, 0)
|
||||
|
||||
def test_cmd_demo_escalate_flag(self) -> None:
|
||||
with patch("cross_eval.demo.run_mock_demo") as mock:
|
||||
exit_code = main(["demo", "--escalate"])
|
||||
mock.assert_called_once_with(preset="simple", show_escalate=True)
|
||||
self.assertEqual(exit_code, 0)
|
||||
|
||||
def test_cmd_demo_live_requires_confirmation(self) -> None:
|
||||
with patch("builtins.input", return_value="n"):
|
||||
exit_code = main(["demo", "--live"])
|
||||
self.assertEqual(exit_code, 0)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Guided init tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class GuidedInitTest(unittest.TestCase):
|
||||
def test_prompt_choice_default(self) -> None:
|
||||
with patch("builtins.input", return_value=""):
|
||||
result = _prompt_choice("Pick:", ["a", "b", "c"], default=2)
|
||||
self.assertEqual(result, "b")
|
||||
|
||||
def test_prompt_choice_by_number(self) -> None:
|
||||
with patch("builtins.input", return_value="3"):
|
||||
result = _prompt_choice("Pick:", ["a", "b", "c"], default=1)
|
||||
self.assertEqual(result, "c")
|
||||
|
||||
def test_prompt_choice_by_name(self) -> None:
|
||||
with patch("builtins.input", return_value="simple"):
|
||||
result = _prompt_choice("Pick:", ["simple", "review-fix"], default=1)
|
||||
self.assertEqual(result, "simple")
|
||||
|
||||
def test_prompt_text_default(self) -> None:
|
||||
with patch("builtins.input", return_value=""):
|
||||
result = _prompt_text("Name", default="claude")
|
||||
self.assertEqual(result, "claude")
|
||||
|
||||
def test_prompt_text_custom(self) -> None:
|
||||
with patch("builtins.input", return_value="codex"):
|
||||
result = _prompt_text("Name", default="claude")
|
||||
self.assertEqual(result, "codex")
|
||||
|
||||
def test_generate_guided_config(self) -> None:
|
||||
config = _generate_guided_config(
|
||||
"review-fix", "ko",
|
||||
{
|
||||
"coder": "claude",
|
||||
"reviewer": "codex",
|
||||
"senior": "codex",
|
||||
"max_iter": 5,
|
||||
},
|
||||
)
|
||||
self.assertIn("preset:review-fix", config)
|
||||
self.assertIn("language: ko", config)
|
||||
self.assertIn("claude-coder", config)
|
||||
self.assertIn("codex-reviewer", config)
|
||||
self.assertIn("codex-senior", config)
|
||||
self.assertIn("max_iterations: 5", config)
|
||||
|
||||
def test_generate_guided_config_full_name(self) -> None:
|
||||
config = _generate_guided_config(
|
||||
"simple", "ko",
|
||||
{
|
||||
"coder": "claude-coder",
|
||||
"reviewer": "codex-reviewer",
|
||||
"senior": "",
|
||||
"max_iter": 3,
|
||||
},
|
||||
)
|
||||
# Full names should not be double-suffixed
|
||||
self.assertIn("claude-coder", config)
|
||||
self.assertNotIn("claude-coder-coder", config)
|
||||
self.assertIn("codex-reviewer", config)
|
||||
self.assertNotIn("codex-reviewer-reviewer", config)
|
||||
|
||||
def test_generate_guided_config_no_senior(self) -> None:
|
||||
config = _generate_guided_config(
|
||||
"simple", "en",
|
||||
{
|
||||
"coder": "claude",
|
||||
"reviewer": "claude",
|
||||
"senior": "",
|
||||
"max_iter": 3,
|
||||
},
|
||||
)
|
||||
self.assertNotIn("senior", config.lower())
|
||||
|
||||
def test_guided_init_creates_files(self) -> None:
|
||||
# Simulate guided init with all defaults
|
||||
inputs = iter(["", "", "", "", "", "", ""])
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
with patch("builtins.input", side_effect=lambda _="": next(inputs, "")):
|
||||
exit_code = main(["init", "--guided", "--dir", tmpdir])
|
||||
|
||||
config_path = Path(tmpdir) / ".cross-eval" / "config.yaml"
|
||||
self.assertTrue(config_path.exists())
|
||||
self.assertEqual(exit_code, 0)
|
||||
|
||||
def test_guided_init_preserves_existing_files(self) -> None:
|
||||
inputs = iter(["", "", "", "", "", "", ""])
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ce_dir = Path(tmpdir) / ".cross-eval"
|
||||
ce_dir.mkdir()
|
||||
existing = ce_dir / "config.yaml"
|
||||
existing.write_text("# existing", encoding="utf-8")
|
||||
|
||||
with patch("builtins.input", side_effect=lambda _="": next(inputs, "")):
|
||||
main(["init", "--guided", "--dir", tmpdir])
|
||||
|
||||
# Should not overwrite
|
||||
self.assertEqual(existing.read_text(), "# existing")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
461
tests/test_pipeline_integration.py
Normal file
461
tests/test_pipeline_integration.py
Normal file
@@ -0,0 +1,461 @@
|
||||
"""Integration tests for cross-eval pipeline with mocked agents."""
|
||||
from __future__ import annotations
|
||||
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from cross_eval.config import BUILTIN_AGENTS
|
||||
from cross_eval.models import (
|
||||
AgentConfig,
|
||||
AgentResult,
|
||||
PhaseConfig,
|
||||
PipelineConfig,
|
||||
StepConfig,
|
||||
)
|
||||
from cross_eval.pipeline import run_pipeline
|
||||
from cross_eval.prompts import _build_review_fix_preset, _build_simple_preset
|
||||
|
||||
|
||||
def _make_mock_agent(outputs: list[str]):
|
||||
"""Returns a side_effect function that returns outputs in sequence."""
|
||||
call_count = [0]
|
||||
|
||||
def _mock(agent_config, prompt, step_name, **kwargs):
|
||||
idx = min(call_count[0], len(outputs) - 1)
|
||||
call_count[0] += 1
|
||||
return AgentResult(
|
||||
output=outputs[idx],
|
||||
exit_code=0,
|
||||
agent_name=agent_config.name,
|
||||
step_name=step_name,
|
||||
duration_seconds=0.1,
|
||||
)
|
||||
|
||||
return _mock
|
||||
|
||||
|
||||
def _make_step_mock(step_outputs: dict[str, list[str]]):
|
||||
"""Returns a side_effect that dispatches by step_name, cycling through outputs."""
|
||||
counters: dict[str, int] = {}
|
||||
|
||||
def _mock(agent_config, prompt, step_name, **kwargs):
|
||||
if step_name not in counters:
|
||||
counters[step_name] = 0
|
||||
outputs = step_outputs.get(step_name, [""])
|
||||
idx = min(counters[step_name], len(outputs) - 1)
|
||||
counters[step_name] += 1
|
||||
return AgentResult(
|
||||
output=outputs[idx],
|
||||
exit_code=0,
|
||||
agent_name=agent_config.name,
|
||||
step_name=step_name,
|
||||
duration_seconds=0.1,
|
||||
)
|
||||
|
||||
return _mock
|
||||
|
||||
|
||||
def _minimal_simple_config(
|
||||
run_dir: Path,
|
||||
max_iterations: int = 3,
|
||||
seniors: list[str] | None = None,
|
||||
) -> PipelineConfig:
|
||||
"""Build a minimal simple pipeline config for testing."""
|
||||
coders = ["claude-coder"]
|
||||
reviewers = ["claude-reviewer"]
|
||||
senior_list = seniors if seniors is not None else []
|
||||
steps = _build_simple_preset(coders, reviewers, senior_list)
|
||||
agents = dict(BUILTIN_AGENTS)
|
||||
return PipelineConfig(
|
||||
output_dir=run_dir,
|
||||
max_iterations=max_iterations,
|
||||
min_iterations=1,
|
||||
language="en",
|
||||
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||
agents=agents,
|
||||
coders=coders,
|
||||
reviewers=reviewers,
|
||||
seniors=senior_list,
|
||||
pipeline=steps,
|
||||
preset_name="simple",
|
||||
)
|
||||
|
||||
|
||||
class TestSimplePipelinePassStopsLoop(unittest.TestCase):
|
||||
"""Test 1: mock agent returns VERDICT: PASS on first review -> stops at iteration 1."""
|
||||
|
||||
def test_simple_pipeline_pass_stops_loop(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
config = _minimal_simple_config(Path(tmpdir))
|
||||
|
||||
mock = _make_mock_agent([
|
||||
"Coding output here", # coding step
|
||||
"All good\n\nVERDICT: PASS", # review step
|
||||
])
|
||||
|
||||
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||
result = run_pipeline(config)
|
||||
|
||||
self.assertEqual(result.final_verdict, "PASS")
|
||||
self.assertEqual(len(result.iterations), 1)
|
||||
|
||||
|
||||
class TestSimplePipelineFailThenPass(unittest.TestCase):
|
||||
"""Test 2: FAIL on first review, PASS on second -> 2 iterations."""
|
||||
|
||||
def test_simple_pipeline_fail_then_pass(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
config = _minimal_simple_config(Path(tmpdir), max_iterations=5)
|
||||
|
||||
mock = _make_step_mock({
|
||||
"coding": ["Coding output v1", "Coding output v2"],
|
||||
"review": [
|
||||
"Issues found\n\nVERDICT: FAIL",
|
||||
"All good\n\nVERDICT: PASS",
|
||||
],
|
||||
})
|
||||
|
||||
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||
result = run_pipeline(config)
|
||||
|
||||
self.assertEqual(result.final_verdict, "PASS")
|
||||
self.assertEqual(len(result.iterations), 2)
|
||||
|
||||
|
||||
class TestSimplePipelineEscalateBreaksLoop(unittest.TestCase):
|
||||
"""Test 3: ESCALATE on review -> stops immediately, final_verdict=ESCALATE."""
|
||||
|
||||
def test_simple_pipeline_escalate_breaks_loop(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
config = _minimal_simple_config(
|
||||
Path(tmpdir), max_iterations=5, seniors=["claude-senior"],
|
||||
)
|
||||
|
||||
escalate_output = (
|
||||
"### Confirmed Issues\n"
|
||||
"- [Critical] Requirements are ambiguous\n\n"
|
||||
"### Escalated Issues\n"
|
||||
"Requirements need stakeholder clarification\n\n"
|
||||
"### Verdict\n"
|
||||
"VERDICT: ESCALATE\n"
|
||||
)
|
||||
|
||||
mock = _make_step_mock({
|
||||
"coding": ["Coding output"],
|
||||
"review": ["Issues found\n\nVERDICT: FAIL"],
|
||||
"senior_review": [escalate_output],
|
||||
})
|
||||
|
||||
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||
result = run_pipeline(config)
|
||||
|
||||
self.assertEqual(result.final_verdict, "ESCALATE")
|
||||
self.assertEqual(len(result.iterations), 1)
|
||||
self.assertTrue(len(result.escalated_issues) > 0)
|
||||
|
||||
|
||||
class TestSimplePipelineEscalatePriorityOverPass(unittest.TestCase):
|
||||
"""Test 4: one verdict step returns PASS, another returns ESCALATE -> ESCALATE wins."""
|
||||
|
||||
def test_simple_pipeline_escalate_priority_over_pass(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# Build a custom pipeline with 2 verdict steps (no senior)
|
||||
steps = [
|
||||
StepConfig(
|
||||
name="coding",
|
||||
agent="claude-coder",
|
||||
role="coding",
|
||||
prompt_template="default:coding",
|
||||
output_key="coding_output",
|
||||
),
|
||||
StepConfig(
|
||||
name="review_a",
|
||||
agent="claude-reviewer",
|
||||
role="review",
|
||||
prompt_template="default:review",
|
||||
output_key="review_a_result",
|
||||
verdict=True,
|
||||
),
|
||||
StepConfig(
|
||||
name="review_b",
|
||||
agent="claude-reviewer",
|
||||
role="review",
|
||||
prompt_template="default:review",
|
||||
output_key="review_b_result",
|
||||
verdict=True,
|
||||
),
|
||||
]
|
||||
config = PipelineConfig(
|
||||
output_dir=Path(tmpdir),
|
||||
max_iterations=3,
|
||||
min_iterations=1,
|
||||
language="en",
|
||||
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||
agents=dict(BUILTIN_AGENTS),
|
||||
coders=["claude-coder"],
|
||||
reviewers=["claude-reviewer"],
|
||||
pipeline=steps,
|
||||
preset_name="custom",
|
||||
)
|
||||
|
||||
escalate_output = (
|
||||
"### Escalated Issues\n"
|
||||
"Ambiguous requirements need clarification\n\n"
|
||||
"VERDICT: ESCALATE\n"
|
||||
)
|
||||
|
||||
mock = _make_step_mock({
|
||||
"coding": ["Coding output"],
|
||||
"review_a": ["All good\n\nVERDICT: PASS"],
|
||||
"review_b": [escalate_output],
|
||||
})
|
||||
|
||||
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||
result = run_pipeline(config)
|
||||
|
||||
self.assertEqual(result.final_verdict, "ESCALATE")
|
||||
self.assertTrue(len(result.escalated_issues) > 0)
|
||||
|
||||
|
||||
class TestPhasedPipelineEscalateBreaksPhase(unittest.TestCase):
|
||||
"""Test 5: phased pipeline (review-fix), verify step returns ESCALATE -> phase stops."""
|
||||
|
||||
def test_phased_pipeline_escalate_breaks_phase(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
coders = ["claude-coder"]
|
||||
reviewers = ["claude-reviewer"]
|
||||
seniors = ["claude-senior"]
|
||||
phases = _build_review_fix_preset(coders, reviewers, seniors)
|
||||
|
||||
config = PipelineConfig(
|
||||
output_dir=Path(tmpdir),
|
||||
max_iterations=5,
|
||||
min_iterations=1,
|
||||
language="en",
|
||||
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||
agents=dict(BUILTIN_AGENTS),
|
||||
coders=coders,
|
||||
reviewers=reviewers,
|
||||
seniors=seniors,
|
||||
phases=phases,
|
||||
preset_name="review-fix",
|
||||
)
|
||||
|
||||
escalate_output = (
|
||||
"### Escalated Issues\n"
|
||||
"Architecture decisions needed beyond plan scope\n\n"
|
||||
"### Verdict\n"
|
||||
"VERDICT: ESCALATE\n"
|
||||
)
|
||||
|
||||
mock = _make_step_mock({
|
||||
"review_claude_reviewer": ["Review findings here"],
|
||||
"aggregate_review": ["Aggregated review\n\nAction items: fix X"],
|
||||
"coding": ["Fixed code"],
|
||||
"verify": [escalate_output],
|
||||
})
|
||||
|
||||
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||
result = run_pipeline(config)
|
||||
|
||||
self.assertEqual(result.final_verdict, "ESCALATE")
|
||||
self.assertTrue(len(result.escalated_issues) > 0)
|
||||
|
||||
|
||||
class TestAutoEscalateFiresWithoutSenior(unittest.TestCase):
|
||||
"""Test 6: simple pipeline without senior, same FAIL feedback 3 times -> auto-escalate."""
|
||||
|
||||
def test_auto_escalate_fires_without_senior(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# No seniors -> review step has verdict=True
|
||||
config = _minimal_simple_config(
|
||||
Path(tmpdir), max_iterations=5, seniors=None,
|
||||
)
|
||||
|
||||
# Same feedback mentioning the same file paths across all iterations
|
||||
repeated_fail = (
|
||||
"Issues found in src/auth.py: missing validation check.\n"
|
||||
"The file src/auth.py still has the same problem.\n\n"
|
||||
"VERDICT: FAIL"
|
||||
)
|
||||
|
||||
mock = _make_step_mock({
|
||||
"coding": ["Coding output v1", "Coding output v2", "Coding output v3"],
|
||||
"review": [repeated_fail, repeated_fail, repeated_fail],
|
||||
})
|
||||
|
||||
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||
result = run_pipeline(config)
|
||||
|
||||
self.assertEqual(result.final_verdict, "ESCALATE")
|
||||
self.assertTrue(
|
||||
any("Auto-escalated" in iss for iss in result.escalated_issues),
|
||||
)
|
||||
|
||||
|
||||
class TestAutoEscalateDoesNotFireWithSenior(unittest.TestCase):
|
||||
"""Test 7: same repeated FAIL but WITH senior/aggregate step -> no auto-escalate."""
|
||||
|
||||
def test_auto_escalate_does_not_fire_with_senior(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# With seniors -> senior_review step has verdict=True, review does not
|
||||
config = _minimal_simple_config(
|
||||
Path(tmpdir), max_iterations=5, seniors=["claude-senior"],
|
||||
)
|
||||
|
||||
repeated_fail_review = (
|
||||
"Issues found in src/auth.py: missing validation check.\n"
|
||||
"VERDICT: FAIL"
|
||||
)
|
||||
# Senior also returns FAIL but the auto-escalate should NOT fire
|
||||
# because has_aggregator is True (seniors list is populated)
|
||||
senior_fail = (
|
||||
"### Confirmed Issues\n"
|
||||
"- Missing validation in src/auth.py\n\n"
|
||||
"### Action Items\n"
|
||||
"1. Add validation in src/auth.py\n\n"
|
||||
"VERDICT: FAIL"
|
||||
)
|
||||
|
||||
mock = _make_step_mock({
|
||||
"coding": [
|
||||
"Coding output v1",
|
||||
"Coding output v2",
|
||||
"Coding output v3",
|
||||
"Coding output v4",
|
||||
"Coding output v5",
|
||||
],
|
||||
"review": [
|
||||
repeated_fail_review,
|
||||
repeated_fail_review,
|
||||
repeated_fail_review,
|
||||
repeated_fail_review,
|
||||
repeated_fail_review,
|
||||
],
|
||||
"senior_review": [
|
||||
senior_fail,
|
||||
senior_fail,
|
||||
senior_fail,
|
||||
senior_fail,
|
||||
senior_fail,
|
||||
],
|
||||
})
|
||||
|
||||
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||
result = run_pipeline(config)
|
||||
|
||||
# Should NOT auto-escalate; should reach max iterations
|
||||
self.assertNotEqual(result.final_verdict, "ESCALATE")
|
||||
self.assertEqual(result.final_verdict, "MAX_ITERATIONS_REACHED")
|
||||
self.assertEqual(len(result.iterations), 5)
|
||||
|
||||
|
||||
class TestTrackerExtractionAcrossIterations(unittest.TestCase):
|
||||
"""Test 8: senior review output with Issue Tracker table -> passed to next iteration."""
|
||||
|
||||
def test_tracker_extraction_across_iterations(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
config = _minimal_simple_config(
|
||||
Path(tmpdir), max_iterations=3, seniors=["claude-senior"],
|
||||
)
|
||||
|
||||
tracker_table = (
|
||||
"## Issue Tracker\n"
|
||||
"| ISS-ID | Severity | Description | Status | Since |\n"
|
||||
"|--------|----------|-------------|--------|-------|\n"
|
||||
"| ISS-001 | Critical | Missing auth check | Open | v1 |\n"
|
||||
"| ISS-002 | Major | No validation | Open | v1 |\n"
|
||||
)
|
||||
senior_output_v1 = (
|
||||
"### Confirmed Issues\n"
|
||||
"- Missing auth\n\n"
|
||||
f"{tracker_table}\n"
|
||||
"### Verdict\n"
|
||||
"VERDICT: FAIL"
|
||||
)
|
||||
senior_output_v2 = (
|
||||
"### Confirmed Issues\n"
|
||||
"- None remaining\n\n"
|
||||
"## Issue Tracker\n"
|
||||
"| ISS-ID | Severity | Description | Status | Since |\n"
|
||||
"|--------|----------|-------------|--------|-------|\n"
|
||||
"| ISS-001 | Critical | Missing auth check | Fixed | v1 |\n"
|
||||
"| ISS-002 | Major | No validation | Fixed | v1 |\n"
|
||||
"\n### Verdict\n"
|
||||
"VERDICT: PASS"
|
||||
)
|
||||
|
||||
captured_prompts: list[dict[str, str]] = []
|
||||
|
||||
def _tracking_mock(agent_config, prompt, step_name, **kwargs):
|
||||
captured_prompts.append({
|
||||
"step_name": step_name,
|
||||
"prompt": prompt,
|
||||
"agent_name": agent_config.name,
|
||||
})
|
||||
if step_name == "coding":
|
||||
return AgentResult(
|
||||
output="Coding output",
|
||||
exit_code=0,
|
||||
agent_name=agent_config.name,
|
||||
step_name=step_name,
|
||||
duration_seconds=0.1,
|
||||
)
|
||||
elif step_name == "review":
|
||||
return AgentResult(
|
||||
output="Review findings\n\nVERDICT: FAIL",
|
||||
exit_code=0,
|
||||
agent_name=agent_config.name,
|
||||
step_name=step_name,
|
||||
duration_seconds=0.1,
|
||||
)
|
||||
elif step_name == "senior_review":
|
||||
# First call: FAIL with tracker, second call: PASS
|
||||
senior_calls = [
|
||||
p for p in captured_prompts if p["step_name"] == "senior_review"
|
||||
]
|
||||
if len(senior_calls) <= 1:
|
||||
output = senior_output_v1
|
||||
else:
|
||||
output = senior_output_v2
|
||||
return AgentResult(
|
||||
output=output,
|
||||
exit_code=0,
|
||||
agent_name=agent_config.name,
|
||||
step_name=step_name,
|
||||
duration_seconds=0.1,
|
||||
)
|
||||
return AgentResult(
|
||||
output="",
|
||||
exit_code=0,
|
||||
agent_name=agent_config.name,
|
||||
step_name=step_name,
|
||||
duration_seconds=0.1,
|
||||
)
|
||||
|
||||
with patch("cross_eval.pipeline.invoke_agent", side_effect=_tracking_mock):
|
||||
result = run_pipeline(config)
|
||||
|
||||
self.assertEqual(result.final_verdict, "PASS")
|
||||
self.assertEqual(len(result.iterations), 2)
|
||||
|
||||
# Verify that the second iteration's senior_review prompt contains
|
||||
# the tracker table from iteration 1
|
||||
iter2_senior_prompts = [
|
||||
p for p in captured_prompts
|
||||
if p["step_name"] == "senior_review"
|
||||
and "ISS-001" in p["prompt"]
|
||||
and "Missing auth check" in p["prompt"]
|
||||
]
|
||||
# The second senior_review call should have the tracker in its prompt
|
||||
self.assertTrue(
|
||||
len(iter2_senior_prompts) >= 1,
|
||||
"Expected previous_senior_tracker content (ISS-001) to appear "
|
||||
"in at least one senior_review prompt",
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user