feat: ESCALATE verdict, issue tracker, onboarding commands

Add 3-verdict system (PASS/FAIL/ESCALATE) with priority handling across
simple and phased pipelines. Senior reviewers can now escalate issues
requiring human intervention, immediately breaking the review loop.

- ESCALATE verdict extraction with highest priority over PASS/FAIL
- Issue Tracker tables (ISS-NNN) carried across iterations
- Auto-escalate heuristic using (file, keyword) composite fingerprints
- Report restructuring: executive view first (verdict → tracker → metrics)
- Onboarding: `doctor`, `demo`, `init --guided` commands
- Exit codes: PASS=0, FAIL=1, ESCALATE=2
- 87 tests passing (54 config + 25 onboarding + 8 integration)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
chungyeong
2026-03-13 18:19:05 +09:00
parent ee4f1a07ef
commit 204e071b74
15 changed files with 3032 additions and 156 deletions

View File

@@ -1,19 +1,25 @@
from __future__ import annotations
import tempfile
import unittest
from pathlib import Path
from unittest.mock import patch
from cross_eval.agent import _supports_reasoning_effort
from cross_eval.agent import AgentInvocationError, _supports_reasoning_effort
from cross_eval.cli import _apply_phased_iteration_override
from cross_eval.agent import invoke_agent
from cross_eval.config import (
BUILTIN_AGENTS,
_default_seniors_for_preset,
apply_reasoning_effort_settings,
normalize_reasoning_effort,
normalize_prompt_template,
normalize_step_role,
validate_config,
)
from cross_eval.models import (
AgentConfig,
AgentResult,
IterationResult,
PhaseConfig,
PipelineConfig,
@@ -21,25 +27,53 @@ from cross_eval.models import (
ReviewMetrics,
StepConfig,
)
from cross_eval.pipeline import _detect_repeated_aggregate
from cross_eval.pipeline import (
_detect_auto_escalate,
_detect_repeated_aggregate,
_execute_parallel_batch,
_extract_senior_tracker,
_extract_verdict,
)
from cross_eval.prompts import (
GENERATE_TEMPLATE,
GENERATE_TEMPLATE_KO,
CODING_TEMPLATE,
CODING_TEMPLATE_KO,
REVIEW_TEMPLATE,
REVIEW_TEMPLATE_KO,
PLAN_REVIEW_TEMPLATE,
PLAN_REVIEW_TEMPLATE_KO,
REVIEW_ONLY_TEMPLATE,
REVIEW_ONLY_TEMPLATE_KO,
AGGREGATE_REVIEW_TEMPLATE,
AGGREGATE_REVIEW_TEMPLATE_KO,
_build_cross_review_preset,
_build_coding_review_fix_preset,
_build_plan_review_preset,
_build_review_fix_preset,
_build_review_only_preset,
_build_simple_preset,
)
from cross_eval.report import build_report, parse_review_metrics
from cross_eval.config import _SENIOR_SYSTEM_PROMPT
from cross_eval.report import build_report, parse_review_metrics, print_escalation_report
class BuiltinAgentConfigTest(unittest.TestCase):
def test_claude_builtin_agents_use_user_settings_and_disable_slash_commands(self) -> None:
for agent_name in ("claude-coder", "claude-reviewer", "claude-senior"):
with self.subTest(agent=agent_name):
args = BUILTIN_AGENTS[agent_name].args
self.assertIn("--setting-sources", args)
self.assertIn("user", args)
self.assertIn("--disable-slash-commands", args)
def test_claude_builtin_agents_use_role_specific_permission_modes(self) -> None:
coder_args = BUILTIN_AGENTS["claude-coder"].args
reviewer_args = BUILTIN_AGENTS["claude-reviewer"].args
senior_args = BUILTIN_AGENTS["claude-senior"].args
self.assertIn("--dangerously-skip-permissions", coder_args)
self.assertIn("bypassPermissions", coder_args)
self.assertIn("plan", reviewer_args)
self.assertIn("plan", senior_args)
def test_codex_builtin_agents_skip_git_repo_check(self) -> None:
for agent_name in ("codex-coder", "codex-reviewer", "codex-senior"):
with self.subTest(agent=agent_name):
@@ -62,6 +96,10 @@ class BuiltinAgentConfigTest(unittest.TestCase):
self.assertEqual(normalize_reasoning_effort("extra_high"), "xhigh")
self.assertEqual(normalize_reasoning_effort("x-high"), "xhigh")
def test_normalize_step_role_and_template_aliases(self) -> None:
self.assertEqual(normalize_step_role("coding"), "coding")
self.assertEqual(normalize_prompt_template("default:coding"), "default:coding")
def test_apply_reasoning_effort_settings_uses_defaults_and_role_overrides(self) -> None:
config = PipelineConfig(
agents={
@@ -116,6 +154,123 @@ class BuiltinAgentConfigTest(unittest.TestCase):
["codex", "-c", 'model_reasoning_effort="high"'],
)
def test_invoke_agent_classifies_auth_failures(self) -> None:
def _fake_run(cmd, **kwargs):
class _Result:
returncode = 1
stdout = ""
stderr = "Not logged in · Please run /login"
return _Result()
agent = AgentConfig(
name="claude-reviewer",
command="claude",
args=["-p", "--model", "opus"],
)
with patch("subprocess.run", side_effect=_fake_run):
with self.assertRaises(AgentInvocationError) as ctx:
invoke_agent(agent, "prompt", "review", quiet=True)
self.assertEqual(ctx.exception.failure_type, "AUTH")
self.assertIn("Re-authenticate", ctx.exception.suggested_action)
def test_invoke_agent_classifies_usage_limit_failures(self) -> None:
def _fake_run(cmd, **kwargs):
class _Result:
returncode = 1
stdout = ""
stderr = "API Error: 429 rate limit exceeded for current quota"
return _Result()
agent = AgentConfig(
name="codex-reviewer",
command="codex",
args=["exec", "--model", "gpt-5.4", "-"],
)
with patch("subprocess.run", side_effect=_fake_run):
with self.assertRaises(AgentInvocationError) as ctx:
invoke_agent(agent, "prompt", "review", quiet=True)
self.assertEqual(ctx.exception.failure_type, "USAGE_LIMIT")
self.assertIn("quota", ctx.exception.suggested_action)
def test_parallel_batch_saves_successes_before_failure(self) -> None:
config = PipelineConfig(
agents={
"ok-reviewer": AgentConfig(name="ok-reviewer", command="codex"),
"bad-reviewer": AgentConfig(name="bad-reviewer", command="claude"),
},
)
steps = [
StepConfig(
name="review_ok",
agent="ok-reviewer",
role="review",
prompt_template="default:review-only",
output_key="review_ok",
parallel=True,
),
StepConfig(
name="review_bad",
agent="bad-reviewer",
role="review",
prompt_template="default:review-only",
output_key="review_bad",
parallel=True,
),
]
step_outputs: dict[str, str] = {}
step_results: dict[str, AgentResult] = {}
def _fake_invoke(agent, prompt, step_name, **kwargs):
if step_name == "review_ok":
return AgentResult(
output="VERDICT: PASS",
exit_code=0,
agent_name=agent.name,
step_name=step_name,
duration_seconds=1.0,
)
raise AgentInvocationError(
agent_name=agent.name,
step_name=step_name,
cmd_preview="claude -p ...",
raw_error="API Error: 429 rate limit exceeded for current quota",
failure_type="USAGE_LIMIT",
suggested_action="Agent CLI hit a quota, billing, or token budget limit. Refill or raise the limit, then rerun.",
)
with tempfile.TemporaryDirectory() as tmpdir:
with patch("cross_eval.pipeline.invoke_agent", side_effect=_fake_invoke):
with self.assertRaises(RuntimeError) as ctx:
_execute_parallel_batch(
steps,
config,
input_contents={},
feedback="",
iteration=1,
max_iterations=3,
cwd=Path(tmpdir),
timeout=None,
dry_run=False,
step_outputs=step_outputs,
step_results=step_results,
run_dir=Path(tmpdir),
output_iter=1,
)
self.assertIn("Successful outputs were saved for: review_ok", str(ctx.exception))
self.assertEqual(step_outputs["review_ok"], "VERDICT: PASS")
self.assertTrue((Path(tmpdir) / "v1" / "review_ok.md").exists())
error_path = Path(tmpdir) / "v1" / "review_bad_error.md"
self.assertTrue(error_path.exists())
self.assertIn("Failure Type", error_path.read_text(encoding="utf-8"))
self.assertIn("USAGE_LIMIT", error_path.read_text(encoding="utf-8"))
def test_detect_repeated_aggregate_warns_on_same_output(self) -> None:
steps = [
StepConfig(
@@ -169,6 +324,14 @@ class BuiltinAgentConfigTest(unittest.TestCase):
),
["claude-senior"],
)
self.assertEqual(
_default_seniors_for_preset(
"preset:coding-review-fix",
["codex-reviewer"],
BUILTIN_AGENTS,
),
["codex-senior"],
)
self.assertEqual(
_default_seniors_for_preset(
"preset:simple",
@@ -204,9 +367,37 @@ class BuiltinAgentConfigTest(unittest.TestCase):
)
self.assertEqual(
[step.name for step in converge.steps[3:]],
["aggregate_review", "generate", "verify"],
["aggregate_review", "coding", "verify"],
)
def test_coding_review_fix_starts_with_single_coding_phase(self) -> None:
phases = _build_coding_review_fix_preset(
["codex-coder"],
["claude-reviewer", "codex-reviewer"],
["codex-senior"],
)
self.assertEqual([phase.name for phase in phases], ["initial_coding", "review_fix"])
self.assertEqual(phases[0].max_iterations, 1)
self.assertEqual([step.name for step in phases[0].steps], ["coding"])
self.assertEqual([step.name for step in phases[1].steps[2:]], ["aggregate_review", "coding", "verify"])
def test_apply_phased_iteration_override_updates_only_verdict_phases(self) -> None:
config = PipelineConfig(
phases=_build_coding_review_fix_preset(
["codex-coder"],
["codex-reviewer"],
["codex-senior"],
),
)
_apply_phased_iteration_override(config, 10)
self.assertEqual(config.phases[0].name, "initial_coding")
self.assertEqual(config.phases[0].max_iterations, 1)
self.assertEqual(config.phases[1].name, "review_fix")
self.assertEqual(config.phases[1].max_iterations, 10)
def test_review_only_duplicate_reviewers_get_unique_step_keys(self) -> None:
steps = _build_review_only_preset(
["codex-coder"],
@@ -219,6 +410,31 @@ class BuiltinAgentConfigTest(unittest.TestCase):
["review_codex_reviewer", "review_codex_reviewer_2"],
)
def test_plan_review_duplicate_reviewers_get_unique_step_keys(self) -> None:
steps = _build_plan_review_preset(
["codex-coder"],
["codex-reviewer", "codex-reviewer"],
[],
)
self.assertEqual(
[step.output_key for step in steps],
["plan_review_codex_reviewer", "plan_review_codex_reviewer_2"],
)
def test_plan_review_with_senior_adds_aggregate_step(self) -> None:
steps = _build_plan_review_preset(
["codex-coder"],
["claude-reviewer", "codex-reviewer"],
["claude-senior"],
)
self.assertEqual(steps[-1].name, "senior_review")
self.assertEqual(steps[-1].agent, "claude-senior")
self.assertTrue(steps[-1].verdict)
self.assertFalse(steps[0].verdict)
self.assertFalse(steps[1].verdict)
def test_cross_review_duplicate_coders_get_unique_step_keys(self) -> None:
steps = _build_cross_review_preset(
["codex-coder", "codex-coder"],
@@ -246,7 +462,7 @@ class BuiltinAgentConfigTest(unittest.TestCase):
steps = phases[0].steps
self.assertEqual(steps[2].name, "aggregate_review")
self.assertEqual(steps[2].agent, "codex-senior")
self.assertEqual(steps[3].name, "generate")
self.assertEqual(steps[3].name, "coding")
self.assertEqual(steps[4].name, "verify")
self.assertEqual(steps[4].agent, "codex-senior")
self.assertTrue(steps[4].verdict)
@@ -273,7 +489,7 @@ class BuiltinAgentConfigTest(unittest.TestCase):
self.assertEqual(
[step.name for step in steps],
["generate", "review", "senior_review"],
["coding", "review", "senior_review"],
)
self.assertFalse(steps[1].verdict)
self.assertTrue(steps[2].verdict)
@@ -325,6 +541,8 @@ class PromptTemplateTest(unittest.TestCase):
for tmpl, label in [
(REVIEW_TEMPLATE, "REVIEW_TEMPLATE"),
(REVIEW_TEMPLATE_KO, "REVIEW_TEMPLATE_KO"),
(PLAN_REVIEW_TEMPLATE, "PLAN_REVIEW_TEMPLATE"),
(PLAN_REVIEW_TEMPLATE_KO, "PLAN_REVIEW_TEMPLATE_KO"),
(REVIEW_ONLY_TEMPLATE, "REVIEW_ONLY_TEMPLATE"),
(REVIEW_ONLY_TEMPLATE_KO, "REVIEW_ONLY_TEMPLATE_KO"),
]:
@@ -351,10 +569,10 @@ class PromptTemplateTest(unittest.TestCase):
self.assertIn("CONFIRMED", tmpl)
self.assertIn("DISMISSED", tmpl)
def test_generate_templates_ignore_dismissed(self) -> None:
"""Generate templates should tell coder to ignore DISMISSED items."""
self.assertIn("DISMISSED", GENERATE_TEMPLATE)
self.assertIn("DISMISSED", GENERATE_TEMPLATE_KO)
def test_coding_templates_ignore_dismissed(self) -> None:
"""Coding templates should tell coder to ignore DISMISSED items."""
self.assertIn("DISMISSED", CODING_TEMPLATE)
self.assertIn("DISMISSED", CODING_TEMPLATE_KO)
def test_aggregate_templates_dismissed_structure(self) -> None:
"""Aggregate templates should use [False positive] / [Already fixed] tags."""
@@ -487,11 +705,11 @@ class ReviewMetricsParsingTest(unittest.TestCase):
language="en",
pipeline=[
StepConfig(
name="generate",
name="coding",
agent="claude-coder",
role="generate",
prompt_template="default:generate",
output_key="generated_code",
role="coding",
prompt_template="default:coding",
output_key="coding_output",
verdict=True,
),
],
@@ -500,7 +718,7 @@ class ReviewMetricsParsingTest(unittest.TestCase):
iterations=[
IterationResult(
iteration=1,
step_outputs={"generated_code": "some code"},
step_outputs={"coding_output": "some code"},
verdict="PASS",
),
],
@@ -511,5 +729,230 @@ class ReviewMetricsParsingTest(unittest.TestCase):
self.assertNotIn("Review Metrics", report)
class EscalateVerdictTest(unittest.TestCase):
"""Test ESCALATE verdict functionality."""
def test_extract_verdict_escalate(self) -> None:
output = "Some review content\n\nVERDICT: ESCALATE\n"
result = _extract_verdict(output, r"VERDICT:\s*PASS")
self.assertEqual(result, "ESCALATE")
def test_extract_verdict_escalate_priority(self) -> None:
"""ESCALATE should take priority even if PASS pattern also matches."""
output = "VERDICT: PASS\n\nVERDICT: ESCALATE\n"
result = _extract_verdict(output, r"VERDICT:\s*PASS")
self.assertEqual(result, "ESCALATE")
def test_extract_verdict_pass_still_works(self) -> None:
output = "All good\n\nVERDICT: PASS\n"
result = _extract_verdict(output, r"VERDICT:\s*PASS")
self.assertEqual(result, "PASS")
def test_extract_verdict_fail_still_works(self) -> None:
output = "Issues found\n\nVERDICT: FAIL\n"
result = _extract_verdict(output, r"VERDICT:\s*PASS")
self.assertEqual(result, "FAIL")
def test_extract_senior_tracker(self) -> None:
output = (
"Some text\n\n"
"## Issue Tracker\n"
"| ISS-ID | Severity | Description | Status | Since |\n"
"|--------|----------|-------------|--------|-------|\n"
"| ISS-001 | Critical | Missing auth | Open | v1 |\n"
"| ISS-002 | Major | Bad naming | Fixed | v1 |\n"
"\nMore text"
)
tracker = _extract_senior_tracker(output)
self.assertIn("Issue Tracker", tracker)
self.assertIn("ISS-001", tracker)
self.assertIn("ISS-002", tracker)
def test_extract_senior_tracker_empty(self) -> None:
output = "No tracker table here"
tracker = _extract_senior_tracker(output)
self.assertEqual(tracker, "")
def test_auto_escalate_heuristic(self) -> None:
prev1 = "Issue in src/auth.py: missing validation"
prev2 = "Issue in src/auth.py: validation still missing"
current = "Issue in src/auth.py: validation not implemented"
# Should detect repeated issue
self.assertTrue(_detect_auto_escalate([prev1, prev2], current, threshold=2))
def test_auto_escalate_no_repeat(self) -> None:
prev1 = "Issue in src/auth.py: missing validation"
current = "Issue in src/database.py: connection pool"
self.assertFalse(_detect_auto_escalate([prev1], current, threshold=2))
def test_auto_escalate_different_issues_same_file(self) -> None:
"""Same file path but different issues should NOT trigger escalation."""
prev1 = "Issue in src/utils.py: missing validation on input"
prev2 = "Issue in src/utils.py: unused import at top of file"
current = "Issue in src/utils.py: error handling not implemented"
# All mention src/utils.py, but the issue keywords differ across
# iterations, so this should NOT escalate.
self.assertFalse(_detect_auto_escalate([prev1, prev2], current, threshold=2))
def test_report_escalate_verdict(self) -> None:
config = PipelineConfig(language="en")
result = PipelineResult(
final_verdict="ESCALATE",
escalated_issues=["Requirements are ambiguous — need stakeholder input"],
)
report = build_report(config, result)
self.assertIn("ESCALATE", report)
self.assertIn("Human review required", report)
self.assertIn("ambiguous", report)
def test_report_escalate_verdict_ko(self) -> None:
config = PipelineConfig(language="ko")
result = PipelineResult(
final_verdict="ESCALATE",
escalated_issues=["요구사항이 모호함"],
)
report = build_report(config, result)
self.assertIn("ESCALATE", report)
self.assertIn("사람의 확인이 필요합니다", report)
def test_exit_code_escalate(self) -> None:
from cross_eval.cli import main
mock_result = PipelineResult(
final_verdict="ESCALATE",
escalated_issues=["Needs human review"],
)
with patch("cross_eval.config.load_config") as mock_load, \
patch("cross_eval.config.validate_config", return_value=[]), \
patch("cross_eval.pipeline.run_pipeline", return_value=mock_result), \
patch("cross_eval.report.print_escalation_report"):
mock_config = PipelineConfig(
pipeline=[
StepConfig(
name="review",
agent="claude-reviewer",
role="review",
prompt_template="default:review",
output_key="review_result",
verdict=True,
),
],
agents=dict(BUILTIN_AGENTS),
coders=["claude-coder"],
reviewers=["claude-reviewer"],
inputs={"plan": Path("/tmp/plan.md")},
language="en",
max_iterations=3,
preset_name="simple",
)
mock_load.return_value = mock_config
with tempfile.NamedTemporaryFile(suffix=".yaml", mode="w") as f:
f.write("inputs:\n plan: /tmp/plan.md\n")
f.flush()
exit_code = main(["run", "-c", f.name])
self.assertEqual(exit_code, 2)
def test_senior_prompt_includes_escalate(self) -> None:
self.assertIn("ESCALATE", _SENIOR_SYSTEM_PROMPT)
self.assertIn("ambiguous", _SENIOR_SYSTEM_PROMPT.lower())
def test_aggregate_template_has_tracker(self) -> None:
self.assertIn("{previous_senior_tracker}", AGGREGATE_REVIEW_TEMPLATE)
self.assertIn("Issue Tracker", AGGREGATE_REVIEW_TEMPLATE)
self.assertIn("VERDICT: ESCALATE", AGGREGATE_REVIEW_TEMPLATE)
def test_report_includes_issue_tracker_summary(self) -> None:
config = PipelineConfig(
language="en",
pipeline=[
StepConfig(
name="review",
agent="claude-reviewer",
role="review",
prompt_template="default:review",
output_key="review_result",
verdict=True,
),
],
)
result = PipelineResult(
iterations=[
IterationResult(
iteration=1,
step_outputs={
"review_result": (
"### Issues Found\n"
"- ISS-001 [Critical][Omission] Missing auth check\n"
"- ISS-002 [Major][Omission] No input validation\n"
"### Verdict\nVERDICT: FAIL"
),
},
verdict="FAIL",
),
],
final_verdict="FAIL",
)
report = build_report(config, result)
self.assertIn("Issue Tracker Summary", report)
self.assertIn("ISS-001", report)
self.assertIn("ISS-002", report)
def test_report_includes_senior_tracker_table(self) -> None:
config = PipelineConfig(
language="en",
pipeline=[
StepConfig(
name="senior_review",
agent="claude-senior",
role="review",
prompt_template="default:aggregate-review",
output_key="senior_review_result",
verdict=True,
),
],
)
result = PipelineResult(
iterations=[
IterationResult(
iteration=1,
step_outputs={
"senior_review_result": (
"### Confirmed Issues\n- Missing auth\n\n"
"## Issue Tracker\n"
"| ISS-ID | Severity | Description | Status | Since |\n"
"|--------|----------|-------------|--------|-------|\n"
"| ISS-001 | Critical | Missing auth check | Open | v1 |\n"
"| ISS-002 | Major | No validation | Fixed | v1 |\n"
"\n### Verdict\nVERDICT: FAIL"
),
},
verdict="FAIL",
),
],
final_verdict="FAIL",
)
report = build_report(config, result)
self.assertIn("Issue Tracker Summary", report)
self.assertIn("ISS-001", report)
self.assertIn("Fixed", report)
def test_aggregate_template_ko_has_tracker(self) -> None:
self.assertIn("{previous_senior_tracker}", AGGREGATE_REVIEW_TEMPLATE_KO)
self.assertIn("이슈 트래커", AGGREGATE_REVIEW_TEMPLATE_KO)
self.assertIn("VERDICT: ESCALATE", AGGREGATE_REVIEW_TEMPLATE_KO)
if __name__ == "__main__":
unittest.main()