feat: ESCALATE verdict, issue tracker, onboarding commands

Add 3-verdict system (PASS/FAIL/ESCALATE) with priority handling across simple and phased pipelines. Senior reviewers can now escalate issues requiring human intervention, immediately breaking the review loop. - ESCALATE verdict extraction with highest priority over PASS/FAIL - Issue Tracker tables (ISS-NNN) carried across iterations - Auto-escalate heuristic using (file, keyword) composite fingerprints - Report restructuring: executive view first (verdict → tracker → metrics) - Onboarding: `doctor`, `demo`, `init --guided` commands - Exit codes: PASS=0, FAIL=1, ESCALATE=2 - 87 tests passing (54 config + 25 onboarding + 8 integration) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 18:19:05 +09:00
parent ee4f1a07ef
commit 204e071b74
15 changed files with 3032 additions and 156 deletions
--- a/cross_eval/config.py
+++ b/cross_eval/config.py
@@ -39,6 +39,26 @@ _CODEX_ARGS = [
    "-",
 ]

+_CLAUDE_BASE_ARGS = [
+    "-p",
+    "--setting-sources",
+    "user",
+    "--disable-slash-commands",
+    "--model",
+    "opus",
+]
+
+_CLAUDE_CODER_ARGS = list(_CLAUDE_BASE_ARGS) + [
+    "--dangerously-skip-permissions",
+    "--permission-mode",
+    "bypassPermissions",
+]
+
+_CLAUDE_REVIEW_ARGS = list(_CLAUDE_BASE_ARGS) + [
+    "--permission-mode",
+    "plan",
+]
+
 _CODER_SYSTEM_PROMPT = (
    "You are a senior software engineer implementing code changes.\n"
    "Rules:\n"
@@ -81,29 +101,37 @@ _SENIOR_SYSTEM_PROMPT = (
    "4. Be skeptical of false positives, but do not lower the bar on real requirement "
    "gaps.\n"
    "5. When issues remain, produce a concise prioritized action list the coder can act on.\n"
-    "6. Do NOT invent new requirements beyond the plan and checklist.\n"
-    "7. End with VERDICT: PASS or VERDICT: FAIL."
+    "6. Maintain an Issue Tracker table across iterations to track issue status.\n"
+    "7. Do NOT invent new requirements beyond the plan and checklist.\n"
+    "8. End with one of three verdicts:\n"
+    "   - VERDICT: PASS — all requirements met, no issues remain.\n"
+    "   - VERDICT: FAIL — issues found that the coder can fix.\n"
+    "   - VERDICT: ESCALATE — issues that require human intervention. Use ESCALATE when:\n"
+    "     * Requirements are ambiguous and need clarification from stakeholders\n"
+    "     * Architecture decisions are needed that go beyond the plan scope\n"
+    "     * External dependency issues block progress\n"
+    "     * The coder has failed to resolve the same issue 2+ times"
 )

 BUILTIN_AGENTS: dict[str, AgentConfig] = {
    "claude-coder": AgentConfig(
        name="claude-coder",
        command="claude",
-        args=["-p", "--model", "opus", "--permission-mode", "auto"],
+        args=list(_CLAUDE_CODER_ARGS),
        system_prompt=_CODER_SYSTEM_PROMPT,
        reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["coder"],
    ),
    "claude-reviewer": AgentConfig(
        name="claude-reviewer",
        command="claude",
-        args=["-p", "--model", "opus", "--permission-mode", "auto"],
+        args=list(_CLAUDE_REVIEW_ARGS),
        system_prompt=_REVIEWER_SYSTEM_PROMPT,
        reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["reviewer"],
    ),
    "claude-senior": AgentConfig(
        name="claude-senior",
        command="claude",
-        args=["-p", "--model", "opus", "--permission-mode", "auto"],
+        args=list(_CLAUDE_REVIEW_ARGS),
        system_prompt=_SENIOR_SYSTEM_PROMPT,
        reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["senior"],
    ),
@@ -136,6 +164,11 @@ _AGENT_ALIASES: dict[str, str] = {
    "codex": "codex",
 }

+_ROLE_ALIASES: dict[str, str] = {
+    "coding": "coding",
+    "review": "review",
+}
+

 def resolve_agent_shorthand(name: str, role: str) -> str:
    """Resolve shorthand agent name to full builtin name.
@@ -150,6 +183,16 @@ def resolve_agent_shorthand(name: str, role: str) -> str:
    return name


+def normalize_step_role(role: str) -> str:
+    """Normalize step role aliases to the canonical role name."""
+    return _ROLE_ALIASES.get(role, role)
+
+
+def normalize_prompt_template(template_ref: str) -> str:
+    """Normalize prompt template aliases to canonical template refs."""
+    return template_ref
+
+
 # ---------------------------------------------------------------------------
 # Role inference (backward compatibility)
 # ---------------------------------------------------------------------------
@@ -233,7 +276,7 @@ def _default_seniors_for_preset(
    """Infer a default senior agent for presets that benefit from adjudication."""
    if not (
        isinstance(pipeline_raw, str)
-        and pipeline_raw == "preset:review-fix"
+        and pipeline_raw in {"preset:review-fix", "preset:coding-review-fix"}
        and reviewers
    ):
        return []
@@ -465,7 +508,7 @@ def _resolve_pipeline(
    """Resolve pipeline from preset string or explicit step list.

    Returns (steps, phases) tuple.  Only one will be non-empty.
-    - Simple/cross-review/review-only → steps populated, phases empty.
+    - Simple/cross-review/plan-review/review-only → steps populated, phases empty.
    - Phased presets (review-fix) → steps empty, phases populated.
    """
    # Preset: "preset:simple" or "preset:review-fix"
@@ -485,11 +528,15 @@ def _resolve_pipeline(
    if isinstance(pipeline_raw, list):
        steps = []
        for step_data in pipeline_raw:
+            raw_role = step_data.get("role", "coding")
+            normalized_role = normalize_step_role(raw_role)
            steps.append(StepConfig(
                name=step_data["name"],
                agent=step_data["agent"],
-                role=step_data.get("role", "generate"),
-                prompt_template=step_data.get("prompt_template", f"default:{step_data.get('role', 'generate')}"),
+                role=normalized_role,
+                prompt_template=normalize_prompt_template(
+                    step_data.get("prompt_template", f"default:{normalized_role}")
+                ),
                output_key=step_data["output_key"],
                verdict=step_data.get("verdict", False),
                verdict_pattern=step_data.get("verdict_pattern", r"VERDICT:\s*PASS"),
@@ -524,10 +571,6 @@ def validate_config(config: PipelineConfig) -> list[str]:
                errors,
                scope=f"Phase '{phase.name}'",
            )
-            if not any(s.verdict for s in phase.steps):
-                errors.append(
-                    f"Phase '{phase.name}' must have at least one step with verdict: true"
-                )
            # Validate verdict patterns
            for step in phase.steps:
                if step.verdict: