feat: ESCALATE verdict, issue tracker, onboarding commands

Add 3-verdict system (PASS/FAIL/ESCALATE) with priority handling across simple and phased pipelines. Senior reviewers can now escalate issues requiring human intervention, immediately breaking the review loop. - ESCALATE verdict extraction with highest priority over PASS/FAIL - Issue Tracker tables (ISS-NNN) carried across iterations - Auto-escalate heuristic using (file, keyword) composite fingerprints - Report restructuring: executive view first (verdict → tracker → metrics) - Onboarding: `doctor`, `demo`, `init --guided` commands - Exit codes: PASS=0, FAIL=1, ESCALATE=2 - 87 tests passing (54 config + 25 onboarding + 8 integration) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 18:19:05 +09:00
parent ee4f1a07ef
commit 204e071b74
15 changed files with 3032 additions and 156 deletions
--- a/cross_eval/demo.py
+++ b/cross_eval/demo.py
@@ -0,0 +1,282 @@
+"""Built-in demo for cross-eval — lets new users see the full lifecycle."""
+from __future__ import annotations
+
+import sys
+import time
+from pathlib import Path
+
+from cross_eval.models import PipelineConfig, PipelineResult
+
+
+# ---------------------------------------------------------------------------
+# Built-in demo plan & checklist
+# ---------------------------------------------------------------------------
+
+DEMO_PLAN = """\
+# Demo: Fibonacci Function
+
+## Objective
+Implement a `fibonacci(n)` function in Python.
+
+## Requirements
+1. `fibonacci(0)` returns `0`, `fibonacci(1)` returns `1`.
+2. For `n >= 2`, return the sum of the two preceding values.
+3. Raise `ValueError` for negative `n`.
+4. Use an iterative approach (not recursive).
+
+## Constraints
+- Single file: `fib.py`
+- No external dependencies.
+"""
+
+DEMO_CHECKLIST = """\
+# Demo Checklist
+- [ ] fibonacci(0) → 0
+- [ ] fibonacci(1) → 1
+- [ ] fibonacci(10) → 55
+- [ ] fibonacci(-1) raises ValueError
+- [ ] Iterative implementation (no recursion)
+- [ ] No unnecessary abstractions
+"""
+
+# ---------------------------------------------------------------------------
+# Mock outputs (realistic-looking)
+# ---------------------------------------------------------------------------
+
+_MOCK_CODING_V1 = """\
+I'll implement the fibonacci function in `fib.py`.
+
+```python
+# fib.py
+
+def fibonacci(n: int) -> int:
+    \"\"\"Return the nth Fibonacci number using iteration.\"\"\"
+    if n < 0:
+        return -1  # invalid input
+    if n <= 1:
+        return n
+    a, b = 0, 1
+    for _ in range(2, n + 1):
+        a, b = b, a + b
+    return b
+```
+
+Created `fib.py` with the iterative fibonacci function.
+"""
+
+_MOCK_REVIEW_V1 = """\
+### Previous Feedback Assessment
+N/A — first iteration.
+
+### Issues Found
+- ISS-001 [Major][Omission] Negative input should raise `ValueError`, \
+but implementation returns `-1` instead. (Requirement 3: "Raise ValueError for negative n")
+
+### Out of Scope Issues
+None
+
+### Summary
+- Critical: 0, Major: 1, Minor: 0
+- Over-engineering count: 0
+- Omission count: 1
+- CONFIRMED: 0, DISMISSED: 0
+- Overall quality: Good structure, one requirement gap.
+
+### Verdict
+VERDICT: FAIL
+"""
+
+_MOCK_CODING_V2 = """\
+Fixing the negative input handling per review feedback (ISS-001).
+
+```python
+# fib.py
+
+def fibonacci(n: int) -> int:
+    \"\"\"Return the nth Fibonacci number using iteration.\"\"\"
+    if n < 0:
+        raise ValueError(f"n must be non-negative, got {n}")
+    if n <= 1:
+        return n
+    a, b = 0, 1
+    for _ in range(2, n + 1):
+        a, b = b, a + b
+    return b
+```
+
+Updated `fib.py`: negative input now raises `ValueError`.
+"""
+
+_MOCK_REVIEW_V2 = """\
+### Previous Feedback Assessment
+- DISMISSED (false positive): None
+- CONFIRMED: None — ISS-001 has been fixed.
+
+### Issues Found
+None — all checklist items satisfied.
+
+### Out of Scope Issues
+None
+
+### Summary
+- Critical: 0, Major: 0, Minor: 0
+- Over-engineering count: 0
+- Omission count: 0
+- CONFIRMED: 0, DISMISSED: 0
+- Overall quality: All requirements met, clean implementation.
+
+### Verdict
+VERDICT: PASS
+"""
+
+_MOCK_STEPS = [
+    # (iteration, step_name, agent, duration, output_chars, verdict, output)
+    (1, "coding", "claude-coder", 2.1, 347, None, _MOCK_CODING_V1),
+    (1, "review", "claude-reviewer", 1.8, 423, "FAIL", _MOCK_REVIEW_V1),
+    (2, "coding", "claude-coder", 2.3, 382, None, _MOCK_CODING_V2),
+    (2, "review", "claude-reviewer", 1.5, 312, "PASS", _MOCK_REVIEW_V2),
+]
+
+_MOCK_ESCALATE_REVIEW = """\
+### Issues Found
+- ISS-001 [Critical][Omission] Requirements are ambiguous: "iterative approach" is unclear — \
+does this exclude memoization? The plan needs clarification from stakeholders.
+
+### Verdict
+VERDICT: ESCALATE
+"""
+
+_MOCK_ESCALATE_STEPS = [
+    (1, "coding", "claude-coder", 2.1, 347, None, _MOCK_CODING_V1),
+    (1, "review", "claude-reviewer", 1.8, 520, "ESCALATE", _MOCK_ESCALATE_REVIEW),
+]
+
+
+# ---------------------------------------------------------------------------
+# Mock demo runner
+# ---------------------------------------------------------------------------
+
+DIM = "\033[2m"
+BOLD = "\033[1m"
+GREEN = "\033[32m"
+RED = "\033[31m"
+YELLOW = "\033[33m"
+CYAN = "\033[36m"
+RESET = "\033[0m"
+
+
+def run_mock_demo(preset: str = "simple", show_escalate: bool = False) -> None:
+    """Run a simulated demo showing the full pipeline lifecycle."""
+    steps = _MOCK_ESCALATE_STEPS if show_escalate else _MOCK_STEPS
+
+    print(f"\n{BOLD}=== cross-eval demo (mock) ==={RESET}")
+    print(f"{DIM}Preset: {preset} | Coder: claude-coder | Reviewer: claude-reviewer{RESET}")
+    print(f"{DIM}Plan: fibonacci function | Max iterations: 3{RESET}\n")
+
+    current_iter = 0
+    for iteration, step_name, agent, duration, chars, verdict, output in steps:
+        if iteration != current_iter:
+            current_iter = iteration
+            print(f"{BOLD}{'━' * 50}")
+            print(f"  Iteration {iteration}/3")
+            print(f"{'━' * 50}{RESET}")
+
+        # Simulate running
+        sys.stdout.write(f"  ⠋ [{step_name}] {agent} running...")
+        sys.stdout.flush()
+        time.sleep(0.5)
+        sys.stdout.write(f"\r  {GREEN}✓{RESET} [{step_name}] {agent} — {chars} chars ({duration}s)\n")
+
+        if verdict:
+            if verdict == "PASS":
+                color = GREEN
+            elif verdict == "ESCALATE":
+                color = YELLOW
+            else:
+                color = RED
+            print(f"  {color}{BOLD}Verdict: {verdict}{RESET}")
+
+            if verdict == "FAIL":
+                # Show key feedback
+                print(f"  {DIM}Feedback: ISS-001 [Major] Negative input returns -1 instead of ValueError{RESET}")
+            elif verdict == "ESCALATE":
+                print(f"  {YELLOW}Reason: Requirements need clarification from stakeholders{RESET}")
+
+            print()
+
+    # Final result
+    if show_escalate:
+        final = "ESCALATE"
+        color = YELLOW
+    else:
+        final = "PASS"
+        color = GREEN
+
+    print(f"{BOLD}Result: {color}{final}{RESET}")
+    print(f"Iterations: {current_iter}")
+
+    if show_escalate:
+        print(f"\n{RED}{BOLD}{'=' * 50}")
+        print(f"  Escalation Report")
+        print(f"{'=' * 50}{RESET}")
+        print(f"{YELLOW}Human review required.{RESET}")
+        print(f"  {RED}•{RESET} Requirements are ambiguous — needs stakeholder clarification")
+        print(f"{RED}{BOLD}{'=' * 50}{RESET}")
+
+    print(f"\n{DIM}This was a mock demo. To run with real agents:{RESET}")
+    print(f"{DIM}  cross-eval demo --live{RESET}")
+    print(f"{DIM}  cross-eval run --plan plan.md{RESET}\n")
+
+
+def run_live_demo(
+    preset: str = "simple",
+    timeout: int | None = None,
+) -> PipelineResult:
+    """Run a live demo with real agents using the built-in plan."""
+    import tempfile
+
+    from cross_eval.config import (
+        BUILTIN_AGENTS,
+        _resolve_agents,
+        apply_reasoning_effort_settings,
+    )
+    from cross_eval.pipeline import run_pipeline
+    from cross_eval.prompts import PHASED_PRESETS, PIPELINE_PRESETS
+
+    coders = ["claude-coder"]
+    reviewers = ["claude-reviewer"]
+    seniors: list[str] = []
+    agents = _resolve_agents(dict(BUILTIN_AGENTS), coders, reviewers, seniors)
+
+    if preset in PIPELINE_PRESETS:
+        pipeline = PIPELINE_PRESETS[preset](coders, reviewers, seniors)
+        phases = []
+    elif preset in PHASED_PRESETS:
+        pipeline = []
+        phases = PHASED_PRESETS[preset](coders, reviewers, seniors)
+    else:
+        pipeline = PIPELINE_PRESETS["simple"](coders, reviewers, seniors)
+        phases = []
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        plan_path = Path(tmpdir) / "plan.md"
+        checklist_path = Path(tmpdir) / "checklist.md"
+        plan_path.write_text(DEMO_PLAN, encoding="utf-8")
+        checklist_path.write_text(DEMO_CHECKLIST, encoding="utf-8")
+
+        config = PipelineConfig(
+            output_dir=Path("output"),
+            max_iterations=3,
+            language="en",
+            inputs={"plan": plan_path, "checklist": checklist_path},
+            agents=agents,
+            coders=coders,
+            reviewers=reviewers,
+            seniors=seniors,
+            pipeline=pipeline,
+            phases=phases,
+            preset_name=f"demo-{preset}",
+        )
+        apply_reasoning_effort_settings(config)
+
+        return run_pipeline(config, timeout=timeout)