feat: ESCALATE verdict, issue tracker, onboarding commands

Add 3-verdict system (PASS/FAIL/ESCALATE) with priority handling across
simple and phased pipelines. Senior reviewers can now escalate issues
requiring human intervention, immediately breaking the review loop.

- ESCALATE verdict extraction with highest priority over PASS/FAIL
- Issue Tracker tables (ISS-NNN) carried across iterations
- Auto-escalate heuristic using (file, keyword) composite fingerprints
- Report restructuring: executive view first (verdict → tracker → metrics)
- Onboarding: `doctor`, `demo`, `init --guided` commands
- Exit codes: PASS=0, FAIL=1, ESCALATE=2
- 87 tests passing (54 config + 25 onboarding + 8 integration)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
chungyeong
2026-03-13 18:19:05 +09:00
parent ee4f1a07ef
commit 204e071b74
15 changed files with 3032 additions and 156 deletions

282
cross_eval/demo.py Normal file
View File

@@ -0,0 +1,282 @@
"""Built-in demo for cross-eval — lets new users see the full lifecycle."""
from __future__ import annotations
import sys
import time
from pathlib import Path
from cross_eval.models import PipelineConfig, PipelineResult
# ---------------------------------------------------------------------------
# Built-in demo plan & checklist
# ---------------------------------------------------------------------------
DEMO_PLAN = """\
# Demo: Fibonacci Function
## Objective
Implement a `fibonacci(n)` function in Python.
## Requirements
1. `fibonacci(0)` returns `0`, `fibonacci(1)` returns `1`.
2. For `n >= 2`, return the sum of the two preceding values.
3. Raise `ValueError` for negative `n`.
4. Use an iterative approach (not recursive).
## Constraints
- Single file: `fib.py`
- No external dependencies.
"""
DEMO_CHECKLIST = """\
# Demo Checklist
- [ ] fibonacci(0) → 0
- [ ] fibonacci(1) → 1
- [ ] fibonacci(10) → 55
- [ ] fibonacci(-1) raises ValueError
- [ ] Iterative implementation (no recursion)
- [ ] No unnecessary abstractions
"""
# ---------------------------------------------------------------------------
# Mock outputs (realistic-looking)
# ---------------------------------------------------------------------------
_MOCK_CODING_V1 = """\
I'll implement the fibonacci function in `fib.py`.
```python
# fib.py
def fibonacci(n: int) -> int:
\"\"\"Return the nth Fibonacci number using iteration.\"\"\"
if n < 0:
return -1 # invalid input
if n <= 1:
return n
a, b = 0, 1
for _ in range(2, n + 1):
a, b = b, a + b
return b
```
Created `fib.py` with the iterative fibonacci function.
"""
_MOCK_REVIEW_V1 = """\
### Previous Feedback Assessment
N/A — first iteration.
### Issues Found
- ISS-001 [Major][Omission] Negative input should raise `ValueError`, \
but implementation returns `-1` instead. (Requirement 3: "Raise ValueError for negative n")
### Out of Scope Issues
None
### Summary
- Critical: 0, Major: 1, Minor: 0
- Over-engineering count: 0
- Omission count: 1
- CONFIRMED: 0, DISMISSED: 0
- Overall quality: Good structure, one requirement gap.
### Verdict
VERDICT: FAIL
"""
_MOCK_CODING_V2 = """\
Fixing the negative input handling per review feedback (ISS-001).
```python
# fib.py
def fibonacci(n: int) -> int:
\"\"\"Return the nth Fibonacci number using iteration.\"\"\"
if n < 0:
raise ValueError(f"n must be non-negative, got {n}")
if n <= 1:
return n
a, b = 0, 1
for _ in range(2, n + 1):
a, b = b, a + b
return b
```
Updated `fib.py`: negative input now raises `ValueError`.
"""
_MOCK_REVIEW_V2 = """\
### Previous Feedback Assessment
- DISMISSED (false positive): None
- CONFIRMED: None — ISS-001 has been fixed.
### Issues Found
None — all checklist items satisfied.
### Out of Scope Issues
None
### Summary
- Critical: 0, Major: 0, Minor: 0
- Over-engineering count: 0
- Omission count: 0
- CONFIRMED: 0, DISMISSED: 0
- Overall quality: All requirements met, clean implementation.
### Verdict
VERDICT: PASS
"""
_MOCK_STEPS = [
# (iteration, step_name, agent, duration, output_chars, verdict, output)
(1, "coding", "claude-coder", 2.1, 347, None, _MOCK_CODING_V1),
(1, "review", "claude-reviewer", 1.8, 423, "FAIL", _MOCK_REVIEW_V1),
(2, "coding", "claude-coder", 2.3, 382, None, _MOCK_CODING_V2),
(2, "review", "claude-reviewer", 1.5, 312, "PASS", _MOCK_REVIEW_V2),
]
_MOCK_ESCALATE_REVIEW = """\
### Issues Found
- ISS-001 [Critical][Omission] Requirements are ambiguous: "iterative approach" is unclear — \
does this exclude memoization? The plan needs clarification from stakeholders.
### Verdict
VERDICT: ESCALATE
"""
_MOCK_ESCALATE_STEPS = [
(1, "coding", "claude-coder", 2.1, 347, None, _MOCK_CODING_V1),
(1, "review", "claude-reviewer", 1.8, 520, "ESCALATE", _MOCK_ESCALATE_REVIEW),
]
# ---------------------------------------------------------------------------
# Mock demo runner
# ---------------------------------------------------------------------------
DIM = "\033[2m"
BOLD = "\033[1m"
GREEN = "\033[32m"
RED = "\033[31m"
YELLOW = "\033[33m"
CYAN = "\033[36m"
RESET = "\033[0m"
def run_mock_demo(preset: str = "simple", show_escalate: bool = False) -> None:
"""Run a simulated demo showing the full pipeline lifecycle."""
steps = _MOCK_ESCALATE_STEPS if show_escalate else _MOCK_STEPS
print(f"\n{BOLD}=== cross-eval demo (mock) ==={RESET}")
print(f"{DIM}Preset: {preset} | Coder: claude-coder | Reviewer: claude-reviewer{RESET}")
print(f"{DIM}Plan: fibonacci function | Max iterations: 3{RESET}\n")
current_iter = 0
for iteration, step_name, agent, duration, chars, verdict, output in steps:
if iteration != current_iter:
current_iter = iteration
print(f"{BOLD}{'' * 50}")
print(f" Iteration {iteration}/3")
print(f"{'' * 50}{RESET}")
# Simulate running
sys.stdout.write(f" ⠋ [{step_name}] {agent} running...")
sys.stdout.flush()
time.sleep(0.5)
sys.stdout.write(f"\r {GREEN}{RESET} [{step_name}] {agent}{chars} chars ({duration}s)\n")
if verdict:
if verdict == "PASS":
color = GREEN
elif verdict == "ESCALATE":
color = YELLOW
else:
color = RED
print(f" {color}{BOLD}Verdict: {verdict}{RESET}")
if verdict == "FAIL":
# Show key feedback
print(f" {DIM}Feedback: ISS-001 [Major] Negative input returns -1 instead of ValueError{RESET}")
elif verdict == "ESCALATE":
print(f" {YELLOW}Reason: Requirements need clarification from stakeholders{RESET}")
print()
# Final result
if show_escalate:
final = "ESCALATE"
color = YELLOW
else:
final = "PASS"
color = GREEN
print(f"{BOLD}Result: {color}{final}{RESET}")
print(f"Iterations: {current_iter}")
if show_escalate:
print(f"\n{RED}{BOLD}{'=' * 50}")
print(f" Escalation Report")
print(f"{'=' * 50}{RESET}")
print(f"{YELLOW}Human review required.{RESET}")
print(f" {RED}{RESET} Requirements are ambiguous — needs stakeholder clarification")
print(f"{RED}{BOLD}{'=' * 50}{RESET}")
print(f"\n{DIM}This was a mock demo. To run with real agents:{RESET}")
print(f"{DIM} cross-eval demo --live{RESET}")
print(f"{DIM} cross-eval run --plan plan.md{RESET}\n")
def run_live_demo(
preset: str = "simple",
timeout: int | None = None,
) -> PipelineResult:
"""Run a live demo with real agents using the built-in plan."""
import tempfile
from cross_eval.config import (
BUILTIN_AGENTS,
_resolve_agents,
apply_reasoning_effort_settings,
)
from cross_eval.pipeline import run_pipeline
from cross_eval.prompts import PHASED_PRESETS, PIPELINE_PRESETS
coders = ["claude-coder"]
reviewers = ["claude-reviewer"]
seniors: list[str] = []
agents = _resolve_agents(dict(BUILTIN_AGENTS), coders, reviewers, seniors)
if preset in PIPELINE_PRESETS:
pipeline = PIPELINE_PRESETS[preset](coders, reviewers, seniors)
phases = []
elif preset in PHASED_PRESETS:
pipeline = []
phases = PHASED_PRESETS[preset](coders, reviewers, seniors)
else:
pipeline = PIPELINE_PRESETS["simple"](coders, reviewers, seniors)
phases = []
with tempfile.TemporaryDirectory() as tmpdir:
plan_path = Path(tmpdir) / "plan.md"
checklist_path = Path(tmpdir) / "checklist.md"
plan_path.write_text(DEMO_PLAN, encoding="utf-8")
checklist_path.write_text(DEMO_CHECKLIST, encoding="utf-8")
config = PipelineConfig(
output_dir=Path("output"),
max_iterations=3,
language="en",
inputs={"plan": plan_path, "checklist": checklist_path},
agents=agents,
coders=coders,
reviewers=reviewers,
seniors=seniors,
pipeline=pipeline,
phases=phases,
preset_name=f"demo-{preset}",
)
apply_reasoning_effort_settings(config)
return run_pipeline(config, timeout=timeout)