feat: ESCALATE verdict, issue tracker, onboarding commands
Add 3-verdict system (PASS/FAIL/ESCALATE) with priority handling across simple and phased pipelines. Senior reviewers can now escalate issues requiring human intervention, immediately breaking the review loop. - ESCALATE verdict extraction with highest priority over PASS/FAIL - Issue Tracker tables (ISS-NNN) carried across iterations - Auto-escalate heuristic using (file, keyword) composite fingerprints - Report restructuring: executive view first (verdict → tracker → metrics) - Onboarding: `doctor`, `demo`, `init --guided` commands - Exit codes: PASS=0, FAIL=1, ESCALATE=2 - 87 tests passing (54 config + 25 onboarding + 8 integration) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
282
cross_eval/demo.py
Normal file
282
cross_eval/demo.py
Normal file
@@ -0,0 +1,282 @@
|
||||
"""Built-in demo for cross-eval — lets new users see the full lifecycle."""
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from cross_eval.models import PipelineConfig, PipelineResult
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Built-in demo plan & checklist
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DEMO_PLAN = """\
|
||||
# Demo: Fibonacci Function
|
||||
|
||||
## Objective
|
||||
Implement a `fibonacci(n)` function in Python.
|
||||
|
||||
## Requirements
|
||||
1. `fibonacci(0)` returns `0`, `fibonacci(1)` returns `1`.
|
||||
2. For `n >= 2`, return the sum of the two preceding values.
|
||||
3. Raise `ValueError` for negative `n`.
|
||||
4. Use an iterative approach (not recursive).
|
||||
|
||||
## Constraints
|
||||
- Single file: `fib.py`
|
||||
- No external dependencies.
|
||||
"""
|
||||
|
||||
DEMO_CHECKLIST = """\
|
||||
# Demo Checklist
|
||||
- [ ] fibonacci(0) → 0
|
||||
- [ ] fibonacci(1) → 1
|
||||
- [ ] fibonacci(10) → 55
|
||||
- [ ] fibonacci(-1) raises ValueError
|
||||
- [ ] Iterative implementation (no recursion)
|
||||
- [ ] No unnecessary abstractions
|
||||
"""
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Mock outputs (realistic-looking)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_MOCK_CODING_V1 = """\
|
||||
I'll implement the fibonacci function in `fib.py`.
|
||||
|
||||
```python
|
||||
# fib.py
|
||||
|
||||
def fibonacci(n: int) -> int:
|
||||
\"\"\"Return the nth Fibonacci number using iteration.\"\"\"
|
||||
if n < 0:
|
||||
return -1 # invalid input
|
||||
if n <= 1:
|
||||
return n
|
||||
a, b = 0, 1
|
||||
for _ in range(2, n + 1):
|
||||
a, b = b, a + b
|
||||
return b
|
||||
```
|
||||
|
||||
Created `fib.py` with the iterative fibonacci function.
|
||||
"""
|
||||
|
||||
_MOCK_REVIEW_V1 = """\
|
||||
### Previous Feedback Assessment
|
||||
N/A — first iteration.
|
||||
|
||||
### Issues Found
|
||||
- ISS-001 [Major][Omission] Negative input should raise `ValueError`, \
|
||||
but implementation returns `-1` instead. (Requirement 3: "Raise ValueError for negative n")
|
||||
|
||||
### Out of Scope Issues
|
||||
None
|
||||
|
||||
### Summary
|
||||
- Critical: 0, Major: 1, Minor: 0
|
||||
- Over-engineering count: 0
|
||||
- Omission count: 1
|
||||
- CONFIRMED: 0, DISMISSED: 0
|
||||
- Overall quality: Good structure, one requirement gap.
|
||||
|
||||
### Verdict
|
||||
VERDICT: FAIL
|
||||
"""
|
||||
|
||||
_MOCK_CODING_V2 = """\
|
||||
Fixing the negative input handling per review feedback (ISS-001).
|
||||
|
||||
```python
|
||||
# fib.py
|
||||
|
||||
def fibonacci(n: int) -> int:
|
||||
\"\"\"Return the nth Fibonacci number using iteration.\"\"\"
|
||||
if n < 0:
|
||||
raise ValueError(f"n must be non-negative, got {n}")
|
||||
if n <= 1:
|
||||
return n
|
||||
a, b = 0, 1
|
||||
for _ in range(2, n + 1):
|
||||
a, b = b, a + b
|
||||
return b
|
||||
```
|
||||
|
||||
Updated `fib.py`: negative input now raises `ValueError`.
|
||||
"""
|
||||
|
||||
_MOCK_REVIEW_V2 = """\
|
||||
### Previous Feedback Assessment
|
||||
- DISMISSED (false positive): None
|
||||
- CONFIRMED: None — ISS-001 has been fixed.
|
||||
|
||||
### Issues Found
|
||||
None — all checklist items satisfied.
|
||||
|
||||
### Out of Scope Issues
|
||||
None
|
||||
|
||||
### Summary
|
||||
- Critical: 0, Major: 0, Minor: 0
|
||||
- Over-engineering count: 0
|
||||
- Omission count: 0
|
||||
- CONFIRMED: 0, DISMISSED: 0
|
||||
- Overall quality: All requirements met, clean implementation.
|
||||
|
||||
### Verdict
|
||||
VERDICT: PASS
|
||||
"""
|
||||
|
||||
_MOCK_STEPS = [
|
||||
# (iteration, step_name, agent, duration, output_chars, verdict, output)
|
||||
(1, "coding", "claude-coder", 2.1, 347, None, _MOCK_CODING_V1),
|
||||
(1, "review", "claude-reviewer", 1.8, 423, "FAIL", _MOCK_REVIEW_V1),
|
||||
(2, "coding", "claude-coder", 2.3, 382, None, _MOCK_CODING_V2),
|
||||
(2, "review", "claude-reviewer", 1.5, 312, "PASS", _MOCK_REVIEW_V2),
|
||||
]
|
||||
|
||||
_MOCK_ESCALATE_REVIEW = """\
|
||||
### Issues Found
|
||||
- ISS-001 [Critical][Omission] Requirements are ambiguous: "iterative approach" is unclear — \
|
||||
does this exclude memoization? The plan needs clarification from stakeholders.
|
||||
|
||||
### Verdict
|
||||
VERDICT: ESCALATE
|
||||
"""
|
||||
|
||||
_MOCK_ESCALATE_STEPS = [
|
||||
(1, "coding", "claude-coder", 2.1, 347, None, _MOCK_CODING_V1),
|
||||
(1, "review", "claude-reviewer", 1.8, 520, "ESCALATE", _MOCK_ESCALATE_REVIEW),
|
||||
]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Mock demo runner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DIM = "\033[2m"
|
||||
BOLD = "\033[1m"
|
||||
GREEN = "\033[32m"
|
||||
RED = "\033[31m"
|
||||
YELLOW = "\033[33m"
|
||||
CYAN = "\033[36m"
|
||||
RESET = "\033[0m"
|
||||
|
||||
|
||||
def run_mock_demo(preset: str = "simple", show_escalate: bool = False) -> None:
|
||||
"""Run a simulated demo showing the full pipeline lifecycle."""
|
||||
steps = _MOCK_ESCALATE_STEPS if show_escalate else _MOCK_STEPS
|
||||
|
||||
print(f"\n{BOLD}=== cross-eval demo (mock) ==={RESET}")
|
||||
print(f"{DIM}Preset: {preset} | Coder: claude-coder | Reviewer: claude-reviewer{RESET}")
|
||||
print(f"{DIM}Plan: fibonacci function | Max iterations: 3{RESET}\n")
|
||||
|
||||
current_iter = 0
|
||||
for iteration, step_name, agent, duration, chars, verdict, output in steps:
|
||||
if iteration != current_iter:
|
||||
current_iter = iteration
|
||||
print(f"{BOLD}{'━' * 50}")
|
||||
print(f" Iteration {iteration}/3")
|
||||
print(f"{'━' * 50}{RESET}")
|
||||
|
||||
# Simulate running
|
||||
sys.stdout.write(f" ⠋ [{step_name}] {agent} running...")
|
||||
sys.stdout.flush()
|
||||
time.sleep(0.5)
|
||||
sys.stdout.write(f"\r {GREEN}✓{RESET} [{step_name}] {agent} — {chars} chars ({duration}s)\n")
|
||||
|
||||
if verdict:
|
||||
if verdict == "PASS":
|
||||
color = GREEN
|
||||
elif verdict == "ESCALATE":
|
||||
color = YELLOW
|
||||
else:
|
||||
color = RED
|
||||
print(f" {color}{BOLD}Verdict: {verdict}{RESET}")
|
||||
|
||||
if verdict == "FAIL":
|
||||
# Show key feedback
|
||||
print(f" {DIM}Feedback: ISS-001 [Major] Negative input returns -1 instead of ValueError{RESET}")
|
||||
elif verdict == "ESCALATE":
|
||||
print(f" {YELLOW}Reason: Requirements need clarification from stakeholders{RESET}")
|
||||
|
||||
print()
|
||||
|
||||
# Final result
|
||||
if show_escalate:
|
||||
final = "ESCALATE"
|
||||
color = YELLOW
|
||||
else:
|
||||
final = "PASS"
|
||||
color = GREEN
|
||||
|
||||
print(f"{BOLD}Result: {color}{final}{RESET}")
|
||||
print(f"Iterations: {current_iter}")
|
||||
|
||||
if show_escalate:
|
||||
print(f"\n{RED}{BOLD}{'=' * 50}")
|
||||
print(f" Escalation Report")
|
||||
print(f"{'=' * 50}{RESET}")
|
||||
print(f"{YELLOW}Human review required.{RESET}")
|
||||
print(f" {RED}•{RESET} Requirements are ambiguous — needs stakeholder clarification")
|
||||
print(f"{RED}{BOLD}{'=' * 50}{RESET}")
|
||||
|
||||
print(f"\n{DIM}This was a mock demo. To run with real agents:{RESET}")
|
||||
print(f"{DIM} cross-eval demo --live{RESET}")
|
||||
print(f"{DIM} cross-eval run --plan plan.md{RESET}\n")
|
||||
|
||||
|
||||
def run_live_demo(
|
||||
preset: str = "simple",
|
||||
timeout: int | None = None,
|
||||
) -> PipelineResult:
|
||||
"""Run a live demo with real agents using the built-in plan."""
|
||||
import tempfile
|
||||
|
||||
from cross_eval.config import (
|
||||
BUILTIN_AGENTS,
|
||||
_resolve_agents,
|
||||
apply_reasoning_effort_settings,
|
||||
)
|
||||
from cross_eval.pipeline import run_pipeline
|
||||
from cross_eval.prompts import PHASED_PRESETS, PIPELINE_PRESETS
|
||||
|
||||
coders = ["claude-coder"]
|
||||
reviewers = ["claude-reviewer"]
|
||||
seniors: list[str] = []
|
||||
agents = _resolve_agents(dict(BUILTIN_AGENTS), coders, reviewers, seniors)
|
||||
|
||||
if preset in PIPELINE_PRESETS:
|
||||
pipeline = PIPELINE_PRESETS[preset](coders, reviewers, seniors)
|
||||
phases = []
|
||||
elif preset in PHASED_PRESETS:
|
||||
pipeline = []
|
||||
phases = PHASED_PRESETS[preset](coders, reviewers, seniors)
|
||||
else:
|
||||
pipeline = PIPELINE_PRESETS["simple"](coders, reviewers, seniors)
|
||||
phases = []
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
plan_path = Path(tmpdir) / "plan.md"
|
||||
checklist_path = Path(tmpdir) / "checklist.md"
|
||||
plan_path.write_text(DEMO_PLAN, encoding="utf-8")
|
||||
checklist_path.write_text(DEMO_CHECKLIST, encoding="utf-8")
|
||||
|
||||
config = PipelineConfig(
|
||||
output_dir=Path("output"),
|
||||
max_iterations=3,
|
||||
language="en",
|
||||
inputs={"plan": plan_path, "checklist": checklist_path},
|
||||
agents=agents,
|
||||
coders=coders,
|
||||
reviewers=reviewers,
|
||||
seniors=seniors,
|
||||
pipeline=pipeline,
|
||||
phases=phases,
|
||||
preset_name=f"demo-{preset}",
|
||||
)
|
||||
apply_reasoning_effort_settings(config)
|
||||
|
||||
return run_pipeline(config, timeout=timeout)
|
||||
Reference in New Issue
Block a user