"""Built-in demo for cross-eval — lets new users see the full lifecycle.""" from __future__ import annotations import sys import time from pathlib import Path from cross_eval.models import PipelineConfig, PipelineResult # --------------------------------------------------------------------------- # Built-in demo plan & checklist # --------------------------------------------------------------------------- DEMO_PLAN = """\ # Demo: Fibonacci Function ## Objective Implement a `fibonacci(n)` function in Python. ## Requirements 1. `fibonacci(0)` returns `0`, `fibonacci(1)` returns `1`. 2. For `n >= 2`, return the sum of the two preceding values. 3. Raise `ValueError` for negative `n`. 4. Use an iterative approach (not recursive). ## Constraints - Single file: `fib.py` - No external dependencies. """ DEMO_CHECKLIST = """\ # Demo Checklist - [ ] fibonacci(0) → 0 - [ ] fibonacci(1) → 1 - [ ] fibonacci(10) → 55 - [ ] fibonacci(-1) raises ValueError - [ ] Iterative implementation (no recursion) - [ ] No unnecessary abstractions """ # --------------------------------------------------------------------------- # Mock outputs (realistic-looking) # --------------------------------------------------------------------------- _MOCK_CODING_V1 = """\ I'll implement the fibonacci function in `fib.py`. ```python # fib.py def fibonacci(n: int) -> int: \"\"\"Return the nth Fibonacci number using iteration.\"\"\" if n < 0: return -1 # invalid input if n <= 1: return n a, b = 0, 1 for _ in range(2, n + 1): a, b = b, a + b return b ``` Created `fib.py` with the iterative fibonacci function. """ _MOCK_REVIEW_V1 = """\ ### Previous Feedback Assessment N/A — first iteration. ### Issues Found - ISS-001 [Major][Omission] Negative input should raise `ValueError`, \ but implementation returns `-1` instead. (Requirement 3: "Raise ValueError for negative n") ### Out of Scope Issues None ### Summary - Critical: 0, Major: 1, Minor: 0 - Over-engineering count: 0 - Omission count: 1 - CONFIRMED: 0, DISMISSED: 0 - Overall quality: Good structure, one requirement gap. ### Verdict VERDICT: FAIL """ _MOCK_CODING_V2 = """\ Fixing the negative input handling per review feedback (ISS-001). ```python # fib.py def fibonacci(n: int) -> int: \"\"\"Return the nth Fibonacci number using iteration.\"\"\" if n < 0: raise ValueError(f"n must be non-negative, got {n}") if n <= 1: return n a, b = 0, 1 for _ in range(2, n + 1): a, b = b, a + b return b ``` Updated `fib.py`: negative input now raises `ValueError`. """ _MOCK_REVIEW_V2 = """\ ### Previous Feedback Assessment - DISMISSED (false positive): None - CONFIRMED: None — ISS-001 has been fixed. ### Issues Found None — all checklist items satisfied. ### Out of Scope Issues None ### Summary - Critical: 0, Major: 0, Minor: 0 - Over-engineering count: 0 - Omission count: 0 - CONFIRMED: 0, DISMISSED: 0 - Overall quality: All requirements met, clean implementation. ### Verdict VERDICT: PASS """ _MOCK_STEPS = [ # (iteration, step_name, agent, duration, output_chars, verdict, output) (1, "coding", "claude-coder", 2.1, 347, None, _MOCK_CODING_V1), (1, "review", "claude-reviewer", 1.8, 423, "FAIL", _MOCK_REVIEW_V1), (2, "coding", "claude-coder", 2.3, 382, None, _MOCK_CODING_V2), (2, "review", "claude-reviewer", 1.5, 312, "PASS", _MOCK_REVIEW_V2), ] _MOCK_ESCALATE_REVIEW = """\ ### Issues Found - ISS-001 [Critical][Omission] Requirements are ambiguous: "iterative approach" is unclear — \ does this exclude memoization? The plan needs clarification from stakeholders. ### Verdict VERDICT: ESCALATE """ _MOCK_ESCALATE_STEPS = [ (1, "coding", "claude-coder", 2.1, 347, None, _MOCK_CODING_V1), (1, "review", "claude-reviewer", 1.8, 520, "ESCALATE", _MOCK_ESCALATE_REVIEW), ] # --------------------------------------------------------------------------- # Mock demo runner # --------------------------------------------------------------------------- DIM = "\033[2m" BOLD = "\033[1m" GREEN = "\033[32m" RED = "\033[31m" YELLOW = "\033[33m" CYAN = "\033[36m" RESET = "\033[0m" def run_mock_demo(preset: str = "simple", show_escalate: bool = False) -> None: """Run a simulated demo showing the full pipeline lifecycle.""" steps = _MOCK_ESCALATE_STEPS if show_escalate else _MOCK_STEPS print(f"\n{BOLD}=== cross-eval demo (mock) ==={RESET}") print(f"{DIM}Preset: {preset} | Coder: claude-coder | Reviewer: claude-reviewer{RESET}") print(f"{DIM}Plan: fibonacci function | Max iterations: 3{RESET}\n") current_iter = 0 for iteration, step_name, agent, duration, chars, verdict, output in steps: if iteration != current_iter: current_iter = iteration print(f"{BOLD}{'━' * 50}") print(f" Iteration {iteration}/3") print(f"{'━' * 50}{RESET}") # Simulate running sys.stdout.write(f" ⠋ [{step_name}] {agent} running...") sys.stdout.flush() time.sleep(0.5) sys.stdout.write(f"\r {GREEN}✓{RESET} [{step_name}] {agent} — {chars} chars ({duration}s)\n") if verdict: if verdict == "PASS": color = GREEN elif verdict == "ESCALATE": color = YELLOW else: color = RED print(f" {color}{BOLD}Verdict: {verdict}{RESET}") if verdict == "FAIL": # Show key feedback print(f" {DIM}Feedback: ISS-001 [Major] Negative input returns -1 instead of ValueError{RESET}") elif verdict == "ESCALATE": print(f" {YELLOW}Reason: Requirements need clarification from stakeholders{RESET}") print() # Final result if show_escalate: final = "ESCALATE" color = YELLOW else: final = "PASS" color = GREEN print(f"{BOLD}Result: {color}{final}{RESET}") print(f"Iterations: {current_iter}") if show_escalate: print(f"\n{RED}{BOLD}{'=' * 50}") print(" Escalation Report") print(f"{'=' * 50}{RESET}") print(f"{YELLOW}Human review required.{RESET}") print(f" {RED}•{RESET} Requirements are ambiguous — needs stakeholder clarification") print(f"{RED}{BOLD}{'=' * 50}{RESET}") print(f"\n{DIM}This was a mock demo. To run with real agents:{RESET}") print(f"{DIM} cross-eval demo --live{RESET}") print(f"{DIM} cross-eval run --plan plan.md{RESET}\n") def run_live_demo( preset: str = "simple", timeout: int | None = None, ) -> PipelineResult: """Run a live demo with real agents using the built-in plan.""" import tempfile from cross_eval.config import ( BUILTIN_AGENTS, _resolve_agents, apply_reasoning_effort_settings, ) from cross_eval.pipeline import run_pipeline from cross_eval.prompts import PHASED_PRESETS, PIPELINE_PRESETS coders = ["claude-coder"] reviewers = ["claude-reviewer"] seniors: list[str] = [] agents = _resolve_agents(dict(BUILTIN_AGENTS), coders, reviewers, seniors) if preset in PIPELINE_PRESETS: pipeline = PIPELINE_PRESETS[preset](coders, reviewers, seniors) phases = [] elif preset in PHASED_PRESETS: pipeline = [] phases = PHASED_PRESETS[preset](coders, reviewers, seniors) else: pipeline = PIPELINE_PRESETS["simple"](coders, reviewers, seniors) phases = [] with tempfile.TemporaryDirectory() as tmpdir: plan_path = Path(tmpdir) / "plan.md" checklist_path = Path(tmpdir) / "checklist.md" plan_path.write_text(DEMO_PLAN, encoding="utf-8") checklist_path.write_text(DEMO_CHECKLIST, encoding="utf-8") config = PipelineConfig( output_dir=Path(".cross-eval/output"), max_iterations=3, language="en", inputs={"plan": plan_path, "checklist": checklist_path}, agents=agents, coders=coders, reviewers=reviewers, seniors=seniors, pipeline=pipeline, phases=phases, preset_name=f"demo-{preset}", ) apply_reasoning_effort_settings(config) return run_pipeline(config, timeout=timeout)