feat: ESCALATE verdict, issue tracker, onboarding commands

Add 3-verdict system (PASS/FAIL/ESCALATE) with priority handling across simple and phased pipelines. Senior reviewers can now escalate issues requiring human intervention, immediately breaking the review loop. - ESCALATE verdict extraction with highest priority over PASS/FAIL - Issue Tracker tables (ISS-NNN) carried across iterations - Auto-escalate heuristic using (file, keyword) composite fingerprints - Report restructuring: executive view first (verdict → tracker → metrics) - Onboarding: `doctor`, `demo`, `init --guided` commands - Exit codes: PASS=0, FAIL=1, ESCALATE=2 - 87 tests passing (54 config + 25 onboarding + 8 integration) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 18:19:05 +09:00
parent ee4f1a07ef
commit 204e071b74
15 changed files with 3032 additions and 156 deletions
--- a/cross_eval/pipeline.py
+++ b/cross_eval/pipeline.py
@@ -10,7 +10,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime
 from pathlib import Path

-from cross_eval.agent import invoke_agent
+from cross_eval.agent import AgentInvocationError, invoke_agent
 from cross_eval.config import try_reload_config
 from cross_eval.models import (
    AgentResult,
@@ -68,6 +68,8 @@ def _run_simple_pipeline(
    final_verdict = "MAX_ITERATIONS_REACHED"
    aggregate_history: dict[str, int] = {}
    aggregate_warnings: list[str] = []
+    escalated_issues: list[str] = []
+    all_feedbacks: list[str] = []

    for i in range(1, config.max_iterations + 1):
        config = try_reload_config(config)
@@ -100,8 +102,34 @@ def _run_simple_pipeline(

        iter_result.feedback = _collect_feedback(config.pipeline, step_outputs)
        feedback = iter_result.feedback or feedback
+        all_feedbacks.append(feedback)
+
+        # Extract tracker from verdict/review steps for next iteration
+        for step in config.pipeline:
+            if step.verdict or step.role == "review":
+                tracker = _extract_senior_tracker(
+                    step_outputs.get(step.output_key, ""),
+                )
+                if tracker:
+                    input_contents["previous_senior_tracker"] = tracker
+
        iterations.append(iter_result)

+        # ESCALATE check (highest priority)
+        if verdict == "ESCALATE":
+            final_verdict = "ESCALATE"
+            # Extract escalation details from verdict step outputs
+            for step in config.pipeline:
+                if step.verdict:
+                    esc = _extract_escalated_issues(
+                        step_outputs.get(step.output_key, ""),
+                    )
+                    if esc:
+                        escalated_issues.append(esc)
+                        iter_result.escalated_issues = esc
+            logger.info("  ESCALATE at iteration %d — stopping loop.", i)
+            break
+
        if verdict == "PASS":
            final_verdict = "PASS"
            if i >= config.min_iterations:
@@ -113,6 +141,26 @@ def _run_simple_pipeline(
                    i, config.min_iterations,
                )

+        # Auto-escalate: no senior/aggregator + repeated FAIL
+        has_aggregator = config.seniors or any(
+            s.prompt_template == "default:aggregate-review" for s in config.pipeline
+        )
+        if (
+            verdict == "FAIL"
+            and not has_aggregator
+            and i >= 2
+            and _detect_auto_escalate(all_feedbacks[:-1], feedback)
+        ):
+            final_verdict = "ESCALATE"
+            auto_msg = (
+                f"Auto-escalated: same issues detected across {i} iterations "
+                f"without resolution (no senior reviewer configured)."
+            )
+            escalated_issues.append(auto_msg)
+            iter_result.escalated_issues = auto_msg
+            logger.info("  AUTO-ESCALATE at iteration %d", i)
+            break
+
        if dry_run:
            logger.info("  (dry-run: stopping after iteration 1)")
            break
@@ -125,6 +173,7 @@ def _run_simple_pipeline(
        total_duration=round(total_duration, 1),
        run_dir=run_dir,
        repeated_aggregate_warnings=aggregate_warnings,
+        escalated_issues=escalated_issues,
    )

    if not dry_run:
@@ -154,8 +203,14 @@ def _run_phased_pipeline(
    global_iter = 0
    aggregate_history_by_phase: dict[str, dict[str, int]] = {}
    aggregate_warnings: list[str] = []
+    escalated_issues: list[str] = []
+    all_feedbacks: list[str] = []
+    escalated = False

    for phase_idx, phase in enumerate(config.phases):
+        if escalated:
+            break
+
        logger.info("=" * 60)
        logger.info(
            "  Phase: %s (max_iter=%d, consecutive_pass=%d)",
@@ -205,8 +260,45 @@ def _run_phased_pipeline(

            iter_result.feedback = _collect_feedback(phase.steps, step_outputs)
            feedback = iter_result.feedback or feedback
+            all_feedbacks.append(feedback)
+
+            # Extract tracker from verdict/review steps
+            for step in phase.steps:
+                if step.verdict or step.role == "review":
+                    tracker = _extract_senior_tracker(
+                        step_outputs.get(step.output_key, ""),
+                    )
+                    if tracker:
+                        input_contents["previous_senior_tracker"] = tracker
+
            iterations.append(iter_result)

+            # ESCALATE check
+            if verdict == "ESCALATE":
+                final_verdict = "ESCALATE"
+                for step in phase.steps:
+                    if step.verdict:
+                        esc = _extract_escalated_issues(
+                            step_outputs.get(step.output_key, ""),
+                        )
+                        if esc:
+                            escalated_issues.append(esc)
+                            iter_result.escalated_issues = esc
+                logger.info(
+                    "  [%s] ESCALATE at iteration %d — stopping.",
+                    phase.name, pi,
+                )
+                escalated = True
+                break
+
+            if verdict is None:
+                logger.info(
+                    "  [%s] completed (no verdict step; single-pass phase)",
+                    phase.name,
+                )
+                phase_converged = True
+                break
+
            if verdict == "PASS":
                consecutive_passes += 1
                logger.info(
@@ -223,9 +315,33 @@ def _run_phased_pipeline(
            else:
                consecutive_passes = 0

+            # Auto-escalate in phased pipeline
+            has_aggregator = config.seniors or any(
+                s.prompt_template == "default:aggregate-review" for s in phase.steps
+            )
+            if (
+                verdict == "FAIL"
+                and not has_aggregator
+                and pi >= 2
+                and _detect_auto_escalate(all_feedbacks[:-1], feedback)
+            ):
+                final_verdict = "ESCALATE"
+                auto_msg = (
+                    f"Auto-escalated: same issues detected across {pi} iterations "
+                    f"in phase '{phase.name}' without resolution."
+                )
+                escalated_issues.append(auto_msg)
+                iter_result.escalated_issues = auto_msg
+                logger.info("  [%s] AUTO-ESCALATE at iteration %d", phase.name, pi)
+                escalated = True
+                break
+
            if dry_run:
                break

+        if escalated:
+            break
+
        if phase_converged:
            logger.info("  Phase '%s' completed: CONVERGED", phase.name)
        else:
@@ -245,6 +361,7 @@ def _run_phased_pipeline(
        total_duration=round(total_duration, 1),
        run_dir=run_dir,
        repeated_aggregate_warnings=aggregate_warnings,
+        escalated_issues=escalated_issues,
    )

    if not dry_run:
@@ -373,15 +490,17 @@ def _run_steps(
                run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
            )

-    # Extract verdict from all verdict steps (ALL must PASS)
+    # Extract verdict from all verdict steps (ALL must PASS; ESCALATE wins over all)
    for step in steps:
        if step.verdict:
            output = step_outputs.get(step.output_key, "")
            step_verdict = _extract_verdict(output, step.verdict_pattern)
            logger.info("  [%s] verdict: %s", step.name, step_verdict)
-            if verdict is None:
+            if step_verdict == "ESCALATE":
+                verdict = "ESCALATE"
+            elif verdict is None:
                verdict = step_verdict
-            elif step_verdict == "FAIL":
+            elif verdict != "ESCALATE" and step_verdict == "FAIL":
                verdict = "FAIL"

    return step_outputs, step_results, verdict
@@ -466,10 +585,11 @@ def _execute_step(
            f"Try --timeout 0 (unlimited)"
        )
    except RuntimeError as e:
-        phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
-        error_msg = (
-            f"# Agent Error\n\n{phase_info}"
-            f"- **Step**: {step.name}\n- **Agent**: {step.agent}\n\n```\n{e}\n```\n"
+        error_msg = _format_runtime_error_markdown(
+            e,
+            step_name=step.name,
+            agent_name=step.agent,
+            phase_name=phase_name,
        )
        _save_step_output(run_dir, output_iter, f"{step.name}_error", error_msg)
        logger.error("  [%s] FAILED — saved to output", step.name)
@@ -527,7 +647,7 @@ def _execute_parallel_batch(
    # Collect results from parallel threads
    local_outputs: dict[str, str] = {}
    local_results: dict[str, AgentResult] = {}
-    errors: list[Exception] = []
+    errors: list[tuple[StepConfig, Exception]] = []

    # Show a single spinner for the batch
    from cross_eval.agent import _Spinner
@@ -563,19 +683,15 @@ def _execute_parallel_batch(
                local_results[output_key] = result
                local_outputs[output_key] = output
            except Exception as e:
-                errors.append(e)
+                errors.append((step, e))

    batch_elapsed = round(time.monotonic() - batch_start, 1)

-    if errors:
-        spinner.stop(f"[parallel] FAILED ({batch_elapsed}s)")
-        raise errors[0]
-
-    spinner.stop(f"[parallel] {len(batch)} agents done ({batch_elapsed}s)")
-
-    # Merge results
+    # Persist successful outputs even if a sibling step failed.
    for step in batch:
        key = step.output_key
+        if key not in local_outputs:
+            continue
        step_outputs[key] = local_outputs[key]
        step_results[key] = local_results[key]
        r = local_results[key]
@@ -585,6 +701,48 @@ def _execute_parallel_batch(
        )
        _save_step_output(run_dir, output_iter, step.name, r.output)

+    if errors:
+        spinner.stop(f"[parallel] FAILED ({batch_elapsed}s)")
+        for failed_step, exc in errors:
+            if isinstance(exc, subprocess.TimeoutExpired):
+                stdout = (exc.stdout or b"") if isinstance(exc.stdout, bytes) else (exc.stdout or "")
+                stderr = (exc.stderr or b"") if isinstance(exc.stderr, bytes) else (exc.stderr or "")
+                if isinstance(stdout, bytes):
+                    stdout = stdout.decode("utf-8", errors="replace")
+                if isinstance(stderr, bytes):
+                    stderr = stderr.decode("utf-8", errors="replace")
+                phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
+                error_msg = (
+                    f"# Agent Timeout\n\n"
+                    f"{phase_info}"
+                    f"- **Step**: {failed_step.name}\n"
+                    f"- **Agent**: {failed_step.agent}\n"
+                    f"- **Timeout**: {timeout}s\n\n"
+                    f"Partial stdout ({len(stdout)} chars):\n"
+                    f"```\n{stdout[:2000] or '(none)'}\n```\n\n"
+                    f"Stderr:\n```\n{stderr[:2000] or '(none)'}\n```\n"
+                )
+            else:
+                error_msg = _format_runtime_error_markdown(
+                    exc,
+                    step_name=failed_step.name,
+                    agent_name=failed_step.agent,
+                    phase_name=phase_name,
+                )
+            _save_step_output(run_dir, output_iter, f"{failed_step.name}_error", error_msg)
+            logger.error("  [%s] FAILED — saved to output", failed_step.name)
+
+        failed_steps = ", ".join(step.name for step, _ in errors)
+        saved_steps = ", ".join(step.name for step in batch if step.output_key in local_outputs)
+        first_error = errors[0][1]
+        saved_note = f" Successful outputs were saved for: {saved_steps}." if saved_steps else ""
+        raise RuntimeError(
+            f"Parallel batch failed: {len(errors)}/{len(batch)} steps failed ({failed_steps})."
+            f"{saved_note} First error:\n{first_error}"
+        )
+
+    spinner.stop(f"[parallel] {len(batch)} agents done ({batch_elapsed}s)")
+

 # ---------------------------------------------------------------------------
 # Context and template helpers
@@ -671,13 +829,104 @@ def _normalize_aggregate_output(output: str) -> str:
    return " ".join(output.lower().split())


+_ESCALATE_PATTERN = re.compile(r"VERDICT:\s*ESCALATE", re.IGNORECASE)
+
+_TRACKER_TABLE_PATTERN = re.compile(
+    r"(##+ Issue Tracker[^\n]*\n(?:\|[^\n]+\|\n?)+)", re.DOTALL,
+)
+
+
 def _extract_verdict(output: str, pattern: str) -> str:
-    """Extract PASS or FAIL from output using regex pattern."""
+    """Extract PASS, FAIL, or ESCALATE from output using regex pattern."""
+    if re.search(_ESCALATE_PATTERN, output):
+        return "ESCALATE"  # highest priority
    if re.search(pattern, output):
        return "PASS"
    return "FAIL"


+def _extract_senior_tracker(output: str) -> str:
+    """Extract Issue Tracker table from senior review output."""
+    match = _TRACKER_TABLE_PATTERN.search(output)
+    return match.group(0) if match else ""
+
+
+def _extract_escalated_issues(output: str) -> str:
+    """Extract escalation details from senior review output."""
+    # Look for content between VERDICT: ESCALATE and end, or an escalation section
+    pattern = r"(?:###?\s*Escalat(?:ed|ion).*?\n)(.*?)(?=\n###|\Z)"
+    match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
+    if match:
+        return match.group(1).strip()
+    # Fallback: grab the Action Items section
+    pattern2 = r"(?:###?\s*Action Items.*?\n)(.*?)(?=\n###|\Z)"
+    match2 = re.search(pattern2, output, re.DOTALL | re.IGNORECASE)
+    if match2:
+        return match2.group(1).strip()
+    return ""
+
+
+_FP_PATTERN = re.compile(r"[\w/\\]+\.\w{1,5}")
+_ISSUE_KEYWORDS = re.compile(
+    r"\b(missing|validation|error[\s_-]?handling|unused|import|"
+    r"injection|auth(?:entication|orization)?|deprecated|"
+    r"leak|overflow|null|undefined|timeout|deadlock|race[\s_-]?condition|"
+    r"security|permission|encoding|format|parsing|connection|"
+    r"boundary|initialization|cleanup|resource|concurrency|"
+    r"exception|crash|hang|corrupt|truncat|duplicat|inconsisten|"
+    r"omission|over[\s_-]?engineer|refactor|naming|docstring|"
+    r"type[\s_-]?hint|test|coverage|logging|config|performance)\w*",
+    re.IGNORECASE,
+)
+
+
+def _issue_fingerprints(text: str) -> set[tuple[str, str]]:
+    """Extract (file_path, issue_keyword) pairs from feedback text.
+
+    For each file path found, look for issue keywords within a window of
+    ~120 characters around the file path mention and create composite keys.
+    """
+    lower = text.lower()
+    paths = list(_FP_PATTERN.finditer(lower))
+    if not paths:
+        return set()
+
+    pairs: set[tuple[str, str]] = set()
+    for m in paths:
+        fp = m.group()
+        # Search a window around the file path for issue keywords
+        window_start = max(0, m.start() - 60)
+        window_end = min(len(lower), m.end() + 60)
+        window = lower[window_start:window_end]
+        for kw_match in _ISSUE_KEYWORDS.finditer(window):
+            pairs.add((fp, kw_match.group().lower()))
+    return pairs
+
+
+def _detect_auto_escalate(
+    feedbacks: list[str],
+    current_feedback: str,
+    threshold: int = 2,
+) -> bool:
+    """Detect repeated identical issues across iterations (for auto-escalation).
+
+    Extracts (file_path, issue_keyword) fingerprints from feedback and checks
+    if any identical pair appears in >= *threshold* previous iterations.
+    This avoids false positives when the same file is mentioned for completely
+    different issues across iterations.
+    """
+    current_fps = _issue_fingerprints(current_feedback)
+    if not current_fps:
+        return False
+
+    repeat_count = 0
+    for prev in feedbacks:
+        prev_fps = _issue_fingerprints(prev)
+        if current_fps & prev_fps:
+            repeat_count += 1
+    return repeat_count >= threshold
+
+
 def _save_step_output(
    run_dir: Path,
    iteration: int,
@@ -691,8 +940,56 @@ def _save_step_output(
    return path


+def _format_runtime_error_markdown(
+    exc: Exception,
+    *,
+    step_name: str,
+    agent_name: str,
+    phase_name: str | None = None,
+) -> str:
+    """Render a structured markdown error report for a failed step."""
+    phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
+    lines = [
+        "# Agent Error",
+        "",
+        phase_info.rstrip(),
+        f"- **Step**: {step_name}",
+        f"- **Agent**: {agent_name}",
+    ]
+    lines = [line for line in lines if line]
+
+    if isinstance(exc, AgentInvocationError):
+        lines.extend(
+            [
+                f"- **Failure Type**: {exc.failure_type}",
+                f"- **Suggested Action**: {exc.suggested_action}",
+                "",
+                "## Command",
+                f"```",
+                exc.cmd_preview,
+                "```",
+                "",
+                "## Raw Error",
+                "```",
+                exc.raw_error,
+                "```",
+            ],
+        )
+    else:
+        lines.extend(
+            [
+                "",
+                "```",
+                str(exc),
+                "```",
+            ],
+        )
+
+    return "\n".join(lines) + "\n"
+
+
 def _save_report(run_dir: Path, config: PipelineConfig, result: PipelineResult) -> None:
-    """Generate and save the final markdown report."""
+    """Build and save the final markdown report."""
    report = build_report(config, result)
    report_path = run_dir / "final-report.md"
    report_path.parent.mkdir(parents=True, exist_ok=True)