feat: ESCALATE verdict, issue tracker, onboarding commands
Add 3-verdict system (PASS/FAIL/ESCALATE) with priority handling across simple and phased pipelines. Senior reviewers can now escalate issues requiring human intervention, immediately breaking the review loop. - ESCALATE verdict extraction with highest priority over PASS/FAIL - Issue Tracker tables (ISS-NNN) carried across iterations - Auto-escalate heuristic using (file, keyword) composite fingerprints - Report restructuring: executive view first (verdict → tracker → metrics) - Onboarding: `doctor`, `demo`, `init --guided` commands - Exit codes: PASS=0, FAIL=1, ESCALATE=2 - 87 tests passing (54 config + 25 onboarding + 8 integration) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -10,7 +10,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from cross_eval.agent import invoke_agent
|
||||
from cross_eval.agent import AgentInvocationError, invoke_agent
|
||||
from cross_eval.config import try_reload_config
|
||||
from cross_eval.models import (
|
||||
AgentResult,
|
||||
@@ -68,6 +68,8 @@ def _run_simple_pipeline(
|
||||
final_verdict = "MAX_ITERATIONS_REACHED"
|
||||
aggregate_history: dict[str, int] = {}
|
||||
aggregate_warnings: list[str] = []
|
||||
escalated_issues: list[str] = []
|
||||
all_feedbacks: list[str] = []
|
||||
|
||||
for i in range(1, config.max_iterations + 1):
|
||||
config = try_reload_config(config)
|
||||
@@ -100,8 +102,34 @@ def _run_simple_pipeline(
|
||||
|
||||
iter_result.feedback = _collect_feedback(config.pipeline, step_outputs)
|
||||
feedback = iter_result.feedback or feedback
|
||||
all_feedbacks.append(feedback)
|
||||
|
||||
# Extract tracker from verdict/review steps for next iteration
|
||||
for step in config.pipeline:
|
||||
if step.verdict or step.role == "review":
|
||||
tracker = _extract_senior_tracker(
|
||||
step_outputs.get(step.output_key, ""),
|
||||
)
|
||||
if tracker:
|
||||
input_contents["previous_senior_tracker"] = tracker
|
||||
|
||||
iterations.append(iter_result)
|
||||
|
||||
# ESCALATE check (highest priority)
|
||||
if verdict == "ESCALATE":
|
||||
final_verdict = "ESCALATE"
|
||||
# Extract escalation details from verdict step outputs
|
||||
for step in config.pipeline:
|
||||
if step.verdict:
|
||||
esc = _extract_escalated_issues(
|
||||
step_outputs.get(step.output_key, ""),
|
||||
)
|
||||
if esc:
|
||||
escalated_issues.append(esc)
|
||||
iter_result.escalated_issues = esc
|
||||
logger.info(" ESCALATE at iteration %d — stopping loop.", i)
|
||||
break
|
||||
|
||||
if verdict == "PASS":
|
||||
final_verdict = "PASS"
|
||||
if i >= config.min_iterations:
|
||||
@@ -113,6 +141,26 @@ def _run_simple_pipeline(
|
||||
i, config.min_iterations,
|
||||
)
|
||||
|
||||
# Auto-escalate: no senior/aggregator + repeated FAIL
|
||||
has_aggregator = config.seniors or any(
|
||||
s.prompt_template == "default:aggregate-review" for s in config.pipeline
|
||||
)
|
||||
if (
|
||||
verdict == "FAIL"
|
||||
and not has_aggregator
|
||||
and i >= 2
|
||||
and _detect_auto_escalate(all_feedbacks[:-1], feedback)
|
||||
):
|
||||
final_verdict = "ESCALATE"
|
||||
auto_msg = (
|
||||
f"Auto-escalated: same issues detected across {i} iterations "
|
||||
f"without resolution (no senior reviewer configured)."
|
||||
)
|
||||
escalated_issues.append(auto_msg)
|
||||
iter_result.escalated_issues = auto_msg
|
||||
logger.info(" AUTO-ESCALATE at iteration %d", i)
|
||||
break
|
||||
|
||||
if dry_run:
|
||||
logger.info(" (dry-run: stopping after iteration 1)")
|
||||
break
|
||||
@@ -125,6 +173,7 @@ def _run_simple_pipeline(
|
||||
total_duration=round(total_duration, 1),
|
||||
run_dir=run_dir,
|
||||
repeated_aggregate_warnings=aggregate_warnings,
|
||||
escalated_issues=escalated_issues,
|
||||
)
|
||||
|
||||
if not dry_run:
|
||||
@@ -154,8 +203,14 @@ def _run_phased_pipeline(
|
||||
global_iter = 0
|
||||
aggregate_history_by_phase: dict[str, dict[str, int]] = {}
|
||||
aggregate_warnings: list[str] = []
|
||||
escalated_issues: list[str] = []
|
||||
all_feedbacks: list[str] = []
|
||||
escalated = False
|
||||
|
||||
for phase_idx, phase in enumerate(config.phases):
|
||||
if escalated:
|
||||
break
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info(
|
||||
" Phase: %s (max_iter=%d, consecutive_pass=%d)",
|
||||
@@ -205,8 +260,45 @@ def _run_phased_pipeline(
|
||||
|
||||
iter_result.feedback = _collect_feedback(phase.steps, step_outputs)
|
||||
feedback = iter_result.feedback or feedback
|
||||
all_feedbacks.append(feedback)
|
||||
|
||||
# Extract tracker from verdict/review steps
|
||||
for step in phase.steps:
|
||||
if step.verdict or step.role == "review":
|
||||
tracker = _extract_senior_tracker(
|
||||
step_outputs.get(step.output_key, ""),
|
||||
)
|
||||
if tracker:
|
||||
input_contents["previous_senior_tracker"] = tracker
|
||||
|
||||
iterations.append(iter_result)
|
||||
|
||||
# ESCALATE check
|
||||
if verdict == "ESCALATE":
|
||||
final_verdict = "ESCALATE"
|
||||
for step in phase.steps:
|
||||
if step.verdict:
|
||||
esc = _extract_escalated_issues(
|
||||
step_outputs.get(step.output_key, ""),
|
||||
)
|
||||
if esc:
|
||||
escalated_issues.append(esc)
|
||||
iter_result.escalated_issues = esc
|
||||
logger.info(
|
||||
" [%s] ESCALATE at iteration %d — stopping.",
|
||||
phase.name, pi,
|
||||
)
|
||||
escalated = True
|
||||
break
|
||||
|
||||
if verdict is None:
|
||||
logger.info(
|
||||
" [%s] completed (no verdict step; single-pass phase)",
|
||||
phase.name,
|
||||
)
|
||||
phase_converged = True
|
||||
break
|
||||
|
||||
if verdict == "PASS":
|
||||
consecutive_passes += 1
|
||||
logger.info(
|
||||
@@ -223,9 +315,33 @@ def _run_phased_pipeline(
|
||||
else:
|
||||
consecutive_passes = 0
|
||||
|
||||
# Auto-escalate in phased pipeline
|
||||
has_aggregator = config.seniors or any(
|
||||
s.prompt_template == "default:aggregate-review" for s in phase.steps
|
||||
)
|
||||
if (
|
||||
verdict == "FAIL"
|
||||
and not has_aggregator
|
||||
and pi >= 2
|
||||
and _detect_auto_escalate(all_feedbacks[:-1], feedback)
|
||||
):
|
||||
final_verdict = "ESCALATE"
|
||||
auto_msg = (
|
||||
f"Auto-escalated: same issues detected across {pi} iterations "
|
||||
f"in phase '{phase.name}' without resolution."
|
||||
)
|
||||
escalated_issues.append(auto_msg)
|
||||
iter_result.escalated_issues = auto_msg
|
||||
logger.info(" [%s] AUTO-ESCALATE at iteration %d", phase.name, pi)
|
||||
escalated = True
|
||||
break
|
||||
|
||||
if dry_run:
|
||||
break
|
||||
|
||||
if escalated:
|
||||
break
|
||||
|
||||
if phase_converged:
|
||||
logger.info(" Phase '%s' completed: CONVERGED", phase.name)
|
||||
else:
|
||||
@@ -245,6 +361,7 @@ def _run_phased_pipeline(
|
||||
total_duration=round(total_duration, 1),
|
||||
run_dir=run_dir,
|
||||
repeated_aggregate_warnings=aggregate_warnings,
|
||||
escalated_issues=escalated_issues,
|
||||
)
|
||||
|
||||
if not dry_run:
|
||||
@@ -373,15 +490,17 @@ def _run_steps(
|
||||
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
|
||||
)
|
||||
|
||||
# Extract verdict from all verdict steps (ALL must PASS)
|
||||
# Extract verdict from all verdict steps (ALL must PASS; ESCALATE wins over all)
|
||||
for step in steps:
|
||||
if step.verdict:
|
||||
output = step_outputs.get(step.output_key, "")
|
||||
step_verdict = _extract_verdict(output, step.verdict_pattern)
|
||||
logger.info(" [%s] verdict: %s", step.name, step_verdict)
|
||||
if verdict is None:
|
||||
if step_verdict == "ESCALATE":
|
||||
verdict = "ESCALATE"
|
||||
elif verdict is None:
|
||||
verdict = step_verdict
|
||||
elif step_verdict == "FAIL":
|
||||
elif verdict != "ESCALATE" and step_verdict == "FAIL":
|
||||
verdict = "FAIL"
|
||||
|
||||
return step_outputs, step_results, verdict
|
||||
@@ -466,10 +585,11 @@ def _execute_step(
|
||||
f"Try --timeout 0 (unlimited)"
|
||||
)
|
||||
except RuntimeError as e:
|
||||
phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
|
||||
error_msg = (
|
||||
f"# Agent Error\n\n{phase_info}"
|
||||
f"- **Step**: {step.name}\n- **Agent**: {step.agent}\n\n```\n{e}\n```\n"
|
||||
error_msg = _format_runtime_error_markdown(
|
||||
e,
|
||||
step_name=step.name,
|
||||
agent_name=step.agent,
|
||||
phase_name=phase_name,
|
||||
)
|
||||
_save_step_output(run_dir, output_iter, f"{step.name}_error", error_msg)
|
||||
logger.error(" [%s] FAILED — saved to output", step.name)
|
||||
@@ -527,7 +647,7 @@ def _execute_parallel_batch(
|
||||
# Collect results from parallel threads
|
||||
local_outputs: dict[str, str] = {}
|
||||
local_results: dict[str, AgentResult] = {}
|
||||
errors: list[Exception] = []
|
||||
errors: list[tuple[StepConfig, Exception]] = []
|
||||
|
||||
# Show a single spinner for the batch
|
||||
from cross_eval.agent import _Spinner
|
||||
@@ -563,19 +683,15 @@ def _execute_parallel_batch(
|
||||
local_results[output_key] = result
|
||||
local_outputs[output_key] = output
|
||||
except Exception as e:
|
||||
errors.append(e)
|
||||
errors.append((step, e))
|
||||
|
||||
batch_elapsed = round(time.monotonic() - batch_start, 1)
|
||||
|
||||
if errors:
|
||||
spinner.stop(f"[parallel] FAILED ({batch_elapsed}s)")
|
||||
raise errors[0]
|
||||
|
||||
spinner.stop(f"[parallel] {len(batch)} agents done ({batch_elapsed}s)")
|
||||
|
||||
# Merge results
|
||||
# Persist successful outputs even if a sibling step failed.
|
||||
for step in batch:
|
||||
key = step.output_key
|
||||
if key not in local_outputs:
|
||||
continue
|
||||
step_outputs[key] = local_outputs[key]
|
||||
step_results[key] = local_results[key]
|
||||
r = local_results[key]
|
||||
@@ -585,6 +701,48 @@ def _execute_parallel_batch(
|
||||
)
|
||||
_save_step_output(run_dir, output_iter, step.name, r.output)
|
||||
|
||||
if errors:
|
||||
spinner.stop(f"[parallel] FAILED ({batch_elapsed}s)")
|
||||
for failed_step, exc in errors:
|
||||
if isinstance(exc, subprocess.TimeoutExpired):
|
||||
stdout = (exc.stdout or b"") if isinstance(exc.stdout, bytes) else (exc.stdout or "")
|
||||
stderr = (exc.stderr or b"") if isinstance(exc.stderr, bytes) else (exc.stderr or "")
|
||||
if isinstance(stdout, bytes):
|
||||
stdout = stdout.decode("utf-8", errors="replace")
|
||||
if isinstance(stderr, bytes):
|
||||
stderr = stderr.decode("utf-8", errors="replace")
|
||||
phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
|
||||
error_msg = (
|
||||
f"# Agent Timeout\n\n"
|
||||
f"{phase_info}"
|
||||
f"- **Step**: {failed_step.name}\n"
|
||||
f"- **Agent**: {failed_step.agent}\n"
|
||||
f"- **Timeout**: {timeout}s\n\n"
|
||||
f"Partial stdout ({len(stdout)} chars):\n"
|
||||
f"```\n{stdout[:2000] or '(none)'}\n```\n\n"
|
||||
f"Stderr:\n```\n{stderr[:2000] or '(none)'}\n```\n"
|
||||
)
|
||||
else:
|
||||
error_msg = _format_runtime_error_markdown(
|
||||
exc,
|
||||
step_name=failed_step.name,
|
||||
agent_name=failed_step.agent,
|
||||
phase_name=phase_name,
|
||||
)
|
||||
_save_step_output(run_dir, output_iter, f"{failed_step.name}_error", error_msg)
|
||||
logger.error(" [%s] FAILED — saved to output", failed_step.name)
|
||||
|
||||
failed_steps = ", ".join(step.name for step, _ in errors)
|
||||
saved_steps = ", ".join(step.name for step in batch if step.output_key in local_outputs)
|
||||
first_error = errors[0][1]
|
||||
saved_note = f" Successful outputs were saved for: {saved_steps}." if saved_steps else ""
|
||||
raise RuntimeError(
|
||||
f"Parallel batch failed: {len(errors)}/{len(batch)} steps failed ({failed_steps})."
|
||||
f"{saved_note} First error:\n{first_error}"
|
||||
)
|
||||
|
||||
spinner.stop(f"[parallel] {len(batch)} agents done ({batch_elapsed}s)")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Context and template helpers
|
||||
@@ -671,13 +829,104 @@ def _normalize_aggregate_output(output: str) -> str:
|
||||
return " ".join(output.lower().split())
|
||||
|
||||
|
||||
_ESCALATE_PATTERN = re.compile(r"VERDICT:\s*ESCALATE", re.IGNORECASE)
|
||||
|
||||
_TRACKER_TABLE_PATTERN = re.compile(
|
||||
r"(##+ Issue Tracker[^\n]*\n(?:\|[^\n]+\|\n?)+)", re.DOTALL,
|
||||
)
|
||||
|
||||
|
||||
def _extract_verdict(output: str, pattern: str) -> str:
|
||||
"""Extract PASS or FAIL from output using regex pattern."""
|
||||
"""Extract PASS, FAIL, or ESCALATE from output using regex pattern."""
|
||||
if re.search(_ESCALATE_PATTERN, output):
|
||||
return "ESCALATE" # highest priority
|
||||
if re.search(pattern, output):
|
||||
return "PASS"
|
||||
return "FAIL"
|
||||
|
||||
|
||||
def _extract_senior_tracker(output: str) -> str:
|
||||
"""Extract Issue Tracker table from senior review output."""
|
||||
match = _TRACKER_TABLE_PATTERN.search(output)
|
||||
return match.group(0) if match else ""
|
||||
|
||||
|
||||
def _extract_escalated_issues(output: str) -> str:
|
||||
"""Extract escalation details from senior review output."""
|
||||
# Look for content between VERDICT: ESCALATE and end, or an escalation section
|
||||
pattern = r"(?:###?\s*Escalat(?:ed|ion).*?\n)(.*?)(?=\n###|\Z)"
|
||||
match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
# Fallback: grab the Action Items section
|
||||
pattern2 = r"(?:###?\s*Action Items.*?\n)(.*?)(?=\n###|\Z)"
|
||||
match2 = re.search(pattern2, output, re.DOTALL | re.IGNORECASE)
|
||||
if match2:
|
||||
return match2.group(1).strip()
|
||||
return ""
|
||||
|
||||
|
||||
_FP_PATTERN = re.compile(r"[\w/\\]+\.\w{1,5}")
|
||||
_ISSUE_KEYWORDS = re.compile(
|
||||
r"\b(missing|validation|error[\s_-]?handling|unused|import|"
|
||||
r"injection|auth(?:entication|orization)?|deprecated|"
|
||||
r"leak|overflow|null|undefined|timeout|deadlock|race[\s_-]?condition|"
|
||||
r"security|permission|encoding|format|parsing|connection|"
|
||||
r"boundary|initialization|cleanup|resource|concurrency|"
|
||||
r"exception|crash|hang|corrupt|truncat|duplicat|inconsisten|"
|
||||
r"omission|over[\s_-]?engineer|refactor|naming|docstring|"
|
||||
r"type[\s_-]?hint|test|coverage|logging|config|performance)\w*",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _issue_fingerprints(text: str) -> set[tuple[str, str]]:
|
||||
"""Extract (file_path, issue_keyword) pairs from feedback text.
|
||||
|
||||
For each file path found, look for issue keywords within a window of
|
||||
~120 characters around the file path mention and create composite keys.
|
||||
"""
|
||||
lower = text.lower()
|
||||
paths = list(_FP_PATTERN.finditer(lower))
|
||||
if not paths:
|
||||
return set()
|
||||
|
||||
pairs: set[tuple[str, str]] = set()
|
||||
for m in paths:
|
||||
fp = m.group()
|
||||
# Search a window around the file path for issue keywords
|
||||
window_start = max(0, m.start() - 60)
|
||||
window_end = min(len(lower), m.end() + 60)
|
||||
window = lower[window_start:window_end]
|
||||
for kw_match in _ISSUE_KEYWORDS.finditer(window):
|
||||
pairs.add((fp, kw_match.group().lower()))
|
||||
return pairs
|
||||
|
||||
|
||||
def _detect_auto_escalate(
|
||||
feedbacks: list[str],
|
||||
current_feedback: str,
|
||||
threshold: int = 2,
|
||||
) -> bool:
|
||||
"""Detect repeated identical issues across iterations (for auto-escalation).
|
||||
|
||||
Extracts (file_path, issue_keyword) fingerprints from feedback and checks
|
||||
if any identical pair appears in >= *threshold* previous iterations.
|
||||
This avoids false positives when the same file is mentioned for completely
|
||||
different issues across iterations.
|
||||
"""
|
||||
current_fps = _issue_fingerprints(current_feedback)
|
||||
if not current_fps:
|
||||
return False
|
||||
|
||||
repeat_count = 0
|
||||
for prev in feedbacks:
|
||||
prev_fps = _issue_fingerprints(prev)
|
||||
if current_fps & prev_fps:
|
||||
repeat_count += 1
|
||||
return repeat_count >= threshold
|
||||
|
||||
|
||||
def _save_step_output(
|
||||
run_dir: Path,
|
||||
iteration: int,
|
||||
@@ -691,8 +940,56 @@ def _save_step_output(
|
||||
return path
|
||||
|
||||
|
||||
def _format_runtime_error_markdown(
|
||||
exc: Exception,
|
||||
*,
|
||||
step_name: str,
|
||||
agent_name: str,
|
||||
phase_name: str | None = None,
|
||||
) -> str:
|
||||
"""Render a structured markdown error report for a failed step."""
|
||||
phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
|
||||
lines = [
|
||||
"# Agent Error",
|
||||
"",
|
||||
phase_info.rstrip(),
|
||||
f"- **Step**: {step_name}",
|
||||
f"- **Agent**: {agent_name}",
|
||||
]
|
||||
lines = [line for line in lines if line]
|
||||
|
||||
if isinstance(exc, AgentInvocationError):
|
||||
lines.extend(
|
||||
[
|
||||
f"- **Failure Type**: {exc.failure_type}",
|
||||
f"- **Suggested Action**: {exc.suggested_action}",
|
||||
"",
|
||||
"## Command",
|
||||
f"```",
|
||||
exc.cmd_preview,
|
||||
"```",
|
||||
"",
|
||||
"## Raw Error",
|
||||
"```",
|
||||
exc.raw_error,
|
||||
"```",
|
||||
],
|
||||
)
|
||||
else:
|
||||
lines.extend(
|
||||
[
|
||||
"",
|
||||
"```",
|
||||
str(exc),
|
||||
"```",
|
||||
],
|
||||
)
|
||||
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
def _save_report(run_dir: Path, config: PipelineConfig, result: PipelineResult) -> None:
|
||||
"""Generate and save the final markdown report."""
|
||||
"""Build and save the final markdown report."""
|
||||
report = build_report(config, result)
|
||||
report_path = run_dir / "final-report.md"
|
||||
report_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
Reference in New Issue
Block a user