Files
cross-eval/cross_eval/pipeline.py
chungyeong 204e071b74 feat: ESCALATE verdict, issue tracker, onboarding commands
Add 3-verdict system (PASS/FAIL/ESCALATE) with priority handling across
simple and phased pipelines. Senior reviewers can now escalate issues
requiring human intervention, immediately breaking the review loop.

- ESCALATE verdict extraction with highest priority over PASS/FAIL
- Issue Tracker tables (ISS-NNN) carried across iterations
- Auto-escalate heuristic using (file, keyword) composite fingerprints
- Report restructuring: executive view first (verdict → tracker → metrics)
- Onboarding: `doctor`, `demo`, `init --guided` commands
- Exit codes: PASS=0, FAIL=1, ESCALATE=2
- 87 tests passing (54 config + 25 onboarding + 8 integration)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 18:19:05 +09:00

998 lines
34 KiB
Python

"""Main pipeline execution engine."""
from __future__ import annotations
import logging
import os
import re
import subprocess
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pathlib import Path
from cross_eval.agent import AgentInvocationError, invoke_agent
from cross_eval.config import try_reload_config
from cross_eval.models import (
AgentResult,
IterationResult,
PipelineConfig,
PipelineResult,
StepConfig,
)
from cross_eval.prompts import render_template, resolve_template, set_language
from cross_eval.report import build_report
logger = logging.getLogger(__name__)
def run_pipeline(
config: PipelineConfig,
cwd: Path | None = None,
dry_run: bool = False,
timeout: int | None = None,
) -> PipelineResult:
"""Execute the full cross-eval pipeline."""
# Create run directory: output/{preset}_{datetime}/
run_dir = _make_run_dir(config)
if config.phases:
return _run_phased_pipeline(config, run_dir, cwd, dry_run, timeout)
return _run_simple_pipeline(config, run_dir, cwd, dry_run, timeout)
def _make_run_dir(config: PipelineConfig) -> Path:
"""Create timestamped run directory under output_dir."""
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
run_dir = config.output_dir / f"{config.preset_name}_{ts}"
run_dir.mkdir(parents=True, exist_ok=True)
return run_dir
def _run_simple_pipeline(
config: PipelineConfig,
run_dir: Path,
cwd: Path | None = None,
dry_run: bool = False,
timeout: int | None = None,
) -> PipelineResult:
"""Execute a simple (non-phased) pipeline."""
if cwd is None:
cwd = Path(os.getcwd())
set_language(config.language)
input_contents = _load_inputs(config)
feedback = "(no feedback — first iteration)"
iterations: list[IterationResult] = []
start_time = time.monotonic()
final_verdict = "MAX_ITERATIONS_REACHED"
aggregate_history: dict[str, int] = {}
aggregate_warnings: list[str] = []
escalated_issues: list[str] = []
all_feedbacks: list[str] = []
for i in range(1, config.max_iterations + 1):
config = try_reload_config(config)
set_language(config.language)
_refresh_inputs(config, input_contents)
logger.info("=" * 50)
logger.info(" Iteration %d/%d", i, config.max_iterations)
logger.info("=" * 50)
step_outputs, step_results, verdict = _run_steps(
config.pipeline, config, input_contents, feedback,
i, config.max_iterations, cwd, timeout, dry_run,
run_dir=run_dir, output_iter=i,
)
iter_result = IterationResult(
iteration=i,
step_results=step_results,
step_outputs=step_outputs,
verdict=verdict,
)
warning = _detect_repeated_aggregate(
config.pipeline, step_outputs, aggregate_history, iteration=i,
)
if warning:
iter_result.repeated_aggregate_warning = warning
aggregate_warnings.append(warning)
logger.warning(" %s", warning)
iter_result.feedback = _collect_feedback(config.pipeline, step_outputs)
feedback = iter_result.feedback or feedback
all_feedbacks.append(feedback)
# Extract tracker from verdict/review steps for next iteration
for step in config.pipeline:
if step.verdict or step.role == "review":
tracker = _extract_senior_tracker(
step_outputs.get(step.output_key, ""),
)
if tracker:
input_contents["previous_senior_tracker"] = tracker
iterations.append(iter_result)
# ESCALATE check (highest priority)
if verdict == "ESCALATE":
final_verdict = "ESCALATE"
# Extract escalation details from verdict step outputs
for step in config.pipeline:
if step.verdict:
esc = _extract_escalated_issues(
step_outputs.get(step.output_key, ""),
)
if esc:
escalated_issues.append(esc)
iter_result.escalated_issues = esc
logger.info(" ESCALATE at iteration %d — stopping loop.", i)
break
if verdict == "PASS":
final_verdict = "PASS"
if i >= config.min_iterations:
logger.info(" PASS at iteration %d (min=%d reached)!", i, config.min_iterations)
break
else:
logger.info(
" PASS at iteration %d, but min_iterations=%d — continuing",
i, config.min_iterations,
)
# Auto-escalate: no senior/aggregator + repeated FAIL
has_aggregator = config.seniors or any(
s.prompt_template == "default:aggregate-review" for s in config.pipeline
)
if (
verdict == "FAIL"
and not has_aggregator
and i >= 2
and _detect_auto_escalate(all_feedbacks[:-1], feedback)
):
final_verdict = "ESCALATE"
auto_msg = (
f"Auto-escalated: same issues detected across {i} iterations "
f"without resolution (no senior reviewer configured)."
)
escalated_issues.append(auto_msg)
iter_result.escalated_issues = auto_msg
logger.info(" AUTO-ESCALATE at iteration %d", i)
break
if dry_run:
logger.info(" (dry-run: stopping after iteration 1)")
break
total_duration = time.monotonic() - start_time
pipeline_result = PipelineResult(
iterations=iterations,
final_verdict=final_verdict,
total_duration=round(total_duration, 1),
run_dir=run_dir,
repeated_aggregate_warnings=aggregate_warnings,
escalated_issues=escalated_issues,
)
if not dry_run:
_save_report(run_dir, config, pipeline_result)
return pipeline_result
def _run_phased_pipeline(
config: PipelineConfig,
run_dir: Path,
cwd: Path | None = None,
dry_run: bool = False,
timeout: int | None = None,
) -> PipelineResult:
"""Execute a multi-phase pipeline (e.g. review-fix)."""
if cwd is None:
cwd = Path(os.getcwd())
set_language(config.language)
input_contents = _load_inputs(config)
iterations: list[IterationResult] = []
feedback = "(no feedback — first iteration)"
start_time = time.monotonic()
final_verdict = "MAX_ITERATIONS_REACHED"
global_iter = 0
aggregate_history_by_phase: dict[str, dict[str, int]] = {}
aggregate_warnings: list[str] = []
escalated_issues: list[str] = []
all_feedbacks: list[str] = []
escalated = False
for phase_idx, phase in enumerate(config.phases):
if escalated:
break
logger.info("=" * 60)
logger.info(
" Phase: %s (max_iter=%d, consecutive_pass=%d)",
phase.name, phase.max_iterations, phase.consecutive_pass,
)
logger.info("=" * 60)
consecutive_passes = 0
phase_converged = False
for pi in range(1, phase.max_iterations + 1):
global_iter += 1
config = try_reload_config(config)
set_language(config.language)
_refresh_inputs(config, input_contents)
logger.info("-" * 50)
logger.info(
" [%s] Iteration %d/%d (global: v%d)",
phase.name, pi, phase.max_iterations, global_iter,
)
logger.info("-" * 50)
step_outputs, step_results, verdict = _run_steps(
phase.steps, config, input_contents, feedback,
pi, phase.max_iterations, cwd, timeout, dry_run,
run_dir=run_dir, output_iter=global_iter, phase_name=phase.name,
)
iter_result = IterationResult(
iteration=global_iter,
step_results=step_results,
step_outputs=step_outputs,
verdict=verdict,
phase_name=phase.name,
)
phase_history = aggregate_history_by_phase.setdefault(phase.name, {})
warning = _detect_repeated_aggregate(
phase.steps, step_outputs, phase_history, iteration=global_iter,
phase_name=phase.name,
)
if warning:
iter_result.repeated_aggregate_warning = warning
aggregate_warnings.append(warning)
logger.warning(" %s", warning)
iter_result.feedback = _collect_feedback(phase.steps, step_outputs)
feedback = iter_result.feedback or feedback
all_feedbacks.append(feedback)
# Extract tracker from verdict/review steps
for step in phase.steps:
if step.verdict or step.role == "review":
tracker = _extract_senior_tracker(
step_outputs.get(step.output_key, ""),
)
if tracker:
input_contents["previous_senior_tracker"] = tracker
iterations.append(iter_result)
# ESCALATE check
if verdict == "ESCALATE":
final_verdict = "ESCALATE"
for step in phase.steps:
if step.verdict:
esc = _extract_escalated_issues(
step_outputs.get(step.output_key, ""),
)
if esc:
escalated_issues.append(esc)
iter_result.escalated_issues = esc
logger.info(
" [%s] ESCALATE at iteration %d — stopping.",
phase.name, pi,
)
escalated = True
break
if verdict is None:
logger.info(
" [%s] completed (no verdict step; single-pass phase)",
phase.name,
)
phase_converged = True
break
if verdict == "PASS":
consecutive_passes += 1
logger.info(
" [%s] PASS (%d/%d consecutive)",
phase.name, consecutive_passes, phase.consecutive_pass,
)
if consecutive_passes >= phase.consecutive_pass:
logger.info(
" [%s] Converged! %d consecutive PASSes.",
phase.name, phase.consecutive_pass,
)
phase_converged = True
break
else:
consecutive_passes = 0
# Auto-escalate in phased pipeline
has_aggregator = config.seniors or any(
s.prompt_template == "default:aggregate-review" for s in phase.steps
)
if (
verdict == "FAIL"
and not has_aggregator
and pi >= 2
and _detect_auto_escalate(all_feedbacks[:-1], feedback)
):
final_verdict = "ESCALATE"
auto_msg = (
f"Auto-escalated: same issues detected across {pi} iterations "
f"in phase '{phase.name}' without resolution."
)
escalated_issues.append(auto_msg)
iter_result.escalated_issues = auto_msg
logger.info(" [%s] AUTO-ESCALATE at iteration %d", phase.name, pi)
escalated = True
break
if dry_run:
break
if escalated:
break
if phase_converged:
logger.info(" Phase '%s' completed: CONVERGED", phase.name)
else:
logger.info(
" Phase '%s' completed: max iterations (%d) reached",
phase.name, phase.max_iterations,
)
if phase_idx == len(config.phases) - 1:
final_verdict = "PASS" if phase_converged else "MAX_ITERATIONS_REACHED"
total_duration = time.monotonic() - start_time
pipeline_result = PipelineResult(
iterations=iterations,
final_verdict=final_verdict,
total_duration=round(total_duration, 1),
run_dir=run_dir,
repeated_aggregate_warnings=aggregate_warnings,
escalated_issues=escalated_issues,
)
if not dry_run:
_save_report(run_dir, config, pipeline_result)
return pipeline_result
# ---------------------------------------------------------------------------
# Shared helpers
# ---------------------------------------------------------------------------
def _load_inputs(config: PipelineConfig) -> dict[str, str]:
"""Load input file contents from config."""
input_contents: dict[str, str] = {}
for key, val in config.inputs.items():
if isinstance(val, str):
input_contents[key] = val
else:
input_contents[key] = val.read_text(encoding="utf-8")
return input_contents
def _refresh_inputs(
config: PipelineConfig, input_contents: dict[str, str],
) -> None:
"""Re-read input files (they may have changed on disk)."""
for key, val in config.inputs.items():
if isinstance(val, str):
input_contents[key] = val
elif isinstance(val, Path) and val.exists():
input_contents[key] = val.read_text(encoding="utf-8")
# ---------------------------------------------------------------------------
# Parallel step grouping
# ---------------------------------------------------------------------------
def _get_step_dependencies(step: StepConfig) -> set[str]:
"""Extract output_key references from context_override values."""
deps: set[str] = set()
for val in step.context_override.values():
for match in re.finditer(r"\{(\w+)\}", val):
deps.add(match.group(1))
return deps
def _group_parallel_steps(steps: list[StepConfig]) -> list[list[StepConfig]]:
"""Group consecutive parallel steps into batches.
Consecutive steps with parallel=True are grouped together,
but a new batch starts when a step depends on an output_key
from a step in the current batch (dependency breaking).
"""
batches: list[list[StepConfig]] = []
current: list[StepConfig] = []
current_output_keys: set[str] = set()
for step in steps:
if not step.parallel:
if current:
batches.append(current)
current = []
current_output_keys = set()
batches.append([step])
continue
# Check if this step depends on any output from the current batch
deps = _get_step_dependencies(step)
if deps & current_output_keys:
batches.append(current)
current = []
current_output_keys = set()
current.append(step)
current_output_keys.add(step.output_key)
if current:
batches.append(current)
return batches
# ---------------------------------------------------------------------------
# Step execution
# ---------------------------------------------------------------------------
def _run_steps(
steps: list[StepConfig],
config: PipelineConfig,
input_contents: dict[str, str],
feedback: str,
iteration: int,
max_iterations: int,
cwd: Path,
timeout: int | None,
dry_run: bool,
*,
run_dir: Path,
output_iter: int,
phase_name: str | None = None,
) -> tuple[dict[str, str], dict[str, AgentResult], str | None]:
"""Execute all steps in one iteration, parallelizing where possible."""
step_outputs: dict[str, str] = {}
step_results: dict[str, AgentResult] = {}
verdict: str | None = None
batches = _group_parallel_steps(steps)
for batch in batches:
if len(batch) == 1:
# Single step — run directly
step = batch[0]
_execute_step(
step, config, input_contents, feedback,
iteration, max_iterations, cwd, timeout, dry_run,
step_outputs, step_results,
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
)
else:
# Parallel batch — run with ThreadPoolExecutor
_execute_parallel_batch(
batch, config, input_contents, feedback,
iteration, max_iterations, cwd, timeout, dry_run,
step_outputs, step_results,
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
)
# Extract verdict from all verdict steps (ALL must PASS; ESCALATE wins over all)
for step in steps:
if step.verdict:
output = step_outputs.get(step.output_key, "")
step_verdict = _extract_verdict(output, step.verdict_pattern)
logger.info(" [%s] verdict: %s", step.name, step_verdict)
if step_verdict == "ESCALATE":
verdict = "ESCALATE"
elif verdict is None:
verdict = step_verdict
elif verdict != "ESCALATE" and step_verdict == "FAIL":
verdict = "FAIL"
return step_outputs, step_results, verdict
def _execute_step(
step: StepConfig,
config: PipelineConfig,
input_contents: dict[str, str],
feedback: str,
iteration: int,
max_iterations: int,
cwd: Path,
timeout: int | None,
dry_run: bool,
step_outputs: dict[str, str],
step_results: dict[str, AgentResult],
*,
run_dir: Path,
output_iter: int,
phase_name: str | None = None,
quiet: bool = False,
) -> None:
"""Execute a single step, updating step_outputs and step_results in place."""
if not quiet:
logger.info(" [%s] agent='%s' role='%s'", step.name, step.agent, step.role)
# 1. Resolve template
template = resolve_template(step.prompt_template)
# 2. Build context
context = _build_context(
input_contents, step_outputs, feedback, iteration, max_iterations,
)
# 3. Apply context overrides
if step.context_override:
context = _apply_context_override(context, step.context_override)
# 4. Render prompt
prompt = render_template(template, context)
# 5. Dry run: print and skip
if dry_run:
phase_label = f" phase={phase_name}" if phase_name else ""
print(f"\n--- Step: {step.name} (agent={step.agent}{phase_label}) ---")
print(prompt)
print(f"--- end {step.name} ---\n")
step_outputs[step.output_key] = f"(dry-run: no output for {step.output_key})"
return
# 6. Invoke agent
agent_config = config.agents[step.agent]
try:
result = invoke_agent(
agent_config, prompt, step.name,
cwd=cwd, timeout=timeout, quiet=quiet,
)
except subprocess.TimeoutExpired as e:
stdout = (e.stdout or b"") if isinstance(e.stdout, bytes) else (e.stdout or "")
stderr = (e.stderr or b"") if isinstance(e.stderr, bytes) else (e.stderr or "")
if isinstance(stdout, bytes):
stdout = stdout.decode("utf-8", errors="replace")
if isinstance(stderr, bytes):
stderr = stderr.decode("utf-8", errors="replace")
phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
error_msg = (
f"# Agent Timeout\n\n"
f"{phase_info}"
f"- **Step**: {step.name}\n"
f"- **Agent**: {step.agent}\n"
f"- **Timeout**: {timeout}s\n\n"
f"Partial stdout ({len(stdout)} chars):\n"
f"```\n{stdout[:2000] or '(none)'}\n```\n\n"
f"Stderr:\n```\n{stderr[:2000] or '(none)'}\n```\n"
)
_save_step_output(run_dir, output_iter, f"{step.name}_error", error_msg)
logger.error(" [%s] TIMEOUT after %ss — saved to output", step.name, timeout)
raise RuntimeError(
f"Agent '{step.agent}' timed out after {timeout}s at step '{step.name}'. "
f"Error saved to {run_dir}/v{output_iter}/{step.name}_error.md. "
f"Try --timeout 0 (unlimited)"
)
except RuntimeError as e:
error_msg = _format_runtime_error_markdown(
e,
step_name=step.name,
agent_name=step.agent,
phase_name=phase_name,
)
_save_step_output(run_dir, output_iter, f"{step.name}_error", error_msg)
logger.error(" [%s] FAILED — saved to output", step.name)
raise
# 7. Store output
step_outputs[step.output_key] = result.output
step_results[step.output_key] = result
if not quiet:
logger.info(
" [%s] completed (%.1fs, %d chars)",
step.name, result.duration_seconds, len(result.output),
)
# 8. Save to disk
_save_step_output(run_dir, output_iter, step.name, result.output)
def _execute_parallel_batch(
batch: list[StepConfig],
config: PipelineConfig,
input_contents: dict[str, str],
feedback: str,
iteration: int,
max_iterations: int,
cwd: Path,
timeout: int | None,
dry_run: bool,
step_outputs: dict[str, str],
step_results: dict[str, AgentResult],
*,
run_dir: Path,
output_iter: int,
phase_name: str | None = None,
) -> None:
"""Execute multiple steps in parallel using threads."""
agent_names = ", ".join(s.agent for s in batch)
logger.info(" [parallel] %d agents: %s", len(batch), agent_names)
if dry_run:
for step in batch:
_execute_step(
step, config, input_contents, feedback,
iteration, max_iterations, cwd, timeout, dry_run,
step_outputs, step_results,
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
)
return
# Snapshot context before parallel execution (all steps see same state)
context_snapshot = dict(input_contents)
context_snapshot.update(step_outputs)
# Collect results from parallel threads
local_outputs: dict[str, str] = {}
local_results: dict[str, AgentResult] = {}
errors: list[tuple[StepConfig, Exception]] = []
# Show a single spinner for the batch
from cross_eval.agent import _Spinner
spinner = _Spinner(
f"[parallel] {len(batch)} agents running ({agent_names})..."
)
spinner.start()
batch_start = time.monotonic()
def _run_one(step: StepConfig) -> tuple[str, str, AgentResult]:
"""Run one step, return (output_key, output, result)."""
template = resolve_template(step.prompt_template)
context = _build_context(
context_snapshot, {}, feedback, iteration, max_iterations,
)
if step.context_override:
context = _apply_context_override(context, step.context_override)
prompt = render_template(template, context)
agent_config = config.agents[step.agent]
result = invoke_agent(
agent_config, prompt, step.name,
cwd=cwd, timeout=timeout, quiet=True,
)
return step.output_key, result.output, result
with ThreadPoolExecutor(max_workers=len(batch)) as executor:
futures = {executor.submit(_run_one, step): step for step in batch}
for future in as_completed(futures):
step = futures[future]
try:
output_key, output, result = future.result()
local_results[output_key] = result
local_outputs[output_key] = output
except Exception as e:
errors.append((step, e))
batch_elapsed = round(time.monotonic() - batch_start, 1)
# Persist successful outputs even if a sibling step failed.
for step in batch:
key = step.output_key
if key not in local_outputs:
continue
step_outputs[key] = local_outputs[key]
step_results[key] = local_results[key]
r = local_results[key]
logger.info(
" [%s] completed (%.1fs, %d chars)",
step.name, r.duration_seconds, len(r.output),
)
_save_step_output(run_dir, output_iter, step.name, r.output)
if errors:
spinner.stop(f"[parallel] FAILED ({batch_elapsed}s)")
for failed_step, exc in errors:
if isinstance(exc, subprocess.TimeoutExpired):
stdout = (exc.stdout or b"") if isinstance(exc.stdout, bytes) else (exc.stdout or "")
stderr = (exc.stderr or b"") if isinstance(exc.stderr, bytes) else (exc.stderr or "")
if isinstance(stdout, bytes):
stdout = stdout.decode("utf-8", errors="replace")
if isinstance(stderr, bytes):
stderr = stderr.decode("utf-8", errors="replace")
phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
error_msg = (
f"# Agent Timeout\n\n"
f"{phase_info}"
f"- **Step**: {failed_step.name}\n"
f"- **Agent**: {failed_step.agent}\n"
f"- **Timeout**: {timeout}s\n\n"
f"Partial stdout ({len(stdout)} chars):\n"
f"```\n{stdout[:2000] or '(none)'}\n```\n\n"
f"Stderr:\n```\n{stderr[:2000] or '(none)'}\n```\n"
)
else:
error_msg = _format_runtime_error_markdown(
exc,
step_name=failed_step.name,
agent_name=failed_step.agent,
phase_name=phase_name,
)
_save_step_output(run_dir, output_iter, f"{failed_step.name}_error", error_msg)
logger.error(" [%s] FAILED — saved to output", failed_step.name)
failed_steps = ", ".join(step.name for step, _ in errors)
saved_steps = ", ".join(step.name for step in batch if step.output_key in local_outputs)
first_error = errors[0][1]
saved_note = f" Successful outputs were saved for: {saved_steps}." if saved_steps else ""
raise RuntimeError(
f"Parallel batch failed: {len(errors)}/{len(batch)} steps failed ({failed_steps})."
f"{saved_note} First error:\n{first_error}"
)
spinner.stop(f"[parallel] {len(batch)} agents done ({batch_elapsed}s)")
# ---------------------------------------------------------------------------
# Context and template helpers
# ---------------------------------------------------------------------------
def _build_context(
input_contents: dict[str, str],
step_outputs: dict[str, str],
feedback: str,
iteration: int,
max_iterations: int,
) -> dict[str, str]:
"""Build the template context dict."""
context: dict[str, str] = {}
context.update(input_contents)
context.update(step_outputs)
context["feedback"] = feedback
context["iteration"] = str(iteration)
context["max_iterations"] = str(max_iterations)
return context
def _apply_context_override(
context: dict[str, str],
overrides: dict[str, str],
) -> dict[str, str]:
"""Apply context_override mappings for cross-review scenarios."""
result = dict(context)
for key, value_template in overrides.items():
result[key] = render_template(value_template, context)
return result
def _collect_feedback(
steps: list[StepConfig],
step_outputs: dict[str, str],
) -> str:
"""Collect feedback from all verdict steps.
Single verdict step → raw output (backward compatible).
Multiple verdict steps → combined with agent headers for cross-referencing.
"""
verdict_steps = [s for s in steps if s.verdict]
if len(verdict_steps) == 1:
return step_outputs.get(verdict_steps[0].output_key, "")
parts: list[str] = []
for s in verdict_steps:
output = step_outputs.get(s.output_key, "")
if output:
parts.append(f"## Review by {s.agent} ({s.name})\n{output}")
return "\n\n---\n\n".join(parts)
def _detect_repeated_aggregate(
steps: list[StepConfig],
step_outputs: dict[str, str],
history: dict[str, int],
*,
iteration: int,
phase_name: str | None = None,
) -> str | None:
"""Detect repeated aggregate-review outputs across iterations."""
for step in steps:
if step.prompt_template != "default:aggregate-review":
continue
output = step_outputs.get(step.output_key, "")
normalized = _normalize_aggregate_output(output)
if not normalized:
return None
if normalized in history:
prev_iter = history[normalized]
phase_prefix = f"[{phase_name}] " if phase_name else ""
return (
f"{phase_prefix}Repeated aggregate_review detected at iteration {iteration} "
f"(same as iteration {prev_iter})."
)
history[normalized] = iteration
return None
return None
def _normalize_aggregate_output(output: str) -> str:
"""Normalize aggregate output for repeat detection."""
return " ".join(output.lower().split())
_ESCALATE_PATTERN = re.compile(r"VERDICT:\s*ESCALATE", re.IGNORECASE)
_TRACKER_TABLE_PATTERN = re.compile(
r"(##+ Issue Tracker[^\n]*\n(?:\|[^\n]+\|\n?)+)", re.DOTALL,
)
def _extract_verdict(output: str, pattern: str) -> str:
"""Extract PASS, FAIL, or ESCALATE from output using regex pattern."""
if re.search(_ESCALATE_PATTERN, output):
return "ESCALATE" # highest priority
if re.search(pattern, output):
return "PASS"
return "FAIL"
def _extract_senior_tracker(output: str) -> str:
"""Extract Issue Tracker table from senior review output."""
match = _TRACKER_TABLE_PATTERN.search(output)
return match.group(0) if match else ""
def _extract_escalated_issues(output: str) -> str:
"""Extract escalation details from senior review output."""
# Look for content between VERDICT: ESCALATE and end, or an escalation section
pattern = r"(?:###?\s*Escalat(?:ed|ion).*?\n)(.*?)(?=\n###|\Z)"
match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
if match:
return match.group(1).strip()
# Fallback: grab the Action Items section
pattern2 = r"(?:###?\s*Action Items.*?\n)(.*?)(?=\n###|\Z)"
match2 = re.search(pattern2, output, re.DOTALL | re.IGNORECASE)
if match2:
return match2.group(1).strip()
return ""
_FP_PATTERN = re.compile(r"[\w/\\]+\.\w{1,5}")
_ISSUE_KEYWORDS = re.compile(
r"\b(missing|validation|error[\s_-]?handling|unused|import|"
r"injection|auth(?:entication|orization)?|deprecated|"
r"leak|overflow|null|undefined|timeout|deadlock|race[\s_-]?condition|"
r"security|permission|encoding|format|parsing|connection|"
r"boundary|initialization|cleanup|resource|concurrency|"
r"exception|crash|hang|corrupt|truncat|duplicat|inconsisten|"
r"omission|over[\s_-]?engineer|refactor|naming|docstring|"
r"type[\s_-]?hint|test|coverage|logging|config|performance)\w*",
re.IGNORECASE,
)
def _issue_fingerprints(text: str) -> set[tuple[str, str]]:
"""Extract (file_path, issue_keyword) pairs from feedback text.
For each file path found, look for issue keywords within a window of
~120 characters around the file path mention and create composite keys.
"""
lower = text.lower()
paths = list(_FP_PATTERN.finditer(lower))
if not paths:
return set()
pairs: set[tuple[str, str]] = set()
for m in paths:
fp = m.group()
# Search a window around the file path for issue keywords
window_start = max(0, m.start() - 60)
window_end = min(len(lower), m.end() + 60)
window = lower[window_start:window_end]
for kw_match in _ISSUE_KEYWORDS.finditer(window):
pairs.add((fp, kw_match.group().lower()))
return pairs
def _detect_auto_escalate(
feedbacks: list[str],
current_feedback: str,
threshold: int = 2,
) -> bool:
"""Detect repeated identical issues across iterations (for auto-escalation).
Extracts (file_path, issue_keyword) fingerprints from feedback and checks
if any identical pair appears in >= *threshold* previous iterations.
This avoids false positives when the same file is mentioned for completely
different issues across iterations.
"""
current_fps = _issue_fingerprints(current_feedback)
if not current_fps:
return False
repeat_count = 0
for prev in feedbacks:
prev_fps = _issue_fingerprints(prev)
if current_fps & prev_fps:
repeat_count += 1
return repeat_count >= threshold
def _save_step_output(
run_dir: Path,
iteration: int,
step_name: str,
content: str,
) -> Path:
"""Save step output to run_dir/v{iteration}/{step_name}.md"""
path = run_dir / f"v{iteration}" / f"{step_name}.md"
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content, encoding="utf-8")
return path
def _format_runtime_error_markdown(
exc: Exception,
*,
step_name: str,
agent_name: str,
phase_name: str | None = None,
) -> str:
"""Render a structured markdown error report for a failed step."""
phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
lines = [
"# Agent Error",
"",
phase_info.rstrip(),
f"- **Step**: {step_name}",
f"- **Agent**: {agent_name}",
]
lines = [line for line in lines if line]
if isinstance(exc, AgentInvocationError):
lines.extend(
[
f"- **Failure Type**: {exc.failure_type}",
f"- **Suggested Action**: {exc.suggested_action}",
"",
"## Command",
f"```",
exc.cmd_preview,
"```",
"",
"## Raw Error",
"```",
exc.raw_error,
"```",
],
)
else:
lines.extend(
[
"",
"```",
str(exc),
"```",
],
)
return "\n".join(lines) + "\n"
def _save_report(run_dir: Path, config: PipelineConfig, result: PipelineResult) -> None:
"""Build and save the final markdown report."""
report = build_report(config, result)
report_path = run_dir / "final-report.md"
report_path.parent.mkdir(parents=True, exist_ok=True)
report_path.write_text(report, encoding="utf-8")
logger.info("Report saved: %s", report_path)