cross-eval/cross_eval/pipeline.py

"""Main pipeline execution engine."""
from __future__ import annotations

import logging
import os
import re
import subprocess
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pathlib import Path

from cross_eval.agent import AgentInvocationError, invoke_agent
from cross_eval.config import try_reload_config
from cross_eval.models import (
    AgentResult,
    IterationResult,
    PipelineConfig,
    PipelineResult,
    StepConfig,
)
from cross_eval.prompts import render_template, resolve_template, set_language
from cross_eval.report import build_report

logger = logging.getLogger(__name__)


def run_pipeline(
    config: PipelineConfig,
    cwd: Path | None = None,
    dry_run: bool = False,
    timeout: int | None = None,
) -> PipelineResult:
    """Execute the full cross-eval pipeline."""
    # Create run directory: output/{preset}_{datetime}/
    run_dir = _make_run_dir(config)

    if config.phases:
        return _run_phased_pipeline(config, run_dir, cwd, dry_run, timeout)
    return _run_simple_pipeline(config, run_dir, cwd, dry_run, timeout)


def _make_run_dir(config: PipelineConfig) -> Path:
    """Create timestamped run directory under output_dir."""
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_dir = config.output_dir / f"{config.preset_name}_{ts}"
    run_dir.mkdir(parents=True, exist_ok=True)
    return run_dir


def _run_simple_pipeline(
    config: PipelineConfig,
    run_dir: Path,
    cwd: Path | None = None,
    dry_run: bool = False,
    timeout: int | None = None,
) -> PipelineResult:
    """Execute a simple (non-phased) pipeline."""
    if cwd is None:
        cwd = Path(os.getcwd())

    set_language(config.language)
    input_contents = _load_inputs(config)

    feedback = "(no feedback — first iteration)"
    iterations: list[IterationResult] = []
    start_time = time.monotonic()
    final_verdict = "MAX_ITERATIONS_REACHED"
    aggregate_history: dict[str, int] = {}
    aggregate_warnings: list[str] = []
    escalated_issues: list[str] = []
    all_feedbacks: list[str] = []

    for i in range(1, config.max_iterations + 1):
        config = try_reload_config(config)
        set_language(config.language)
        _refresh_inputs(config, input_contents)

        logger.info("=" * 50)
        logger.info("  Iteration %d/%d", i, config.max_iterations)
        logger.info("=" * 50)

        step_outputs, step_results, verdict = _run_steps(
            config.pipeline, config, input_contents, feedback,
            i, config.max_iterations, cwd, timeout, dry_run,
            run_dir=run_dir, output_iter=i,
        )

        iter_result = IterationResult(
            iteration=i,
            step_results=step_results,
            step_outputs=step_outputs,
            verdict=verdict,
        )
        warning = _detect_repeated_aggregate(
            config.pipeline, step_outputs, aggregate_history, iteration=i,
        )
        if warning:
            iter_result.repeated_aggregate_warning = warning
            aggregate_warnings.append(warning)
            logger.warning("  %s", warning)

        iter_result.feedback = _collect_feedback(config.pipeline, step_outputs)
        feedback = iter_result.feedback or feedback
        all_feedbacks.append(feedback)

        # Extract tracker from verdict/review steps for next iteration
        for step in config.pipeline:
            if step.verdict or step.role == "review":
                tracker = _extract_senior_tracker(
                    step_outputs.get(step.output_key, ""),
                )
                if tracker:
                    input_contents["previous_senior_tracker"] = tracker

        iterations.append(iter_result)

        # ESCALATE check (highest priority)
        if verdict == "ESCALATE":
            final_verdict = "ESCALATE"
            # Extract escalation details from verdict step outputs
            for step in config.pipeline:
                if step.verdict:
                    esc = _extract_escalated_issues(
                        step_outputs.get(step.output_key, ""),
                    )
                    if esc:
                        escalated_issues.append(esc)
                        iter_result.escalated_issues = esc
            logger.info("  ESCALATE at iteration %d — stopping loop.", i)
            break

        if verdict == "PASS":
            final_verdict = "PASS"
            if i >= config.min_iterations:
                logger.info("  PASS at iteration %d (min=%d reached)!", i, config.min_iterations)
                break
            else:
                logger.info(
                    "  PASS at iteration %d, but min_iterations=%d — continuing",
                    i, config.min_iterations,
                )

        # Auto-escalate: no senior/aggregator + repeated FAIL
        has_aggregator = config.seniors or any(
            s.prompt_template == "default:aggregate-review" for s in config.pipeline
        )
        if (
            verdict == "FAIL"
            and not has_aggregator
            and i >= 2
            and _detect_auto_escalate(all_feedbacks[:-1], feedback)
        ):
            final_verdict = "ESCALATE"
            auto_msg = (
                f"Auto-escalated: same issues detected across {i} iterations "
                f"without resolution (no senior reviewer configured)."
            )
            escalated_issues.append(auto_msg)
            iter_result.escalated_issues = auto_msg
            logger.info("  AUTO-ESCALATE at iteration %d", i)
            break

        if dry_run:
            logger.info("  (dry-run: stopping after iteration 1)")
            break

    total_duration = time.monotonic() - start_time

    pipeline_result = PipelineResult(
        iterations=iterations,
        final_verdict=final_verdict,
        total_duration=round(total_duration, 1),
        run_dir=run_dir,
        repeated_aggregate_warnings=aggregate_warnings,
        escalated_issues=escalated_issues,
    )

    if not dry_run:
        _save_report(run_dir, config, pipeline_result)

    return pipeline_result


def _run_phased_pipeline(
    config: PipelineConfig,
    run_dir: Path,
    cwd: Path | None = None,
    dry_run: bool = False,
    timeout: int | None = None,
) -> PipelineResult:
    """Execute a multi-phase pipeline (e.g. review-fix)."""
    if cwd is None:
        cwd = Path(os.getcwd())

    set_language(config.language)
    input_contents = _load_inputs(config)

    iterations: list[IterationResult] = []
    feedback = "(no feedback — first iteration)"
    start_time = time.monotonic()
    final_verdict = "MAX_ITERATIONS_REACHED"
    global_iter = 0
    aggregate_history_by_phase: dict[str, dict[str, int]] = {}
    aggregate_warnings: list[str] = []
    escalated_issues: list[str] = []
    all_feedbacks: list[str] = []
    escalated = False

    for phase_idx, phase in enumerate(config.phases):
        if escalated:
            break

        logger.info("=" * 60)
        logger.info(
            "  Phase: %s (max_iter=%d, consecutive_pass=%d)",
            phase.name, phase.max_iterations, phase.consecutive_pass,
        )
        logger.info("=" * 60)

        consecutive_passes = 0
        phase_converged = False

        for pi in range(1, phase.max_iterations + 1):
            global_iter += 1

            config = try_reload_config(config)
            set_language(config.language)
            _refresh_inputs(config, input_contents)

            logger.info("-" * 50)
            logger.info(
                "  [%s] Iteration %d/%d (global: v%d)",
                phase.name, pi, phase.max_iterations, global_iter,
            )
            logger.info("-" * 50)

            step_outputs, step_results, verdict = _run_steps(
                phase.steps, config, input_contents, feedback,
                pi, phase.max_iterations, cwd, timeout, dry_run,
                run_dir=run_dir, output_iter=global_iter, phase_name=phase.name,
            )

            iter_result = IterationResult(
                iteration=global_iter,
                step_results=step_results,
                step_outputs=step_outputs,
                verdict=verdict,
                phase_name=phase.name,
            )
            phase_history = aggregate_history_by_phase.setdefault(phase.name, {})
            warning = _detect_repeated_aggregate(
                phase.steps, step_outputs, phase_history, iteration=global_iter,
                phase_name=phase.name,
            )
            if warning:
                iter_result.repeated_aggregate_warning = warning
                aggregate_warnings.append(warning)
                logger.warning("  %s", warning)

            iter_result.feedback = _collect_feedback(phase.steps, step_outputs)
            feedback = iter_result.feedback or feedback
            all_feedbacks.append(feedback)

            # Extract tracker from verdict/review steps
            for step in phase.steps:
                if step.verdict or step.role == "review":
                    tracker = _extract_senior_tracker(
                        step_outputs.get(step.output_key, ""),
                    )
                    if tracker:
                        input_contents["previous_senior_tracker"] = tracker

            iterations.append(iter_result)

            # ESCALATE check
            if verdict == "ESCALATE":
                final_verdict = "ESCALATE"
                for step in phase.steps:
                    if step.verdict:
                        esc = _extract_escalated_issues(
                            step_outputs.get(step.output_key, ""),
                        )
                        if esc:
                            escalated_issues.append(esc)
                            iter_result.escalated_issues = esc
                logger.info(
                    "  [%s] ESCALATE at iteration %d — stopping.",
                    phase.name, pi,
                )
                escalated = True
                break

            if verdict is None:
                logger.info(
                    "  [%s] completed (no verdict step; single-pass phase)",
                    phase.name,
                )
                phase_converged = True
                break

            if verdict == "PASS":
                consecutive_passes += 1
                logger.info(
                    "  [%s] PASS (%d/%d consecutive)",
                    phase.name, consecutive_passes, phase.consecutive_pass,
                )
                if consecutive_passes >= phase.consecutive_pass:
                    logger.info(
                        "  [%s] Converged! %d consecutive PASSes.",
                        phase.name, phase.consecutive_pass,
                    )
                    phase_converged = True
                    break
            else:
                consecutive_passes = 0

            # Auto-escalate in phased pipeline
            has_aggregator = config.seniors or any(
                s.prompt_template == "default:aggregate-review" for s in phase.steps
            )
            if (
                verdict == "FAIL"
                and not has_aggregator
                and pi >= 2
                and _detect_auto_escalate(all_feedbacks[:-1], feedback)
            ):
                final_verdict = "ESCALATE"
                auto_msg = (
                    f"Auto-escalated: same issues detected across {pi} iterations "
                    f"in phase '{phase.name}' without resolution."
                )
                escalated_issues.append(auto_msg)
                iter_result.escalated_issues = auto_msg
                logger.info("  [%s] AUTO-ESCALATE at iteration %d", phase.name, pi)
                escalated = True
                break

            if dry_run:
                break

        if escalated:
            break

        if phase_converged:
            logger.info("  Phase '%s' completed: CONVERGED", phase.name)
        else:
            logger.info(
                "  Phase '%s' completed: max iterations (%d) reached",
                phase.name, phase.max_iterations,
            )

        if phase_idx == len(config.phases) - 1:
            final_verdict = "PASS" if phase_converged else "MAX_ITERATIONS_REACHED"

    total_duration = time.monotonic() - start_time

    pipeline_result = PipelineResult(
        iterations=iterations,
        final_verdict=final_verdict,
        total_duration=round(total_duration, 1),
        run_dir=run_dir,
        repeated_aggregate_warnings=aggregate_warnings,
        escalated_issues=escalated_issues,
    )

    if not dry_run:
        _save_report(run_dir, config, pipeline_result)

    return pipeline_result


# ---------------------------------------------------------------------------
# Shared helpers
# ---------------------------------------------------------------------------

def _load_inputs(config: PipelineConfig) -> dict[str, str]:
    """Load input file contents from config."""
    input_contents: dict[str, str] = {}
    for key, val in config.inputs.items():
        if isinstance(val, str):
            input_contents[key] = val
        else:
            input_contents[key] = val.read_text(encoding="utf-8")
    return input_contents


def _refresh_inputs(
    config: PipelineConfig, input_contents: dict[str, str],
) -> None:
    """Re-read input files (they may have changed on disk)."""
    for key, val in config.inputs.items():
        if isinstance(val, str):
            input_contents[key] = val
        elif isinstance(val, Path) and val.exists():
            input_contents[key] = val.read_text(encoding="utf-8")


# ---------------------------------------------------------------------------
# Parallel step grouping
# ---------------------------------------------------------------------------

def _get_step_dependencies(step: StepConfig) -> set[str]:
    """Extract output_key references from context_override values."""
    deps: set[str] = set()
    for val in step.context_override.values():
        for match in re.finditer(r"\{(\w+)\}", val):
            deps.add(match.group(1))
    return deps


def _group_parallel_steps(steps: list[StepConfig]) -> list[list[StepConfig]]:
    """Group consecutive parallel steps into batches.

    Consecutive steps with parallel=True are grouped together,
    but a new batch starts when a step depends on an output_key
    from a step in the current batch (dependency breaking).
    """
    batches: list[list[StepConfig]] = []
    current: list[StepConfig] = []
    current_output_keys: set[str] = set()

    for step in steps:
        if not step.parallel:
            if current:
                batches.append(current)
                current = []
                current_output_keys = set()
            batches.append([step])
            continue

        # Check if this step depends on any output from the current batch
        deps = _get_step_dependencies(step)
        if deps & current_output_keys:
            batches.append(current)
            current = []
            current_output_keys = set()

        current.append(step)
        current_output_keys.add(step.output_key)

    if current:
        batches.append(current)

    return batches


# ---------------------------------------------------------------------------
# Step execution
# ---------------------------------------------------------------------------

def _run_steps(
    steps: list[StepConfig],
    config: PipelineConfig,
    input_contents: dict[str, str],
    feedback: str,
    iteration: int,
    max_iterations: int,
    cwd: Path,
    timeout: int | None,
    dry_run: bool,
    *,
    run_dir: Path,
    output_iter: int,
    phase_name: str | None = None,
) -> tuple[dict[str, str], dict[str, AgentResult], str | None]:
    """Execute all steps in one iteration, parallelizing where possible."""
    step_outputs: dict[str, str] = {}
    step_results: dict[str, AgentResult] = {}
    verdict: str | None = None

    batches = _group_parallel_steps(steps)

    for batch in batches:
        if len(batch) == 1:
            # Single step — run directly
            step = batch[0]
            _execute_step(
                step, config, input_contents, feedback,
                iteration, max_iterations, cwd, timeout, dry_run,
                step_outputs, step_results,
                run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
            )
        else:
            # Parallel batch — run with ThreadPoolExecutor
            _execute_parallel_batch(
                batch, config, input_contents, feedback,
                iteration, max_iterations, cwd, timeout, dry_run,
                step_outputs, step_results,
                run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
            )

    # Extract verdict from all verdict steps (ALL must PASS; ESCALATE wins over all)
    for step in steps:
        if step.verdict:
            output = step_outputs.get(step.output_key, "")
            step_verdict = _extract_verdict(output, step.verdict_pattern)
            logger.info("  [%s] verdict: %s", step.name, step_verdict)
            if step_verdict == "ESCALATE":
                verdict = "ESCALATE"
            elif verdict is None:
                verdict = step_verdict
            elif verdict != "ESCALATE" and step_verdict == "FAIL":
                verdict = "FAIL"

    return step_outputs, step_results, verdict


def _execute_step(
    step: StepConfig,
    config: PipelineConfig,
    input_contents: dict[str, str],
    feedback: str,
    iteration: int,
    max_iterations: int,
    cwd: Path,
    timeout: int | None,
    dry_run: bool,
    step_outputs: dict[str, str],
    step_results: dict[str, AgentResult],
    *,
    run_dir: Path,
    output_iter: int,
    phase_name: str | None = None,
    quiet: bool = False,
) -> None:
    """Execute a single step, updating step_outputs and step_results in place."""
    if not quiet:
        logger.info("  [%s] agent='%s' role='%s'", step.name, step.agent, step.role)

    # 1. Resolve template
    template = resolve_template(step.prompt_template)

    # 2. Build context
    context = _build_context(
        input_contents, step_outputs, feedback, iteration, max_iterations,
    )

    # 3. Apply context overrides
    if step.context_override:
        context = _apply_context_override(context, step.context_override)

    # 4. Render prompt
    prompt = render_template(template, context)

    # 5. Dry run: print and skip
    if dry_run:
        phase_label = f" phase={phase_name}" if phase_name else ""
        print(f"\n--- Step: {step.name} (agent={step.agent}{phase_label}) ---")
        print(prompt)
        print(f"--- end {step.name} ---\n")
        step_outputs[step.output_key] = f"(dry-run: no output for {step.output_key})"
        return

    # 6. Invoke agent
    agent_config = config.agents[step.agent]
    try:
        result = invoke_agent(
            agent_config, prompt, step.name,
            cwd=cwd, timeout=timeout, quiet=quiet,
        )
    except subprocess.TimeoutExpired as e:
        stdout = (e.stdout or b"") if isinstance(e.stdout, bytes) else (e.stdout or "")
        stderr = (e.stderr or b"") if isinstance(e.stderr, bytes) else (e.stderr or "")
        if isinstance(stdout, bytes):
            stdout = stdout.decode("utf-8", errors="replace")
        if isinstance(stderr, bytes):
            stderr = stderr.decode("utf-8", errors="replace")
        phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
        error_msg = (
            f"# Agent Timeout\n\n"
            f"{phase_info}"
            f"- **Step**: {step.name}\n"
            f"- **Agent**: {step.agent}\n"
            f"- **Timeout**: {timeout}s\n\n"
            f"Partial stdout ({len(stdout)} chars):\n"
            f"```\n{stdout[:2000] or '(none)'}\n```\n\n"
            f"Stderr:\n```\n{stderr[:2000] or '(none)'}\n```\n"
        )
        _save_step_output(run_dir, output_iter, f"{step.name}_error", error_msg)
        logger.error("  [%s] TIMEOUT after %ss — saved to output", step.name, timeout)
        raise RuntimeError(
            f"Agent '{step.agent}' timed out after {timeout}s at step '{step.name}'. "
            f"Error saved to {run_dir}/v{output_iter}/{step.name}_error.md. "
            f"Try --timeout 0 (unlimited)"
        )
    except RuntimeError as e:
        error_msg = _format_runtime_error_markdown(
            e,
            step_name=step.name,
            agent_name=step.agent,
            phase_name=phase_name,
        )
        _save_step_output(run_dir, output_iter, f"{step.name}_error", error_msg)
        logger.error("  [%s] FAILED — saved to output", step.name)
        raise

    # 7. Store output
    step_outputs[step.output_key] = result.output
    step_results[step.output_key] = result

    if not quiet:
        logger.info(
            "  [%s] completed (%.1fs, %d chars)",
            step.name, result.duration_seconds, len(result.output),
        )

    # 8. Save to disk
    _save_step_output(run_dir, output_iter, step.name, result.output)


def _execute_parallel_batch(
    batch: list[StepConfig],
    config: PipelineConfig,
    input_contents: dict[str, str],
    feedback: str,
    iteration: int,
    max_iterations: int,
    cwd: Path,
    timeout: int | None,
    dry_run: bool,
    step_outputs: dict[str, str],
    step_results: dict[str, AgentResult],
    *,
    run_dir: Path,
    output_iter: int,
    phase_name: str | None = None,
) -> None:
    """Execute multiple steps in parallel using threads."""
    agent_names = ", ".join(s.agent for s in batch)
    logger.info("  [parallel] %d agents: %s", len(batch), agent_names)

    if dry_run:
        for step in batch:
            _execute_step(
                step, config, input_contents, feedback,
                iteration, max_iterations, cwd, timeout, dry_run,
                step_outputs, step_results,
                run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
            )
        return

    # Snapshot context before parallel execution (all steps see same state)
    context_snapshot = dict(input_contents)
    context_snapshot.update(step_outputs)

    # Collect results from parallel threads
    local_outputs: dict[str, str] = {}
    local_results: dict[str, AgentResult] = {}
    errors: list[tuple[StepConfig, Exception]] = []

    # Show a single spinner for the batch
    from cross_eval.agent import _Spinner
    spinner = _Spinner(
        f"[parallel] {len(batch)} agents running ({agent_names})..."
    )
    spinner.start()
    batch_start = time.monotonic()

    def _run_one(step: StepConfig) -> tuple[str, str, AgentResult]:
        """Run one step, return (output_key, output, result)."""
        template = resolve_template(step.prompt_template)
        context = _build_context(
            context_snapshot, {}, feedback, iteration, max_iterations,
        )
        if step.context_override:
            context = _apply_context_override(context, step.context_override)
        prompt = render_template(template, context)

        agent_config = config.agents[step.agent]
        result = invoke_agent(
            agent_config, prompt, step.name,
            cwd=cwd, timeout=timeout, quiet=True,
        )
        return step.output_key, result.output, result

    with ThreadPoolExecutor(max_workers=len(batch)) as executor:
        futures = {executor.submit(_run_one, step): step for step in batch}
        for future in as_completed(futures):
            step = futures[future]
            try:
                output_key, output, result = future.result()
                local_results[output_key] = result
                local_outputs[output_key] = output
            except Exception as e:
                errors.append((step, e))

    batch_elapsed = round(time.monotonic() - batch_start, 1)

    # Persist successful outputs even if a sibling step failed.
    for step in batch:
        key = step.output_key
        if key not in local_outputs:
            continue
        step_outputs[key] = local_outputs[key]
        step_results[key] = local_results[key]
        r = local_results[key]
        logger.info(
            "  [%s] completed (%.1fs, %d chars)",
            step.name, r.duration_seconds, len(r.output),
        )
        _save_step_output(run_dir, output_iter, step.name, r.output)

    if errors:
        spinner.stop(f"[parallel] FAILED ({batch_elapsed}s)")
        for failed_step, exc in errors:
            if isinstance(exc, subprocess.TimeoutExpired):
                stdout = (exc.stdout or b"") if isinstance(exc.stdout, bytes) else (exc.stdout or "")
                stderr = (exc.stderr or b"") if isinstance(exc.stderr, bytes) else (exc.stderr or "")
                if isinstance(stdout, bytes):
                    stdout = stdout.decode("utf-8", errors="replace")
                if isinstance(stderr, bytes):
                    stderr = stderr.decode("utf-8", errors="replace")
                phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
                error_msg = (
                    f"# Agent Timeout\n\n"
                    f"{phase_info}"
                    f"- **Step**: {failed_step.name}\n"
                    f"- **Agent**: {failed_step.agent}\n"
                    f"- **Timeout**: {timeout}s\n\n"
                    f"Partial stdout ({len(stdout)} chars):\n"
                    f"```\n{stdout[:2000] or '(none)'}\n```\n\n"
                    f"Stderr:\n```\n{stderr[:2000] or '(none)'}\n```\n"
                )
            else:
                error_msg = _format_runtime_error_markdown(
                    exc,
                    step_name=failed_step.name,
                    agent_name=failed_step.agent,
                    phase_name=phase_name,
                )
            _save_step_output(run_dir, output_iter, f"{failed_step.name}_error", error_msg)
            logger.error("  [%s] FAILED — saved to output", failed_step.name)

        failed_steps = ", ".join(step.name for step, _ in errors)
        saved_steps = ", ".join(step.name for step in batch if step.output_key in local_outputs)
        first_error = errors[0][1]
        saved_note = f" Successful outputs were saved for: {saved_steps}." if saved_steps else ""
        raise RuntimeError(
            f"Parallel batch failed: {len(errors)}/{len(batch)} steps failed ({failed_steps})."
            f"{saved_note} First error:\n{first_error}"
        )

    spinner.stop(f"[parallel] {len(batch)} agents done ({batch_elapsed}s)")


# ---------------------------------------------------------------------------
# Context and template helpers
# ---------------------------------------------------------------------------

def _build_context(
    input_contents: dict[str, str],
    step_outputs: dict[str, str],
    feedback: str,
    iteration: int,
    max_iterations: int,
) -> dict[str, str]:
    """Build the template context dict."""
    context: dict[str, str] = {}
    context.update(input_contents)
    context.update(step_outputs)
    context["feedback"] = feedback
    context["iteration"] = str(iteration)
    context["max_iterations"] = str(max_iterations)
    return context


def _apply_context_override(
    context: dict[str, str],
    overrides: dict[str, str],
) -> dict[str, str]:
    """Apply context_override mappings for cross-review scenarios."""
    result = dict(context)
    for key, value_template in overrides.items():
        result[key] = render_template(value_template, context)
    return result


def _collect_feedback(
    steps: list[StepConfig],
    step_outputs: dict[str, str],
) -> str:
    """Collect feedback from all verdict steps.

    Single verdict step  → raw output (backward compatible).
    Multiple verdict steps → combined with agent headers for cross-referencing.
    """
    verdict_steps = [s for s in steps if s.verdict]
    if len(verdict_steps) == 1:
        return step_outputs.get(verdict_steps[0].output_key, "")
    parts: list[str] = []
    for s in verdict_steps:
        output = step_outputs.get(s.output_key, "")
        if output:
            parts.append(f"## Review by {s.agent} ({s.name})\n{output}")
    return "\n\n---\n\n".join(parts)


def _detect_repeated_aggregate(
    steps: list[StepConfig],
    step_outputs: dict[str, str],
    history: dict[str, int],
    *,
    iteration: int,
    phase_name: str | None = None,
) -> str | None:
    """Detect repeated aggregate-review outputs across iterations."""
    for step in steps:
        if step.prompt_template != "default:aggregate-review":
            continue
        output = step_outputs.get(step.output_key, "")
        normalized = _normalize_aggregate_output(output)
        if not normalized:
            return None
        if normalized in history:
            prev_iter = history[normalized]
            phase_prefix = f"[{phase_name}] " if phase_name else ""
            return (
                f"{phase_prefix}Repeated aggregate_review detected at iteration {iteration} "
                f"(same as iteration {prev_iter})."
            )
        history[normalized] = iteration
        return None
    return None


def _normalize_aggregate_output(output: str) -> str:
    """Normalize aggregate output for repeat detection."""
    return " ".join(output.lower().split())


_ESCALATE_PATTERN = re.compile(r"VERDICT:\s*ESCALATE", re.IGNORECASE)

_TRACKER_TABLE_PATTERN = re.compile(
    r"(##+ Issue Tracker[^\n]*\n(?:\|[^\n]+\|\n?)+)", re.DOTALL,
)


def _extract_verdict(output: str, pattern: str) -> str:
    """Extract PASS, FAIL, or ESCALATE from output using regex pattern."""
    if re.search(_ESCALATE_PATTERN, output):
        return "ESCALATE"  # highest priority
    if re.search(pattern, output):
        return "PASS"
    return "FAIL"


def _extract_senior_tracker(output: str) -> str:
    """Extract Issue Tracker table from senior review output."""
    match = _TRACKER_TABLE_PATTERN.search(output)
    return match.group(0) if match else ""


def _extract_escalated_issues(output: str) -> str:
    """Extract escalation details from senior review output."""
    # Look for content between VERDICT: ESCALATE and end, or an escalation section
    pattern = r"(?:###?\s*Escalat(?:ed|ion).*?\n)(.*?)(?=\n###|\Z)"
    match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
    if match:
        return match.group(1).strip()
    # Fallback: grab the Action Items section
    pattern2 = r"(?:###?\s*Action Items.*?\n)(.*?)(?=\n###|\Z)"
    match2 = re.search(pattern2, output, re.DOTALL | re.IGNORECASE)
    if match2:
        return match2.group(1).strip()
    return ""


_FP_PATTERN = re.compile(r"[\w/\\]+\.\w{1,5}")
_ISSUE_KEYWORDS = re.compile(
    r"\b(missing|validation|error[\s_-]?handling|unused|import|"
    r"injection|auth(?:entication|orization)?|deprecated|"
    r"leak|overflow|null|undefined|timeout|deadlock|race[\s_-]?condition|"
    r"security|permission|encoding|format|parsing|connection|"
    r"boundary|initialization|cleanup|resource|concurrency|"
    r"exception|crash|hang|corrupt|truncat|duplicat|inconsisten|"
    r"omission|over[\s_-]?engineer|refactor|naming|docstring|"
    r"type[\s_-]?hint|test|coverage|logging|config|performance)\w*",
    re.IGNORECASE,
)


def _issue_fingerprints(text: str) -> set[tuple[str, str]]:
    """Extract (file_path, issue_keyword) pairs from feedback text.

    For each file path found, look for issue keywords within a window of
    ~120 characters around the file path mention and create composite keys.
    """
    lower = text.lower()
    paths = list(_FP_PATTERN.finditer(lower))
    if not paths:
        return set()

    pairs: set[tuple[str, str]] = set()
    for m in paths:
        fp = m.group()
        # Search a window around the file path for issue keywords
        window_start = max(0, m.start() - 60)
        window_end = min(len(lower), m.end() + 60)
        window = lower[window_start:window_end]
        for kw_match in _ISSUE_KEYWORDS.finditer(window):
            pairs.add((fp, kw_match.group().lower()))
    return pairs


def _detect_auto_escalate(
    feedbacks: list[str],
    current_feedback: str,
    threshold: int = 2,
) -> bool:
    """Detect repeated identical issues across iterations (for auto-escalation).

    Extracts (file_path, issue_keyword) fingerprints from feedback and checks
    if any identical pair appears in >= *threshold* previous iterations.
    This avoids false positives when the same file is mentioned for completely
    different issues across iterations.
    """
    current_fps = _issue_fingerprints(current_feedback)
    if not current_fps:
        return False

    repeat_count = 0
    for prev in feedbacks:
        prev_fps = _issue_fingerprints(prev)
        if current_fps & prev_fps:
            repeat_count += 1
    return repeat_count >= threshold


def _save_step_output(
    run_dir: Path,
    iteration: int,
    step_name: str,
    content: str,
) -> Path:
    """Save step output to run_dir/v{iteration}/{step_name}.md"""
    path = run_dir / f"v{iteration}" / f"{step_name}.md"
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(content, encoding="utf-8")
    return path


def _format_runtime_error_markdown(
    exc: Exception,
    *,
    step_name: str,
    agent_name: str,
    phase_name: str | None = None,
) -> str:
    """Render a structured markdown error report for a failed step."""
    phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
    lines = [
        "# Agent Error",
        "",
        phase_info.rstrip(),
        f"- **Step**: {step_name}",
        f"- **Agent**: {agent_name}",
    ]
    lines = [line for line in lines if line]

    if isinstance(exc, AgentInvocationError):
        lines.extend(
            [
                f"- **Failure Type**: {exc.failure_type}",
                f"- **Suggested Action**: {exc.suggested_action}",
                "",
                "## Command",
                f"```",
                exc.cmd_preview,
                "```",
                "",
                "## Raw Error",
                "```",
                exc.raw_error,
                "```",
            ],
        )
    else:
        lines.extend(
            [
                "",
                "```",
                str(exc),
                "```",
            ],
        )

    return "\n".join(lines) + "\n"


def _save_report(run_dir: Path, config: PipelineConfig, result: PipelineResult) -> None:
    """Build and save the final markdown report."""
    report = build_report(config, result)
    report_path = run_dir / "final-report.md"
    report_path.parent.mkdir(parents=True, exist_ok=True)
    report_path.write_text(report, encoding="utf-8")
    logger.info("Report saved: %s", report_path)