initial commit

2026-03-11 21:53:14 +09:00
commit ee4f1a07ef
42 changed files with 4533 additions and 0 deletions
--- a/cross_eval/pipeline.py
+++ b/cross_eval/pipeline.py
@@ -0,0 +1,700 @@
+"""Main pipeline execution engine."""
+from __future__ import annotations
+
+import logging
+import os
+import re
+import subprocess
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime
+from pathlib import Path
+
+from cross_eval.agent import invoke_agent
+from cross_eval.config import try_reload_config
+from cross_eval.models import (
+    AgentResult,
+    IterationResult,
+    PipelineConfig,
+    PipelineResult,
+    StepConfig,
+)
+from cross_eval.prompts import render_template, resolve_template, set_language
+from cross_eval.report import build_report
+
+logger = logging.getLogger(__name__)
+
+
+def run_pipeline(
+    config: PipelineConfig,
+    cwd: Path | None = None,
+    dry_run: bool = False,
+    timeout: int | None = None,
+) -> PipelineResult:
+    """Execute the full cross-eval pipeline."""
+    # Create run directory: output/{preset}_{datetime}/
+    run_dir = _make_run_dir(config)
+
+    if config.phases:
+        return _run_phased_pipeline(config, run_dir, cwd, dry_run, timeout)
+    return _run_simple_pipeline(config, run_dir, cwd, dry_run, timeout)
+
+
+def _make_run_dir(config: PipelineConfig) -> Path:
+    """Create timestamped run directory under output_dir."""
+    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+    run_dir = config.output_dir / f"{config.preset_name}_{ts}"
+    run_dir.mkdir(parents=True, exist_ok=True)
+    return run_dir
+
+
+def _run_simple_pipeline(
+    config: PipelineConfig,
+    run_dir: Path,
+    cwd: Path | None = None,
+    dry_run: bool = False,
+    timeout: int | None = None,
+) -> PipelineResult:
+    """Execute a simple (non-phased) pipeline."""
+    if cwd is None:
+        cwd = Path(os.getcwd())
+
+    set_language(config.language)
+    input_contents = _load_inputs(config)
+
+    feedback = "(no feedback — first iteration)"
+    iterations: list[IterationResult] = []
+    start_time = time.monotonic()
+    final_verdict = "MAX_ITERATIONS_REACHED"
+    aggregate_history: dict[str, int] = {}
+    aggregate_warnings: list[str] = []
+
+    for i in range(1, config.max_iterations + 1):
+        config = try_reload_config(config)
+        set_language(config.language)
+        _refresh_inputs(config, input_contents)
+
+        logger.info("=" * 50)
+        logger.info("  Iteration %d/%d", i, config.max_iterations)
+        logger.info("=" * 50)
+
+        step_outputs, step_results, verdict = _run_steps(
+            config.pipeline, config, input_contents, feedback,
+            i, config.max_iterations, cwd, timeout, dry_run,
+            run_dir=run_dir, output_iter=i,
+        )
+
+        iter_result = IterationResult(
+            iteration=i,
+            step_results=step_results,
+            step_outputs=step_outputs,
+            verdict=verdict,
+        )
+        warning = _detect_repeated_aggregate(
+            config.pipeline, step_outputs, aggregate_history, iteration=i,
+        )
+        if warning:
+            iter_result.repeated_aggregate_warning = warning
+            aggregate_warnings.append(warning)
+            logger.warning("  %s", warning)
+
+        iter_result.feedback = _collect_feedback(config.pipeline, step_outputs)
+        feedback = iter_result.feedback or feedback
+        iterations.append(iter_result)
+
+        if verdict == "PASS":
+            final_verdict = "PASS"
+            if i >= config.min_iterations:
+                logger.info("  PASS at iteration %d (min=%d reached)!", i, config.min_iterations)
+                break
+            else:
+                logger.info(
+                    "  PASS at iteration %d, but min_iterations=%d — continuing",
+                    i, config.min_iterations,
+                )
+
+        if dry_run:
+            logger.info("  (dry-run: stopping after iteration 1)")
+            break
+
+    total_duration = time.monotonic() - start_time
+
+    pipeline_result = PipelineResult(
+        iterations=iterations,
+        final_verdict=final_verdict,
+        total_duration=round(total_duration, 1),
+        run_dir=run_dir,
+        repeated_aggregate_warnings=aggregate_warnings,
+    )
+
+    if not dry_run:
+        _save_report(run_dir, config, pipeline_result)
+
+    return pipeline_result
+
+
+def _run_phased_pipeline(
+    config: PipelineConfig,
+    run_dir: Path,
+    cwd: Path | None = None,
+    dry_run: bool = False,
+    timeout: int | None = None,
+) -> PipelineResult:
+    """Execute a multi-phase pipeline (e.g. review-fix)."""
+    if cwd is None:
+        cwd = Path(os.getcwd())
+
+    set_language(config.language)
+    input_contents = _load_inputs(config)
+
+    iterations: list[IterationResult] = []
+    feedback = "(no feedback — first iteration)"
+    start_time = time.monotonic()
+    final_verdict = "MAX_ITERATIONS_REACHED"
+    global_iter = 0
+    aggregate_history_by_phase: dict[str, dict[str, int]] = {}
+    aggregate_warnings: list[str] = []
+
+    for phase_idx, phase in enumerate(config.phases):
+        logger.info("=" * 60)
+        logger.info(
+            "  Phase: %s (max_iter=%d, consecutive_pass=%d)",
+            phase.name, phase.max_iterations, phase.consecutive_pass,
+        )
+        logger.info("=" * 60)
+
+        consecutive_passes = 0
+        phase_converged = False
+
+        for pi in range(1, phase.max_iterations + 1):
+            global_iter += 1
+
+            config = try_reload_config(config)
+            set_language(config.language)
+            _refresh_inputs(config, input_contents)
+
+            logger.info("-" * 50)
+            logger.info(
+                "  [%s] Iteration %d/%d (global: v%d)",
+                phase.name, pi, phase.max_iterations, global_iter,
+            )
+            logger.info("-" * 50)
+
+            step_outputs, step_results, verdict = _run_steps(
+                phase.steps, config, input_contents, feedback,
+                pi, phase.max_iterations, cwd, timeout, dry_run,
+                run_dir=run_dir, output_iter=global_iter, phase_name=phase.name,
+            )
+
+            iter_result = IterationResult(
+                iteration=global_iter,
+                step_results=step_results,
+                step_outputs=step_outputs,
+                verdict=verdict,
+                phase_name=phase.name,
+            )
+            phase_history = aggregate_history_by_phase.setdefault(phase.name, {})
+            warning = _detect_repeated_aggregate(
+                phase.steps, step_outputs, phase_history, iteration=global_iter,
+                phase_name=phase.name,
+            )
+            if warning:
+                iter_result.repeated_aggregate_warning = warning
+                aggregate_warnings.append(warning)
+                logger.warning("  %s", warning)
+
+            iter_result.feedback = _collect_feedback(phase.steps, step_outputs)
+            feedback = iter_result.feedback or feedback
+            iterations.append(iter_result)
+
+            if verdict == "PASS":
+                consecutive_passes += 1
+                logger.info(
+                    "  [%s] PASS (%d/%d consecutive)",
+                    phase.name, consecutive_passes, phase.consecutive_pass,
+                )
+                if consecutive_passes >= phase.consecutive_pass:
+                    logger.info(
+                        "  [%s] Converged! %d consecutive PASSes.",
+                        phase.name, phase.consecutive_pass,
+                    )
+                    phase_converged = True
+                    break
+            else:
+                consecutive_passes = 0
+
+            if dry_run:
+                break
+
+        if phase_converged:
+            logger.info("  Phase '%s' completed: CONVERGED", phase.name)
+        else:
+            logger.info(
+                "  Phase '%s' completed: max iterations (%d) reached",
+                phase.name, phase.max_iterations,
+            )
+
+        if phase_idx == len(config.phases) - 1:
+            final_verdict = "PASS" if phase_converged else "MAX_ITERATIONS_REACHED"
+
+    total_duration = time.monotonic() - start_time
+
+    pipeline_result = PipelineResult(
+        iterations=iterations,
+        final_verdict=final_verdict,
+        total_duration=round(total_duration, 1),
+        run_dir=run_dir,
+        repeated_aggregate_warnings=aggregate_warnings,
+    )
+
+    if not dry_run:
+        _save_report(run_dir, config, pipeline_result)
+
+    return pipeline_result
+
+
+# ---------------------------------------------------------------------------
+# Shared helpers
+# ---------------------------------------------------------------------------
+
+def _load_inputs(config: PipelineConfig) -> dict[str, str]:
+    """Load input file contents from config."""
+    input_contents: dict[str, str] = {}
+    for key, val in config.inputs.items():
+        if isinstance(val, str):
+            input_contents[key] = val
+        else:
+            input_contents[key] = val.read_text(encoding="utf-8")
+    return input_contents
+
+
+def _refresh_inputs(
+    config: PipelineConfig, input_contents: dict[str, str],
+) -> None:
+    """Re-read input files (they may have changed on disk)."""
+    for key, val in config.inputs.items():
+        if isinstance(val, str):
+            input_contents[key] = val
+        elif isinstance(val, Path) and val.exists():
+            input_contents[key] = val.read_text(encoding="utf-8")
+
+
+# ---------------------------------------------------------------------------
+# Parallel step grouping
+# ---------------------------------------------------------------------------
+
+def _get_step_dependencies(step: StepConfig) -> set[str]:
+    """Extract output_key references from context_override values."""
+    deps: set[str] = set()
+    for val in step.context_override.values():
+        for match in re.finditer(r"\{(\w+)\}", val):
+            deps.add(match.group(1))
+    return deps
+
+
+def _group_parallel_steps(steps: list[StepConfig]) -> list[list[StepConfig]]:
+    """Group consecutive parallel steps into batches.
+
+    Consecutive steps with parallel=True are grouped together,
+    but a new batch starts when a step depends on an output_key
+    from a step in the current batch (dependency breaking).
+    """
+    batches: list[list[StepConfig]] = []
+    current: list[StepConfig] = []
+    current_output_keys: set[str] = set()
+
+    for step in steps:
+        if not step.parallel:
+            if current:
+                batches.append(current)
+                current = []
+                current_output_keys = set()
+            batches.append([step])
+            continue
+
+        # Check if this step depends on any output from the current batch
+        deps = _get_step_dependencies(step)
+        if deps & current_output_keys:
+            batches.append(current)
+            current = []
+            current_output_keys = set()
+
+        current.append(step)
+        current_output_keys.add(step.output_key)
+
+    if current:
+        batches.append(current)
+
+    return batches
+
+
+# ---------------------------------------------------------------------------
+# Step execution
+# ---------------------------------------------------------------------------
+
+def _run_steps(
+    steps: list[StepConfig],
+    config: PipelineConfig,
+    input_contents: dict[str, str],
+    feedback: str,
+    iteration: int,
+    max_iterations: int,
+    cwd: Path,
+    timeout: int | None,
+    dry_run: bool,
+    *,
+    run_dir: Path,
+    output_iter: int,
+    phase_name: str | None = None,
+) -> tuple[dict[str, str], dict[str, AgentResult], str | None]:
+    """Execute all steps in one iteration, parallelizing where possible."""
+    step_outputs: dict[str, str] = {}
+    step_results: dict[str, AgentResult] = {}
+    verdict: str | None = None
+
+    batches = _group_parallel_steps(steps)
+
+    for batch in batches:
+        if len(batch) == 1:
+            # Single step — run directly
+            step = batch[0]
+            _execute_step(
+                step, config, input_contents, feedback,
+                iteration, max_iterations, cwd, timeout, dry_run,
+                step_outputs, step_results,
+                run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
+            )
+        else:
+            # Parallel batch — run with ThreadPoolExecutor
+            _execute_parallel_batch(
+                batch, config, input_contents, feedback,
+                iteration, max_iterations, cwd, timeout, dry_run,
+                step_outputs, step_results,
+                run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
+            )
+
+    # Extract verdict from all verdict steps (ALL must PASS)
+    for step in steps:
+        if step.verdict:
+            output = step_outputs.get(step.output_key, "")
+            step_verdict = _extract_verdict(output, step.verdict_pattern)
+            logger.info("  [%s] verdict: %s", step.name, step_verdict)
+            if verdict is None:
+                verdict = step_verdict
+            elif step_verdict == "FAIL":
+                verdict = "FAIL"
+
+    return step_outputs, step_results, verdict
+
+
+def _execute_step(
+    step: StepConfig,
+    config: PipelineConfig,
+    input_contents: dict[str, str],
+    feedback: str,
+    iteration: int,
+    max_iterations: int,
+    cwd: Path,
+    timeout: int | None,
+    dry_run: bool,
+    step_outputs: dict[str, str],
+    step_results: dict[str, AgentResult],
+    *,
+    run_dir: Path,
+    output_iter: int,
+    phase_name: str | None = None,
+    quiet: bool = False,
+) -> None:
+    """Execute a single step, updating step_outputs and step_results in place."""
+    if not quiet:
+        logger.info("  [%s] agent='%s' role='%s'", step.name, step.agent, step.role)
+
+    # 1. Resolve template
+    template = resolve_template(step.prompt_template)
+
+    # 2. Build context
+    context = _build_context(
+        input_contents, step_outputs, feedback, iteration, max_iterations,
+    )
+
+    # 3. Apply context overrides
+    if step.context_override:
+        context = _apply_context_override(context, step.context_override)
+
+    # 4. Render prompt
+    prompt = render_template(template, context)
+
+    # 5. Dry run: print and skip
+    if dry_run:
+        phase_label = f" phase={phase_name}" if phase_name else ""
+        print(f"\n--- Step: {step.name} (agent={step.agent}{phase_label}) ---")
+        print(prompt)
+        print(f"--- end {step.name} ---\n")
+        step_outputs[step.output_key] = f"(dry-run: no output for {step.output_key})"
+        return
+
+    # 6. Invoke agent
+    agent_config = config.agents[step.agent]
+    try:
+        result = invoke_agent(
+            agent_config, prompt, step.name,
+            cwd=cwd, timeout=timeout, quiet=quiet,
+        )
+    except subprocess.TimeoutExpired as e:
+        stdout = (e.stdout or b"") if isinstance(e.stdout, bytes) else (e.stdout or "")
+        stderr = (e.stderr or b"") if isinstance(e.stderr, bytes) else (e.stderr or "")
+        if isinstance(stdout, bytes):
+            stdout = stdout.decode("utf-8", errors="replace")
+        if isinstance(stderr, bytes):
+            stderr = stderr.decode("utf-8", errors="replace")
+        phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
+        error_msg = (
+            f"# Agent Timeout\n\n"
+            f"{phase_info}"
+            f"- **Step**: {step.name}\n"
+            f"- **Agent**: {step.agent}\n"
+            f"- **Timeout**: {timeout}s\n\n"
+            f"Partial stdout ({len(stdout)} chars):\n"
+            f"```\n{stdout[:2000] or '(none)'}\n```\n\n"
+            f"Stderr:\n```\n{stderr[:2000] or '(none)'}\n```\n"
+        )
+        _save_step_output(run_dir, output_iter, f"{step.name}_error", error_msg)
+        logger.error("  [%s] TIMEOUT after %ss — saved to output", step.name, timeout)
+        raise RuntimeError(
+            f"Agent '{step.agent}' timed out after {timeout}s at step '{step.name}'. "
+            f"Error saved to {run_dir}/v{output_iter}/{step.name}_error.md. "
+            f"Try --timeout 0 (unlimited)"
+        )
+    except RuntimeError as e:
+        phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
+        error_msg = (
+            f"# Agent Error\n\n{phase_info}"
+            f"- **Step**: {step.name}\n- **Agent**: {step.agent}\n\n```\n{e}\n```\n"
+        )
+        _save_step_output(run_dir, output_iter, f"{step.name}_error", error_msg)
+        logger.error("  [%s] FAILED — saved to output", step.name)
+        raise
+
+    # 7. Store output
+    step_outputs[step.output_key] = result.output
+    step_results[step.output_key] = result
+
+    if not quiet:
+        logger.info(
+            "  [%s] completed (%.1fs, %d chars)",
+            step.name, result.duration_seconds, len(result.output),
+        )
+
+    # 8. Save to disk
+    _save_step_output(run_dir, output_iter, step.name, result.output)
+
+
+def _execute_parallel_batch(
+    batch: list[StepConfig],
+    config: PipelineConfig,
+    input_contents: dict[str, str],
+    feedback: str,
+    iteration: int,
+    max_iterations: int,
+    cwd: Path,
+    timeout: int | None,
+    dry_run: bool,
+    step_outputs: dict[str, str],
+    step_results: dict[str, AgentResult],
+    *,
+    run_dir: Path,
+    output_iter: int,
+    phase_name: str | None = None,
+) -> None:
+    """Execute multiple steps in parallel using threads."""
+    agent_names = ", ".join(s.agent for s in batch)
+    logger.info("  [parallel] %d agents: %s", len(batch), agent_names)
+
+    if dry_run:
+        for step in batch:
+            _execute_step(
+                step, config, input_contents, feedback,
+                iteration, max_iterations, cwd, timeout, dry_run,
+                step_outputs, step_results,
+                run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
+            )
+        return
+
+    # Snapshot context before parallel execution (all steps see same state)
+    context_snapshot = dict(input_contents)
+    context_snapshot.update(step_outputs)
+
+    # Collect results from parallel threads
+    local_outputs: dict[str, str] = {}
+    local_results: dict[str, AgentResult] = {}
+    errors: list[Exception] = []
+
+    # Show a single spinner for the batch
+    from cross_eval.agent import _Spinner
+    spinner = _Spinner(
+        f"[parallel] {len(batch)} agents running ({agent_names})..."
+    )
+    spinner.start()
+    batch_start = time.monotonic()
+
+    def _run_one(step: StepConfig) -> tuple[str, str, AgentResult]:
+        """Run one step, return (output_key, output, result)."""
+        template = resolve_template(step.prompt_template)
+        context = _build_context(
+            context_snapshot, {}, feedback, iteration, max_iterations,
+        )
+        if step.context_override:
+            context = _apply_context_override(context, step.context_override)
+        prompt = render_template(template, context)
+
+        agent_config = config.agents[step.agent]
+        result = invoke_agent(
+            agent_config, prompt, step.name,
+            cwd=cwd, timeout=timeout, quiet=True,
+        )
+        return step.output_key, result.output, result
+
+    with ThreadPoolExecutor(max_workers=len(batch)) as executor:
+        futures = {executor.submit(_run_one, step): step for step in batch}
+        for future in as_completed(futures):
+            step = futures[future]
+            try:
+                output_key, output, result = future.result()
+                local_results[output_key] = result
+                local_outputs[output_key] = output
+            except Exception as e:
+                errors.append(e)
+
+    batch_elapsed = round(time.monotonic() - batch_start, 1)
+
+    if errors:
+        spinner.stop(f"[parallel] FAILED ({batch_elapsed}s)")
+        raise errors[0]
+
+    spinner.stop(f"[parallel] {len(batch)} agents done ({batch_elapsed}s)")
+
+    # Merge results
+    for step in batch:
+        key = step.output_key
+        step_outputs[key] = local_outputs[key]
+        step_results[key] = local_results[key]
+        r = local_results[key]
+        logger.info(
+            "  [%s] completed (%.1fs, %d chars)",
+            step.name, r.duration_seconds, len(r.output),
+        )
+        _save_step_output(run_dir, output_iter, step.name, r.output)
+
+
+# ---------------------------------------------------------------------------
+# Context and template helpers
+# ---------------------------------------------------------------------------
+
+def _build_context(
+    input_contents: dict[str, str],
+    step_outputs: dict[str, str],
+    feedback: str,
+    iteration: int,
+    max_iterations: int,
+) -> dict[str, str]:
+    """Build the template context dict."""
+    context: dict[str, str] = {}
+    context.update(input_contents)
+    context.update(step_outputs)
+    context["feedback"] = feedback
+    context["iteration"] = str(iteration)
+    context["max_iterations"] = str(max_iterations)
+    return context
+
+
+def _apply_context_override(
+    context: dict[str, str],
+    overrides: dict[str, str],
+) -> dict[str, str]:
+    """Apply context_override mappings for cross-review scenarios."""
+    result = dict(context)
+    for key, value_template in overrides.items():
+        result[key] = render_template(value_template, context)
+    return result
+
+
+def _collect_feedback(
+    steps: list[StepConfig],
+    step_outputs: dict[str, str],
+) -> str:
+    """Collect feedback from all verdict steps.
+
+    Single verdict step  → raw output (backward compatible).
+    Multiple verdict steps → combined with agent headers for cross-referencing.
+    """
+    verdict_steps = [s for s in steps if s.verdict]
+    if len(verdict_steps) == 1:
+        return step_outputs.get(verdict_steps[0].output_key, "")
+    parts: list[str] = []
+    for s in verdict_steps:
+        output = step_outputs.get(s.output_key, "")
+        if output:
+            parts.append(f"## Review by {s.agent} ({s.name})\n{output}")
+    return "\n\n---\n\n".join(parts)
+
+
+def _detect_repeated_aggregate(
+    steps: list[StepConfig],
+    step_outputs: dict[str, str],
+    history: dict[str, int],
+    *,
+    iteration: int,
+    phase_name: str | None = None,
+) -> str | None:
+    """Detect repeated aggregate-review outputs across iterations."""
+    for step in steps:
+        if step.prompt_template != "default:aggregate-review":
+            continue
+        output = step_outputs.get(step.output_key, "")
+        normalized = _normalize_aggregate_output(output)
+        if not normalized:
+            return None
+        if normalized in history:
+            prev_iter = history[normalized]
+            phase_prefix = f"[{phase_name}] " if phase_name else ""
+            return (
+                f"{phase_prefix}Repeated aggregate_review detected at iteration {iteration} "
+                f"(same as iteration {prev_iter})."
+            )
+        history[normalized] = iteration
+        return None
+    return None
+
+
+def _normalize_aggregate_output(output: str) -> str:
+    """Normalize aggregate output for repeat detection."""
+    return " ".join(output.lower().split())
+
+
+def _extract_verdict(output: str, pattern: str) -> str:
+    """Extract PASS or FAIL from output using regex pattern."""
+    if re.search(pattern, output):
+        return "PASS"
+    return "FAIL"
+
+
+def _save_step_output(
+    run_dir: Path,
+    iteration: int,
+    step_name: str,
+    content: str,
+) -> Path:
+    """Save step output to run_dir/v{iteration}/{step_name}.md"""
+    path = run_dir / f"v{iteration}" / f"{step_name}.md"
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(content, encoding="utf-8")
+    return path
+
+
+def _save_report(run_dir: Path, config: PipelineConfig, result: PipelineResult) -> None:
+    """Generate and save the final markdown report."""
+    report = build_report(config, result)
+    report_path = run_dir / "final-report.md"
+    report_path.parent.mkdir(parents=True, exist_ok=True)
+    report_path.write_text(report, encoding="utf-8")
+    logger.info("Report saved: %s", report_path)