initial commit
This commit is contained in:
700
cross_eval/pipeline.py
Normal file
700
cross_eval/pipeline.py
Normal file
@@ -0,0 +1,700 @@
|
||||
"""Main pipeline execution engine."""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from cross_eval.agent import invoke_agent
|
||||
from cross_eval.config import try_reload_config
|
||||
from cross_eval.models import (
|
||||
AgentResult,
|
||||
IterationResult,
|
||||
PipelineConfig,
|
||||
PipelineResult,
|
||||
StepConfig,
|
||||
)
|
||||
from cross_eval.prompts import render_template, resolve_template, set_language
|
||||
from cross_eval.report import build_report
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def run_pipeline(
|
||||
config: PipelineConfig,
|
||||
cwd: Path | None = None,
|
||||
dry_run: bool = False,
|
||||
timeout: int | None = None,
|
||||
) -> PipelineResult:
|
||||
"""Execute the full cross-eval pipeline."""
|
||||
# Create run directory: output/{preset}_{datetime}/
|
||||
run_dir = _make_run_dir(config)
|
||||
|
||||
if config.phases:
|
||||
return _run_phased_pipeline(config, run_dir, cwd, dry_run, timeout)
|
||||
return _run_simple_pipeline(config, run_dir, cwd, dry_run, timeout)
|
||||
|
||||
|
||||
def _make_run_dir(config: PipelineConfig) -> Path:
|
||||
"""Create timestamped run directory under output_dir."""
|
||||
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
run_dir = config.output_dir / f"{config.preset_name}_{ts}"
|
||||
run_dir.mkdir(parents=True, exist_ok=True)
|
||||
return run_dir
|
||||
|
||||
|
||||
def _run_simple_pipeline(
|
||||
config: PipelineConfig,
|
||||
run_dir: Path,
|
||||
cwd: Path | None = None,
|
||||
dry_run: bool = False,
|
||||
timeout: int | None = None,
|
||||
) -> PipelineResult:
|
||||
"""Execute a simple (non-phased) pipeline."""
|
||||
if cwd is None:
|
||||
cwd = Path(os.getcwd())
|
||||
|
||||
set_language(config.language)
|
||||
input_contents = _load_inputs(config)
|
||||
|
||||
feedback = "(no feedback — first iteration)"
|
||||
iterations: list[IterationResult] = []
|
||||
start_time = time.monotonic()
|
||||
final_verdict = "MAX_ITERATIONS_REACHED"
|
||||
aggregate_history: dict[str, int] = {}
|
||||
aggregate_warnings: list[str] = []
|
||||
|
||||
for i in range(1, config.max_iterations + 1):
|
||||
config = try_reload_config(config)
|
||||
set_language(config.language)
|
||||
_refresh_inputs(config, input_contents)
|
||||
|
||||
logger.info("=" * 50)
|
||||
logger.info(" Iteration %d/%d", i, config.max_iterations)
|
||||
logger.info("=" * 50)
|
||||
|
||||
step_outputs, step_results, verdict = _run_steps(
|
||||
config.pipeline, config, input_contents, feedback,
|
||||
i, config.max_iterations, cwd, timeout, dry_run,
|
||||
run_dir=run_dir, output_iter=i,
|
||||
)
|
||||
|
||||
iter_result = IterationResult(
|
||||
iteration=i,
|
||||
step_results=step_results,
|
||||
step_outputs=step_outputs,
|
||||
verdict=verdict,
|
||||
)
|
||||
warning = _detect_repeated_aggregate(
|
||||
config.pipeline, step_outputs, aggregate_history, iteration=i,
|
||||
)
|
||||
if warning:
|
||||
iter_result.repeated_aggregate_warning = warning
|
||||
aggregate_warnings.append(warning)
|
||||
logger.warning(" %s", warning)
|
||||
|
||||
iter_result.feedback = _collect_feedback(config.pipeline, step_outputs)
|
||||
feedback = iter_result.feedback or feedback
|
||||
iterations.append(iter_result)
|
||||
|
||||
if verdict == "PASS":
|
||||
final_verdict = "PASS"
|
||||
if i >= config.min_iterations:
|
||||
logger.info(" PASS at iteration %d (min=%d reached)!", i, config.min_iterations)
|
||||
break
|
||||
else:
|
||||
logger.info(
|
||||
" PASS at iteration %d, but min_iterations=%d — continuing",
|
||||
i, config.min_iterations,
|
||||
)
|
||||
|
||||
if dry_run:
|
||||
logger.info(" (dry-run: stopping after iteration 1)")
|
||||
break
|
||||
|
||||
total_duration = time.monotonic() - start_time
|
||||
|
||||
pipeline_result = PipelineResult(
|
||||
iterations=iterations,
|
||||
final_verdict=final_verdict,
|
||||
total_duration=round(total_duration, 1),
|
||||
run_dir=run_dir,
|
||||
repeated_aggregate_warnings=aggregate_warnings,
|
||||
)
|
||||
|
||||
if not dry_run:
|
||||
_save_report(run_dir, config, pipeline_result)
|
||||
|
||||
return pipeline_result
|
||||
|
||||
|
||||
def _run_phased_pipeline(
|
||||
config: PipelineConfig,
|
||||
run_dir: Path,
|
||||
cwd: Path | None = None,
|
||||
dry_run: bool = False,
|
||||
timeout: int | None = None,
|
||||
) -> PipelineResult:
|
||||
"""Execute a multi-phase pipeline (e.g. review-fix)."""
|
||||
if cwd is None:
|
||||
cwd = Path(os.getcwd())
|
||||
|
||||
set_language(config.language)
|
||||
input_contents = _load_inputs(config)
|
||||
|
||||
iterations: list[IterationResult] = []
|
||||
feedback = "(no feedback — first iteration)"
|
||||
start_time = time.monotonic()
|
||||
final_verdict = "MAX_ITERATIONS_REACHED"
|
||||
global_iter = 0
|
||||
aggregate_history_by_phase: dict[str, dict[str, int]] = {}
|
||||
aggregate_warnings: list[str] = []
|
||||
|
||||
for phase_idx, phase in enumerate(config.phases):
|
||||
logger.info("=" * 60)
|
||||
logger.info(
|
||||
" Phase: %s (max_iter=%d, consecutive_pass=%d)",
|
||||
phase.name, phase.max_iterations, phase.consecutive_pass,
|
||||
)
|
||||
logger.info("=" * 60)
|
||||
|
||||
consecutive_passes = 0
|
||||
phase_converged = False
|
||||
|
||||
for pi in range(1, phase.max_iterations + 1):
|
||||
global_iter += 1
|
||||
|
||||
config = try_reload_config(config)
|
||||
set_language(config.language)
|
||||
_refresh_inputs(config, input_contents)
|
||||
|
||||
logger.info("-" * 50)
|
||||
logger.info(
|
||||
" [%s] Iteration %d/%d (global: v%d)",
|
||||
phase.name, pi, phase.max_iterations, global_iter,
|
||||
)
|
||||
logger.info("-" * 50)
|
||||
|
||||
step_outputs, step_results, verdict = _run_steps(
|
||||
phase.steps, config, input_contents, feedback,
|
||||
pi, phase.max_iterations, cwd, timeout, dry_run,
|
||||
run_dir=run_dir, output_iter=global_iter, phase_name=phase.name,
|
||||
)
|
||||
|
||||
iter_result = IterationResult(
|
||||
iteration=global_iter,
|
||||
step_results=step_results,
|
||||
step_outputs=step_outputs,
|
||||
verdict=verdict,
|
||||
phase_name=phase.name,
|
||||
)
|
||||
phase_history = aggregate_history_by_phase.setdefault(phase.name, {})
|
||||
warning = _detect_repeated_aggregate(
|
||||
phase.steps, step_outputs, phase_history, iteration=global_iter,
|
||||
phase_name=phase.name,
|
||||
)
|
||||
if warning:
|
||||
iter_result.repeated_aggregate_warning = warning
|
||||
aggregate_warnings.append(warning)
|
||||
logger.warning(" %s", warning)
|
||||
|
||||
iter_result.feedback = _collect_feedback(phase.steps, step_outputs)
|
||||
feedback = iter_result.feedback or feedback
|
||||
iterations.append(iter_result)
|
||||
|
||||
if verdict == "PASS":
|
||||
consecutive_passes += 1
|
||||
logger.info(
|
||||
" [%s] PASS (%d/%d consecutive)",
|
||||
phase.name, consecutive_passes, phase.consecutive_pass,
|
||||
)
|
||||
if consecutive_passes >= phase.consecutive_pass:
|
||||
logger.info(
|
||||
" [%s] Converged! %d consecutive PASSes.",
|
||||
phase.name, phase.consecutive_pass,
|
||||
)
|
||||
phase_converged = True
|
||||
break
|
||||
else:
|
||||
consecutive_passes = 0
|
||||
|
||||
if dry_run:
|
||||
break
|
||||
|
||||
if phase_converged:
|
||||
logger.info(" Phase '%s' completed: CONVERGED", phase.name)
|
||||
else:
|
||||
logger.info(
|
||||
" Phase '%s' completed: max iterations (%d) reached",
|
||||
phase.name, phase.max_iterations,
|
||||
)
|
||||
|
||||
if phase_idx == len(config.phases) - 1:
|
||||
final_verdict = "PASS" if phase_converged else "MAX_ITERATIONS_REACHED"
|
||||
|
||||
total_duration = time.monotonic() - start_time
|
||||
|
||||
pipeline_result = PipelineResult(
|
||||
iterations=iterations,
|
||||
final_verdict=final_verdict,
|
||||
total_duration=round(total_duration, 1),
|
||||
run_dir=run_dir,
|
||||
repeated_aggregate_warnings=aggregate_warnings,
|
||||
)
|
||||
|
||||
if not dry_run:
|
||||
_save_report(run_dir, config, pipeline_result)
|
||||
|
||||
return pipeline_result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Shared helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _load_inputs(config: PipelineConfig) -> dict[str, str]:
|
||||
"""Load input file contents from config."""
|
||||
input_contents: dict[str, str] = {}
|
||||
for key, val in config.inputs.items():
|
||||
if isinstance(val, str):
|
||||
input_contents[key] = val
|
||||
else:
|
||||
input_contents[key] = val.read_text(encoding="utf-8")
|
||||
return input_contents
|
||||
|
||||
|
||||
def _refresh_inputs(
|
||||
config: PipelineConfig, input_contents: dict[str, str],
|
||||
) -> None:
|
||||
"""Re-read input files (they may have changed on disk)."""
|
||||
for key, val in config.inputs.items():
|
||||
if isinstance(val, str):
|
||||
input_contents[key] = val
|
||||
elif isinstance(val, Path) and val.exists():
|
||||
input_contents[key] = val.read_text(encoding="utf-8")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parallel step grouping
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _get_step_dependencies(step: StepConfig) -> set[str]:
|
||||
"""Extract output_key references from context_override values."""
|
||||
deps: set[str] = set()
|
||||
for val in step.context_override.values():
|
||||
for match in re.finditer(r"\{(\w+)\}", val):
|
||||
deps.add(match.group(1))
|
||||
return deps
|
||||
|
||||
|
||||
def _group_parallel_steps(steps: list[StepConfig]) -> list[list[StepConfig]]:
|
||||
"""Group consecutive parallel steps into batches.
|
||||
|
||||
Consecutive steps with parallel=True are grouped together,
|
||||
but a new batch starts when a step depends on an output_key
|
||||
from a step in the current batch (dependency breaking).
|
||||
"""
|
||||
batches: list[list[StepConfig]] = []
|
||||
current: list[StepConfig] = []
|
||||
current_output_keys: set[str] = set()
|
||||
|
||||
for step in steps:
|
||||
if not step.parallel:
|
||||
if current:
|
||||
batches.append(current)
|
||||
current = []
|
||||
current_output_keys = set()
|
||||
batches.append([step])
|
||||
continue
|
||||
|
||||
# Check if this step depends on any output from the current batch
|
||||
deps = _get_step_dependencies(step)
|
||||
if deps & current_output_keys:
|
||||
batches.append(current)
|
||||
current = []
|
||||
current_output_keys = set()
|
||||
|
||||
current.append(step)
|
||||
current_output_keys.add(step.output_key)
|
||||
|
||||
if current:
|
||||
batches.append(current)
|
||||
|
||||
return batches
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step execution
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _run_steps(
|
||||
steps: list[StepConfig],
|
||||
config: PipelineConfig,
|
||||
input_contents: dict[str, str],
|
||||
feedback: str,
|
||||
iteration: int,
|
||||
max_iterations: int,
|
||||
cwd: Path,
|
||||
timeout: int | None,
|
||||
dry_run: bool,
|
||||
*,
|
||||
run_dir: Path,
|
||||
output_iter: int,
|
||||
phase_name: str | None = None,
|
||||
) -> tuple[dict[str, str], dict[str, AgentResult], str | None]:
|
||||
"""Execute all steps in one iteration, parallelizing where possible."""
|
||||
step_outputs: dict[str, str] = {}
|
||||
step_results: dict[str, AgentResult] = {}
|
||||
verdict: str | None = None
|
||||
|
||||
batches = _group_parallel_steps(steps)
|
||||
|
||||
for batch in batches:
|
||||
if len(batch) == 1:
|
||||
# Single step — run directly
|
||||
step = batch[0]
|
||||
_execute_step(
|
||||
step, config, input_contents, feedback,
|
||||
iteration, max_iterations, cwd, timeout, dry_run,
|
||||
step_outputs, step_results,
|
||||
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
|
||||
)
|
||||
else:
|
||||
# Parallel batch — run with ThreadPoolExecutor
|
||||
_execute_parallel_batch(
|
||||
batch, config, input_contents, feedback,
|
||||
iteration, max_iterations, cwd, timeout, dry_run,
|
||||
step_outputs, step_results,
|
||||
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
|
||||
)
|
||||
|
||||
# Extract verdict from all verdict steps (ALL must PASS)
|
||||
for step in steps:
|
||||
if step.verdict:
|
||||
output = step_outputs.get(step.output_key, "")
|
||||
step_verdict = _extract_verdict(output, step.verdict_pattern)
|
||||
logger.info(" [%s] verdict: %s", step.name, step_verdict)
|
||||
if verdict is None:
|
||||
verdict = step_verdict
|
||||
elif step_verdict == "FAIL":
|
||||
verdict = "FAIL"
|
||||
|
||||
return step_outputs, step_results, verdict
|
||||
|
||||
|
||||
def _execute_step(
|
||||
step: StepConfig,
|
||||
config: PipelineConfig,
|
||||
input_contents: dict[str, str],
|
||||
feedback: str,
|
||||
iteration: int,
|
||||
max_iterations: int,
|
||||
cwd: Path,
|
||||
timeout: int | None,
|
||||
dry_run: bool,
|
||||
step_outputs: dict[str, str],
|
||||
step_results: dict[str, AgentResult],
|
||||
*,
|
||||
run_dir: Path,
|
||||
output_iter: int,
|
||||
phase_name: str | None = None,
|
||||
quiet: bool = False,
|
||||
) -> None:
|
||||
"""Execute a single step, updating step_outputs and step_results in place."""
|
||||
if not quiet:
|
||||
logger.info(" [%s] agent='%s' role='%s'", step.name, step.agent, step.role)
|
||||
|
||||
# 1. Resolve template
|
||||
template = resolve_template(step.prompt_template)
|
||||
|
||||
# 2. Build context
|
||||
context = _build_context(
|
||||
input_contents, step_outputs, feedback, iteration, max_iterations,
|
||||
)
|
||||
|
||||
# 3. Apply context overrides
|
||||
if step.context_override:
|
||||
context = _apply_context_override(context, step.context_override)
|
||||
|
||||
# 4. Render prompt
|
||||
prompt = render_template(template, context)
|
||||
|
||||
# 5. Dry run: print and skip
|
||||
if dry_run:
|
||||
phase_label = f" phase={phase_name}" if phase_name else ""
|
||||
print(f"\n--- Step: {step.name} (agent={step.agent}{phase_label}) ---")
|
||||
print(prompt)
|
||||
print(f"--- end {step.name} ---\n")
|
||||
step_outputs[step.output_key] = f"(dry-run: no output for {step.output_key})"
|
||||
return
|
||||
|
||||
# 6. Invoke agent
|
||||
agent_config = config.agents[step.agent]
|
||||
try:
|
||||
result = invoke_agent(
|
||||
agent_config, prompt, step.name,
|
||||
cwd=cwd, timeout=timeout, quiet=quiet,
|
||||
)
|
||||
except subprocess.TimeoutExpired as e:
|
||||
stdout = (e.stdout or b"") if isinstance(e.stdout, bytes) else (e.stdout or "")
|
||||
stderr = (e.stderr or b"") if isinstance(e.stderr, bytes) else (e.stderr or "")
|
||||
if isinstance(stdout, bytes):
|
||||
stdout = stdout.decode("utf-8", errors="replace")
|
||||
if isinstance(stderr, bytes):
|
||||
stderr = stderr.decode("utf-8", errors="replace")
|
||||
phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
|
||||
error_msg = (
|
||||
f"# Agent Timeout\n\n"
|
||||
f"{phase_info}"
|
||||
f"- **Step**: {step.name}\n"
|
||||
f"- **Agent**: {step.agent}\n"
|
||||
f"- **Timeout**: {timeout}s\n\n"
|
||||
f"Partial stdout ({len(stdout)} chars):\n"
|
||||
f"```\n{stdout[:2000] or '(none)'}\n```\n\n"
|
||||
f"Stderr:\n```\n{stderr[:2000] or '(none)'}\n```\n"
|
||||
)
|
||||
_save_step_output(run_dir, output_iter, f"{step.name}_error", error_msg)
|
||||
logger.error(" [%s] TIMEOUT after %ss — saved to output", step.name, timeout)
|
||||
raise RuntimeError(
|
||||
f"Agent '{step.agent}' timed out after {timeout}s at step '{step.name}'. "
|
||||
f"Error saved to {run_dir}/v{output_iter}/{step.name}_error.md. "
|
||||
f"Try --timeout 0 (unlimited)"
|
||||
)
|
||||
except RuntimeError as e:
|
||||
phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
|
||||
error_msg = (
|
||||
f"# Agent Error\n\n{phase_info}"
|
||||
f"- **Step**: {step.name}\n- **Agent**: {step.agent}\n\n```\n{e}\n```\n"
|
||||
)
|
||||
_save_step_output(run_dir, output_iter, f"{step.name}_error", error_msg)
|
||||
logger.error(" [%s] FAILED — saved to output", step.name)
|
||||
raise
|
||||
|
||||
# 7. Store output
|
||||
step_outputs[step.output_key] = result.output
|
||||
step_results[step.output_key] = result
|
||||
|
||||
if not quiet:
|
||||
logger.info(
|
||||
" [%s] completed (%.1fs, %d chars)",
|
||||
step.name, result.duration_seconds, len(result.output),
|
||||
)
|
||||
|
||||
# 8. Save to disk
|
||||
_save_step_output(run_dir, output_iter, step.name, result.output)
|
||||
|
||||
|
||||
def _execute_parallel_batch(
|
||||
batch: list[StepConfig],
|
||||
config: PipelineConfig,
|
||||
input_contents: dict[str, str],
|
||||
feedback: str,
|
||||
iteration: int,
|
||||
max_iterations: int,
|
||||
cwd: Path,
|
||||
timeout: int | None,
|
||||
dry_run: bool,
|
||||
step_outputs: dict[str, str],
|
||||
step_results: dict[str, AgentResult],
|
||||
*,
|
||||
run_dir: Path,
|
||||
output_iter: int,
|
||||
phase_name: str | None = None,
|
||||
) -> None:
|
||||
"""Execute multiple steps in parallel using threads."""
|
||||
agent_names = ", ".join(s.agent for s in batch)
|
||||
logger.info(" [parallel] %d agents: %s", len(batch), agent_names)
|
||||
|
||||
if dry_run:
|
||||
for step in batch:
|
||||
_execute_step(
|
||||
step, config, input_contents, feedback,
|
||||
iteration, max_iterations, cwd, timeout, dry_run,
|
||||
step_outputs, step_results,
|
||||
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
|
||||
)
|
||||
return
|
||||
|
||||
# Snapshot context before parallel execution (all steps see same state)
|
||||
context_snapshot = dict(input_contents)
|
||||
context_snapshot.update(step_outputs)
|
||||
|
||||
# Collect results from parallel threads
|
||||
local_outputs: dict[str, str] = {}
|
||||
local_results: dict[str, AgentResult] = {}
|
||||
errors: list[Exception] = []
|
||||
|
||||
# Show a single spinner for the batch
|
||||
from cross_eval.agent import _Spinner
|
||||
spinner = _Spinner(
|
||||
f"[parallel] {len(batch)} agents running ({agent_names})..."
|
||||
)
|
||||
spinner.start()
|
||||
batch_start = time.monotonic()
|
||||
|
||||
def _run_one(step: StepConfig) -> tuple[str, str, AgentResult]:
|
||||
"""Run one step, return (output_key, output, result)."""
|
||||
template = resolve_template(step.prompt_template)
|
||||
context = _build_context(
|
||||
context_snapshot, {}, feedback, iteration, max_iterations,
|
||||
)
|
||||
if step.context_override:
|
||||
context = _apply_context_override(context, step.context_override)
|
||||
prompt = render_template(template, context)
|
||||
|
||||
agent_config = config.agents[step.agent]
|
||||
result = invoke_agent(
|
||||
agent_config, prompt, step.name,
|
||||
cwd=cwd, timeout=timeout, quiet=True,
|
||||
)
|
||||
return step.output_key, result.output, result
|
||||
|
||||
with ThreadPoolExecutor(max_workers=len(batch)) as executor:
|
||||
futures = {executor.submit(_run_one, step): step for step in batch}
|
||||
for future in as_completed(futures):
|
||||
step = futures[future]
|
||||
try:
|
||||
output_key, output, result = future.result()
|
||||
local_results[output_key] = result
|
||||
local_outputs[output_key] = output
|
||||
except Exception as e:
|
||||
errors.append(e)
|
||||
|
||||
batch_elapsed = round(time.monotonic() - batch_start, 1)
|
||||
|
||||
if errors:
|
||||
spinner.stop(f"[parallel] FAILED ({batch_elapsed}s)")
|
||||
raise errors[0]
|
||||
|
||||
spinner.stop(f"[parallel] {len(batch)} agents done ({batch_elapsed}s)")
|
||||
|
||||
# Merge results
|
||||
for step in batch:
|
||||
key = step.output_key
|
||||
step_outputs[key] = local_outputs[key]
|
||||
step_results[key] = local_results[key]
|
||||
r = local_results[key]
|
||||
logger.info(
|
||||
" [%s] completed (%.1fs, %d chars)",
|
||||
step.name, r.duration_seconds, len(r.output),
|
||||
)
|
||||
_save_step_output(run_dir, output_iter, step.name, r.output)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Context and template helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _build_context(
|
||||
input_contents: dict[str, str],
|
||||
step_outputs: dict[str, str],
|
||||
feedback: str,
|
||||
iteration: int,
|
||||
max_iterations: int,
|
||||
) -> dict[str, str]:
|
||||
"""Build the template context dict."""
|
||||
context: dict[str, str] = {}
|
||||
context.update(input_contents)
|
||||
context.update(step_outputs)
|
||||
context["feedback"] = feedback
|
||||
context["iteration"] = str(iteration)
|
||||
context["max_iterations"] = str(max_iterations)
|
||||
return context
|
||||
|
||||
|
||||
def _apply_context_override(
|
||||
context: dict[str, str],
|
||||
overrides: dict[str, str],
|
||||
) -> dict[str, str]:
|
||||
"""Apply context_override mappings for cross-review scenarios."""
|
||||
result = dict(context)
|
||||
for key, value_template in overrides.items():
|
||||
result[key] = render_template(value_template, context)
|
||||
return result
|
||||
|
||||
|
||||
def _collect_feedback(
|
||||
steps: list[StepConfig],
|
||||
step_outputs: dict[str, str],
|
||||
) -> str:
|
||||
"""Collect feedback from all verdict steps.
|
||||
|
||||
Single verdict step → raw output (backward compatible).
|
||||
Multiple verdict steps → combined with agent headers for cross-referencing.
|
||||
"""
|
||||
verdict_steps = [s for s in steps if s.verdict]
|
||||
if len(verdict_steps) == 1:
|
||||
return step_outputs.get(verdict_steps[0].output_key, "")
|
||||
parts: list[str] = []
|
||||
for s in verdict_steps:
|
||||
output = step_outputs.get(s.output_key, "")
|
||||
if output:
|
||||
parts.append(f"## Review by {s.agent} ({s.name})\n{output}")
|
||||
return "\n\n---\n\n".join(parts)
|
||||
|
||||
|
||||
def _detect_repeated_aggregate(
|
||||
steps: list[StepConfig],
|
||||
step_outputs: dict[str, str],
|
||||
history: dict[str, int],
|
||||
*,
|
||||
iteration: int,
|
||||
phase_name: str | None = None,
|
||||
) -> str | None:
|
||||
"""Detect repeated aggregate-review outputs across iterations."""
|
||||
for step in steps:
|
||||
if step.prompt_template != "default:aggregate-review":
|
||||
continue
|
||||
output = step_outputs.get(step.output_key, "")
|
||||
normalized = _normalize_aggregate_output(output)
|
||||
if not normalized:
|
||||
return None
|
||||
if normalized in history:
|
||||
prev_iter = history[normalized]
|
||||
phase_prefix = f"[{phase_name}] " if phase_name else ""
|
||||
return (
|
||||
f"{phase_prefix}Repeated aggregate_review detected at iteration {iteration} "
|
||||
f"(same as iteration {prev_iter})."
|
||||
)
|
||||
history[normalized] = iteration
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _normalize_aggregate_output(output: str) -> str:
|
||||
"""Normalize aggregate output for repeat detection."""
|
||||
return " ".join(output.lower().split())
|
||||
|
||||
|
||||
def _extract_verdict(output: str, pattern: str) -> str:
|
||||
"""Extract PASS or FAIL from output using regex pattern."""
|
||||
if re.search(pattern, output):
|
||||
return "PASS"
|
||||
return "FAIL"
|
||||
|
||||
|
||||
def _save_step_output(
|
||||
run_dir: Path,
|
||||
iteration: int,
|
||||
step_name: str,
|
||||
content: str,
|
||||
) -> Path:
|
||||
"""Save step output to run_dir/v{iteration}/{step_name}.md"""
|
||||
path = run_dir / f"v{iteration}" / f"{step_name}.md"
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(content, encoding="utf-8")
|
||||
return path
|
||||
|
||||
|
||||
def _save_report(run_dir: Path, config: PipelineConfig, result: PipelineResult) -> None:
|
||||
"""Generate and save the final markdown report."""
|
||||
report = build_report(config, result)
|
||||
report_path = run_dir / "final-report.md"
|
||||
report_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
report_path.write_text(report, encoding="utf-8")
|
||||
logger.info("Report saved: %s", report_path)
|
||||
Reference in New Issue
Block a user