release: cut 0.2.0 baseline
This commit is contained in:
@@ -10,9 +10,11 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from cross_eval.agent import AgentInvocationError, invoke_agent
|
||||
from cross_eval.agent import AgentInvocationError, invoke_agent, invoke_agent_agentic
|
||||
from cross_eval.worktree import WorktreeError
|
||||
from cross_eval.config import try_reload_config
|
||||
from cross_eval.models import (
|
||||
AgentConfig,
|
||||
AgentResult,
|
||||
IterationResult,
|
||||
PipelineConfig,
|
||||
@@ -21,6 +23,11 @@ from cross_eval.models import (
|
||||
)
|
||||
from cross_eval.prompts import render_template, resolve_template, set_language
|
||||
from cross_eval.report import build_report
|
||||
from cross_eval.runtime_env import (
|
||||
build_execution_policy,
|
||||
build_runtime_environment,
|
||||
summarize_environment,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -48,6 +55,104 @@ def _make_run_dir(config: PipelineConfig) -> Path:
|
||||
return run_dir
|
||||
|
||||
|
||||
def _commit_iteration(
|
||||
worktree_path: Path,
|
||||
label: str,
|
||||
iteration: int,
|
||||
verdict: str | None,
|
||||
) -> None:
|
||||
"""Intermediate commit after each agentic iteration.
|
||||
|
||||
This resets the diff baseline so the next iteration only captures new changes.
|
||||
"""
|
||||
from cross_eval.worktree import commit_worktree
|
||||
committed = commit_worktree(
|
||||
worktree_path,
|
||||
f"cross-eval: {label} v{iteration} ({verdict or 'no-verdict'})",
|
||||
)
|
||||
if committed:
|
||||
logger.debug(" Intermediate commit: v%d (%s)", iteration, verdict)
|
||||
|
||||
|
||||
def _has_agentic_steps(config: PipelineConfig, steps: list[StepConfig]) -> bool:
|
||||
"""Check if any step uses an agentic agent."""
|
||||
return any(
|
||||
config.agents.get(s.agent, AgentConfig(name="", command="")).agentic
|
||||
for s in steps
|
||||
)
|
||||
|
||||
|
||||
def _setup_worktree(cwd: Path, run_dir: Path, preset_name: str) -> tuple[Path, str]:
|
||||
"""Create a shared worktree for the entire pipeline run.
|
||||
|
||||
1. Generate branch name (cross-eval/<preset>_<timestamp>)
|
||||
2. Create branch from HEAD
|
||||
3. Create worktree on that branch
|
||||
|
||||
Returns (worktree_path, branch_name).
|
||||
"""
|
||||
from cross_eval.worktree import create_worktree, make_branch_name
|
||||
branch_name = make_branch_name(preset_name)
|
||||
worktree_dir = run_dir / "work"
|
||||
worktree_path = create_worktree(
|
||||
base_cwd=cwd, work_dir=worktree_dir, branch_name=branch_name,
|
||||
)
|
||||
return worktree_path, branch_name
|
||||
|
||||
|
||||
def _finalize_worktree(
|
||||
cwd: Path,
|
||||
worktree_path: Path,
|
||||
branch_name: str,
|
||||
preset_name: str,
|
||||
final_verdict: str,
|
||||
) -> str | None:
|
||||
"""Commit changes on the branch, then remove the worktree.
|
||||
|
||||
The branch survives worktree removal and stays in the original repo.
|
||||
Returns the branch name if changes were committed, None otherwise.
|
||||
"""
|
||||
from cross_eval.worktree import commit_worktree, remove_worktree
|
||||
|
||||
committed = False
|
||||
try:
|
||||
committed = commit_worktree(
|
||||
worktree_path,
|
||||
f"cross-eval: {preset_name} ({final_verdict})",
|
||||
)
|
||||
if committed:
|
||||
logger.info(" Agentic changes committed on branch: %s", branch_name)
|
||||
else:
|
||||
logger.warning(" No agentic changes to commit (empty diff)")
|
||||
except Exception:
|
||||
logger.warning(" Failed to commit agentic changes", exc_info=True)
|
||||
|
||||
try:
|
||||
remove_worktree(base_cwd=cwd, work_dir=worktree_path)
|
||||
except Exception:
|
||||
logger.warning("Failed to clean up worktree: %s", worktree_path)
|
||||
|
||||
# Check if branch has any commits beyond the base — if not, delete it
|
||||
if not committed:
|
||||
try:
|
||||
# Check if branch has diverged from its base
|
||||
result = subprocess.run(
|
||||
["git", "log", "--oneline", f"HEAD..{branch_name}"],
|
||||
cwd=cwd, capture_output=True, text=True,
|
||||
)
|
||||
if not result.stdout.strip():
|
||||
# No commits on branch beyond base — clean up
|
||||
subprocess.run(
|
||||
["git", "branch", "-D", branch_name],
|
||||
cwd=cwd, capture_output=True,
|
||||
)
|
||||
logger.info(" Deleted empty branch: %s", branch_name)
|
||||
except Exception:
|
||||
pass # best-effort cleanup
|
||||
|
||||
return branch_name if committed else None
|
||||
|
||||
|
||||
def _run_simple_pipeline(
|
||||
config: PipelineConfig,
|
||||
run_dir: Path,
|
||||
@@ -61,6 +166,15 @@ def _run_simple_pipeline(
|
||||
|
||||
set_language(config.language)
|
||||
input_contents = _load_inputs(config)
|
||||
runtime_env = _build_runtime_inputs(config, input_contents, cwd or Path(os.getcwd()))
|
||||
|
||||
# Setup shared worktree for agentic mode
|
||||
worktree_path: Path | None = None
|
||||
agentic_branch_name: str | None = None
|
||||
if not dry_run and _has_agentic_steps(config, config.pipeline):
|
||||
worktree_path, agentic_branch_name = _setup_worktree(
|
||||
cwd, run_dir, config.preset_name,
|
||||
)
|
||||
|
||||
feedback = "(no feedback — first iteration)"
|
||||
iterations: list[IterationResult] = []
|
||||
@@ -71,99 +185,114 @@ def _run_simple_pipeline(
|
||||
escalated_issues: list[str] = []
|
||||
all_feedbacks: list[str] = []
|
||||
|
||||
for i in range(1, config.max_iterations + 1):
|
||||
config = try_reload_config(config)
|
||||
set_language(config.language)
|
||||
_refresh_inputs(config, input_contents)
|
||||
try:
|
||||
for i in range(1, config.max_iterations + 1):
|
||||
config = try_reload_config(config)
|
||||
set_language(config.language)
|
||||
_refresh_inputs(config, input_contents)
|
||||
runtime_env = _build_runtime_inputs(config, input_contents, cwd)
|
||||
|
||||
logger.info("=" * 50)
|
||||
logger.info(" Iteration %d/%d", i, config.max_iterations)
|
||||
logger.info("=" * 50)
|
||||
logger.info("=" * 50)
|
||||
logger.info(" Iteration %d/%d", i, config.max_iterations)
|
||||
logger.info("=" * 50)
|
||||
|
||||
step_outputs, step_results, verdict = _run_steps(
|
||||
config.pipeline, config, input_contents, feedback,
|
||||
i, config.max_iterations, cwd, timeout, dry_run,
|
||||
run_dir=run_dir, output_iter=i,
|
||||
)
|
||||
step_outputs, step_results, verdict = _run_steps(
|
||||
config.pipeline, config, input_contents, feedback,
|
||||
i, config.max_iterations, cwd, timeout, dry_run,
|
||||
run_dir=run_dir, output_iter=i,
|
||||
worktree_path=worktree_path,
|
||||
runtime_env=runtime_env,
|
||||
)
|
||||
|
||||
iter_result = IterationResult(
|
||||
iteration=i,
|
||||
step_results=step_results,
|
||||
step_outputs=step_outputs,
|
||||
verdict=verdict,
|
||||
)
|
||||
warning = _detect_repeated_aggregate(
|
||||
config.pipeline, step_outputs, aggregate_history, iteration=i,
|
||||
)
|
||||
if warning:
|
||||
iter_result.repeated_aggregate_warning = warning
|
||||
aggregate_warnings.append(warning)
|
||||
logger.warning(" %s", warning)
|
||||
# Intermediate commit so next iteration's diff only shows new changes
|
||||
if worktree_path is not None:
|
||||
_commit_iteration(worktree_path, config.preset_name, i, verdict)
|
||||
|
||||
iter_result.feedback = _collect_feedback(config.pipeline, step_outputs)
|
||||
feedback = iter_result.feedback or feedback
|
||||
all_feedbacks.append(feedback)
|
||||
iter_result = IterationResult(
|
||||
iteration=i,
|
||||
step_results=step_results,
|
||||
step_outputs=step_outputs,
|
||||
verdict=verdict,
|
||||
)
|
||||
warning = _detect_repeated_aggregate(
|
||||
config.pipeline, step_outputs, aggregate_history, iteration=i,
|
||||
)
|
||||
if warning:
|
||||
iter_result.repeated_aggregate_warning = warning
|
||||
aggregate_warnings.append(warning)
|
||||
logger.warning(" %s", warning)
|
||||
|
||||
# Extract tracker from verdict/review steps for next iteration
|
||||
for step in config.pipeline:
|
||||
if step.verdict or step.role == "review":
|
||||
tracker = _extract_senior_tracker(
|
||||
step_outputs.get(step.output_key, ""),
|
||||
)
|
||||
if tracker:
|
||||
input_contents["previous_senior_tracker"] = tracker
|
||||
iter_result.feedback = _collect_feedback(config.pipeline, step_outputs)
|
||||
feedback = iter_result.feedback or feedback
|
||||
all_feedbacks.append(feedback)
|
||||
|
||||
iterations.append(iter_result)
|
||||
|
||||
# ESCALATE check (highest priority)
|
||||
if verdict == "ESCALATE":
|
||||
final_verdict = "ESCALATE"
|
||||
# Extract escalation details from verdict step outputs
|
||||
# Extract tracker from verdict/review steps for next iteration
|
||||
for step in config.pipeline:
|
||||
if step.verdict:
|
||||
esc = _extract_escalated_issues(
|
||||
if step.verdict or step.role == "review":
|
||||
tracker = _extract_senior_tracker(
|
||||
step_outputs.get(step.output_key, ""),
|
||||
)
|
||||
if esc:
|
||||
escalated_issues.append(esc)
|
||||
iter_result.escalated_issues = esc
|
||||
logger.info(" ESCALATE at iteration %d — stopping loop.", i)
|
||||
break
|
||||
if tracker:
|
||||
input_contents["previous_senior_tracker"] = tracker
|
||||
|
||||
if verdict == "PASS":
|
||||
final_verdict = "PASS"
|
||||
if i >= config.min_iterations:
|
||||
logger.info(" PASS at iteration %d (min=%d reached)!", i, config.min_iterations)
|
||||
iterations.append(iter_result)
|
||||
|
||||
# ESCALATE check (highest priority)
|
||||
if verdict == "ESCALATE":
|
||||
final_verdict = "ESCALATE"
|
||||
for step in config.pipeline:
|
||||
if step.verdict:
|
||||
esc = _extract_escalated_issues(
|
||||
step_outputs.get(step.output_key, ""),
|
||||
)
|
||||
if esc:
|
||||
escalated_issues.append(esc)
|
||||
iter_result.escalated_issues = esc
|
||||
logger.info(" ESCALATE at iteration %d — stopping loop.", i)
|
||||
break
|
||||
else:
|
||||
logger.info(
|
||||
" PASS at iteration %d, but min_iterations=%d — continuing",
|
||||
i, config.min_iterations,
|
||||
)
|
||||
|
||||
# Auto-escalate: no senior/aggregator + repeated FAIL
|
||||
has_aggregator = config.seniors or any(
|
||||
s.prompt_template == "default:aggregate-review" for s in config.pipeline
|
||||
)
|
||||
if (
|
||||
verdict == "FAIL"
|
||||
and not has_aggregator
|
||||
and i >= 2
|
||||
and _detect_auto_escalate(all_feedbacks[:-1], feedback)
|
||||
):
|
||||
final_verdict = "ESCALATE"
|
||||
auto_msg = (
|
||||
f"Auto-escalated: same issues detected across {i} iterations "
|
||||
f"without resolution (no senior reviewer configured)."
|
||||
if verdict == "PASS":
|
||||
final_verdict = "PASS"
|
||||
if i >= config.min_iterations:
|
||||
logger.info(" PASS at iteration %d (min=%d reached)!", i, config.min_iterations)
|
||||
break
|
||||
else:
|
||||
logger.info(
|
||||
" PASS at iteration %d, but min_iterations=%d — continuing",
|
||||
i, config.min_iterations,
|
||||
)
|
||||
|
||||
# Auto-escalate: no senior/aggregator + repeated FAIL
|
||||
has_aggregator = config.seniors or any(
|
||||
s.prompt_template == "default:aggregate-review" for s in config.pipeline
|
||||
)
|
||||
escalated_issues.append(auto_msg)
|
||||
iter_result.escalated_issues = auto_msg
|
||||
logger.info(" AUTO-ESCALATE at iteration %d", i)
|
||||
break
|
||||
if (
|
||||
verdict == "FAIL"
|
||||
and not has_aggregator
|
||||
and i >= 2
|
||||
and _detect_auto_escalate(all_feedbacks[:-1], feedback)
|
||||
):
|
||||
final_verdict = "ESCALATE"
|
||||
auto_msg = (
|
||||
f"Auto-escalated: same issues detected across {i} iterations "
|
||||
f"without resolution (no senior reviewer configured)."
|
||||
)
|
||||
escalated_issues.append(auto_msg)
|
||||
iter_result.escalated_issues = auto_msg
|
||||
logger.info(" AUTO-ESCALATE at iteration %d", i)
|
||||
break
|
||||
|
||||
if dry_run:
|
||||
logger.info(" (dry-run: stopping after iteration 1)")
|
||||
break
|
||||
if dry_run:
|
||||
logger.info(" (dry-run: stopping after iteration 1)")
|
||||
break
|
||||
|
||||
finally:
|
||||
agentic_branch: str | None = None
|
||||
if worktree_path is not None and agentic_branch_name is not None:
|
||||
agentic_branch = _finalize_worktree(
|
||||
cwd, worktree_path, agentic_branch_name,
|
||||
config.preset_name, final_verdict,
|
||||
)
|
||||
|
||||
total_duration = time.monotonic() - start_time
|
||||
|
||||
@@ -174,6 +303,7 @@ def _run_simple_pipeline(
|
||||
run_dir=run_dir,
|
||||
repeated_aggregate_warnings=aggregate_warnings,
|
||||
escalated_issues=escalated_issues,
|
||||
agentic_branch=agentic_branch,
|
||||
)
|
||||
|
||||
if not dry_run:
|
||||
@@ -195,6 +325,16 @@ def _run_phased_pipeline(
|
||||
|
||||
set_language(config.language)
|
||||
input_contents = _load_inputs(config)
|
||||
runtime_env = _build_runtime_inputs(config, input_contents, cwd)
|
||||
|
||||
# Setup shared worktree for agentic mode
|
||||
all_phase_steps = [s for p in config.phases for s in p.steps]
|
||||
worktree_path: Path | None = None
|
||||
agentic_branch_name: str | None = None
|
||||
if not dry_run and _has_agentic_steps(config, all_phase_steps):
|
||||
worktree_path, agentic_branch_name = _setup_worktree(
|
||||
cwd, run_dir, config.preset_name,
|
||||
)
|
||||
|
||||
iterations: list[IterationResult] = []
|
||||
feedback = "(no feedback — first iteration)"
|
||||
@@ -207,152 +347,171 @@ def _run_phased_pipeline(
|
||||
all_feedbacks: list[str] = []
|
||||
escalated = False
|
||||
|
||||
for phase_idx, phase in enumerate(config.phases):
|
||||
if escalated:
|
||||
break
|
||||
try:
|
||||
for phase_idx, phase in enumerate(config.phases):
|
||||
if escalated:
|
||||
break
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info(
|
||||
" Phase: %s (max_iter=%d, consecutive_pass=%d)",
|
||||
phase.name, phase.max_iterations, phase.consecutive_pass,
|
||||
)
|
||||
logger.info("=" * 60)
|
||||
|
||||
consecutive_passes = 0
|
||||
phase_converged = False
|
||||
|
||||
for pi in range(1, phase.max_iterations + 1):
|
||||
global_iter += 1
|
||||
|
||||
config = try_reload_config(config)
|
||||
set_language(config.language)
|
||||
_refresh_inputs(config, input_contents)
|
||||
|
||||
logger.info("-" * 50)
|
||||
logger.info("=" * 60)
|
||||
logger.info(
|
||||
" [%s] Iteration %d/%d (global: v%d)",
|
||||
phase.name, pi, phase.max_iterations, global_iter,
|
||||
" Phase: %s (max_iter=%d, consecutive_pass=%d)",
|
||||
phase.name, phase.max_iterations, phase.consecutive_pass,
|
||||
)
|
||||
logger.info("-" * 50)
|
||||
logger.info("=" * 60)
|
||||
|
||||
step_outputs, step_results, verdict = _run_steps(
|
||||
phase.steps, config, input_contents, feedback,
|
||||
pi, phase.max_iterations, cwd, timeout, dry_run,
|
||||
run_dir=run_dir, output_iter=global_iter, phase_name=phase.name,
|
||||
)
|
||||
consecutive_passes = 0
|
||||
phase_converged = False
|
||||
|
||||
iter_result = IterationResult(
|
||||
iteration=global_iter,
|
||||
step_results=step_results,
|
||||
step_outputs=step_outputs,
|
||||
verdict=verdict,
|
||||
phase_name=phase.name,
|
||||
)
|
||||
phase_history = aggregate_history_by_phase.setdefault(phase.name, {})
|
||||
warning = _detect_repeated_aggregate(
|
||||
phase.steps, step_outputs, phase_history, iteration=global_iter,
|
||||
phase_name=phase.name,
|
||||
)
|
||||
if warning:
|
||||
iter_result.repeated_aggregate_warning = warning
|
||||
aggregate_warnings.append(warning)
|
||||
logger.warning(" %s", warning)
|
||||
for pi in range(1, phase.max_iterations + 1):
|
||||
global_iter += 1
|
||||
|
||||
iter_result.feedback = _collect_feedback(phase.steps, step_outputs)
|
||||
feedback = iter_result.feedback or feedback
|
||||
all_feedbacks.append(feedback)
|
||||
config = try_reload_config(config)
|
||||
set_language(config.language)
|
||||
_refresh_inputs(config, input_contents)
|
||||
runtime_env = _build_runtime_inputs(config, input_contents, cwd)
|
||||
|
||||
# Extract tracker from verdict/review steps
|
||||
for step in phase.steps:
|
||||
if step.verdict or step.role == "review":
|
||||
tracker = _extract_senior_tracker(
|
||||
step_outputs.get(step.output_key, ""),
|
||||
logger.info("-" * 50)
|
||||
logger.info(
|
||||
" [%s] Iteration %d/%d (global: v%d)",
|
||||
phase.name, pi, phase.max_iterations, global_iter,
|
||||
)
|
||||
logger.info("-" * 50)
|
||||
|
||||
step_outputs, step_results, verdict = _run_steps(
|
||||
phase.steps, config, input_contents, feedback,
|
||||
pi, phase.max_iterations, cwd, timeout, dry_run,
|
||||
run_dir=run_dir, output_iter=global_iter, phase_name=phase.name,
|
||||
worktree_path=worktree_path,
|
||||
runtime_env=runtime_env,
|
||||
)
|
||||
|
||||
# Intermediate commit so next iteration's diff only shows new changes
|
||||
if worktree_path is not None:
|
||||
_commit_iteration(
|
||||
worktree_path, f"{config.preset_name}/{phase.name}",
|
||||
global_iter, verdict,
|
||||
)
|
||||
if tracker:
|
||||
input_contents["previous_senior_tracker"] = tracker
|
||||
|
||||
iterations.append(iter_result)
|
||||
iter_result = IterationResult(
|
||||
iteration=global_iter,
|
||||
step_results=step_results,
|
||||
step_outputs=step_outputs,
|
||||
verdict=verdict,
|
||||
phase_name=phase.name,
|
||||
)
|
||||
phase_history = aggregate_history_by_phase.setdefault(phase.name, {})
|
||||
warning = _detect_repeated_aggregate(
|
||||
phase.steps, step_outputs, phase_history, iteration=global_iter,
|
||||
phase_name=phase.name,
|
||||
)
|
||||
if warning:
|
||||
iter_result.repeated_aggregate_warning = warning
|
||||
aggregate_warnings.append(warning)
|
||||
logger.warning(" %s", warning)
|
||||
|
||||
# ESCALATE check
|
||||
if verdict == "ESCALATE":
|
||||
final_verdict = "ESCALATE"
|
||||
iter_result.feedback = _collect_feedback(phase.steps, step_outputs)
|
||||
feedback = iter_result.feedback or feedback
|
||||
all_feedbacks.append(feedback)
|
||||
|
||||
# Extract tracker from verdict/review steps
|
||||
for step in phase.steps:
|
||||
if step.verdict:
|
||||
esc = _extract_escalated_issues(
|
||||
if step.verdict or step.role == "review":
|
||||
tracker = _extract_senior_tracker(
|
||||
step_outputs.get(step.output_key, ""),
|
||||
)
|
||||
if esc:
|
||||
escalated_issues.append(esc)
|
||||
iter_result.escalated_issues = esc
|
||||
logger.info(
|
||||
" [%s] ESCALATE at iteration %d — stopping.",
|
||||
phase.name, pi,
|
||||
)
|
||||
escalated = True
|
||||
break
|
||||
if tracker:
|
||||
input_contents["previous_senior_tracker"] = tracker
|
||||
|
||||
if verdict is None:
|
||||
logger.info(
|
||||
" [%s] completed (no verdict step; single-pass phase)",
|
||||
phase.name,
|
||||
)
|
||||
phase_converged = True
|
||||
break
|
||||
iterations.append(iter_result)
|
||||
|
||||
if verdict == "PASS":
|
||||
consecutive_passes += 1
|
||||
logger.info(
|
||||
" [%s] PASS (%d/%d consecutive)",
|
||||
phase.name, consecutive_passes, phase.consecutive_pass,
|
||||
)
|
||||
if consecutive_passes >= phase.consecutive_pass:
|
||||
# ESCALATE check
|
||||
if verdict == "ESCALATE":
|
||||
final_verdict = "ESCALATE"
|
||||
for step in phase.steps:
|
||||
if step.verdict:
|
||||
esc = _extract_escalated_issues(
|
||||
step_outputs.get(step.output_key, ""),
|
||||
)
|
||||
if esc:
|
||||
escalated_issues.append(esc)
|
||||
iter_result.escalated_issues = esc
|
||||
logger.info(
|
||||
" [%s] Converged! %d consecutive PASSes.",
|
||||
phase.name, phase.consecutive_pass,
|
||||
" [%s] ESCALATE at iteration %d — stopping.",
|
||||
phase.name, pi,
|
||||
)
|
||||
escalated = True
|
||||
break
|
||||
|
||||
if verdict is None:
|
||||
logger.info(
|
||||
" [%s] completed (no verdict step; single-pass phase)",
|
||||
phase.name,
|
||||
)
|
||||
phase_converged = True
|
||||
break
|
||||
else:
|
||||
consecutive_passes = 0
|
||||
|
||||
# Auto-escalate in phased pipeline
|
||||
has_aggregator = config.seniors or any(
|
||||
s.prompt_template == "default:aggregate-review" for s in phase.steps
|
||||
)
|
||||
if (
|
||||
verdict == "FAIL"
|
||||
and not has_aggregator
|
||||
and pi >= 2
|
||||
and _detect_auto_escalate(all_feedbacks[:-1], feedback)
|
||||
):
|
||||
final_verdict = "ESCALATE"
|
||||
auto_msg = (
|
||||
f"Auto-escalated: same issues detected across {pi} iterations "
|
||||
f"in phase '{phase.name}' without resolution."
|
||||
if verdict == "PASS":
|
||||
consecutive_passes += 1
|
||||
logger.info(
|
||||
" [%s] PASS (%d/%d consecutive)",
|
||||
phase.name, consecutive_passes, phase.consecutive_pass,
|
||||
)
|
||||
if consecutive_passes >= phase.consecutive_pass:
|
||||
logger.info(
|
||||
" [%s] Converged! %d consecutive PASSes.",
|
||||
phase.name, phase.consecutive_pass,
|
||||
)
|
||||
phase_converged = True
|
||||
break
|
||||
else:
|
||||
consecutive_passes = 0
|
||||
|
||||
# Auto-escalate in phased pipeline
|
||||
has_aggregator = config.seniors or any(
|
||||
s.prompt_template == "default:aggregate-review" for s in phase.steps
|
||||
)
|
||||
escalated_issues.append(auto_msg)
|
||||
iter_result.escalated_issues = auto_msg
|
||||
logger.info(" [%s] AUTO-ESCALATE at iteration %d", phase.name, pi)
|
||||
escalated = True
|
||||
if (
|
||||
verdict == "FAIL"
|
||||
and not has_aggregator
|
||||
and pi >= 2
|
||||
and _detect_auto_escalate(all_feedbacks[:-1], feedback)
|
||||
):
|
||||
final_verdict = "ESCALATE"
|
||||
auto_msg = (
|
||||
f"Auto-escalated: same issues detected across {pi} iterations "
|
||||
f"in phase '{phase.name}' without resolution."
|
||||
)
|
||||
escalated_issues.append(auto_msg)
|
||||
iter_result.escalated_issues = auto_msg
|
||||
logger.info(" [%s] AUTO-ESCALATE at iteration %d", phase.name, pi)
|
||||
escalated = True
|
||||
break
|
||||
|
||||
if dry_run:
|
||||
break
|
||||
|
||||
if escalated:
|
||||
break
|
||||
|
||||
if dry_run:
|
||||
break
|
||||
if phase_converged:
|
||||
logger.info(" Phase '%s' completed: CONVERGED", phase.name)
|
||||
else:
|
||||
logger.info(
|
||||
" Phase '%s' completed: max iterations (%d) reached",
|
||||
phase.name, phase.max_iterations,
|
||||
)
|
||||
|
||||
if escalated:
|
||||
break
|
||||
if phase_idx == len(config.phases) - 1:
|
||||
final_verdict = "PASS" if phase_converged else "MAX_ITERATIONS_REACHED"
|
||||
|
||||
if phase_converged:
|
||||
logger.info(" Phase '%s' completed: CONVERGED", phase.name)
|
||||
else:
|
||||
logger.info(
|
||||
" Phase '%s' completed: max iterations (%d) reached",
|
||||
phase.name, phase.max_iterations,
|
||||
finally:
|
||||
agentic_branch: str | None = None
|
||||
if worktree_path is not None and agentic_branch_name is not None:
|
||||
agentic_branch = _finalize_worktree(
|
||||
cwd, worktree_path, agentic_branch_name,
|
||||
config.preset_name, final_verdict,
|
||||
)
|
||||
|
||||
if phase_idx == len(config.phases) - 1:
|
||||
final_verdict = "PASS" if phase_converged else "MAX_ITERATIONS_REACHED"
|
||||
|
||||
total_duration = time.monotonic() - start_time
|
||||
|
||||
pipeline_result = PipelineResult(
|
||||
@@ -362,6 +521,7 @@ def _run_phased_pipeline(
|
||||
run_dir=run_dir,
|
||||
repeated_aggregate_warnings=aggregate_warnings,
|
||||
escalated_issues=escalated_issues,
|
||||
agentic_branch=agentic_branch,
|
||||
)
|
||||
|
||||
if not dry_run:
|
||||
@@ -463,6 +623,8 @@ def _run_steps(
|
||||
run_dir: Path,
|
||||
output_iter: int,
|
||||
phase_name: str | None = None,
|
||||
worktree_path: Path | None = None,
|
||||
runtime_env: dict[str, str] | None = None,
|
||||
) -> tuple[dict[str, str], dict[str, AgentResult], str | None]:
|
||||
"""Execute all steps in one iteration, parallelizing where possible."""
|
||||
step_outputs: dict[str, str] = {}
|
||||
@@ -473,21 +635,23 @@ def _run_steps(
|
||||
|
||||
for batch in batches:
|
||||
if len(batch) == 1:
|
||||
# Single step — run directly
|
||||
step = batch[0]
|
||||
_execute_step(
|
||||
step, config, input_contents, feedback,
|
||||
iteration, max_iterations, cwd, timeout, dry_run,
|
||||
step_outputs, step_results,
|
||||
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
|
||||
run_dir=run_dir, output_iter=output_iter,
|
||||
phase_name=phase_name, worktree_path=worktree_path,
|
||||
runtime_env=runtime_env,
|
||||
)
|
||||
else:
|
||||
# Parallel batch — run with ThreadPoolExecutor
|
||||
_execute_parallel_batch(
|
||||
batch, config, input_contents, feedback,
|
||||
iteration, max_iterations, cwd, timeout, dry_run,
|
||||
step_outputs, step_results,
|
||||
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
|
||||
run_dir=run_dir, output_iter=output_iter,
|
||||
phase_name=phase_name, worktree_path=worktree_path,
|
||||
runtime_env=runtime_env,
|
||||
)
|
||||
|
||||
# Extract verdict from all verdict steps (ALL must PASS; ESCALATE wins over all)
|
||||
@@ -506,6 +670,25 @@ def _run_steps(
|
||||
return step_outputs, step_results, verdict
|
||||
|
||||
|
||||
def _invoke_agentic(
|
||||
agent_config: AgentConfig,
|
||||
prompt: str,
|
||||
step_name: str,
|
||||
*,
|
||||
worktree_path: Path,
|
||||
env: dict[str, str] | None = None,
|
||||
timeout: int | None = None,
|
||||
quiet: bool = False,
|
||||
) -> AgentResult:
|
||||
"""Run an agent in agentic mode using an existing worktree."""
|
||||
return invoke_agent_agentic(
|
||||
agent_config, prompt, step_name,
|
||||
worktree_path=worktree_path,
|
||||
env=env,
|
||||
timeout=timeout, quiet=quiet,
|
||||
)
|
||||
|
||||
|
||||
def _execute_step(
|
||||
step: StepConfig,
|
||||
config: PipelineConfig,
|
||||
@@ -523,6 +706,8 @@ def _execute_step(
|
||||
output_iter: int,
|
||||
phase_name: str | None = None,
|
||||
quiet: bool = False,
|
||||
worktree_path: Path | None = None,
|
||||
runtime_env: dict[str, str] | None = None,
|
||||
) -> None:
|
||||
"""Execute a single step, updating step_outputs and step_results in place."""
|
||||
if not quiet:
|
||||
@@ -542,6 +727,7 @@ def _execute_step(
|
||||
|
||||
# 4. Render prompt
|
||||
prompt = render_template(template, context)
|
||||
prompt = _augment_prompt_with_runtime_context(prompt, context)
|
||||
|
||||
# 5. Dry run: print and skip
|
||||
if dry_run:
|
||||
@@ -555,10 +741,21 @@ def _execute_step(
|
||||
# 6. Invoke agent
|
||||
agent_config = config.agents[step.agent]
|
||||
try:
|
||||
result = invoke_agent(
|
||||
agent_config, prompt, step.name,
|
||||
cwd=cwd, timeout=timeout, quiet=quiet,
|
||||
)
|
||||
if agent_config.agentic and worktree_path:
|
||||
result = _invoke_agentic(
|
||||
agent_config, prompt, step.name,
|
||||
worktree_path=worktree_path,
|
||||
env=runtime_env,
|
||||
timeout=timeout, quiet=quiet,
|
||||
)
|
||||
else:
|
||||
# When worktree exists, run non-agentic agents (reviewers) in
|
||||
# the worktree too so they can inspect the modified files.
|
||||
effective_cwd = worktree_path if worktree_path else cwd
|
||||
result = invoke_agent(
|
||||
agent_config, prompt, step.name,
|
||||
cwd=effective_cwd, env=runtime_env, timeout=timeout, quiet=quiet,
|
||||
)
|
||||
except subprocess.TimeoutExpired as e:
|
||||
stdout = (e.stdout or b"") if isinstance(e.stdout, bytes) else (e.stdout or "")
|
||||
stderr = (e.stderr or b"") if isinstance(e.stderr, bytes) else (e.stderr or "")
|
||||
@@ -625,6 +822,8 @@ def _execute_parallel_batch(
|
||||
run_dir: Path,
|
||||
output_iter: int,
|
||||
phase_name: str | None = None,
|
||||
worktree_path: Path | None = None,
|
||||
runtime_env: dict[str, str] | None = None,
|
||||
) -> None:
|
||||
"""Execute multiple steps in parallel using threads."""
|
||||
agent_names = ", ".join(s.agent for s in batch)
|
||||
@@ -640,6 +839,26 @@ def _execute_parallel_batch(
|
||||
)
|
||||
return
|
||||
|
||||
# Agentic steps cannot run in parallel (they share a worktree)
|
||||
agentic_in_batch = [
|
||||
s for s in batch
|
||||
if config.agents.get(s.agent, AgentConfig(name="", command="")).agentic
|
||||
]
|
||||
if len(agentic_in_batch) > 1:
|
||||
logger.warning(
|
||||
" [parallel] %d agentic steps cannot run concurrently — running sequentially",
|
||||
len(agentic_in_batch),
|
||||
)
|
||||
for step in batch:
|
||||
_execute_step(
|
||||
step, config, input_contents, feedback,
|
||||
iteration, max_iterations, cwd, timeout, dry_run,
|
||||
step_outputs, step_results,
|
||||
run_dir=run_dir, output_iter=output_iter,
|
||||
phase_name=phase_name, worktree_path=worktree_path,
|
||||
)
|
||||
return
|
||||
|
||||
# Snapshot context before parallel execution (all steps see same state)
|
||||
context_snapshot = dict(input_contents)
|
||||
context_snapshot.update(step_outputs)
|
||||
@@ -666,12 +885,22 @@ def _execute_parallel_batch(
|
||||
if step.context_override:
|
||||
context = _apply_context_override(context, step.context_override)
|
||||
prompt = render_template(template, context)
|
||||
prompt = _augment_prompt_with_runtime_context(prompt, context)
|
||||
|
||||
agent_config = config.agents[step.agent]
|
||||
result = invoke_agent(
|
||||
agent_config, prompt, step.name,
|
||||
cwd=cwd, timeout=timeout, quiet=True,
|
||||
)
|
||||
if agent_config.agentic and worktree_path:
|
||||
result = _invoke_agentic(
|
||||
agent_config, prompt, step.name,
|
||||
worktree_path=worktree_path,
|
||||
env=runtime_env,
|
||||
timeout=timeout, quiet=True,
|
||||
)
|
||||
else:
|
||||
effective_cwd = worktree_path if worktree_path else cwd
|
||||
result = invoke_agent(
|
||||
agent_config, prompt, step.name,
|
||||
cwd=effective_cwd, env=runtime_env, timeout=timeout, quiet=True,
|
||||
)
|
||||
return step.output_key, result.output, result
|
||||
|
||||
with ThreadPoolExecutor(max_workers=len(batch)) as executor:
|
||||
@@ -765,6 +994,35 @@ def _build_context(
|
||||
return context
|
||||
|
||||
|
||||
def _build_runtime_inputs(
|
||||
config: PipelineConfig,
|
||||
input_contents: dict[str, str],
|
||||
cwd: Path,
|
||||
) -> dict[str, str]:
|
||||
"""Load runtime env and expose safe execution hints to prompts."""
|
||||
env, loaded_files, loaded_values = build_runtime_environment(config.execution, cwd)
|
||||
input_contents["execution_policy"] = build_execution_policy(config.execution)
|
||||
input_contents["environment_context"] = summarize_environment(
|
||||
config.execution, loaded_files, env, loaded_values,
|
||||
)
|
||||
return env
|
||||
|
||||
|
||||
def _augment_prompt_with_runtime_context(
|
||||
prompt: str,
|
||||
context: dict[str, str],
|
||||
) -> str:
|
||||
"""Append execution/env guidance without requiring every template to include placeholders."""
|
||||
extras: list[str] = []
|
||||
if context.get("execution_policy"):
|
||||
extras.append("## Execution Policy\n" + context["execution_policy"])
|
||||
if context.get("environment_context"):
|
||||
extras.append("## Environment Context\n" + context["environment_context"])
|
||||
if not extras:
|
||||
return prompt
|
||||
return prompt.rstrip() + "\n\n" + "\n\n".join(extras) + "\n"
|
||||
|
||||
|
||||
def _apply_context_override(
|
||||
context: dict[str, str],
|
||||
overrides: dict[str, str],
|
||||
|
||||
Reference in New Issue
Block a user