release: cut 0.2.0 baseline

2026-03-13 21:47:54 +09:00
parent 204e071b74
commit 941304398d
15 changed files with 1930 additions and 270 deletions
--- a/cross_eval/pipeline.py
+++ b/cross_eval/pipeline.py
@@ -10,9 +10,11 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime
 from pathlib import Path

-from cross_eval.agent import AgentInvocationError, invoke_agent
+from cross_eval.agent import AgentInvocationError, invoke_agent, invoke_agent_agentic
+from cross_eval.worktree import WorktreeError
 from cross_eval.config import try_reload_config
 from cross_eval.models import (
+    AgentConfig,
    AgentResult,
    IterationResult,
    PipelineConfig,
@@ -21,6 +23,11 @@ from cross_eval.models import (
 )
 from cross_eval.prompts import render_template, resolve_template, set_language
 from cross_eval.report import build_report
+from cross_eval.runtime_env import (
+    build_execution_policy,
+    build_runtime_environment,
+    summarize_environment,
+)

 logger = logging.getLogger(__name__)

@@ -48,6 +55,104 @@ def _make_run_dir(config: PipelineConfig) -> Path:
    return run_dir


+def _commit_iteration(
+    worktree_path: Path,
+    label: str,
+    iteration: int,
+    verdict: str | None,
+) -> None:
+    """Intermediate commit after each agentic iteration.
+
+    This resets the diff baseline so the next iteration only captures new changes.
+    """
+    from cross_eval.worktree import commit_worktree
+    committed = commit_worktree(
+        worktree_path,
+        f"cross-eval: {label} v{iteration} ({verdict or 'no-verdict'})",
+    )
+    if committed:
+        logger.debug("  Intermediate commit: v%d (%s)", iteration, verdict)
+
+
+def _has_agentic_steps(config: PipelineConfig, steps: list[StepConfig]) -> bool:
+    """Check if any step uses an agentic agent."""
+    return any(
+        config.agents.get(s.agent, AgentConfig(name="", command="")).agentic
+        for s in steps
+    )
+
+
+def _setup_worktree(cwd: Path, run_dir: Path, preset_name: str) -> tuple[Path, str]:
+    """Create a shared worktree for the entire pipeline run.
+
+    1. Generate branch name (cross-eval/<preset>_<timestamp>)
+    2. Create branch from HEAD
+    3. Create worktree on that branch
+
+    Returns (worktree_path, branch_name).
+    """
+    from cross_eval.worktree import create_worktree, make_branch_name
+    branch_name = make_branch_name(preset_name)
+    worktree_dir = run_dir / "work"
+    worktree_path = create_worktree(
+        base_cwd=cwd, work_dir=worktree_dir, branch_name=branch_name,
+    )
+    return worktree_path, branch_name
+
+
+def _finalize_worktree(
+    cwd: Path,
+    worktree_path: Path,
+    branch_name: str,
+    preset_name: str,
+    final_verdict: str,
+) -> str | None:
+    """Commit changes on the branch, then remove the worktree.
+
+    The branch survives worktree removal and stays in the original repo.
+    Returns the branch name if changes were committed, None otherwise.
+    """
+    from cross_eval.worktree import commit_worktree, remove_worktree
+
+    committed = False
+    try:
+        committed = commit_worktree(
+            worktree_path,
+            f"cross-eval: {preset_name} ({final_verdict})",
+        )
+        if committed:
+            logger.info("  Agentic changes committed on branch: %s", branch_name)
+        else:
+            logger.warning("  No agentic changes to commit (empty diff)")
+    except Exception:
+        logger.warning("  Failed to commit agentic changes", exc_info=True)
+
+    try:
+        remove_worktree(base_cwd=cwd, work_dir=worktree_path)
+    except Exception:
+        logger.warning("Failed to clean up worktree: %s", worktree_path)
+
+    # Check if branch has any commits beyond the base — if not, delete it
+    if not committed:
+        try:
+            # Check if branch has diverged from its base
+            result = subprocess.run(
+                ["git", "log", "--oneline", f"HEAD..{branch_name}"],
+                cwd=cwd, capture_output=True, text=True,
+            )
+            if not result.stdout.strip():
+                # No commits on branch beyond base — clean up
+                subprocess.run(
+                    ["git", "branch", "-D", branch_name],
+                    cwd=cwd, capture_output=True,
+                )
+                logger.info("  Deleted empty branch: %s", branch_name)
+        except Exception:
+            pass  # best-effort cleanup
+
+    return branch_name if committed else None
+
+
 def _run_simple_pipeline(
    config: PipelineConfig,
    run_dir: Path,
@@ -61,6 +166,15 @@ def _run_simple_pipeline(

    set_language(config.language)
    input_contents = _load_inputs(config)
+    runtime_env = _build_runtime_inputs(config, input_contents, cwd or Path(os.getcwd()))
+
+    # Setup shared worktree for agentic mode
+    worktree_path: Path | None = None
+    agentic_branch_name: str | None = None
+    if not dry_run and _has_agentic_steps(config, config.pipeline):
+        worktree_path, agentic_branch_name = _setup_worktree(
+            cwd, run_dir, config.preset_name,
+        )

    feedback = "(no feedback — first iteration)"
    iterations: list[IterationResult] = []
@@ -71,99 +185,114 @@ def _run_simple_pipeline(
    escalated_issues: list[str] = []
    all_feedbacks: list[str] = []

-    for i in range(1, config.max_iterations + 1):
-        config = try_reload_config(config)
-        set_language(config.language)
-        _refresh_inputs(config, input_contents)
+    try:
+        for i in range(1, config.max_iterations + 1):
+            config = try_reload_config(config)
+            set_language(config.language)
+            _refresh_inputs(config, input_contents)
+            runtime_env = _build_runtime_inputs(config, input_contents, cwd)

-        logger.info("=" * 50)
-        logger.info("  Iteration %d/%d", i, config.max_iterations)
-        logger.info("=" * 50)
+            logger.info("=" * 50)
+            logger.info("  Iteration %d/%d", i, config.max_iterations)
+            logger.info("=" * 50)

-        step_outputs, step_results, verdict = _run_steps(
-            config.pipeline, config, input_contents, feedback,
-            i, config.max_iterations, cwd, timeout, dry_run,
-            run_dir=run_dir, output_iter=i,
-        )
+            step_outputs, step_results, verdict = _run_steps(
+                config.pipeline, config, input_contents, feedback,
+                i, config.max_iterations, cwd, timeout, dry_run,
+                run_dir=run_dir, output_iter=i,
+                worktree_path=worktree_path,
+                runtime_env=runtime_env,
+            )

-        iter_result = IterationResult(
-            iteration=i,
-            step_results=step_results,
-            step_outputs=step_outputs,
-            verdict=verdict,
-        )
-        warning = _detect_repeated_aggregate(
-            config.pipeline, step_outputs, aggregate_history, iteration=i,
-        )
-        if warning:
-            iter_result.repeated_aggregate_warning = warning
-            aggregate_warnings.append(warning)
-            logger.warning("  %s", warning)
+            # Intermediate commit so next iteration's diff only shows new changes
+            if worktree_path is not None:
+                _commit_iteration(worktree_path, config.preset_name, i, verdict)

-        iter_result.feedback = _collect_feedback(config.pipeline, step_outputs)
-        feedback = iter_result.feedback or feedback
-        all_feedbacks.append(feedback)
+            iter_result = IterationResult(
+                iteration=i,
+                step_results=step_results,
+                step_outputs=step_outputs,
+                verdict=verdict,
+            )
+            warning = _detect_repeated_aggregate(
+                config.pipeline, step_outputs, aggregate_history, iteration=i,
+            )
+            if warning:
+                iter_result.repeated_aggregate_warning = warning
+                aggregate_warnings.append(warning)
+                logger.warning("  %s", warning)

-        # Extract tracker from verdict/review steps for next iteration
-        for step in config.pipeline:
-            if step.verdict or step.role == "review":
-                tracker = _extract_senior_tracker(
-                    step_outputs.get(step.output_key, ""),
-                )
-                if tracker:
-                    input_contents["previous_senior_tracker"] = tracker
+            iter_result.feedback = _collect_feedback(config.pipeline, step_outputs)
+            feedback = iter_result.feedback or feedback
+            all_feedbacks.append(feedback)

-        iterations.append(iter_result)
-
-        # ESCALATE check (highest priority)
-        if verdict == "ESCALATE":
-            final_verdict = "ESCALATE"
-            # Extract escalation details from verdict step outputs
+            # Extract tracker from verdict/review steps for next iteration
            for step in config.pipeline:
-                if step.verdict:
-                    esc = _extract_escalated_issues(
+                if step.verdict or step.role == "review":
+                    tracker = _extract_senior_tracker(
                        step_outputs.get(step.output_key, ""),
                    )
-                    if esc:
-                        escalated_issues.append(esc)
-                        iter_result.escalated_issues = esc
-            logger.info("  ESCALATE at iteration %d — stopping loop.", i)
-            break
+                    if tracker:
+                        input_contents["previous_senior_tracker"] = tracker

-        if verdict == "PASS":
-            final_verdict = "PASS"
-            if i >= config.min_iterations:
-                logger.info("  PASS at iteration %d (min=%d reached)!", i, config.min_iterations)
+            iterations.append(iter_result)
+
+            # ESCALATE check (highest priority)
+            if verdict == "ESCALATE":
+                final_verdict = "ESCALATE"
+                for step in config.pipeline:
+                    if step.verdict:
+                        esc = _extract_escalated_issues(
+                            step_outputs.get(step.output_key, ""),
+                        )
+                        if esc:
+                            escalated_issues.append(esc)
+                            iter_result.escalated_issues = esc
+                logger.info("  ESCALATE at iteration %d — stopping loop.", i)
                break
-            else:
-                logger.info(
-                    "  PASS at iteration %d, but min_iterations=%d — continuing",
-                    i, config.min_iterations,
-                )

-        # Auto-escalate: no senior/aggregator + repeated FAIL
-        has_aggregator = config.seniors or any(
-            s.prompt_template == "default:aggregate-review" for s in config.pipeline
-        )
-        if (
-            verdict == "FAIL"
-            and not has_aggregator
-            and i >= 2
-            and _detect_auto_escalate(all_feedbacks[:-1], feedback)
-        ):
-            final_verdict = "ESCALATE"
-            auto_msg = (
-                f"Auto-escalated: same issues detected across {i} iterations "
-                f"without resolution (no senior reviewer configured)."
+            if verdict == "PASS":
+                final_verdict = "PASS"
+                if i >= config.min_iterations:
+                    logger.info("  PASS at iteration %d (min=%d reached)!", i, config.min_iterations)
+                    break
+                else:
+                    logger.info(
+                        "  PASS at iteration %d, but min_iterations=%d — continuing",
+                        i, config.min_iterations,
+                    )
+
+            # Auto-escalate: no senior/aggregator + repeated FAIL
+            has_aggregator = config.seniors or any(
+                s.prompt_template == "default:aggregate-review" for s in config.pipeline
            )
-            escalated_issues.append(auto_msg)
-            iter_result.escalated_issues = auto_msg
-            logger.info("  AUTO-ESCALATE at iteration %d", i)
-            break
+            if (
+                verdict == "FAIL"
+                and not has_aggregator
+                and i >= 2
+                and _detect_auto_escalate(all_feedbacks[:-1], feedback)
+            ):
+                final_verdict = "ESCALATE"
+                auto_msg = (
+                    f"Auto-escalated: same issues detected across {i} iterations "
+                    f"without resolution (no senior reviewer configured)."
+                )
+                escalated_issues.append(auto_msg)
+                iter_result.escalated_issues = auto_msg
+                logger.info("  AUTO-ESCALATE at iteration %d", i)
+                break

-        if dry_run:
-            logger.info("  (dry-run: stopping after iteration 1)")
-            break
+            if dry_run:
+                logger.info("  (dry-run: stopping after iteration 1)")
+                break
+
+    finally:
+        agentic_branch: str | None = None
+        if worktree_path is not None and agentic_branch_name is not None:
+            agentic_branch = _finalize_worktree(
+                cwd, worktree_path, agentic_branch_name,
+                config.preset_name, final_verdict,
+            )

    total_duration = time.monotonic() - start_time

@@ -174,6 +303,7 @@ def _run_simple_pipeline(
        run_dir=run_dir,
        repeated_aggregate_warnings=aggregate_warnings,
        escalated_issues=escalated_issues,
+        agentic_branch=agentic_branch,
    )

    if not dry_run:
@@ -195,6 +325,16 @@ def _run_phased_pipeline(

    set_language(config.language)
    input_contents = _load_inputs(config)
+    runtime_env = _build_runtime_inputs(config, input_contents, cwd)
+
+    # Setup shared worktree for agentic mode
+    all_phase_steps = [s for p in config.phases for s in p.steps]
+    worktree_path: Path | None = None
+    agentic_branch_name: str | None = None
+    if not dry_run and _has_agentic_steps(config, all_phase_steps):
+        worktree_path, agentic_branch_name = _setup_worktree(
+            cwd, run_dir, config.preset_name,
+        )

    iterations: list[IterationResult] = []
    feedback = "(no feedback — first iteration)"
@@ -207,152 +347,171 @@ def _run_phased_pipeline(
    all_feedbacks: list[str] = []
    escalated = False

-    for phase_idx, phase in enumerate(config.phases):
-        if escalated:
-            break
+    try:
+        for phase_idx, phase in enumerate(config.phases):
+            if escalated:
+                break

-        logger.info("=" * 60)
-        logger.info(
-            "  Phase: %s (max_iter=%d, consecutive_pass=%d)",
-            phase.name, phase.max_iterations, phase.consecutive_pass,
-        )
-        logger.info("=" * 60)
-
-        consecutive_passes = 0
-        phase_converged = False
-
-        for pi in range(1, phase.max_iterations + 1):
-            global_iter += 1
-
-            config = try_reload_config(config)
-            set_language(config.language)
-            _refresh_inputs(config, input_contents)
-
-            logger.info("-" * 50)
+            logger.info("=" * 60)
            logger.info(
-                "  [%s] Iteration %d/%d (global: v%d)",
-                phase.name, pi, phase.max_iterations, global_iter,
+                "  Phase: %s (max_iter=%d, consecutive_pass=%d)",
+                phase.name, phase.max_iterations, phase.consecutive_pass,
            )
-            logger.info("-" * 50)
+            logger.info("=" * 60)

-            step_outputs, step_results, verdict = _run_steps(
-                phase.steps, config, input_contents, feedback,
-                pi, phase.max_iterations, cwd, timeout, dry_run,
-                run_dir=run_dir, output_iter=global_iter, phase_name=phase.name,
-            )
+            consecutive_passes = 0
+            phase_converged = False

-            iter_result = IterationResult(
-                iteration=global_iter,
-                step_results=step_results,
-                step_outputs=step_outputs,
-                verdict=verdict,
-                phase_name=phase.name,
-            )
-            phase_history = aggregate_history_by_phase.setdefault(phase.name, {})
-            warning = _detect_repeated_aggregate(
-                phase.steps, step_outputs, phase_history, iteration=global_iter,
-                phase_name=phase.name,
-            )
-            if warning:
-                iter_result.repeated_aggregate_warning = warning
-                aggregate_warnings.append(warning)
-                logger.warning("  %s", warning)
+            for pi in range(1, phase.max_iterations + 1):
+                global_iter += 1

-            iter_result.feedback = _collect_feedback(phase.steps, step_outputs)
-            feedback = iter_result.feedback or feedback
-            all_feedbacks.append(feedback)
+                config = try_reload_config(config)
+                set_language(config.language)
+                _refresh_inputs(config, input_contents)
+                runtime_env = _build_runtime_inputs(config, input_contents, cwd)

-            # Extract tracker from verdict/review steps
-            for step in phase.steps:
-                if step.verdict or step.role == "review":
-                    tracker = _extract_senior_tracker(
-                        step_outputs.get(step.output_key, ""),
+                logger.info("-" * 50)
+                logger.info(
+                    "  [%s] Iteration %d/%d (global: v%d)",
+                    phase.name, pi, phase.max_iterations, global_iter,
+                )
+                logger.info("-" * 50)
+
+                step_outputs, step_results, verdict = _run_steps(
+                    phase.steps, config, input_contents, feedback,
+                    pi, phase.max_iterations, cwd, timeout, dry_run,
+                    run_dir=run_dir, output_iter=global_iter, phase_name=phase.name,
+                    worktree_path=worktree_path,
+                    runtime_env=runtime_env,
+                )
+
+                # Intermediate commit so next iteration's diff only shows new changes
+                if worktree_path is not None:
+                    _commit_iteration(
+                        worktree_path, f"{config.preset_name}/{phase.name}",
+                        global_iter, verdict,
                    )
-                    if tracker:
-                        input_contents["previous_senior_tracker"] = tracker

-            iterations.append(iter_result)
+                iter_result = IterationResult(
+                    iteration=global_iter,
+                    step_results=step_results,
+                    step_outputs=step_outputs,
+                    verdict=verdict,
+                    phase_name=phase.name,
+                )
+                phase_history = aggregate_history_by_phase.setdefault(phase.name, {})
+                warning = _detect_repeated_aggregate(
+                    phase.steps, step_outputs, phase_history, iteration=global_iter,
+                    phase_name=phase.name,
+                )
+                if warning:
+                    iter_result.repeated_aggregate_warning = warning
+                    aggregate_warnings.append(warning)
+                    logger.warning("  %s", warning)

-            # ESCALATE check
-            if verdict == "ESCALATE":
-                final_verdict = "ESCALATE"
+                iter_result.feedback = _collect_feedback(phase.steps, step_outputs)
+                feedback = iter_result.feedback or feedback
+                all_feedbacks.append(feedback)
+
+                # Extract tracker from verdict/review steps
                for step in phase.steps:
-                    if step.verdict:
-                        esc = _extract_escalated_issues(
+                    if step.verdict or step.role == "review":
+                        tracker = _extract_senior_tracker(
                            step_outputs.get(step.output_key, ""),
                        )
-                        if esc:
-                            escalated_issues.append(esc)
-                            iter_result.escalated_issues = esc
-                logger.info(
-                    "  [%s] ESCALATE at iteration %d — stopping.",
-                    phase.name, pi,
-                )
-                escalated = True
-                break
+                        if tracker:
+                            input_contents["previous_senior_tracker"] = tracker

-            if verdict is None:
-                logger.info(
-                    "  [%s] completed (no verdict step; single-pass phase)",
-                    phase.name,
-                )
-                phase_converged = True
-                break
+                iterations.append(iter_result)

-            if verdict == "PASS":
-                consecutive_passes += 1
-                logger.info(
-                    "  [%s] PASS (%d/%d consecutive)",
-                    phase.name, consecutive_passes, phase.consecutive_pass,
-                )
-                if consecutive_passes >= phase.consecutive_pass:
+                # ESCALATE check
+                if verdict == "ESCALATE":
+                    final_verdict = "ESCALATE"
+                    for step in phase.steps:
+                        if step.verdict:
+                            esc = _extract_escalated_issues(
+                                step_outputs.get(step.output_key, ""),
+                            )
+                            if esc:
+                                escalated_issues.append(esc)
+                                iter_result.escalated_issues = esc
                    logger.info(
-                        "  [%s] Converged! %d consecutive PASSes.",
-                        phase.name, phase.consecutive_pass,
+                        "  [%s] ESCALATE at iteration %d — stopping.",
+                        phase.name, pi,
+                    )
+                    escalated = True
+                    break
+
+                if verdict is None:
+                    logger.info(
+                        "  [%s] completed (no verdict step; single-pass phase)",
+                        phase.name,
                    )
                    phase_converged = True
                    break
-            else:
-                consecutive_passes = 0

-            # Auto-escalate in phased pipeline
-            has_aggregator = config.seniors or any(
-                s.prompt_template == "default:aggregate-review" for s in phase.steps
-            )
-            if (
-                verdict == "FAIL"
-                and not has_aggregator
-                and pi >= 2
-                and _detect_auto_escalate(all_feedbacks[:-1], feedback)
-            ):
-                final_verdict = "ESCALATE"
-                auto_msg = (
-                    f"Auto-escalated: same issues detected across {pi} iterations "
-                    f"in phase '{phase.name}' without resolution."
+                if verdict == "PASS":
+                    consecutive_passes += 1
+                    logger.info(
+                        "  [%s] PASS (%d/%d consecutive)",
+                        phase.name, consecutive_passes, phase.consecutive_pass,
+                    )
+                    if consecutive_passes >= phase.consecutive_pass:
+                        logger.info(
+                            "  [%s] Converged! %d consecutive PASSes.",
+                            phase.name, phase.consecutive_pass,
+                        )
+                        phase_converged = True
+                        break
+                else:
+                    consecutive_passes = 0
+
+                # Auto-escalate in phased pipeline
+                has_aggregator = config.seniors or any(
+                    s.prompt_template == "default:aggregate-review" for s in phase.steps
                )
-                escalated_issues.append(auto_msg)
-                iter_result.escalated_issues = auto_msg
-                logger.info("  [%s] AUTO-ESCALATE at iteration %d", phase.name, pi)
-                escalated = True
+                if (
+                    verdict == "FAIL"
+                    and not has_aggregator
+                    and pi >= 2
+                    and _detect_auto_escalate(all_feedbacks[:-1], feedback)
+                ):
+                    final_verdict = "ESCALATE"
+                    auto_msg = (
+                        f"Auto-escalated: same issues detected across {pi} iterations "
+                        f"in phase '{phase.name}' without resolution."
+                    )
+                    escalated_issues.append(auto_msg)
+                    iter_result.escalated_issues = auto_msg
+                    logger.info("  [%s] AUTO-ESCALATE at iteration %d", phase.name, pi)
+                    escalated = True
+                    break
+
+                if dry_run:
+                    break
+
+            if escalated:
                break

-            if dry_run:
-                break
+            if phase_converged:
+                logger.info("  Phase '%s' completed: CONVERGED", phase.name)
+            else:
+                logger.info(
+                    "  Phase '%s' completed: max iterations (%d) reached",
+                    phase.name, phase.max_iterations,
+                )

-        if escalated:
-            break
+            if phase_idx == len(config.phases) - 1:
+                final_verdict = "PASS" if phase_converged else "MAX_ITERATIONS_REACHED"

-        if phase_converged:
-            logger.info("  Phase '%s' completed: CONVERGED", phase.name)
-        else:
-            logger.info(
-                "  Phase '%s' completed: max iterations (%d) reached",
-                phase.name, phase.max_iterations,
+    finally:
+        agentic_branch: str | None = None
+        if worktree_path is not None and agentic_branch_name is not None:
+            agentic_branch = _finalize_worktree(
+                cwd, worktree_path, agentic_branch_name,
+                config.preset_name, final_verdict,
            )

-        if phase_idx == len(config.phases) - 1:
-            final_verdict = "PASS" if phase_converged else "MAX_ITERATIONS_REACHED"
-
    total_duration = time.monotonic() - start_time

    pipeline_result = PipelineResult(
@@ -362,6 +521,7 @@ def _run_phased_pipeline(
        run_dir=run_dir,
        repeated_aggregate_warnings=aggregate_warnings,
        escalated_issues=escalated_issues,
+        agentic_branch=agentic_branch,
    )

    if not dry_run:
@@ -463,6 +623,8 @@ def _run_steps(
    run_dir: Path,
    output_iter: int,
    phase_name: str | None = None,
+    worktree_path: Path | None = None,
+    runtime_env: dict[str, str] | None = None,
 ) -> tuple[dict[str, str], dict[str, AgentResult], str | None]:
    """Execute all steps in one iteration, parallelizing where possible."""
    step_outputs: dict[str, str] = {}
@@ -473,21 +635,23 @@ def _run_steps(

    for batch in batches:
        if len(batch) == 1:
-            # Single step — run directly
            step = batch[0]
            _execute_step(
                step, config, input_contents, feedback,
                iteration, max_iterations, cwd, timeout, dry_run,
                step_outputs, step_results,
-                run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
+                run_dir=run_dir, output_iter=output_iter,
+                phase_name=phase_name, worktree_path=worktree_path,
+                runtime_env=runtime_env,
            )
        else:
-            # Parallel batch — run with ThreadPoolExecutor
            _execute_parallel_batch(
                batch, config, input_contents, feedback,
                iteration, max_iterations, cwd, timeout, dry_run,
                step_outputs, step_results,
-                run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
+                run_dir=run_dir, output_iter=output_iter,
+                phase_name=phase_name, worktree_path=worktree_path,
+                runtime_env=runtime_env,
            )

    # Extract verdict from all verdict steps (ALL must PASS; ESCALATE wins over all)
@@ -506,6 +670,25 @@ def _run_steps(
    return step_outputs, step_results, verdict


+def _invoke_agentic(
+    agent_config: AgentConfig,
+    prompt: str,
+    step_name: str,
+    *,
+    worktree_path: Path,
+    env: dict[str, str] | None = None,
+    timeout: int | None = None,
+    quiet: bool = False,
+) -> AgentResult:
+    """Run an agent in agentic mode using an existing worktree."""
+    return invoke_agent_agentic(
+        agent_config, prompt, step_name,
+        worktree_path=worktree_path,
+        env=env,
+        timeout=timeout, quiet=quiet,
+    )
+
+
 def _execute_step(
    step: StepConfig,
    config: PipelineConfig,
@@ -523,6 +706,8 @@ def _execute_step(
    output_iter: int,
    phase_name: str | None = None,
    quiet: bool = False,
+    worktree_path: Path | None = None,
+    runtime_env: dict[str, str] | None = None,
 ) -> None:
    """Execute a single step, updating step_outputs and step_results in place."""
    if not quiet:
@@ -542,6 +727,7 @@ def _execute_step(

    # 4. Render prompt
    prompt = render_template(template, context)
+    prompt = _augment_prompt_with_runtime_context(prompt, context)

    # 5. Dry run: print and skip
    if dry_run:
@@ -555,10 +741,21 @@ def _execute_step(
    # 6. Invoke agent
    agent_config = config.agents[step.agent]
    try:
-        result = invoke_agent(
-            agent_config, prompt, step.name,
-            cwd=cwd, timeout=timeout, quiet=quiet,
-        )
+        if agent_config.agentic and worktree_path:
+            result = _invoke_agentic(
+                agent_config, prompt, step.name,
+                worktree_path=worktree_path,
+                env=runtime_env,
+                timeout=timeout, quiet=quiet,
+            )
+        else:
+            # When worktree exists, run non-agentic agents (reviewers) in
+            # the worktree too so they can inspect the modified files.
+            effective_cwd = worktree_path if worktree_path else cwd
+            result = invoke_agent(
+                agent_config, prompt, step.name,
+                cwd=effective_cwd, env=runtime_env, timeout=timeout, quiet=quiet,
+            )
    except subprocess.TimeoutExpired as e:
        stdout = (e.stdout or b"") if isinstance(e.stdout, bytes) else (e.stdout or "")
        stderr = (e.stderr or b"") if isinstance(e.stderr, bytes) else (e.stderr or "")
@@ -625,6 +822,8 @@ def _execute_parallel_batch(
    run_dir: Path,
    output_iter: int,
    phase_name: str | None = None,
+    worktree_path: Path | None = None,
+    runtime_env: dict[str, str] | None = None,
 ) -> None:
    """Execute multiple steps in parallel using threads."""
    agent_names = ", ".join(s.agent for s in batch)
@@ -640,6 +839,26 @@ def _execute_parallel_batch(
            )
        return

+    # Agentic steps cannot run in parallel (they share a worktree)
+    agentic_in_batch = [
+        s for s in batch
+        if config.agents.get(s.agent, AgentConfig(name="", command="")).agentic
+    ]
+    if len(agentic_in_batch) > 1:
+        logger.warning(
+            "  [parallel] %d agentic steps cannot run concurrently — running sequentially",
+            len(agentic_in_batch),
+        )
+        for step in batch:
+            _execute_step(
+                step, config, input_contents, feedback,
+                iteration, max_iterations, cwd, timeout, dry_run,
+                step_outputs, step_results,
+                run_dir=run_dir, output_iter=output_iter,
+                phase_name=phase_name, worktree_path=worktree_path,
+            )
+        return
+
    # Snapshot context before parallel execution (all steps see same state)
    context_snapshot = dict(input_contents)
    context_snapshot.update(step_outputs)
@@ -666,12 +885,22 @@ def _execute_parallel_batch(
        if step.context_override:
            context = _apply_context_override(context, step.context_override)
        prompt = render_template(template, context)
+        prompt = _augment_prompt_with_runtime_context(prompt, context)

        agent_config = config.agents[step.agent]
-        result = invoke_agent(
-            agent_config, prompt, step.name,
-            cwd=cwd, timeout=timeout, quiet=True,
-        )
+        if agent_config.agentic and worktree_path:
+            result = _invoke_agentic(
+                agent_config, prompt, step.name,
+                worktree_path=worktree_path,
+                env=runtime_env,
+                timeout=timeout, quiet=True,
+            )
+        else:
+            effective_cwd = worktree_path if worktree_path else cwd
+            result = invoke_agent(
+                agent_config, prompt, step.name,
+                cwd=effective_cwd, env=runtime_env, timeout=timeout, quiet=True,
+            )
        return step.output_key, result.output, result

    with ThreadPoolExecutor(max_workers=len(batch)) as executor:
@@ -765,6 +994,35 @@ def _build_context(
    return context


+def _build_runtime_inputs(
+    config: PipelineConfig,
+    input_contents: dict[str, str],
+    cwd: Path,
+) -> dict[str, str]:
+    """Load runtime env and expose safe execution hints to prompts."""
+    env, loaded_files, loaded_values = build_runtime_environment(config.execution, cwd)
+    input_contents["execution_policy"] = build_execution_policy(config.execution)
+    input_contents["environment_context"] = summarize_environment(
+        config.execution, loaded_files, env, loaded_values,
+    )
+    return env
+
+
+def _augment_prompt_with_runtime_context(
+    prompt: str,
+    context: dict[str, str],
+) -> str:
+    """Append execution/env guidance without requiring every template to include placeholders."""
+    extras: list[str] = []
+    if context.get("execution_policy"):
+        extras.append("## Execution Policy\n" + context["execution_policy"])
+    if context.get("environment_context"):
+        extras.append("## Environment Context\n" + context["environment_context"])
+    if not extras:
+        return prompt
+    return prompt.rstrip() + "\n\n" + "\n\n".join(extras) + "\n"
+
+
 def _apply_context_override(
    context: dict[str, str],
    overrides: dict[str, str],