release: cut 0.2.0 baseline

This commit is contained in:
chungyeong
2026-03-13 21:47:54 +09:00
parent 204e071b74
commit 941304398d
15 changed files with 1930 additions and 270 deletions

View File

@@ -1 +1 @@
__version__ = "0.1.0"
__version__ = "0.2.0"

View File

@@ -3,8 +3,10 @@ from __future__ import annotations
import itertools
import logging
import os
import subprocess
import sys
import tempfile
import threading
import time
from pathlib import Path
@@ -142,11 +144,17 @@ class _Spinner:
sys.stderr.flush()
def _is_print_mode(args: list[str]) -> bool:
"""Check if the agent args include -p / --print flag."""
return "-p" in args or "--print" in args
def invoke_agent(
agent: AgentConfig,
prompt: str,
step_name: str,
cwd: Optional[Path] = None,
env: Optional[dict[str, str]] = None,
timeout: int | None = None,
quiet: bool = False,
) -> AgentResult:
@@ -155,30 +163,67 @@ def invoke_agent(
Args:
quiet: If True, suppress spinner (for parallel execution).
"""
is_claude = "claude" in agent.command
is_interactive = is_claude and not _is_print_mode(agent.args)
cmd = [agent.command]
if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
cmd.extend(agent.args)
# Build the full prompt (system prompt + user prompt)
if agent.system_prompt and _supports_system_prompt_flag(agent.command):
# claude: --system-prompt flag supported natively
cmd.extend(["--system-prompt", agent.system_prompt])
input_data = prompt
elif agent.system_prompt:
# codex, others: no --system-prompt flag, prepend to prompt
input_data = (
f"<system>\n{agent.system_prompt}\n</system>\n\n"
f"{prompt}"
# --- Temp files for interactive (non -p) claude ---
task_file: Optional[Path] = None
output_file: Optional[Path] = None
if is_interactive:
# Write prompt + output instruction to temp task file
task_fd, task_path = tempfile.mkstemp(suffix=".md", prefix="cross_eval_task_")
task_file = Path(task_path)
os.close(task_fd)
out_fd, out_path = tempfile.mkstemp(suffix=".md", prefix="cross_eval_out_")
output_file = Path(out_path)
os.close(out_fd)
# Clear the output file so we can detect if agent wrote to it
output_file.write_text("", encoding="utf-8")
wrapped_prompt = (
f"{prompt}\n\n"
f"---\n"
f"IMPORTANT: Write your COMPLETE response to this file: {output_file}\n"
f"Do NOT modify any other files in the project."
)
task_file.write_text(wrapped_prompt, encoding="utf-8")
# System prompt via flag
if agent.system_prompt and _supports_system_prompt_flag(agent.command):
cmd.extend(["--system-prompt", agent.system_prompt])
# Positional arg: point claude to the task file
cmd.append(
f"Read the task file at {task_file} and follow all instructions in it. "
f"Write your complete output to {output_file}."
)
input_data: str | None = None
else:
input_data = prompt
# Print mode (-p) or non-claude: deliver prompt via stdin
if agent.system_prompt and _supports_system_prompt_flag(agent.command):
cmd.extend(["--system-prompt", agent.system_prompt])
input_data = prompt
elif agent.system_prompt:
input_data = (
f"<system>\n{agent.system_prompt}\n</system>\n\n"
f"{prompt}"
)
else:
input_data = prompt
logger.debug("Invoking agent '%s': %s", agent.name, " ".join(cmd[:5]) + " ...")
spinner: Optional[_Spinner] = None
if not quiet:
logger.info(" cmd: %s", " ".join(cmd[:6]))
mode_label = "interactive" if is_interactive else ""
logger.info(" cmd: %s %s", " ".join(cmd[:6]), f"({mode_label})" if mode_label else "")
spinner = _Spinner(f"[{step_name}] {agent.name} running...")
spinner.start()
@@ -191,6 +236,7 @@ def invoke_agent(
text=True,
timeout=timeout,
cwd=cwd,
env=env,
)
duration = time.monotonic() - start
except subprocess.TimeoutExpired:
@@ -201,10 +247,154 @@ def invoke_agent(
if spinner:
spinner.stop(f"[{step_name}] ERROR")
raise
finally:
if task_file:
task_file.unlink(missing_ok=True)
if result.returncode != 0:
if spinner:
spinner.stop(f"[{step_name}] FAILED (exit {result.returncode})")
if output_file:
output_file.unlink(missing_ok=True)
err_detail = result.stderr.strip() or result.stdout.strip()
if err_detail and len(err_detail) > 500:
err_detail = err_detail[:500] + "..."
cmd_preview = " ".join(cmd[:6])
failure_type, suggested_action = _classify_agent_failure(err_detail or "")
raise AgentInvocationError(
agent_name=agent.name,
step_name=step_name,
cmd_preview=cmd_preview,
raw_error=err_detail or "(no output)",
failure_type=failure_type,
suggested_action=suggested_action,
)
# --- Capture output ---
if output_file:
output = output_file.read_text(encoding="utf-8").strip()
output_file.unlink(missing_ok=True)
if not output:
# Fallback to stdout if agent didn't write to the file
output = result.stdout.strip()
else:
output = result.stdout.strip()
output = result.stdout.strip()
chars = len(output)
if spinner:
spinner.stop(f"[{step_name}] done — {chars} chars")
if not output:
stderr_info = result.stderr.strip()
if stderr_info:
logger.warning(
"Agent '%s' produced empty output at step '%s'. stderr: %s",
agent.name, step_name, stderr_info[:500],
)
else:
logger.warning(
"Agent '%s' produced empty output at step '%s' (no stderr either)",
agent.name, step_name,
)
return AgentResult(
output=output,
exit_code=result.returncode,
agent_name=agent.name,
step_name=step_name,
duration_seconds=round(duration, 1),
)
def invoke_agent_agentic(
agent: AgentConfig,
prompt: str,
step_name: str,
worktree_path: Path,
env: Optional[dict[str, str]] = None,
timeout: int | None = None,
quiet: bool = False,
) -> AgentResult:
"""Invoke an agent in agentic mode (no -p, runs in worktree, captures git diff).
The agent runs without print mode so it can modify files directly.
After the agent exits, git diff (since last commit) is captured as the output.
"""
from cross_eval.worktree import capture_diff
# Write prompt to a temp file (outside worktree, won't appear in diffs)
import tempfile
task_fd, task_path = tempfile.mkstemp(suffix=".md", prefix="cross_eval_task_")
task_file = Path(task_path)
task_file.write_text(prompt, encoding="utf-8")
os.close(task_fd)
cmd = [agent.command]
if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
# Strip stdin sentinel ("-") from args for agentic mode
args = [a for a in agent.args if a != "-"]
cmd.extend(args)
# System prompt via flag if supported
if agent.system_prompt and _supports_system_prompt_flag(agent.command):
cmd.extend(["--system-prompt", agent.system_prompt])
# Deliver the prompt differently per agent type
is_codex = "codex" in agent.command
input_data: str | None = None
if is_codex:
# codex: stdin mode
cmd.append("-")
if agent.system_prompt and not _supports_system_prompt_flag(agent.command):
input_data = f"<system>\n{agent.system_prompt}\n</system>\n\n{prompt}"
else:
input_data = prompt
else:
# claude: use positional arg with a pointer to the task file
# (avoids OS arg length limits for large prompts)
cmd.append(
f"Read the task file at {task_file} and execute all instructions in it. "
f"Work in the current directory."
)
logger.debug(
"Invoking agent '%s' (agentic) in worktree: %s",
agent.name, worktree_path,
)
spinner: Optional[_Spinner] = None
if not quiet:
logger.info(" cmd: %s (agentic)", " ".join(cmd[:6]))
spinner = _Spinner(f"[{step_name}] {agent.name} (agentic) running...")
spinner.start()
try:
start = time.monotonic()
result = subprocess.run(
cmd,
input=input_data,
capture_output=True,
text=True,
timeout=timeout,
cwd=worktree_path,
env=env,
)
duration = time.monotonic() - start
except subprocess.TimeoutExpired:
if spinner:
spinner.stop(f"[{step_name}] TIMEOUT after {timeout}s")
raise
except Exception:
if spinner:
spinner.stop(f"[{step_name}] ERROR")
raise
finally:
# Clean up temp task file (it's in /tmp, not in worktree)
task_file.unlink(missing_ok=True)
if result.returncode != 0:
if spinner:
spinner.stop(f"[{step_name}] FAILED (exit {result.returncode})")
@@ -222,17 +412,22 @@ def invoke_agent(
suggested_action=suggested_action,
)
if spinner:
spinner.stop(f"[{step_name}] done — {chars} chars")
# Capture git diff as the output (changes since last commit on the branch)
diff_output = capture_diff(worktree_path)
if not output:
if not diff_output:
diff_output = "(no changes)"
logger.warning(
"Agent '%s' produced empty output at step '%s'",
"Agent '%s' made no file changes at step '%s'",
agent.name, step_name,
)
chars = len(diff_output)
if spinner:
spinner.stop(f"[{step_name}] done — {chars} chars (agentic)")
return AgentResult(
output=output,
output=diff_output,
exit_code=result.returncode,
agent_name=agent.name,
step_name=step_name,

View File

@@ -49,7 +49,7 @@ max_iterations: 3
language: {language}
# 결과 저장 경로
output_dir: output
output_dir: .cross-eval/output
# ─── 커스텀 에이전트 (선택) ────────────────────────────────────
# 기본 제공 에이전트를 덮어쓰거나 새 에이전트를 정의할 수 있습니다.
@@ -372,6 +372,14 @@ def main(argv: list[str] | None = None) -> int:
"--input", action="append", dest="inputs", metavar="KEY=PATH",
help="추가 입력 파일 (예: --input spec=./api-spec.md)",
)
input_group.add_argument(
"--env-file", action="append", dest="env_files", type=Path, default=None,
help="에이전트 subprocess에 주입할 추가 .env 파일 (여러 개 가능)",
)
input_group.add_argument(
"--target", action="append", dest="execution_targets", default=None,
help="에이전트에게 강조할 실행 대상 힌트 (예: clickhouse, postgres)",
)
# -- 에이전트 설정 --
agent_group = run_parser.add_argument_group(
@@ -410,6 +418,10 @@ def main(argv: list[str] | None = None) -> int:
choices=REASONING_EFFORT_CHOICES + ("extra-high", "extra_high", "x-high"),
help="Senior용 reasoning effort",
)
agent_group.add_argument(
"--agentic", action="store_true", default=False,
help="Coder를 agentic 모드로 실행 (worktree에서 파일 직접 수정, git diff로 결과 캡처)",
)
agent_group.add_argument(
"--model", default=None, metavar="MODEL",
help="모든 에이전트의 모델을 한번에 변경 (예: sonnet, opus)",
@@ -761,7 +773,7 @@ def _generate_guided_config(
"",
f"max_iterations: {settings['max_iter']}",
f"language: {lang}",
"output_dir: output",
"output_dir: .cross-eval/output",
"",
])
@@ -799,20 +811,19 @@ def _apply_model_override(config, agent_name: str, model: str) -> None:
def _apply_phased_iteration_override(config, max_iter: int | None) -> None:
"""Apply CLI max-iter to converging phases while preserving setup phases."""
if max_iter is None:
return
from cross_eval.config import sync_phased_iterations
for phase in config.phases:
if any(step.verdict for step in phase.steps):
phase.max_iterations = max_iter
sync_phased_iterations(config, max_iter)
def cmd_run(args: argparse.Namespace) -> int:
"""Load config, validate, and execute the pipeline."""
from cross_eval.config import (
ensure_fix_preset_agentic,
apply_input_overrides,
default_config,
load_config,
sync_phased_iterations,
validate_config,
)
from cross_eval.prompts import PIPELINE_PRESETS
@@ -917,6 +928,10 @@ def cmd_run(args: argparse.Namespace) -> int:
if preset in {"plan-review", "review-only"} and args.max_iter is None and args.min_iter is None:
config.max_iterations = 1
sync_phased_iterations(config)
if args.max_iter is not None:
sync_phased_iterations(config, args.max_iter)
apply_reasoning_effort_settings(
config,
reasoning_effort=args.reasoning_effort,
@@ -925,6 +940,15 @@ def cmd_run(args: argparse.Namespace) -> int:
senior_effort=args.senior_effort,
)
# --agentic: convert coder agents to agentic mode
if args.agentic:
from cross_eval.config import _make_agentic
for coder_name in config.coders:
if coder_name in config.agents:
_make_agentic(config.agents[coder_name])
ensure_fix_preset_agentic(config)
# --model: apply to ALL agents
if args.model is not None:
for agent_name in config.agents:
@@ -958,6 +982,17 @@ def cmd_run(args: argparse.Namespace) -> int:
return 1
config.inputs["docs"] = docs_content
if args.env_files:
for env_file in args.env_files:
resolved = env_file.resolve()
if not resolved.exists():
print(f"Env file not found: {resolved}", file=sys.stderr)
return 1
config.execution.env_files.append(str(resolved))
if args.execution_targets:
config.execution.auto_context_targets = list(args.execution_targets)
if args.inputs:
overrides = {}
for item in args.inputs:

View File

@@ -1,6 +1,7 @@
"""Configuration loading, validation, and preset resolution."""
from __future__ import annotations
import copy
import logging
import re
from pathlib import Path
@@ -8,7 +9,13 @@ from typing import Any
import yaml
from cross_eval.models import AgentConfig, PhaseConfig, PipelineConfig, StepConfig
from cross_eval.models import (
AgentConfig,
ExecutionConfig,
PhaseConfig,
PipelineConfig,
StepConfig,
)
from cross_eval.prompts import PHASED_PRESETS, PIPELINE_PRESETS
logger = logging.getLogger(__name__)
@@ -24,6 +31,7 @@ DEFAULT_ROLE_REASONING_EFFORTS = {
"reviewer": "medium",
"senior": "high",
}
FIX_STYLE_PRESETS = {"review-fix", "coding-review-fix"}
# ---------------------------------------------------------------------------
@@ -54,7 +62,12 @@ _CLAUDE_CODER_ARGS = list(_CLAUDE_BASE_ARGS) + [
"bypassPermissions",
]
_CLAUDE_REVIEW_ARGS = list(_CLAUDE_BASE_ARGS) + [
_CLAUDE_REVIEW_ARGS = [
"--setting-sources",
"user",
"--disable-slash-commands",
"--model",
"opus",
"--permission-mode",
"plan",
]
@@ -64,29 +77,37 @@ _CODER_SYSTEM_PROMPT = (
"Rules:\n"
"1. FIRST explore the project directory to understand the existing codebase, "
"patterns, and conventions before writing any code.\n"
"2. Implement ONLY what the plan specifies. Do NOT add extra features, "
"2. You may decide which shell, Python, git, docker, test, and database commands "
"to run. The user does not need to pre-specify exact commands.\n"
"3. Environment variables from configured .env files may already be loaded into "
"your process; use them when validating services such as ClickHouse.\n"
"4. Implement ONLY what the plan specifies. Do NOT add extra features, "
"unnecessary abstractions, premature optimizations, or \"nice-to-have\" improvements.\n"
"3. Follow the project's existing coding style, naming conventions, and directory structure.\n"
"4. If previous review feedback is provided, fix ONLY the specific issues mentioned. "
"5. Follow the project's existing coding style, naming conventions, and directory structure.\n"
"6. If previous review feedback is provided, fix ONLY the specific issues mentioned. "
"Do NOT refactor unrelated code.\n"
"5. Ignore any items from previous feedback that were marked as DISMISSED or false positive.\n"
"6. When in doubt about scope, do LESS, not more."
"7. Ignore any items from previous feedback that were marked as DISMISSED or false positive.\n"
"8. When in doubt about scope, do LESS, not more."
)
_REVIEWER_SYSTEM_PROMPT = (
"You are a code reviewer. You MUST NOT create, modify, or delete any files.\n"
"Rules:\n"
"1. Explore the project directory to understand the full codebase context.\n"
"2. Compare the implementation against the plan and checklist ONLY.\n"
"3. Classify every issue with BOTH severity AND category:\n"
"2. You may decide which shell, Python, test, git, docker, and database read commands "
"to run in order to verify behavior. The user does not need to pre-specify exact commands.\n"
"3. Environment variables from configured .env files may already be loaded into "
"your process; use them for verification when relevant.\n"
"4. Compare the implementation against the plan and checklist ONLY.\n"
"5. Classify every issue with BOTH severity AND category:\n"
" - Severity: Critical (breaks functionality/security) > Major (requirement mismatch) > Minor (convention/style)\n"
" - Category: Over-engineering / Omission\n"
"4. When reviewing with previous feedback, mark items as CONFIRMED (still an issue) "
"6. When reviewing with previous feedback, mark items as CONFIRMED (still an issue) "
"or DISMISSED (false positive) with rationale.\n"
"5. Report out-of-scope issues separately — problems found outside plan/checklist scope.\n"
"6. Order issues by severity (Critical first).\n"
"7. Do NOT suggest improvements beyond the plan scope.\n"
"8. End with VERDICT: PASS (all requirements met, no over-engineering) "
"7. Report out-of-scope issues separately — problems found outside plan/checklist scope.\n"
"8. Order issues by severity (Critical first).\n"
"9. Do NOT suggest improvements beyond the plan scope.\n"
"10. End with VERDICT: PASS (all requirements met, no over-engineering) "
"or VERDICT: FAIL (issues found)."
)
@@ -94,16 +115,20 @@ _SENIOR_SYSTEM_PROMPT = (
"You are a senior technical reviewer coordinating a review-fix-verification loop.\n"
"Rules:\n"
"1. Explore the project directory to understand the full codebase context.\n"
"2. In aggregation mode, deduplicate overlaps, resolve disagreements, and keep only "
"2. You may decide which shell, Python, test, git, docker, and database read commands "
"to run to verify disputed issues. The user does not need to pre-specify exact commands.\n"
"3. Environment variables from configured .env files may already be loaded into "
"your process; use them when validating service integrations.\n"
"4. In aggregation mode, deduplicate overlaps, resolve disagreements, and keep only "
"evidence-backed issues. Categorize dismissed findings as [False positive] or [Already fixed].\n"
"3. In verification mode, judge the current implementation directly against ONLY the "
"5. In verification mode, judge the current implementation directly against ONLY the "
"plan and checklist.\n"
"4. Be skeptical of false positives, but do not lower the bar on real requirement "
"6. Be skeptical of false positives, but do not lower the bar on real requirement "
"gaps.\n"
"5. When issues remain, produce a concise prioritized action list the coder can act on.\n"
"6. Maintain an Issue Tracker table across iterations to track issue status.\n"
"7. Do NOT invent new requirements beyond the plan and checklist.\n"
"8. End with one of three verdicts:\n"
"7. When issues remain, produce a concise prioritized action list the coder can act on.\n"
"8. Maintain an Issue Tracker table across iterations to track issue status.\n"
"9. Do NOT invent new requirements beyond the plan and checklist.\n"
"10. End with one of three verdicts:\n"
" - VERDICT: PASS — all requirements met, no issues remain.\n"
" - VERDICT: FAIL — issues found that the coder can fix.\n"
" - VERDICT: ESCALATE — issues that require human intervention. Use ESCALATE when:\n"
@@ -263,7 +288,7 @@ def _resolve_agents(
for name in all_referenced:
if name not in result and name in BUILTIN_AGENTS:
result[name] = BUILTIN_AGENTS[name]
result[name] = copy.deepcopy(BUILTIN_AGENTS[name])
return result
@@ -354,15 +379,16 @@ def _apply_role_effort(
def default_config() -> PipelineConfig:
"""Return a PipelineConfig with sensible defaults (no YAML needed)."""
agents = dict(BUILTIN_AGENTS)
agents = copy.deepcopy(BUILTIN_AGENTS)
coders = ["claude-coder"]
reviewers = ["claude-reviewer"]
seniors: list[str] = []
pipeline = PIPELINE_PRESETS["simple"](coders, reviewers, seniors)
return PipelineConfig(
output_dir=Path("output"),
output_dir=Path(".cross-eval/output"),
max_iterations=3,
language="ko",
execution=ExecutionConfig(),
inputs={},
agents=agents,
coders=coders,
@@ -406,6 +432,7 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
system_prompt=agent_data.get("system_prompt"),
reasoning_effort=agent_data.get("reasoning_effort"),
stdin_mode=agent_data.get("stdin_mode", False),
agentic=agent_data.get("agentic", False),
)
# --- roles: explicit or inferred ---
@@ -445,6 +472,17 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
p = config_dir / p
inputs[key] = p
execution_raw = raw.get("execution", {}) or {}
execution = ExecutionConfig(
mode=execution_raw.get("mode", "agent-decides"),
command_policy=execution_raw.get("command_policy", "broad"),
inherit_env=bool(execution_raw.get("inherit_env", True)),
auto_env_files=list(execution_raw.get("auto_env_files", [".env", ".env.local"])),
env_files=list(execution_raw.get("env_files", [])),
expose_env_names=bool(execution_raw.get("expose_env_names", True)),
auto_context_targets=list(execution_raw.get("auto_context_targets", [])),
)
# --- pipeline (preset or custom) ---
steps, phases = _resolve_pipeline(pipeline_raw, coders, reviewers, seniors)
@@ -453,12 +491,13 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
if isinstance(pipeline_raw, str) and pipeline_raw.startswith("preset:"):
preset_name = pipeline_raw.split(":", 1)[1]
return PipelineConfig(
output_dir=Path(raw.get("output_dir", "output")),
config = PipelineConfig(
output_dir=Path(raw.get("output_dir", ".cross-eval/output")),
max_iterations=int(raw.get("max_iterations", 3)),
min_iterations=int(raw.get("min_iterations", 1)),
verbose=bool(raw.get("verbose", False)),
language=raw.get("language", "en"),
execution=execution,
inputs=inputs,
agents=agents,
coders=coders,
@@ -470,6 +509,9 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
_config_path=config_path,
_config_mtime=config_path.stat().st_mtime,
)
sync_phased_iterations(config)
ensure_fix_preset_agentic(config)
return config
def try_reload_config(config: PipelineConfig) -> PipelineConfig:
@@ -619,6 +661,16 @@ def validate_config(config: PipelineConfig) -> list[str]:
if config.language not in ("en", "ko"):
errors.append(f"Unsupported language '{config.language}'. Use 'en' or 'ko'.")
if config.execution.mode not in {"agent-decides"}:
errors.append(
f"Unsupported execution.mode '{config.execution.mode}'. Use 'agent-decides'."
)
if config.execution.command_policy not in {"broad", "restricted"}:
errors.append(
"Unsupported execution.command_policy "
f"'{config.execution.command_policy}'. Use 'broad' or 'restricted'."
)
return errors
@@ -642,6 +694,37 @@ def _validate_unique_step_fields(
seen_output_keys.add(step.output_key)
def _make_agentic(agent: AgentConfig) -> None:
"""Convert an agent to agentic mode in-place (remove -p, set agentic=True)."""
agent.agentic = True
agent.args = [a for a in agent.args if a != "-p"]
def sync_phased_iterations(
config: PipelineConfig,
max_iter: int | None = None,
) -> None:
"""Apply effective max iterations to converging phases while preserving setup phases."""
if not config.phases:
return
effective_max_iter = config.max_iterations if max_iter is None else max_iter
for phase in config.phases:
if any(step.verdict for step in phase.steps):
phase.max_iterations = effective_max_iter
def ensure_fix_preset_agentic(config: PipelineConfig) -> None:
"""Fix-style presets should modify code, so coders run agentically by default."""
if config.preset_name not in FIX_STYLE_PRESETS:
return
for coder_name in config.coders:
agent = config.agents.get(coder_name)
if agent is not None and not agent.agentic:
_make_agentic(agent)
def apply_input_overrides(
config: PipelineConfig, overrides: dict[str, str]
) -> None:

View File

@@ -265,7 +265,7 @@ def run_live_demo(
checklist_path.write_text(DEMO_CHECKLIST, encoding="utf-8")
config = PipelineConfig(
output_dir=Path("output"),
output_dir=Path(".cross-eval/output"),
max_iterations=3,
language="en",
inputs={"plan": plan_path, "checklist": checklist_path},

View File

@@ -16,6 +16,7 @@ class AgentConfig:
system_prompt: Optional[str] = None
reasoning_effort: Optional[str] = None
stdin_mode: bool = False
agentic: bool = False # run in worktree, capture git diff instead of stdout
@dataclass
@@ -43,15 +44,29 @@ class PhaseConfig:
consecutive_pass: int = 1 # stop after N consecutive PASSes
@dataclass
class ExecutionConfig:
"""Runtime execution policy for agent subprocesses."""
mode: str = "agent-decides"
command_policy: str = "broad"
inherit_env: bool = True
auto_env_files: list[str] = field(default_factory=lambda: [".env", ".env.local"])
env_files: list[str] = field(default_factory=list)
expose_env_names: bool = True
auto_context_targets: list[str] = field(default_factory=list)
@dataclass
class PipelineConfig:
"""Full cross-eval configuration."""
output_dir: Path = field(default_factory=lambda: Path("output"))
output_dir: Path = field(default_factory=lambda: Path(".cross-eval/output"))
max_iterations: int = 3
min_iterations: int = 1
verbose: bool = False
language: str = "en" # "en" or "ko"
execution: ExecutionConfig = field(default_factory=ExecutionConfig)
inputs: dict[str, Path | str] = field(default_factory=dict)
agents: dict[str, AgentConfig] = field(default_factory=dict)
coders: list[str] = field(default_factory=list)
@@ -118,3 +133,4 @@ class PipelineResult:
run_dir: Optional[Path] = None
repeated_aggregate_warnings: list[str] = field(default_factory=list)
escalated_issues: list[str] = field(default_factory=list)
agentic_branch: Optional[str] = None

View File

@@ -10,9 +10,11 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pathlib import Path
from cross_eval.agent import AgentInvocationError, invoke_agent
from cross_eval.agent import AgentInvocationError, invoke_agent, invoke_agent_agentic
from cross_eval.worktree import WorktreeError
from cross_eval.config import try_reload_config
from cross_eval.models import (
AgentConfig,
AgentResult,
IterationResult,
PipelineConfig,
@@ -21,6 +23,11 @@ from cross_eval.models import (
)
from cross_eval.prompts import render_template, resolve_template, set_language
from cross_eval.report import build_report
from cross_eval.runtime_env import (
build_execution_policy,
build_runtime_environment,
summarize_environment,
)
logger = logging.getLogger(__name__)
@@ -48,6 +55,104 @@ def _make_run_dir(config: PipelineConfig) -> Path:
return run_dir
def _commit_iteration(
worktree_path: Path,
label: str,
iteration: int,
verdict: str | None,
) -> None:
"""Intermediate commit after each agentic iteration.
This resets the diff baseline so the next iteration only captures new changes.
"""
from cross_eval.worktree import commit_worktree
committed = commit_worktree(
worktree_path,
f"cross-eval: {label} v{iteration} ({verdict or 'no-verdict'})",
)
if committed:
logger.debug(" Intermediate commit: v%d (%s)", iteration, verdict)
def _has_agentic_steps(config: PipelineConfig, steps: list[StepConfig]) -> bool:
"""Check if any step uses an agentic agent."""
return any(
config.agents.get(s.agent, AgentConfig(name="", command="")).agentic
for s in steps
)
def _setup_worktree(cwd: Path, run_dir: Path, preset_name: str) -> tuple[Path, str]:
"""Create a shared worktree for the entire pipeline run.
1. Generate branch name (cross-eval/<preset>_<timestamp>)
2. Create branch from HEAD
3. Create worktree on that branch
Returns (worktree_path, branch_name).
"""
from cross_eval.worktree import create_worktree, make_branch_name
branch_name = make_branch_name(preset_name)
worktree_dir = run_dir / "work"
worktree_path = create_worktree(
base_cwd=cwd, work_dir=worktree_dir, branch_name=branch_name,
)
return worktree_path, branch_name
def _finalize_worktree(
cwd: Path,
worktree_path: Path,
branch_name: str,
preset_name: str,
final_verdict: str,
) -> str | None:
"""Commit changes on the branch, then remove the worktree.
The branch survives worktree removal and stays in the original repo.
Returns the branch name if changes were committed, None otherwise.
"""
from cross_eval.worktree import commit_worktree, remove_worktree
committed = False
try:
committed = commit_worktree(
worktree_path,
f"cross-eval: {preset_name} ({final_verdict})",
)
if committed:
logger.info(" Agentic changes committed on branch: %s", branch_name)
else:
logger.warning(" No agentic changes to commit (empty diff)")
except Exception:
logger.warning(" Failed to commit agentic changes", exc_info=True)
try:
remove_worktree(base_cwd=cwd, work_dir=worktree_path)
except Exception:
logger.warning("Failed to clean up worktree: %s", worktree_path)
# Check if branch has any commits beyond the base — if not, delete it
if not committed:
try:
# Check if branch has diverged from its base
result = subprocess.run(
["git", "log", "--oneline", f"HEAD..{branch_name}"],
cwd=cwd, capture_output=True, text=True,
)
if not result.stdout.strip():
# No commits on branch beyond base — clean up
subprocess.run(
["git", "branch", "-D", branch_name],
cwd=cwd, capture_output=True,
)
logger.info(" Deleted empty branch: %s", branch_name)
except Exception:
pass # best-effort cleanup
return branch_name if committed else None
def _run_simple_pipeline(
config: PipelineConfig,
run_dir: Path,
@@ -61,6 +166,15 @@ def _run_simple_pipeline(
set_language(config.language)
input_contents = _load_inputs(config)
runtime_env = _build_runtime_inputs(config, input_contents, cwd or Path(os.getcwd()))
# Setup shared worktree for agentic mode
worktree_path: Path | None = None
agentic_branch_name: str | None = None
if not dry_run and _has_agentic_steps(config, config.pipeline):
worktree_path, agentic_branch_name = _setup_worktree(
cwd, run_dir, config.preset_name,
)
feedback = "(no feedback — first iteration)"
iterations: list[IterationResult] = []
@@ -71,99 +185,114 @@ def _run_simple_pipeline(
escalated_issues: list[str] = []
all_feedbacks: list[str] = []
for i in range(1, config.max_iterations + 1):
config = try_reload_config(config)
set_language(config.language)
_refresh_inputs(config, input_contents)
try:
for i in range(1, config.max_iterations + 1):
config = try_reload_config(config)
set_language(config.language)
_refresh_inputs(config, input_contents)
runtime_env = _build_runtime_inputs(config, input_contents, cwd)
logger.info("=" * 50)
logger.info(" Iteration %d/%d", i, config.max_iterations)
logger.info("=" * 50)
logger.info("=" * 50)
logger.info(" Iteration %d/%d", i, config.max_iterations)
logger.info("=" * 50)
step_outputs, step_results, verdict = _run_steps(
config.pipeline, config, input_contents, feedback,
i, config.max_iterations, cwd, timeout, dry_run,
run_dir=run_dir, output_iter=i,
)
step_outputs, step_results, verdict = _run_steps(
config.pipeline, config, input_contents, feedback,
i, config.max_iterations, cwd, timeout, dry_run,
run_dir=run_dir, output_iter=i,
worktree_path=worktree_path,
runtime_env=runtime_env,
)
iter_result = IterationResult(
iteration=i,
step_results=step_results,
step_outputs=step_outputs,
verdict=verdict,
)
warning = _detect_repeated_aggregate(
config.pipeline, step_outputs, aggregate_history, iteration=i,
)
if warning:
iter_result.repeated_aggregate_warning = warning
aggregate_warnings.append(warning)
logger.warning(" %s", warning)
# Intermediate commit so next iteration's diff only shows new changes
if worktree_path is not None:
_commit_iteration(worktree_path, config.preset_name, i, verdict)
iter_result.feedback = _collect_feedback(config.pipeline, step_outputs)
feedback = iter_result.feedback or feedback
all_feedbacks.append(feedback)
iter_result = IterationResult(
iteration=i,
step_results=step_results,
step_outputs=step_outputs,
verdict=verdict,
)
warning = _detect_repeated_aggregate(
config.pipeline, step_outputs, aggregate_history, iteration=i,
)
if warning:
iter_result.repeated_aggregate_warning = warning
aggregate_warnings.append(warning)
logger.warning(" %s", warning)
# Extract tracker from verdict/review steps for next iteration
for step in config.pipeline:
if step.verdict or step.role == "review":
tracker = _extract_senior_tracker(
step_outputs.get(step.output_key, ""),
)
if tracker:
input_contents["previous_senior_tracker"] = tracker
iter_result.feedback = _collect_feedback(config.pipeline, step_outputs)
feedback = iter_result.feedback or feedback
all_feedbacks.append(feedback)
iterations.append(iter_result)
# ESCALATE check (highest priority)
if verdict == "ESCALATE":
final_verdict = "ESCALATE"
# Extract escalation details from verdict step outputs
# Extract tracker from verdict/review steps for next iteration
for step in config.pipeline:
if step.verdict:
esc = _extract_escalated_issues(
if step.verdict or step.role == "review":
tracker = _extract_senior_tracker(
step_outputs.get(step.output_key, ""),
)
if esc:
escalated_issues.append(esc)
iter_result.escalated_issues = esc
logger.info(" ESCALATE at iteration %d — stopping loop.", i)
break
if tracker:
input_contents["previous_senior_tracker"] = tracker
if verdict == "PASS":
final_verdict = "PASS"
if i >= config.min_iterations:
logger.info(" PASS at iteration %d (min=%d reached)!", i, config.min_iterations)
iterations.append(iter_result)
# ESCALATE check (highest priority)
if verdict == "ESCALATE":
final_verdict = "ESCALATE"
for step in config.pipeline:
if step.verdict:
esc = _extract_escalated_issues(
step_outputs.get(step.output_key, ""),
)
if esc:
escalated_issues.append(esc)
iter_result.escalated_issues = esc
logger.info(" ESCALATE at iteration %d — stopping loop.", i)
break
else:
logger.info(
" PASS at iteration %d, but min_iterations=%d — continuing",
i, config.min_iterations,
)
# Auto-escalate: no senior/aggregator + repeated FAIL
has_aggregator = config.seniors or any(
s.prompt_template == "default:aggregate-review" for s in config.pipeline
)
if (
verdict == "FAIL"
and not has_aggregator
and i >= 2
and _detect_auto_escalate(all_feedbacks[:-1], feedback)
):
final_verdict = "ESCALATE"
auto_msg = (
f"Auto-escalated: same issues detected across {i} iterations "
f"without resolution (no senior reviewer configured)."
if verdict == "PASS":
final_verdict = "PASS"
if i >= config.min_iterations:
logger.info(" PASS at iteration %d (min=%d reached)!", i, config.min_iterations)
break
else:
logger.info(
" PASS at iteration %d, but min_iterations=%d — continuing",
i, config.min_iterations,
)
# Auto-escalate: no senior/aggregator + repeated FAIL
has_aggregator = config.seniors or any(
s.prompt_template == "default:aggregate-review" for s in config.pipeline
)
escalated_issues.append(auto_msg)
iter_result.escalated_issues = auto_msg
logger.info(" AUTO-ESCALATE at iteration %d", i)
break
if (
verdict == "FAIL"
and not has_aggregator
and i >= 2
and _detect_auto_escalate(all_feedbacks[:-1], feedback)
):
final_verdict = "ESCALATE"
auto_msg = (
f"Auto-escalated: same issues detected across {i} iterations "
f"without resolution (no senior reviewer configured)."
)
escalated_issues.append(auto_msg)
iter_result.escalated_issues = auto_msg
logger.info(" AUTO-ESCALATE at iteration %d", i)
break
if dry_run:
logger.info(" (dry-run: stopping after iteration 1)")
break
if dry_run:
logger.info(" (dry-run: stopping after iteration 1)")
break
finally:
agentic_branch: str | None = None
if worktree_path is not None and agentic_branch_name is not None:
agentic_branch = _finalize_worktree(
cwd, worktree_path, agentic_branch_name,
config.preset_name, final_verdict,
)
total_duration = time.monotonic() - start_time
@@ -174,6 +303,7 @@ def _run_simple_pipeline(
run_dir=run_dir,
repeated_aggregate_warnings=aggregate_warnings,
escalated_issues=escalated_issues,
agentic_branch=agentic_branch,
)
if not dry_run:
@@ -195,6 +325,16 @@ def _run_phased_pipeline(
set_language(config.language)
input_contents = _load_inputs(config)
runtime_env = _build_runtime_inputs(config, input_contents, cwd)
# Setup shared worktree for agentic mode
all_phase_steps = [s for p in config.phases for s in p.steps]
worktree_path: Path | None = None
agentic_branch_name: str | None = None
if not dry_run and _has_agentic_steps(config, all_phase_steps):
worktree_path, agentic_branch_name = _setup_worktree(
cwd, run_dir, config.preset_name,
)
iterations: list[IterationResult] = []
feedback = "(no feedback — first iteration)"
@@ -207,152 +347,171 @@ def _run_phased_pipeline(
all_feedbacks: list[str] = []
escalated = False
for phase_idx, phase in enumerate(config.phases):
if escalated:
break
try:
for phase_idx, phase in enumerate(config.phases):
if escalated:
break
logger.info("=" * 60)
logger.info(
" Phase: %s (max_iter=%d, consecutive_pass=%d)",
phase.name, phase.max_iterations, phase.consecutive_pass,
)
logger.info("=" * 60)
consecutive_passes = 0
phase_converged = False
for pi in range(1, phase.max_iterations + 1):
global_iter += 1
config = try_reload_config(config)
set_language(config.language)
_refresh_inputs(config, input_contents)
logger.info("-" * 50)
logger.info("=" * 60)
logger.info(
" [%s] Iteration %d/%d (global: v%d)",
phase.name, pi, phase.max_iterations, global_iter,
" Phase: %s (max_iter=%d, consecutive_pass=%d)",
phase.name, phase.max_iterations, phase.consecutive_pass,
)
logger.info("-" * 50)
logger.info("=" * 60)
step_outputs, step_results, verdict = _run_steps(
phase.steps, config, input_contents, feedback,
pi, phase.max_iterations, cwd, timeout, dry_run,
run_dir=run_dir, output_iter=global_iter, phase_name=phase.name,
)
consecutive_passes = 0
phase_converged = False
iter_result = IterationResult(
iteration=global_iter,
step_results=step_results,
step_outputs=step_outputs,
verdict=verdict,
phase_name=phase.name,
)
phase_history = aggregate_history_by_phase.setdefault(phase.name, {})
warning = _detect_repeated_aggregate(
phase.steps, step_outputs, phase_history, iteration=global_iter,
phase_name=phase.name,
)
if warning:
iter_result.repeated_aggregate_warning = warning
aggregate_warnings.append(warning)
logger.warning(" %s", warning)
for pi in range(1, phase.max_iterations + 1):
global_iter += 1
iter_result.feedback = _collect_feedback(phase.steps, step_outputs)
feedback = iter_result.feedback or feedback
all_feedbacks.append(feedback)
config = try_reload_config(config)
set_language(config.language)
_refresh_inputs(config, input_contents)
runtime_env = _build_runtime_inputs(config, input_contents, cwd)
# Extract tracker from verdict/review steps
for step in phase.steps:
if step.verdict or step.role == "review":
tracker = _extract_senior_tracker(
step_outputs.get(step.output_key, ""),
logger.info("-" * 50)
logger.info(
" [%s] Iteration %d/%d (global: v%d)",
phase.name, pi, phase.max_iterations, global_iter,
)
logger.info("-" * 50)
step_outputs, step_results, verdict = _run_steps(
phase.steps, config, input_contents, feedback,
pi, phase.max_iterations, cwd, timeout, dry_run,
run_dir=run_dir, output_iter=global_iter, phase_name=phase.name,
worktree_path=worktree_path,
runtime_env=runtime_env,
)
# Intermediate commit so next iteration's diff only shows new changes
if worktree_path is not None:
_commit_iteration(
worktree_path, f"{config.preset_name}/{phase.name}",
global_iter, verdict,
)
if tracker:
input_contents["previous_senior_tracker"] = tracker
iterations.append(iter_result)
iter_result = IterationResult(
iteration=global_iter,
step_results=step_results,
step_outputs=step_outputs,
verdict=verdict,
phase_name=phase.name,
)
phase_history = aggregate_history_by_phase.setdefault(phase.name, {})
warning = _detect_repeated_aggregate(
phase.steps, step_outputs, phase_history, iteration=global_iter,
phase_name=phase.name,
)
if warning:
iter_result.repeated_aggregate_warning = warning
aggregate_warnings.append(warning)
logger.warning(" %s", warning)
# ESCALATE check
if verdict == "ESCALATE":
final_verdict = "ESCALATE"
iter_result.feedback = _collect_feedback(phase.steps, step_outputs)
feedback = iter_result.feedback or feedback
all_feedbacks.append(feedback)
# Extract tracker from verdict/review steps
for step in phase.steps:
if step.verdict:
esc = _extract_escalated_issues(
if step.verdict or step.role == "review":
tracker = _extract_senior_tracker(
step_outputs.get(step.output_key, ""),
)
if esc:
escalated_issues.append(esc)
iter_result.escalated_issues = esc
logger.info(
" [%s] ESCALATE at iteration %d — stopping.",
phase.name, pi,
)
escalated = True
break
if tracker:
input_contents["previous_senior_tracker"] = tracker
if verdict is None:
logger.info(
" [%s] completed (no verdict step; single-pass phase)",
phase.name,
)
phase_converged = True
break
iterations.append(iter_result)
if verdict == "PASS":
consecutive_passes += 1
logger.info(
" [%s] PASS (%d/%d consecutive)",
phase.name, consecutive_passes, phase.consecutive_pass,
)
if consecutive_passes >= phase.consecutive_pass:
# ESCALATE check
if verdict == "ESCALATE":
final_verdict = "ESCALATE"
for step in phase.steps:
if step.verdict:
esc = _extract_escalated_issues(
step_outputs.get(step.output_key, ""),
)
if esc:
escalated_issues.append(esc)
iter_result.escalated_issues = esc
logger.info(
" [%s] Converged! %d consecutive PASSes.",
phase.name, phase.consecutive_pass,
" [%s] ESCALATE at iteration %d — stopping.",
phase.name, pi,
)
escalated = True
break
if verdict is None:
logger.info(
" [%s] completed (no verdict step; single-pass phase)",
phase.name,
)
phase_converged = True
break
else:
consecutive_passes = 0
# Auto-escalate in phased pipeline
has_aggregator = config.seniors or any(
s.prompt_template == "default:aggregate-review" for s in phase.steps
)
if (
verdict == "FAIL"
and not has_aggregator
and pi >= 2
and _detect_auto_escalate(all_feedbacks[:-1], feedback)
):
final_verdict = "ESCALATE"
auto_msg = (
f"Auto-escalated: same issues detected across {pi} iterations "
f"in phase '{phase.name}' without resolution."
if verdict == "PASS":
consecutive_passes += 1
logger.info(
" [%s] PASS (%d/%d consecutive)",
phase.name, consecutive_passes, phase.consecutive_pass,
)
if consecutive_passes >= phase.consecutive_pass:
logger.info(
" [%s] Converged! %d consecutive PASSes.",
phase.name, phase.consecutive_pass,
)
phase_converged = True
break
else:
consecutive_passes = 0
# Auto-escalate in phased pipeline
has_aggregator = config.seniors or any(
s.prompt_template == "default:aggregate-review" for s in phase.steps
)
escalated_issues.append(auto_msg)
iter_result.escalated_issues = auto_msg
logger.info(" [%s] AUTO-ESCALATE at iteration %d", phase.name, pi)
escalated = True
if (
verdict == "FAIL"
and not has_aggregator
and pi >= 2
and _detect_auto_escalate(all_feedbacks[:-1], feedback)
):
final_verdict = "ESCALATE"
auto_msg = (
f"Auto-escalated: same issues detected across {pi} iterations "
f"in phase '{phase.name}' without resolution."
)
escalated_issues.append(auto_msg)
iter_result.escalated_issues = auto_msg
logger.info(" [%s] AUTO-ESCALATE at iteration %d", phase.name, pi)
escalated = True
break
if dry_run:
break
if escalated:
break
if dry_run:
break
if phase_converged:
logger.info(" Phase '%s' completed: CONVERGED", phase.name)
else:
logger.info(
" Phase '%s' completed: max iterations (%d) reached",
phase.name, phase.max_iterations,
)
if escalated:
break
if phase_idx == len(config.phases) - 1:
final_verdict = "PASS" if phase_converged else "MAX_ITERATIONS_REACHED"
if phase_converged:
logger.info(" Phase '%s' completed: CONVERGED", phase.name)
else:
logger.info(
" Phase '%s' completed: max iterations (%d) reached",
phase.name, phase.max_iterations,
finally:
agentic_branch: str | None = None
if worktree_path is not None and agentic_branch_name is not None:
agentic_branch = _finalize_worktree(
cwd, worktree_path, agentic_branch_name,
config.preset_name, final_verdict,
)
if phase_idx == len(config.phases) - 1:
final_verdict = "PASS" if phase_converged else "MAX_ITERATIONS_REACHED"
total_duration = time.monotonic() - start_time
pipeline_result = PipelineResult(
@@ -362,6 +521,7 @@ def _run_phased_pipeline(
run_dir=run_dir,
repeated_aggregate_warnings=aggregate_warnings,
escalated_issues=escalated_issues,
agentic_branch=agentic_branch,
)
if not dry_run:
@@ -463,6 +623,8 @@ def _run_steps(
run_dir: Path,
output_iter: int,
phase_name: str | None = None,
worktree_path: Path | None = None,
runtime_env: dict[str, str] | None = None,
) -> tuple[dict[str, str], dict[str, AgentResult], str | None]:
"""Execute all steps in one iteration, parallelizing where possible."""
step_outputs: dict[str, str] = {}
@@ -473,21 +635,23 @@ def _run_steps(
for batch in batches:
if len(batch) == 1:
# Single step — run directly
step = batch[0]
_execute_step(
step, config, input_contents, feedback,
iteration, max_iterations, cwd, timeout, dry_run,
step_outputs, step_results,
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
run_dir=run_dir, output_iter=output_iter,
phase_name=phase_name, worktree_path=worktree_path,
runtime_env=runtime_env,
)
else:
# Parallel batch — run with ThreadPoolExecutor
_execute_parallel_batch(
batch, config, input_contents, feedback,
iteration, max_iterations, cwd, timeout, dry_run,
step_outputs, step_results,
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
run_dir=run_dir, output_iter=output_iter,
phase_name=phase_name, worktree_path=worktree_path,
runtime_env=runtime_env,
)
# Extract verdict from all verdict steps (ALL must PASS; ESCALATE wins over all)
@@ -506,6 +670,25 @@ def _run_steps(
return step_outputs, step_results, verdict
def _invoke_agentic(
agent_config: AgentConfig,
prompt: str,
step_name: str,
*,
worktree_path: Path,
env: dict[str, str] | None = None,
timeout: int | None = None,
quiet: bool = False,
) -> AgentResult:
"""Run an agent in agentic mode using an existing worktree."""
return invoke_agent_agentic(
agent_config, prompt, step_name,
worktree_path=worktree_path,
env=env,
timeout=timeout, quiet=quiet,
)
def _execute_step(
step: StepConfig,
config: PipelineConfig,
@@ -523,6 +706,8 @@ def _execute_step(
output_iter: int,
phase_name: str | None = None,
quiet: bool = False,
worktree_path: Path | None = None,
runtime_env: dict[str, str] | None = None,
) -> None:
"""Execute a single step, updating step_outputs and step_results in place."""
if not quiet:
@@ -542,6 +727,7 @@ def _execute_step(
# 4. Render prompt
prompt = render_template(template, context)
prompt = _augment_prompt_with_runtime_context(prompt, context)
# 5. Dry run: print and skip
if dry_run:
@@ -555,10 +741,21 @@ def _execute_step(
# 6. Invoke agent
agent_config = config.agents[step.agent]
try:
result = invoke_agent(
agent_config, prompt, step.name,
cwd=cwd, timeout=timeout, quiet=quiet,
)
if agent_config.agentic and worktree_path:
result = _invoke_agentic(
agent_config, prompt, step.name,
worktree_path=worktree_path,
env=runtime_env,
timeout=timeout, quiet=quiet,
)
else:
# When worktree exists, run non-agentic agents (reviewers) in
# the worktree too so they can inspect the modified files.
effective_cwd = worktree_path if worktree_path else cwd
result = invoke_agent(
agent_config, prompt, step.name,
cwd=effective_cwd, env=runtime_env, timeout=timeout, quiet=quiet,
)
except subprocess.TimeoutExpired as e:
stdout = (e.stdout or b"") if isinstance(e.stdout, bytes) else (e.stdout or "")
stderr = (e.stderr or b"") if isinstance(e.stderr, bytes) else (e.stderr or "")
@@ -625,6 +822,8 @@ def _execute_parallel_batch(
run_dir: Path,
output_iter: int,
phase_name: str | None = None,
worktree_path: Path | None = None,
runtime_env: dict[str, str] | None = None,
) -> None:
"""Execute multiple steps in parallel using threads."""
agent_names = ", ".join(s.agent for s in batch)
@@ -640,6 +839,26 @@ def _execute_parallel_batch(
)
return
# Agentic steps cannot run in parallel (they share a worktree)
agentic_in_batch = [
s for s in batch
if config.agents.get(s.agent, AgentConfig(name="", command="")).agentic
]
if len(agentic_in_batch) > 1:
logger.warning(
" [parallel] %d agentic steps cannot run concurrently — running sequentially",
len(agentic_in_batch),
)
for step in batch:
_execute_step(
step, config, input_contents, feedback,
iteration, max_iterations, cwd, timeout, dry_run,
step_outputs, step_results,
run_dir=run_dir, output_iter=output_iter,
phase_name=phase_name, worktree_path=worktree_path,
)
return
# Snapshot context before parallel execution (all steps see same state)
context_snapshot = dict(input_contents)
context_snapshot.update(step_outputs)
@@ -666,12 +885,22 @@ def _execute_parallel_batch(
if step.context_override:
context = _apply_context_override(context, step.context_override)
prompt = render_template(template, context)
prompt = _augment_prompt_with_runtime_context(prompt, context)
agent_config = config.agents[step.agent]
result = invoke_agent(
agent_config, prompt, step.name,
cwd=cwd, timeout=timeout, quiet=True,
)
if agent_config.agentic and worktree_path:
result = _invoke_agentic(
agent_config, prompt, step.name,
worktree_path=worktree_path,
env=runtime_env,
timeout=timeout, quiet=True,
)
else:
effective_cwd = worktree_path if worktree_path else cwd
result = invoke_agent(
agent_config, prompt, step.name,
cwd=effective_cwd, env=runtime_env, timeout=timeout, quiet=True,
)
return step.output_key, result.output, result
with ThreadPoolExecutor(max_workers=len(batch)) as executor:
@@ -765,6 +994,35 @@ def _build_context(
return context
def _build_runtime_inputs(
config: PipelineConfig,
input_contents: dict[str, str],
cwd: Path,
) -> dict[str, str]:
"""Load runtime env and expose safe execution hints to prompts."""
env, loaded_files, loaded_values = build_runtime_environment(config.execution, cwd)
input_contents["execution_policy"] = build_execution_policy(config.execution)
input_contents["environment_context"] = summarize_environment(
config.execution, loaded_files, env, loaded_values,
)
return env
def _augment_prompt_with_runtime_context(
prompt: str,
context: dict[str, str],
) -> str:
"""Append execution/env guidance without requiring every template to include placeholders."""
extras: list[str] = []
if context.get("execution_policy"):
extras.append("## Execution Policy\n" + context["execution_policy"])
if context.get("environment_context"):
extras.append("## Environment Context\n" + context["environment_context"])
if not extras:
return prompt
return prompt.rstrip() + "\n\n" + "\n\n".join(extras) + "\n"
def _apply_context_override(
context: dict[str, str],
overrides: dict[str, str],

View File

@@ -535,6 +535,10 @@ def _append_final_verdict(
lines.append("---\n")
lines.append(f"## {_t(config, 'final_verdict_title')}: {result.final_verdict}\n")
if result.agentic_branch:
lines.append(f"**Agentic branch**: `{result.agentic_branch}`")
lines.append(f"```bash\ngit checkout {result.agentic_branch}\n```\n")
if result.final_verdict == "PASS":
lines.append(_t(config, "pass_msg"))
elif result.final_verdict == "ESCALATE":

152
cross_eval/runtime_env.py Normal file
View File

@@ -0,0 +1,152 @@
"""Helpers for building agent runtime environments from .env files."""
from __future__ import annotations
import os
from pathlib import Path
from cross_eval.models import ExecutionConfig
_SUMMARY_PREFIXES = (
"CLICKHOUSE",
"CH_",
"DB_",
"DATABASE",
"PG",
"POSTGRES",
"MYSQL",
"REDIS",
"AWS",
"S3",
)
def _strip_quotes(value: str) -> str:
if len(value) >= 2 and value[0] == value[-1] and value[0] in {"'", '"'}:
unwrapped = value[1:-1]
if value[0] == '"':
return bytes(unwrapped, "utf-8").decode("unicode_escape")
return unwrapped
return value
def parse_dotenv(path: Path) -> dict[str, str]:
"""Parse a simple dotenv file into key/value pairs."""
values: dict[str, str] = {}
for raw_line in path.read_text(encoding="utf-8").splitlines():
line = raw_line.strip()
if not line or line.startswith("#"):
continue
if line.startswith("export "):
line = line[len("export ") :].strip()
if "=" not in line:
continue
key, value = line.split("=", 1)
key = key.strip()
if not key:
continue
values[key] = _strip_quotes(value.strip())
return values
def resolve_env_files(execution: ExecutionConfig, project_root: Path) -> list[Path]:
"""Resolve and deduplicate configured env files under the project root."""
candidates: list[Path] = []
for raw in execution.env_files:
path = Path(raw)
if not path.is_absolute():
path = project_root / path
candidates.append(path)
for raw in execution.auto_env_files:
path = project_root / raw
candidates.append(path)
resolved: list[Path] = []
seen: set[Path] = set()
for path in candidates:
try:
normalized = path.resolve()
except OSError:
normalized = path
if normalized in seen or not normalized.exists() or not normalized.is_file():
continue
seen.add(normalized)
resolved.append(normalized)
return resolved
def build_runtime_environment(
execution: ExecutionConfig,
project_root: Path,
) -> tuple[dict[str, str], list[Path], dict[str, str]]:
"""Build subprocess env plus metadata about loaded files and names."""
env = os.environ.copy() if execution.inherit_env else {}
loaded_files = resolve_env_files(execution, project_root)
loaded_values: dict[str, str] = {}
for path in loaded_files:
file_values = parse_dotenv(path)
loaded_values.update(file_values)
env.update(file_values)
return env, loaded_files, loaded_values
def summarize_environment(
execution: ExecutionConfig,
loaded_files: list[Path],
env: dict[str, str],
loaded_values: dict[str, str],
) -> str:
"""Generate a safe environment summary for prompts without leaking secrets."""
lines: list[str] = []
if loaded_files:
joined = ", ".join(str(path) for path in loaded_files)
lines.append(f"Loaded env files into the agent process: {joined}")
else:
lines.append("No .env file was auto-loaded into the agent process.")
if execution.auto_context_targets:
lines.append(
"Execution targets hinted by the user: "
+ ", ".join(execution.auto_context_targets)
)
if execution.expose_env_names:
visible_names = sorted(
{
key
for key in set(loaded_values) | set(env)
if key.startswith(_SUMMARY_PREFIXES)
or any(prefix in key for prefix in ("CLICKHOUSE", "DATABASE", "DB_"))
}
)
if visible_names:
lines.append("Relevant env var names available to commands: " + ", ".join(visible_names))
else:
lines.append("No DB/service env var names matched the default summary filters.")
else:
lines.append("Environment variable values are loaded but names are hidden from the prompt.")
wants_clickhouse = "clickhouse" in {target.lower() for target in execution.auto_context_targets}
clickhouse_keys = [key for key in env if "CLICKHOUSE" in key or key.startswith("CH_")]
if wants_clickhouse or clickhouse_keys:
if clickhouse_keys:
lines.append("ClickHouse-related environment variables are available to the agent.")
else:
lines.append("No ClickHouse-specific env vars were detected in the loaded environment.")
return "\n".join(lines)
def build_execution_policy(execution: ExecutionConfig) -> str:
"""Describe the execution latitude granted to agentic coders/reviewers."""
lines = [
f"Execution mode: {execution.mode}",
f"Command policy: {execution.command_policy}",
"The agent may choose shell, Python, git, docker, test, and database commands on its own when needed.",
"The user does not need to pre-specify exact commands.",
]
if execution.command_policy == "broad":
lines.append("Prefer direct validation by running the minimum set of commands needed to prove a fix.")
else:
lines.append("Keep command usage minimal and focused on validation.")
return "\n".join(lines)

135
cross_eval/worktree.py Normal file
View File

@@ -0,0 +1,135 @@
"""Git worktree lifecycle management for agentic mode."""
from __future__ import annotations
import logging
import shutil
import subprocess
from datetime import datetime
from pathlib import Path
logger = logging.getLogger(__name__)
class WorktreeError(RuntimeError):
"""Error during worktree operations."""
def make_branch_name(preset_name: str) -> str:
"""Generate a branch name for agentic results."""
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
return f"cross-eval/{preset_name}_{ts}"
def create_worktree(base_cwd: Path, work_dir: Path, branch_name: str) -> Path:
"""Create a git worktree on a new branch from HEAD.
1. Create branch from HEAD
2. Create worktree checked out to that branch
The branch lives in the original repo, so it survives worktree removal.
"""
work_dir = work_dir.resolve()
if work_dir.exists():
shutil.rmtree(work_dir)
# Create the branch at HEAD
try:
subprocess.run(
["git", "branch", branch_name, "HEAD"],
cwd=base_cwd,
capture_output=True,
text=True,
check=True,
)
except subprocess.CalledProcessError as e:
raise WorktreeError(
f"Failed to create branch '{branch_name}': {e.stderr.strip()}"
) from e
# Create worktree on that branch
try:
subprocess.run(
["git", "worktree", "add", str(work_dir), branch_name],
cwd=base_cwd,
capture_output=True,
text=True,
check=True,
)
except subprocess.CalledProcessError as e:
# Clean up the branch if worktree creation fails
subprocess.run(
["git", "branch", "-D", branch_name],
cwd=base_cwd,
capture_output=True,
)
raise WorktreeError(
f"Failed to create worktree at {work_dir}: {e.stderr.strip()}"
) from e
logger.debug("Created worktree on branch '%s': %s", branch_name, work_dir)
return work_dir
def capture_diff(worktree_path: Path) -> str:
"""Capture all changes made in the worktree as a unified diff.
Includes both tracked modifications and new untracked files.
"""
subprocess.run(
["git", "add", "-A"],
cwd=worktree_path,
capture_output=True,
check=True,
)
result = subprocess.run(
["git", "diff", "--cached", "HEAD"],
cwd=worktree_path,
capture_output=True,
text=True,
)
return result.stdout.strip()
def commit_worktree(worktree_path: Path, message: str) -> bool:
"""Stage and commit all changes in the worktree.
Returns True if a commit was made, False if nothing to commit.
"""
subprocess.run(
["git", "add", "-A"],
cwd=worktree_path,
capture_output=True,
check=True,
)
result = subprocess.run(
["git", "commit", "-m", message],
cwd=worktree_path,
capture_output=True,
text=True,
)
# exit code 1 = nothing to commit
return result.returncode == 0
def remove_worktree(base_cwd: Path, work_dir: Path) -> None:
"""Remove a git worktree (branch is preserved in the original repo)."""
work_dir = work_dir.resolve()
try:
subprocess.run(
["git", "worktree", "remove", "--force", str(work_dir)],
cwd=base_cwd,
capture_output=True,
text=True,
check=True,
)
except subprocess.CalledProcessError:
if work_dir.exists():
shutil.rmtree(work_dir, ignore_errors=True)
subprocess.run(
["git", "worktree", "prune"],
cwd=base_cwd,
capture_output=True,
)
logger.debug("Removed worktree: %s (branch preserved)", work_dir)