release: cut 0.2.0 baseline
This commit is contained in:
@@ -1,6 +1,6 @@
|
|||||||
Metadata-Version: 2.4
|
Metadata-Version: 2.4
|
||||||
Name: cross-eval
|
Name: cross-eval
|
||||||
Version: 0.1.0
|
Version: 0.2.0
|
||||||
Summary: AI agent cross-evaluation CLI tool
|
Summary: AI agent cross-evaluation CLI tool
|
||||||
Requires-Python: >=3.9
|
Requires-Python: >=3.9
|
||||||
Requires-Dist: pyyaml>=6.0
|
Requires-Dist: pyyaml>=6.0
|
||||||
|
|||||||
@@ -10,12 +10,15 @@ cross_eval/models.py
|
|||||||
cross_eval/pipeline.py
|
cross_eval/pipeline.py
|
||||||
cross_eval/prompts.py
|
cross_eval/prompts.py
|
||||||
cross_eval/report.py
|
cross_eval/report.py
|
||||||
|
cross_eval/runtime_env.py
|
||||||
|
cross_eval/worktree.py
|
||||||
cross_eval.egg-info/PKG-INFO
|
cross_eval.egg-info/PKG-INFO
|
||||||
cross_eval.egg-info/SOURCES.txt
|
cross_eval.egg-info/SOURCES.txt
|
||||||
cross_eval.egg-info/dependency_links.txt
|
cross_eval.egg-info/dependency_links.txt
|
||||||
cross_eval.egg-info/entry_points.txt
|
cross_eval.egg-info/entry_points.txt
|
||||||
cross_eval.egg-info/requires.txt
|
cross_eval.egg-info/requires.txt
|
||||||
cross_eval.egg-info/top_level.txt
|
cross_eval.egg-info/top_level.txt
|
||||||
|
tests/test_agentic.py
|
||||||
tests/test_config.py
|
tests/test_config.py
|
||||||
tests/test_onboarding.py
|
tests/test_onboarding.py
|
||||||
tests/test_pipeline_integration.py
|
tests/test_pipeline_integration.py
|
||||||
@@ -1 +1 @@
|
|||||||
__version__ = "0.1.0"
|
__version__ = "0.2.0"
|
||||||
|
|||||||
@@ -3,8 +3,10 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
import tempfile
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -142,11 +144,17 @@ class _Spinner:
|
|||||||
sys.stderr.flush()
|
sys.stderr.flush()
|
||||||
|
|
||||||
|
|
||||||
|
def _is_print_mode(args: list[str]) -> bool:
|
||||||
|
"""Check if the agent args include -p / --print flag."""
|
||||||
|
return "-p" in args or "--print" in args
|
||||||
|
|
||||||
|
|
||||||
def invoke_agent(
|
def invoke_agent(
|
||||||
agent: AgentConfig,
|
agent: AgentConfig,
|
||||||
prompt: str,
|
prompt: str,
|
||||||
step_name: str,
|
step_name: str,
|
||||||
cwd: Optional[Path] = None,
|
cwd: Optional[Path] = None,
|
||||||
|
env: Optional[dict[str, str]] = None,
|
||||||
timeout: int | None = None,
|
timeout: int | None = None,
|
||||||
quiet: bool = False,
|
quiet: bool = False,
|
||||||
) -> AgentResult:
|
) -> AgentResult:
|
||||||
@@ -155,30 +163,67 @@ def invoke_agent(
|
|||||||
Args:
|
Args:
|
||||||
quiet: If True, suppress spinner (for parallel execution).
|
quiet: If True, suppress spinner (for parallel execution).
|
||||||
"""
|
"""
|
||||||
|
is_claude = "claude" in agent.command
|
||||||
|
is_interactive = is_claude and not _is_print_mode(agent.args)
|
||||||
|
|
||||||
cmd = [agent.command]
|
cmd = [agent.command]
|
||||||
if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
|
if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
|
||||||
cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
|
cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
|
||||||
cmd.extend(agent.args)
|
cmd.extend(agent.args)
|
||||||
|
|
||||||
# Build the full prompt (system prompt + user prompt)
|
# --- Temp files for interactive (non -p) claude ---
|
||||||
if agent.system_prompt and _supports_system_prompt_flag(agent.command):
|
task_file: Optional[Path] = None
|
||||||
# claude: --system-prompt flag supported natively
|
output_file: Optional[Path] = None
|
||||||
cmd.extend(["--system-prompt", agent.system_prompt])
|
|
||||||
input_data = prompt
|
if is_interactive:
|
||||||
elif agent.system_prompt:
|
# Write prompt + output instruction to temp task file
|
||||||
# codex, others: no --system-prompt flag, prepend to prompt
|
task_fd, task_path = tempfile.mkstemp(suffix=".md", prefix="cross_eval_task_")
|
||||||
input_data = (
|
task_file = Path(task_path)
|
||||||
f"<system>\n{agent.system_prompt}\n</system>\n\n"
|
os.close(task_fd)
|
||||||
f"{prompt}"
|
|
||||||
|
out_fd, out_path = tempfile.mkstemp(suffix=".md", prefix="cross_eval_out_")
|
||||||
|
output_file = Path(out_path)
|
||||||
|
os.close(out_fd)
|
||||||
|
# Clear the output file so we can detect if agent wrote to it
|
||||||
|
output_file.write_text("", encoding="utf-8")
|
||||||
|
|
||||||
|
wrapped_prompt = (
|
||||||
|
f"{prompt}\n\n"
|
||||||
|
f"---\n"
|
||||||
|
f"IMPORTANT: Write your COMPLETE response to this file: {output_file}\n"
|
||||||
|
f"Do NOT modify any other files in the project."
|
||||||
)
|
)
|
||||||
|
task_file.write_text(wrapped_prompt, encoding="utf-8")
|
||||||
|
|
||||||
|
# System prompt via flag
|
||||||
|
if agent.system_prompt and _supports_system_prompt_flag(agent.command):
|
||||||
|
cmd.extend(["--system-prompt", agent.system_prompt])
|
||||||
|
|
||||||
|
# Positional arg: point claude to the task file
|
||||||
|
cmd.append(
|
||||||
|
f"Read the task file at {task_file} and follow all instructions in it. "
|
||||||
|
f"Write your complete output to {output_file}."
|
||||||
|
)
|
||||||
|
input_data: str | None = None
|
||||||
else:
|
else:
|
||||||
input_data = prompt
|
# Print mode (-p) or non-claude: deliver prompt via stdin
|
||||||
|
if agent.system_prompt and _supports_system_prompt_flag(agent.command):
|
||||||
|
cmd.extend(["--system-prompt", agent.system_prompt])
|
||||||
|
input_data = prompt
|
||||||
|
elif agent.system_prompt:
|
||||||
|
input_data = (
|
||||||
|
f"<system>\n{agent.system_prompt}\n</system>\n\n"
|
||||||
|
f"{prompt}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
input_data = prompt
|
||||||
|
|
||||||
logger.debug("Invoking agent '%s': %s", agent.name, " ".join(cmd[:5]) + " ...")
|
logger.debug("Invoking agent '%s': %s", agent.name, " ".join(cmd[:5]) + " ...")
|
||||||
|
|
||||||
spinner: Optional[_Spinner] = None
|
spinner: Optional[_Spinner] = None
|
||||||
if not quiet:
|
if not quiet:
|
||||||
logger.info(" cmd: %s", " ".join(cmd[:6]))
|
mode_label = "interactive" if is_interactive else ""
|
||||||
|
logger.info(" cmd: %s %s", " ".join(cmd[:6]), f"({mode_label})" if mode_label else "")
|
||||||
spinner = _Spinner(f"[{step_name}] {agent.name} running...")
|
spinner = _Spinner(f"[{step_name}] {agent.name} running...")
|
||||||
spinner.start()
|
spinner.start()
|
||||||
|
|
||||||
@@ -191,6 +236,7 @@ def invoke_agent(
|
|||||||
text=True,
|
text=True,
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
cwd=cwd,
|
cwd=cwd,
|
||||||
|
env=env,
|
||||||
)
|
)
|
||||||
duration = time.monotonic() - start
|
duration = time.monotonic() - start
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
@@ -201,10 +247,154 @@ def invoke_agent(
|
|||||||
if spinner:
|
if spinner:
|
||||||
spinner.stop(f"[{step_name}] ERROR")
|
spinner.stop(f"[{step_name}] ERROR")
|
||||||
raise
|
raise
|
||||||
|
finally:
|
||||||
|
if task_file:
|
||||||
|
task_file.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
if spinner:
|
||||||
|
spinner.stop(f"[{step_name}] FAILED (exit {result.returncode})")
|
||||||
|
if output_file:
|
||||||
|
output_file.unlink(missing_ok=True)
|
||||||
|
err_detail = result.stderr.strip() or result.stdout.strip()
|
||||||
|
if err_detail and len(err_detail) > 500:
|
||||||
|
err_detail = err_detail[:500] + "..."
|
||||||
|
cmd_preview = " ".join(cmd[:6])
|
||||||
|
failure_type, suggested_action = _classify_agent_failure(err_detail or "")
|
||||||
|
raise AgentInvocationError(
|
||||||
|
agent_name=agent.name,
|
||||||
|
step_name=step_name,
|
||||||
|
cmd_preview=cmd_preview,
|
||||||
|
raw_error=err_detail or "(no output)",
|
||||||
|
failure_type=failure_type,
|
||||||
|
suggested_action=suggested_action,
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Capture output ---
|
||||||
|
if output_file:
|
||||||
|
output = output_file.read_text(encoding="utf-8").strip()
|
||||||
|
output_file.unlink(missing_ok=True)
|
||||||
|
if not output:
|
||||||
|
# Fallback to stdout if agent didn't write to the file
|
||||||
|
output = result.stdout.strip()
|
||||||
|
else:
|
||||||
|
output = result.stdout.strip()
|
||||||
|
|
||||||
output = result.stdout.strip()
|
|
||||||
chars = len(output)
|
chars = len(output)
|
||||||
|
|
||||||
|
if spinner:
|
||||||
|
spinner.stop(f"[{step_name}] done — {chars} chars")
|
||||||
|
|
||||||
|
if not output:
|
||||||
|
stderr_info = result.stderr.strip()
|
||||||
|
if stderr_info:
|
||||||
|
logger.warning(
|
||||||
|
"Agent '%s' produced empty output at step '%s'. stderr: %s",
|
||||||
|
agent.name, step_name, stderr_info[:500],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
"Agent '%s' produced empty output at step '%s' (no stderr either)",
|
||||||
|
agent.name, step_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
return AgentResult(
|
||||||
|
output=output,
|
||||||
|
exit_code=result.returncode,
|
||||||
|
agent_name=agent.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=round(duration, 1),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def invoke_agent_agentic(
|
||||||
|
agent: AgentConfig,
|
||||||
|
prompt: str,
|
||||||
|
step_name: str,
|
||||||
|
worktree_path: Path,
|
||||||
|
env: Optional[dict[str, str]] = None,
|
||||||
|
timeout: int | None = None,
|
||||||
|
quiet: bool = False,
|
||||||
|
) -> AgentResult:
|
||||||
|
"""Invoke an agent in agentic mode (no -p, runs in worktree, captures git diff).
|
||||||
|
|
||||||
|
The agent runs without print mode so it can modify files directly.
|
||||||
|
After the agent exits, git diff (since last commit) is captured as the output.
|
||||||
|
"""
|
||||||
|
from cross_eval.worktree import capture_diff
|
||||||
|
|
||||||
|
# Write prompt to a temp file (outside worktree, won't appear in diffs)
|
||||||
|
import tempfile
|
||||||
|
task_fd, task_path = tempfile.mkstemp(suffix=".md", prefix="cross_eval_task_")
|
||||||
|
task_file = Path(task_path)
|
||||||
|
task_file.write_text(prompt, encoding="utf-8")
|
||||||
|
os.close(task_fd)
|
||||||
|
|
||||||
|
cmd = [agent.command]
|
||||||
|
if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
|
||||||
|
cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
|
||||||
|
|
||||||
|
# Strip stdin sentinel ("-") from args for agentic mode
|
||||||
|
args = [a for a in agent.args if a != "-"]
|
||||||
|
cmd.extend(args)
|
||||||
|
|
||||||
|
# System prompt via flag if supported
|
||||||
|
if agent.system_prompt and _supports_system_prompt_flag(agent.command):
|
||||||
|
cmd.extend(["--system-prompt", agent.system_prompt])
|
||||||
|
|
||||||
|
# Deliver the prompt differently per agent type
|
||||||
|
is_codex = "codex" in agent.command
|
||||||
|
input_data: str | None = None
|
||||||
|
if is_codex:
|
||||||
|
# codex: stdin mode
|
||||||
|
cmd.append("-")
|
||||||
|
if agent.system_prompt and not _supports_system_prompt_flag(agent.command):
|
||||||
|
input_data = f"<system>\n{agent.system_prompt}\n</system>\n\n{prompt}"
|
||||||
|
else:
|
||||||
|
input_data = prompt
|
||||||
|
else:
|
||||||
|
# claude: use positional arg with a pointer to the task file
|
||||||
|
# (avoids OS arg length limits for large prompts)
|
||||||
|
cmd.append(
|
||||||
|
f"Read the task file at {task_file} and execute all instructions in it. "
|
||||||
|
f"Work in the current directory."
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
"Invoking agent '%s' (agentic) in worktree: %s",
|
||||||
|
agent.name, worktree_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
spinner: Optional[_Spinner] = None
|
||||||
|
if not quiet:
|
||||||
|
logger.info(" cmd: %s (agentic)", " ".join(cmd[:6]))
|
||||||
|
spinner = _Spinner(f"[{step_name}] {agent.name} (agentic) running...")
|
||||||
|
spinner.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
start = time.monotonic()
|
||||||
|
result = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
input=input_data,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=timeout,
|
||||||
|
cwd=worktree_path,
|
||||||
|
env=env,
|
||||||
|
)
|
||||||
|
duration = time.monotonic() - start
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
if spinner:
|
||||||
|
spinner.stop(f"[{step_name}] TIMEOUT after {timeout}s")
|
||||||
|
raise
|
||||||
|
except Exception:
|
||||||
|
if spinner:
|
||||||
|
spinner.stop(f"[{step_name}] ERROR")
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
# Clean up temp task file (it's in /tmp, not in worktree)
|
||||||
|
task_file.unlink(missing_ok=True)
|
||||||
|
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
if spinner:
|
if spinner:
|
||||||
spinner.stop(f"[{step_name}] FAILED (exit {result.returncode})")
|
spinner.stop(f"[{step_name}] FAILED (exit {result.returncode})")
|
||||||
@@ -222,17 +412,22 @@ def invoke_agent(
|
|||||||
suggested_action=suggested_action,
|
suggested_action=suggested_action,
|
||||||
)
|
)
|
||||||
|
|
||||||
if spinner:
|
# Capture git diff as the output (changes since last commit on the branch)
|
||||||
spinner.stop(f"[{step_name}] done — {chars} chars")
|
diff_output = capture_diff(worktree_path)
|
||||||
|
|
||||||
if not output:
|
if not diff_output:
|
||||||
|
diff_output = "(no changes)"
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Agent '%s' produced empty output at step '%s'",
|
"Agent '%s' made no file changes at step '%s'",
|
||||||
agent.name, step_name,
|
agent.name, step_name,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
chars = len(diff_output)
|
||||||
|
if spinner:
|
||||||
|
spinner.stop(f"[{step_name}] done — {chars} chars (agentic)")
|
||||||
|
|
||||||
return AgentResult(
|
return AgentResult(
|
||||||
output=output,
|
output=diff_output,
|
||||||
exit_code=result.returncode,
|
exit_code=result.returncode,
|
||||||
agent_name=agent.name,
|
agent_name=agent.name,
|
||||||
step_name=step_name,
|
step_name=step_name,
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ max_iterations: 3
|
|||||||
language: {language}
|
language: {language}
|
||||||
|
|
||||||
# 결과 저장 경로
|
# 결과 저장 경로
|
||||||
output_dir: output
|
output_dir: .cross-eval/output
|
||||||
|
|
||||||
# ─── 커스텀 에이전트 (선택) ────────────────────────────────────
|
# ─── 커스텀 에이전트 (선택) ────────────────────────────────────
|
||||||
# 기본 제공 에이전트를 덮어쓰거나 새 에이전트를 정의할 수 있습니다.
|
# 기본 제공 에이전트를 덮어쓰거나 새 에이전트를 정의할 수 있습니다.
|
||||||
@@ -372,6 +372,14 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
"--input", action="append", dest="inputs", metavar="KEY=PATH",
|
"--input", action="append", dest="inputs", metavar="KEY=PATH",
|
||||||
help="추가 입력 파일 (예: --input spec=./api-spec.md)",
|
help="추가 입력 파일 (예: --input spec=./api-spec.md)",
|
||||||
)
|
)
|
||||||
|
input_group.add_argument(
|
||||||
|
"--env-file", action="append", dest="env_files", type=Path, default=None,
|
||||||
|
help="에이전트 subprocess에 주입할 추가 .env 파일 (여러 개 가능)",
|
||||||
|
)
|
||||||
|
input_group.add_argument(
|
||||||
|
"--target", action="append", dest="execution_targets", default=None,
|
||||||
|
help="에이전트에게 강조할 실행 대상 힌트 (예: clickhouse, postgres)",
|
||||||
|
)
|
||||||
|
|
||||||
# -- 에이전트 설정 --
|
# -- 에이전트 설정 --
|
||||||
agent_group = run_parser.add_argument_group(
|
agent_group = run_parser.add_argument_group(
|
||||||
@@ -410,6 +418,10 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
choices=REASONING_EFFORT_CHOICES + ("extra-high", "extra_high", "x-high"),
|
choices=REASONING_EFFORT_CHOICES + ("extra-high", "extra_high", "x-high"),
|
||||||
help="Senior용 reasoning effort",
|
help="Senior용 reasoning effort",
|
||||||
)
|
)
|
||||||
|
agent_group.add_argument(
|
||||||
|
"--agentic", action="store_true", default=False,
|
||||||
|
help="Coder를 agentic 모드로 실행 (worktree에서 파일 직접 수정, git diff로 결과 캡처)",
|
||||||
|
)
|
||||||
agent_group.add_argument(
|
agent_group.add_argument(
|
||||||
"--model", default=None, metavar="MODEL",
|
"--model", default=None, metavar="MODEL",
|
||||||
help="모든 에이전트의 모델을 한번에 변경 (예: sonnet, opus)",
|
help="모든 에이전트의 모델을 한번에 변경 (예: sonnet, opus)",
|
||||||
@@ -761,7 +773,7 @@ def _generate_guided_config(
|
|||||||
"",
|
"",
|
||||||
f"max_iterations: {settings['max_iter']}",
|
f"max_iterations: {settings['max_iter']}",
|
||||||
f"language: {lang}",
|
f"language: {lang}",
|
||||||
"output_dir: output",
|
"output_dir: .cross-eval/output",
|
||||||
"",
|
"",
|
||||||
])
|
])
|
||||||
|
|
||||||
@@ -799,20 +811,19 @@ def _apply_model_override(config, agent_name: str, model: str) -> None:
|
|||||||
|
|
||||||
def _apply_phased_iteration_override(config, max_iter: int | None) -> None:
|
def _apply_phased_iteration_override(config, max_iter: int | None) -> None:
|
||||||
"""Apply CLI max-iter to converging phases while preserving setup phases."""
|
"""Apply CLI max-iter to converging phases while preserving setup phases."""
|
||||||
if max_iter is None:
|
from cross_eval.config import sync_phased_iterations
|
||||||
return
|
|
||||||
|
|
||||||
for phase in config.phases:
|
sync_phased_iterations(config, max_iter)
|
||||||
if any(step.verdict for step in phase.steps):
|
|
||||||
phase.max_iterations = max_iter
|
|
||||||
|
|
||||||
|
|
||||||
def cmd_run(args: argparse.Namespace) -> int:
|
def cmd_run(args: argparse.Namespace) -> int:
|
||||||
"""Load config, validate, and execute the pipeline."""
|
"""Load config, validate, and execute the pipeline."""
|
||||||
from cross_eval.config import (
|
from cross_eval.config import (
|
||||||
|
ensure_fix_preset_agentic,
|
||||||
apply_input_overrides,
|
apply_input_overrides,
|
||||||
default_config,
|
default_config,
|
||||||
load_config,
|
load_config,
|
||||||
|
sync_phased_iterations,
|
||||||
validate_config,
|
validate_config,
|
||||||
)
|
)
|
||||||
from cross_eval.prompts import PIPELINE_PRESETS
|
from cross_eval.prompts import PIPELINE_PRESETS
|
||||||
@@ -917,6 +928,10 @@ def cmd_run(args: argparse.Namespace) -> int:
|
|||||||
if preset in {"plan-review", "review-only"} and args.max_iter is None and args.min_iter is None:
|
if preset in {"plan-review", "review-only"} and args.max_iter is None and args.min_iter is None:
|
||||||
config.max_iterations = 1
|
config.max_iterations = 1
|
||||||
|
|
||||||
|
sync_phased_iterations(config)
|
||||||
|
if args.max_iter is not None:
|
||||||
|
sync_phased_iterations(config, args.max_iter)
|
||||||
|
|
||||||
apply_reasoning_effort_settings(
|
apply_reasoning_effort_settings(
|
||||||
config,
|
config,
|
||||||
reasoning_effort=args.reasoning_effort,
|
reasoning_effort=args.reasoning_effort,
|
||||||
@@ -925,6 +940,15 @@ def cmd_run(args: argparse.Namespace) -> int:
|
|||||||
senior_effort=args.senior_effort,
|
senior_effort=args.senior_effort,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# --agentic: convert coder agents to agentic mode
|
||||||
|
if args.agentic:
|
||||||
|
from cross_eval.config import _make_agentic
|
||||||
|
for coder_name in config.coders:
|
||||||
|
if coder_name in config.agents:
|
||||||
|
_make_agentic(config.agents[coder_name])
|
||||||
|
|
||||||
|
ensure_fix_preset_agentic(config)
|
||||||
|
|
||||||
# --model: apply to ALL agents
|
# --model: apply to ALL agents
|
||||||
if args.model is not None:
|
if args.model is not None:
|
||||||
for agent_name in config.agents:
|
for agent_name in config.agents:
|
||||||
@@ -958,6 +982,17 @@ def cmd_run(args: argparse.Namespace) -> int:
|
|||||||
return 1
|
return 1
|
||||||
config.inputs["docs"] = docs_content
|
config.inputs["docs"] = docs_content
|
||||||
|
|
||||||
|
if args.env_files:
|
||||||
|
for env_file in args.env_files:
|
||||||
|
resolved = env_file.resolve()
|
||||||
|
if not resolved.exists():
|
||||||
|
print(f"Env file not found: {resolved}", file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
config.execution.env_files.append(str(resolved))
|
||||||
|
|
||||||
|
if args.execution_targets:
|
||||||
|
config.execution.auto_context_targets = list(args.execution_targets)
|
||||||
|
|
||||||
if args.inputs:
|
if args.inputs:
|
||||||
overrides = {}
|
overrides = {}
|
||||||
for item in args.inputs:
|
for item in args.inputs:
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
"""Configuration loading, validation, and preset resolution."""
|
"""Configuration loading, validation, and preset resolution."""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import copy
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -8,7 +9,13 @@ from typing import Any
|
|||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from cross_eval.models import AgentConfig, PhaseConfig, PipelineConfig, StepConfig
|
from cross_eval.models import (
|
||||||
|
AgentConfig,
|
||||||
|
ExecutionConfig,
|
||||||
|
PhaseConfig,
|
||||||
|
PipelineConfig,
|
||||||
|
StepConfig,
|
||||||
|
)
|
||||||
from cross_eval.prompts import PHASED_PRESETS, PIPELINE_PRESETS
|
from cross_eval.prompts import PHASED_PRESETS, PIPELINE_PRESETS
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -24,6 +31,7 @@ DEFAULT_ROLE_REASONING_EFFORTS = {
|
|||||||
"reviewer": "medium",
|
"reviewer": "medium",
|
||||||
"senior": "high",
|
"senior": "high",
|
||||||
}
|
}
|
||||||
|
FIX_STYLE_PRESETS = {"review-fix", "coding-review-fix"}
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -54,7 +62,12 @@ _CLAUDE_CODER_ARGS = list(_CLAUDE_BASE_ARGS) + [
|
|||||||
"bypassPermissions",
|
"bypassPermissions",
|
||||||
]
|
]
|
||||||
|
|
||||||
_CLAUDE_REVIEW_ARGS = list(_CLAUDE_BASE_ARGS) + [
|
_CLAUDE_REVIEW_ARGS = [
|
||||||
|
"--setting-sources",
|
||||||
|
"user",
|
||||||
|
"--disable-slash-commands",
|
||||||
|
"--model",
|
||||||
|
"opus",
|
||||||
"--permission-mode",
|
"--permission-mode",
|
||||||
"plan",
|
"plan",
|
||||||
]
|
]
|
||||||
@@ -64,29 +77,37 @@ _CODER_SYSTEM_PROMPT = (
|
|||||||
"Rules:\n"
|
"Rules:\n"
|
||||||
"1. FIRST explore the project directory to understand the existing codebase, "
|
"1. FIRST explore the project directory to understand the existing codebase, "
|
||||||
"patterns, and conventions before writing any code.\n"
|
"patterns, and conventions before writing any code.\n"
|
||||||
"2. Implement ONLY what the plan specifies. Do NOT add extra features, "
|
"2. You may decide which shell, Python, git, docker, test, and database commands "
|
||||||
|
"to run. The user does not need to pre-specify exact commands.\n"
|
||||||
|
"3. Environment variables from configured .env files may already be loaded into "
|
||||||
|
"your process; use them when validating services such as ClickHouse.\n"
|
||||||
|
"4. Implement ONLY what the plan specifies. Do NOT add extra features, "
|
||||||
"unnecessary abstractions, premature optimizations, or \"nice-to-have\" improvements.\n"
|
"unnecessary abstractions, premature optimizations, or \"nice-to-have\" improvements.\n"
|
||||||
"3. Follow the project's existing coding style, naming conventions, and directory structure.\n"
|
"5. Follow the project's existing coding style, naming conventions, and directory structure.\n"
|
||||||
"4. If previous review feedback is provided, fix ONLY the specific issues mentioned. "
|
"6. If previous review feedback is provided, fix ONLY the specific issues mentioned. "
|
||||||
"Do NOT refactor unrelated code.\n"
|
"Do NOT refactor unrelated code.\n"
|
||||||
"5. Ignore any items from previous feedback that were marked as DISMISSED or false positive.\n"
|
"7. Ignore any items from previous feedback that were marked as DISMISSED or false positive.\n"
|
||||||
"6. When in doubt about scope, do LESS, not more."
|
"8. When in doubt about scope, do LESS, not more."
|
||||||
)
|
)
|
||||||
|
|
||||||
_REVIEWER_SYSTEM_PROMPT = (
|
_REVIEWER_SYSTEM_PROMPT = (
|
||||||
"You are a code reviewer. You MUST NOT create, modify, or delete any files.\n"
|
"You are a code reviewer. You MUST NOT create, modify, or delete any files.\n"
|
||||||
"Rules:\n"
|
"Rules:\n"
|
||||||
"1. Explore the project directory to understand the full codebase context.\n"
|
"1. Explore the project directory to understand the full codebase context.\n"
|
||||||
"2. Compare the implementation against the plan and checklist ONLY.\n"
|
"2. You may decide which shell, Python, test, git, docker, and database read commands "
|
||||||
"3. Classify every issue with BOTH severity AND category:\n"
|
"to run in order to verify behavior. The user does not need to pre-specify exact commands.\n"
|
||||||
|
"3. Environment variables from configured .env files may already be loaded into "
|
||||||
|
"your process; use them for verification when relevant.\n"
|
||||||
|
"4. Compare the implementation against the plan and checklist ONLY.\n"
|
||||||
|
"5. Classify every issue with BOTH severity AND category:\n"
|
||||||
" - Severity: Critical (breaks functionality/security) > Major (requirement mismatch) > Minor (convention/style)\n"
|
" - Severity: Critical (breaks functionality/security) > Major (requirement mismatch) > Minor (convention/style)\n"
|
||||||
" - Category: Over-engineering / Omission\n"
|
" - Category: Over-engineering / Omission\n"
|
||||||
"4. When reviewing with previous feedback, mark items as CONFIRMED (still an issue) "
|
"6. When reviewing with previous feedback, mark items as CONFIRMED (still an issue) "
|
||||||
"or DISMISSED (false positive) with rationale.\n"
|
"or DISMISSED (false positive) with rationale.\n"
|
||||||
"5. Report out-of-scope issues separately — problems found outside plan/checklist scope.\n"
|
"7. Report out-of-scope issues separately — problems found outside plan/checklist scope.\n"
|
||||||
"6. Order issues by severity (Critical first).\n"
|
"8. Order issues by severity (Critical first).\n"
|
||||||
"7. Do NOT suggest improvements beyond the plan scope.\n"
|
"9. Do NOT suggest improvements beyond the plan scope.\n"
|
||||||
"8. End with VERDICT: PASS (all requirements met, no over-engineering) "
|
"10. End with VERDICT: PASS (all requirements met, no over-engineering) "
|
||||||
"or VERDICT: FAIL (issues found)."
|
"or VERDICT: FAIL (issues found)."
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -94,16 +115,20 @@ _SENIOR_SYSTEM_PROMPT = (
|
|||||||
"You are a senior technical reviewer coordinating a review-fix-verification loop.\n"
|
"You are a senior technical reviewer coordinating a review-fix-verification loop.\n"
|
||||||
"Rules:\n"
|
"Rules:\n"
|
||||||
"1. Explore the project directory to understand the full codebase context.\n"
|
"1. Explore the project directory to understand the full codebase context.\n"
|
||||||
"2. In aggregation mode, deduplicate overlaps, resolve disagreements, and keep only "
|
"2. You may decide which shell, Python, test, git, docker, and database read commands "
|
||||||
|
"to run to verify disputed issues. The user does not need to pre-specify exact commands.\n"
|
||||||
|
"3. Environment variables from configured .env files may already be loaded into "
|
||||||
|
"your process; use them when validating service integrations.\n"
|
||||||
|
"4. In aggregation mode, deduplicate overlaps, resolve disagreements, and keep only "
|
||||||
"evidence-backed issues. Categorize dismissed findings as [False positive] or [Already fixed].\n"
|
"evidence-backed issues. Categorize dismissed findings as [False positive] or [Already fixed].\n"
|
||||||
"3. In verification mode, judge the current implementation directly against ONLY the "
|
"5. In verification mode, judge the current implementation directly against ONLY the "
|
||||||
"plan and checklist.\n"
|
"plan and checklist.\n"
|
||||||
"4. Be skeptical of false positives, but do not lower the bar on real requirement "
|
"6. Be skeptical of false positives, but do not lower the bar on real requirement "
|
||||||
"gaps.\n"
|
"gaps.\n"
|
||||||
"5. When issues remain, produce a concise prioritized action list the coder can act on.\n"
|
"7. When issues remain, produce a concise prioritized action list the coder can act on.\n"
|
||||||
"6. Maintain an Issue Tracker table across iterations to track issue status.\n"
|
"8. Maintain an Issue Tracker table across iterations to track issue status.\n"
|
||||||
"7. Do NOT invent new requirements beyond the plan and checklist.\n"
|
"9. Do NOT invent new requirements beyond the plan and checklist.\n"
|
||||||
"8. End with one of three verdicts:\n"
|
"10. End with one of three verdicts:\n"
|
||||||
" - VERDICT: PASS — all requirements met, no issues remain.\n"
|
" - VERDICT: PASS — all requirements met, no issues remain.\n"
|
||||||
" - VERDICT: FAIL — issues found that the coder can fix.\n"
|
" - VERDICT: FAIL — issues found that the coder can fix.\n"
|
||||||
" - VERDICT: ESCALATE — issues that require human intervention. Use ESCALATE when:\n"
|
" - VERDICT: ESCALATE — issues that require human intervention. Use ESCALATE when:\n"
|
||||||
@@ -263,7 +288,7 @@ def _resolve_agents(
|
|||||||
|
|
||||||
for name in all_referenced:
|
for name in all_referenced:
|
||||||
if name not in result and name in BUILTIN_AGENTS:
|
if name not in result and name in BUILTIN_AGENTS:
|
||||||
result[name] = BUILTIN_AGENTS[name]
|
result[name] = copy.deepcopy(BUILTIN_AGENTS[name])
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@@ -354,15 +379,16 @@ def _apply_role_effort(
|
|||||||
|
|
||||||
def default_config() -> PipelineConfig:
|
def default_config() -> PipelineConfig:
|
||||||
"""Return a PipelineConfig with sensible defaults (no YAML needed)."""
|
"""Return a PipelineConfig with sensible defaults (no YAML needed)."""
|
||||||
agents = dict(BUILTIN_AGENTS)
|
agents = copy.deepcopy(BUILTIN_AGENTS)
|
||||||
coders = ["claude-coder"]
|
coders = ["claude-coder"]
|
||||||
reviewers = ["claude-reviewer"]
|
reviewers = ["claude-reviewer"]
|
||||||
seniors: list[str] = []
|
seniors: list[str] = []
|
||||||
pipeline = PIPELINE_PRESETS["simple"](coders, reviewers, seniors)
|
pipeline = PIPELINE_PRESETS["simple"](coders, reviewers, seniors)
|
||||||
return PipelineConfig(
|
return PipelineConfig(
|
||||||
output_dir=Path("output"),
|
output_dir=Path(".cross-eval/output"),
|
||||||
max_iterations=3,
|
max_iterations=3,
|
||||||
language="ko",
|
language="ko",
|
||||||
|
execution=ExecutionConfig(),
|
||||||
inputs={},
|
inputs={},
|
||||||
agents=agents,
|
agents=agents,
|
||||||
coders=coders,
|
coders=coders,
|
||||||
@@ -406,6 +432,7 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
|
|||||||
system_prompt=agent_data.get("system_prompt"),
|
system_prompt=agent_data.get("system_prompt"),
|
||||||
reasoning_effort=agent_data.get("reasoning_effort"),
|
reasoning_effort=agent_data.get("reasoning_effort"),
|
||||||
stdin_mode=agent_data.get("stdin_mode", False),
|
stdin_mode=agent_data.get("stdin_mode", False),
|
||||||
|
agentic=agent_data.get("agentic", False),
|
||||||
)
|
)
|
||||||
|
|
||||||
# --- roles: explicit or inferred ---
|
# --- roles: explicit or inferred ---
|
||||||
@@ -445,6 +472,17 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
|
|||||||
p = config_dir / p
|
p = config_dir / p
|
||||||
inputs[key] = p
|
inputs[key] = p
|
||||||
|
|
||||||
|
execution_raw = raw.get("execution", {}) or {}
|
||||||
|
execution = ExecutionConfig(
|
||||||
|
mode=execution_raw.get("mode", "agent-decides"),
|
||||||
|
command_policy=execution_raw.get("command_policy", "broad"),
|
||||||
|
inherit_env=bool(execution_raw.get("inherit_env", True)),
|
||||||
|
auto_env_files=list(execution_raw.get("auto_env_files", [".env", ".env.local"])),
|
||||||
|
env_files=list(execution_raw.get("env_files", [])),
|
||||||
|
expose_env_names=bool(execution_raw.get("expose_env_names", True)),
|
||||||
|
auto_context_targets=list(execution_raw.get("auto_context_targets", [])),
|
||||||
|
)
|
||||||
|
|
||||||
# --- pipeline (preset or custom) ---
|
# --- pipeline (preset or custom) ---
|
||||||
steps, phases = _resolve_pipeline(pipeline_raw, coders, reviewers, seniors)
|
steps, phases = _resolve_pipeline(pipeline_raw, coders, reviewers, seniors)
|
||||||
|
|
||||||
@@ -453,12 +491,13 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
|
|||||||
if isinstance(pipeline_raw, str) and pipeline_raw.startswith("preset:"):
|
if isinstance(pipeline_raw, str) and pipeline_raw.startswith("preset:"):
|
||||||
preset_name = pipeline_raw.split(":", 1)[1]
|
preset_name = pipeline_raw.split(":", 1)[1]
|
||||||
|
|
||||||
return PipelineConfig(
|
config = PipelineConfig(
|
||||||
output_dir=Path(raw.get("output_dir", "output")),
|
output_dir=Path(raw.get("output_dir", ".cross-eval/output")),
|
||||||
max_iterations=int(raw.get("max_iterations", 3)),
|
max_iterations=int(raw.get("max_iterations", 3)),
|
||||||
min_iterations=int(raw.get("min_iterations", 1)),
|
min_iterations=int(raw.get("min_iterations", 1)),
|
||||||
verbose=bool(raw.get("verbose", False)),
|
verbose=bool(raw.get("verbose", False)),
|
||||||
language=raw.get("language", "en"),
|
language=raw.get("language", "en"),
|
||||||
|
execution=execution,
|
||||||
inputs=inputs,
|
inputs=inputs,
|
||||||
agents=agents,
|
agents=agents,
|
||||||
coders=coders,
|
coders=coders,
|
||||||
@@ -470,6 +509,9 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
|
|||||||
_config_path=config_path,
|
_config_path=config_path,
|
||||||
_config_mtime=config_path.stat().st_mtime,
|
_config_mtime=config_path.stat().st_mtime,
|
||||||
)
|
)
|
||||||
|
sync_phased_iterations(config)
|
||||||
|
ensure_fix_preset_agentic(config)
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
def try_reload_config(config: PipelineConfig) -> PipelineConfig:
|
def try_reload_config(config: PipelineConfig) -> PipelineConfig:
|
||||||
@@ -619,6 +661,16 @@ def validate_config(config: PipelineConfig) -> list[str]:
|
|||||||
if config.language not in ("en", "ko"):
|
if config.language not in ("en", "ko"):
|
||||||
errors.append(f"Unsupported language '{config.language}'. Use 'en' or 'ko'.")
|
errors.append(f"Unsupported language '{config.language}'. Use 'en' or 'ko'.")
|
||||||
|
|
||||||
|
if config.execution.mode not in {"agent-decides"}:
|
||||||
|
errors.append(
|
||||||
|
f"Unsupported execution.mode '{config.execution.mode}'. Use 'agent-decides'."
|
||||||
|
)
|
||||||
|
if config.execution.command_policy not in {"broad", "restricted"}:
|
||||||
|
errors.append(
|
||||||
|
"Unsupported execution.command_policy "
|
||||||
|
f"'{config.execution.command_policy}'. Use 'broad' or 'restricted'."
|
||||||
|
)
|
||||||
|
|
||||||
return errors
|
return errors
|
||||||
|
|
||||||
|
|
||||||
@@ -642,6 +694,37 @@ def _validate_unique_step_fields(
|
|||||||
seen_output_keys.add(step.output_key)
|
seen_output_keys.add(step.output_key)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_agentic(agent: AgentConfig) -> None:
|
||||||
|
"""Convert an agent to agentic mode in-place (remove -p, set agentic=True)."""
|
||||||
|
agent.agentic = True
|
||||||
|
agent.args = [a for a in agent.args if a != "-p"]
|
||||||
|
|
||||||
|
|
||||||
|
def sync_phased_iterations(
|
||||||
|
config: PipelineConfig,
|
||||||
|
max_iter: int | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""Apply effective max iterations to converging phases while preserving setup phases."""
|
||||||
|
if not config.phases:
|
||||||
|
return
|
||||||
|
|
||||||
|
effective_max_iter = config.max_iterations if max_iter is None else max_iter
|
||||||
|
for phase in config.phases:
|
||||||
|
if any(step.verdict for step in phase.steps):
|
||||||
|
phase.max_iterations = effective_max_iter
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_fix_preset_agentic(config: PipelineConfig) -> None:
|
||||||
|
"""Fix-style presets should modify code, so coders run agentically by default."""
|
||||||
|
if config.preset_name not in FIX_STYLE_PRESETS:
|
||||||
|
return
|
||||||
|
|
||||||
|
for coder_name in config.coders:
|
||||||
|
agent = config.agents.get(coder_name)
|
||||||
|
if agent is not None and not agent.agentic:
|
||||||
|
_make_agentic(agent)
|
||||||
|
|
||||||
|
|
||||||
def apply_input_overrides(
|
def apply_input_overrides(
|
||||||
config: PipelineConfig, overrides: dict[str, str]
|
config: PipelineConfig, overrides: dict[str, str]
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|||||||
@@ -265,7 +265,7 @@ def run_live_demo(
|
|||||||
checklist_path.write_text(DEMO_CHECKLIST, encoding="utf-8")
|
checklist_path.write_text(DEMO_CHECKLIST, encoding="utf-8")
|
||||||
|
|
||||||
config = PipelineConfig(
|
config = PipelineConfig(
|
||||||
output_dir=Path("output"),
|
output_dir=Path(".cross-eval/output"),
|
||||||
max_iterations=3,
|
max_iterations=3,
|
||||||
language="en",
|
language="en",
|
||||||
inputs={"plan": plan_path, "checklist": checklist_path},
|
inputs={"plan": plan_path, "checklist": checklist_path},
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ class AgentConfig:
|
|||||||
system_prompt: Optional[str] = None
|
system_prompt: Optional[str] = None
|
||||||
reasoning_effort: Optional[str] = None
|
reasoning_effort: Optional[str] = None
|
||||||
stdin_mode: bool = False
|
stdin_mode: bool = False
|
||||||
|
agentic: bool = False # run in worktree, capture git diff instead of stdout
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -43,15 +44,29 @@ class PhaseConfig:
|
|||||||
consecutive_pass: int = 1 # stop after N consecutive PASSes
|
consecutive_pass: int = 1 # stop after N consecutive PASSes
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExecutionConfig:
|
||||||
|
"""Runtime execution policy for agent subprocesses."""
|
||||||
|
|
||||||
|
mode: str = "agent-decides"
|
||||||
|
command_policy: str = "broad"
|
||||||
|
inherit_env: bool = True
|
||||||
|
auto_env_files: list[str] = field(default_factory=lambda: [".env", ".env.local"])
|
||||||
|
env_files: list[str] = field(default_factory=list)
|
||||||
|
expose_env_names: bool = True
|
||||||
|
auto_context_targets: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class PipelineConfig:
|
class PipelineConfig:
|
||||||
"""Full cross-eval configuration."""
|
"""Full cross-eval configuration."""
|
||||||
|
|
||||||
output_dir: Path = field(default_factory=lambda: Path("output"))
|
output_dir: Path = field(default_factory=lambda: Path(".cross-eval/output"))
|
||||||
max_iterations: int = 3
|
max_iterations: int = 3
|
||||||
min_iterations: int = 1
|
min_iterations: int = 1
|
||||||
verbose: bool = False
|
verbose: bool = False
|
||||||
language: str = "en" # "en" or "ko"
|
language: str = "en" # "en" or "ko"
|
||||||
|
execution: ExecutionConfig = field(default_factory=ExecutionConfig)
|
||||||
inputs: dict[str, Path | str] = field(default_factory=dict)
|
inputs: dict[str, Path | str] = field(default_factory=dict)
|
||||||
agents: dict[str, AgentConfig] = field(default_factory=dict)
|
agents: dict[str, AgentConfig] = field(default_factory=dict)
|
||||||
coders: list[str] = field(default_factory=list)
|
coders: list[str] = field(default_factory=list)
|
||||||
@@ -118,3 +133,4 @@ class PipelineResult:
|
|||||||
run_dir: Optional[Path] = None
|
run_dir: Optional[Path] = None
|
||||||
repeated_aggregate_warnings: list[str] = field(default_factory=list)
|
repeated_aggregate_warnings: list[str] = field(default_factory=list)
|
||||||
escalated_issues: list[str] = field(default_factory=list)
|
escalated_issues: list[str] = field(default_factory=list)
|
||||||
|
agentic_branch: Optional[str] = None
|
||||||
|
|||||||
@@ -10,9 +10,11 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from cross_eval.agent import AgentInvocationError, invoke_agent
|
from cross_eval.agent import AgentInvocationError, invoke_agent, invoke_agent_agentic
|
||||||
|
from cross_eval.worktree import WorktreeError
|
||||||
from cross_eval.config import try_reload_config
|
from cross_eval.config import try_reload_config
|
||||||
from cross_eval.models import (
|
from cross_eval.models import (
|
||||||
|
AgentConfig,
|
||||||
AgentResult,
|
AgentResult,
|
||||||
IterationResult,
|
IterationResult,
|
||||||
PipelineConfig,
|
PipelineConfig,
|
||||||
@@ -21,6 +23,11 @@ from cross_eval.models import (
|
|||||||
)
|
)
|
||||||
from cross_eval.prompts import render_template, resolve_template, set_language
|
from cross_eval.prompts import render_template, resolve_template, set_language
|
||||||
from cross_eval.report import build_report
|
from cross_eval.report import build_report
|
||||||
|
from cross_eval.runtime_env import (
|
||||||
|
build_execution_policy,
|
||||||
|
build_runtime_environment,
|
||||||
|
summarize_environment,
|
||||||
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -48,6 +55,104 @@ def _make_run_dir(config: PipelineConfig) -> Path:
|
|||||||
return run_dir
|
return run_dir
|
||||||
|
|
||||||
|
|
||||||
|
def _commit_iteration(
|
||||||
|
worktree_path: Path,
|
||||||
|
label: str,
|
||||||
|
iteration: int,
|
||||||
|
verdict: str | None,
|
||||||
|
) -> None:
|
||||||
|
"""Intermediate commit after each agentic iteration.
|
||||||
|
|
||||||
|
This resets the diff baseline so the next iteration only captures new changes.
|
||||||
|
"""
|
||||||
|
from cross_eval.worktree import commit_worktree
|
||||||
|
committed = commit_worktree(
|
||||||
|
worktree_path,
|
||||||
|
f"cross-eval: {label} v{iteration} ({verdict or 'no-verdict'})",
|
||||||
|
)
|
||||||
|
if committed:
|
||||||
|
logger.debug(" Intermediate commit: v%d (%s)", iteration, verdict)
|
||||||
|
|
||||||
|
|
||||||
|
def _has_agentic_steps(config: PipelineConfig, steps: list[StepConfig]) -> bool:
|
||||||
|
"""Check if any step uses an agentic agent."""
|
||||||
|
return any(
|
||||||
|
config.agents.get(s.agent, AgentConfig(name="", command="")).agentic
|
||||||
|
for s in steps
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _setup_worktree(cwd: Path, run_dir: Path, preset_name: str) -> tuple[Path, str]:
|
||||||
|
"""Create a shared worktree for the entire pipeline run.
|
||||||
|
|
||||||
|
1. Generate branch name (cross-eval/<preset>_<timestamp>)
|
||||||
|
2. Create branch from HEAD
|
||||||
|
3. Create worktree on that branch
|
||||||
|
|
||||||
|
Returns (worktree_path, branch_name).
|
||||||
|
"""
|
||||||
|
from cross_eval.worktree import create_worktree, make_branch_name
|
||||||
|
branch_name = make_branch_name(preset_name)
|
||||||
|
worktree_dir = run_dir / "work"
|
||||||
|
worktree_path = create_worktree(
|
||||||
|
base_cwd=cwd, work_dir=worktree_dir, branch_name=branch_name,
|
||||||
|
)
|
||||||
|
return worktree_path, branch_name
|
||||||
|
|
||||||
|
|
||||||
|
def _finalize_worktree(
|
||||||
|
cwd: Path,
|
||||||
|
worktree_path: Path,
|
||||||
|
branch_name: str,
|
||||||
|
preset_name: str,
|
||||||
|
final_verdict: str,
|
||||||
|
) -> str | None:
|
||||||
|
"""Commit changes on the branch, then remove the worktree.
|
||||||
|
|
||||||
|
The branch survives worktree removal and stays in the original repo.
|
||||||
|
Returns the branch name if changes were committed, None otherwise.
|
||||||
|
"""
|
||||||
|
from cross_eval.worktree import commit_worktree, remove_worktree
|
||||||
|
|
||||||
|
committed = False
|
||||||
|
try:
|
||||||
|
committed = commit_worktree(
|
||||||
|
worktree_path,
|
||||||
|
f"cross-eval: {preset_name} ({final_verdict})",
|
||||||
|
)
|
||||||
|
if committed:
|
||||||
|
logger.info(" Agentic changes committed on branch: %s", branch_name)
|
||||||
|
else:
|
||||||
|
logger.warning(" No agentic changes to commit (empty diff)")
|
||||||
|
except Exception:
|
||||||
|
logger.warning(" Failed to commit agentic changes", exc_info=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
remove_worktree(base_cwd=cwd, work_dir=worktree_path)
|
||||||
|
except Exception:
|
||||||
|
logger.warning("Failed to clean up worktree: %s", worktree_path)
|
||||||
|
|
||||||
|
# Check if branch has any commits beyond the base — if not, delete it
|
||||||
|
if not committed:
|
||||||
|
try:
|
||||||
|
# Check if branch has diverged from its base
|
||||||
|
result = subprocess.run(
|
||||||
|
["git", "log", "--oneline", f"HEAD..{branch_name}"],
|
||||||
|
cwd=cwd, capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
if not result.stdout.strip():
|
||||||
|
# No commits on branch beyond base — clean up
|
||||||
|
subprocess.run(
|
||||||
|
["git", "branch", "-D", branch_name],
|
||||||
|
cwd=cwd, capture_output=True,
|
||||||
|
)
|
||||||
|
logger.info(" Deleted empty branch: %s", branch_name)
|
||||||
|
except Exception:
|
||||||
|
pass # best-effort cleanup
|
||||||
|
|
||||||
|
return branch_name if committed else None
|
||||||
|
|
||||||
|
|
||||||
def _run_simple_pipeline(
|
def _run_simple_pipeline(
|
||||||
config: PipelineConfig,
|
config: PipelineConfig,
|
||||||
run_dir: Path,
|
run_dir: Path,
|
||||||
@@ -61,6 +166,15 @@ def _run_simple_pipeline(
|
|||||||
|
|
||||||
set_language(config.language)
|
set_language(config.language)
|
||||||
input_contents = _load_inputs(config)
|
input_contents = _load_inputs(config)
|
||||||
|
runtime_env = _build_runtime_inputs(config, input_contents, cwd or Path(os.getcwd()))
|
||||||
|
|
||||||
|
# Setup shared worktree for agentic mode
|
||||||
|
worktree_path: Path | None = None
|
||||||
|
agentic_branch_name: str | None = None
|
||||||
|
if not dry_run and _has_agentic_steps(config, config.pipeline):
|
||||||
|
worktree_path, agentic_branch_name = _setup_worktree(
|
||||||
|
cwd, run_dir, config.preset_name,
|
||||||
|
)
|
||||||
|
|
||||||
feedback = "(no feedback — first iteration)"
|
feedback = "(no feedback — first iteration)"
|
||||||
iterations: list[IterationResult] = []
|
iterations: list[IterationResult] = []
|
||||||
@@ -71,99 +185,114 @@ def _run_simple_pipeline(
|
|||||||
escalated_issues: list[str] = []
|
escalated_issues: list[str] = []
|
||||||
all_feedbacks: list[str] = []
|
all_feedbacks: list[str] = []
|
||||||
|
|
||||||
for i in range(1, config.max_iterations + 1):
|
try:
|
||||||
config = try_reload_config(config)
|
for i in range(1, config.max_iterations + 1):
|
||||||
set_language(config.language)
|
config = try_reload_config(config)
|
||||||
_refresh_inputs(config, input_contents)
|
set_language(config.language)
|
||||||
|
_refresh_inputs(config, input_contents)
|
||||||
|
runtime_env = _build_runtime_inputs(config, input_contents, cwd)
|
||||||
|
|
||||||
logger.info("=" * 50)
|
logger.info("=" * 50)
|
||||||
logger.info(" Iteration %d/%d", i, config.max_iterations)
|
logger.info(" Iteration %d/%d", i, config.max_iterations)
|
||||||
logger.info("=" * 50)
|
logger.info("=" * 50)
|
||||||
|
|
||||||
step_outputs, step_results, verdict = _run_steps(
|
step_outputs, step_results, verdict = _run_steps(
|
||||||
config.pipeline, config, input_contents, feedback,
|
config.pipeline, config, input_contents, feedback,
|
||||||
i, config.max_iterations, cwd, timeout, dry_run,
|
i, config.max_iterations, cwd, timeout, dry_run,
|
||||||
run_dir=run_dir, output_iter=i,
|
run_dir=run_dir, output_iter=i,
|
||||||
)
|
worktree_path=worktree_path,
|
||||||
|
runtime_env=runtime_env,
|
||||||
|
)
|
||||||
|
|
||||||
iter_result = IterationResult(
|
# Intermediate commit so next iteration's diff only shows new changes
|
||||||
iteration=i,
|
if worktree_path is not None:
|
||||||
step_results=step_results,
|
_commit_iteration(worktree_path, config.preset_name, i, verdict)
|
||||||
step_outputs=step_outputs,
|
|
||||||
verdict=verdict,
|
|
||||||
)
|
|
||||||
warning = _detect_repeated_aggregate(
|
|
||||||
config.pipeline, step_outputs, aggregate_history, iteration=i,
|
|
||||||
)
|
|
||||||
if warning:
|
|
||||||
iter_result.repeated_aggregate_warning = warning
|
|
||||||
aggregate_warnings.append(warning)
|
|
||||||
logger.warning(" %s", warning)
|
|
||||||
|
|
||||||
iter_result.feedback = _collect_feedback(config.pipeline, step_outputs)
|
iter_result = IterationResult(
|
||||||
feedback = iter_result.feedback or feedback
|
iteration=i,
|
||||||
all_feedbacks.append(feedback)
|
step_results=step_results,
|
||||||
|
step_outputs=step_outputs,
|
||||||
|
verdict=verdict,
|
||||||
|
)
|
||||||
|
warning = _detect_repeated_aggregate(
|
||||||
|
config.pipeline, step_outputs, aggregate_history, iteration=i,
|
||||||
|
)
|
||||||
|
if warning:
|
||||||
|
iter_result.repeated_aggregate_warning = warning
|
||||||
|
aggregate_warnings.append(warning)
|
||||||
|
logger.warning(" %s", warning)
|
||||||
|
|
||||||
# Extract tracker from verdict/review steps for next iteration
|
iter_result.feedback = _collect_feedback(config.pipeline, step_outputs)
|
||||||
for step in config.pipeline:
|
feedback = iter_result.feedback or feedback
|
||||||
if step.verdict or step.role == "review":
|
all_feedbacks.append(feedback)
|
||||||
tracker = _extract_senior_tracker(
|
|
||||||
step_outputs.get(step.output_key, ""),
|
|
||||||
)
|
|
||||||
if tracker:
|
|
||||||
input_contents["previous_senior_tracker"] = tracker
|
|
||||||
|
|
||||||
iterations.append(iter_result)
|
# Extract tracker from verdict/review steps for next iteration
|
||||||
|
|
||||||
# ESCALATE check (highest priority)
|
|
||||||
if verdict == "ESCALATE":
|
|
||||||
final_verdict = "ESCALATE"
|
|
||||||
# Extract escalation details from verdict step outputs
|
|
||||||
for step in config.pipeline:
|
for step in config.pipeline:
|
||||||
if step.verdict:
|
if step.verdict or step.role == "review":
|
||||||
esc = _extract_escalated_issues(
|
tracker = _extract_senior_tracker(
|
||||||
step_outputs.get(step.output_key, ""),
|
step_outputs.get(step.output_key, ""),
|
||||||
)
|
)
|
||||||
if esc:
|
if tracker:
|
||||||
escalated_issues.append(esc)
|
input_contents["previous_senior_tracker"] = tracker
|
||||||
iter_result.escalated_issues = esc
|
|
||||||
logger.info(" ESCALATE at iteration %d — stopping loop.", i)
|
|
||||||
break
|
|
||||||
|
|
||||||
if verdict == "PASS":
|
iterations.append(iter_result)
|
||||||
final_verdict = "PASS"
|
|
||||||
if i >= config.min_iterations:
|
# ESCALATE check (highest priority)
|
||||||
logger.info(" PASS at iteration %d (min=%d reached)!", i, config.min_iterations)
|
if verdict == "ESCALATE":
|
||||||
|
final_verdict = "ESCALATE"
|
||||||
|
for step in config.pipeline:
|
||||||
|
if step.verdict:
|
||||||
|
esc = _extract_escalated_issues(
|
||||||
|
step_outputs.get(step.output_key, ""),
|
||||||
|
)
|
||||||
|
if esc:
|
||||||
|
escalated_issues.append(esc)
|
||||||
|
iter_result.escalated_issues = esc
|
||||||
|
logger.info(" ESCALATE at iteration %d — stopping loop.", i)
|
||||||
break
|
break
|
||||||
else:
|
|
||||||
logger.info(
|
|
||||||
" PASS at iteration %d, but min_iterations=%d — continuing",
|
|
||||||
i, config.min_iterations,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Auto-escalate: no senior/aggregator + repeated FAIL
|
if verdict == "PASS":
|
||||||
has_aggregator = config.seniors or any(
|
final_verdict = "PASS"
|
||||||
s.prompt_template == "default:aggregate-review" for s in config.pipeline
|
if i >= config.min_iterations:
|
||||||
)
|
logger.info(" PASS at iteration %d (min=%d reached)!", i, config.min_iterations)
|
||||||
if (
|
break
|
||||||
verdict == "FAIL"
|
else:
|
||||||
and not has_aggregator
|
logger.info(
|
||||||
and i >= 2
|
" PASS at iteration %d, but min_iterations=%d — continuing",
|
||||||
and _detect_auto_escalate(all_feedbacks[:-1], feedback)
|
i, config.min_iterations,
|
||||||
):
|
)
|
||||||
final_verdict = "ESCALATE"
|
|
||||||
auto_msg = (
|
# Auto-escalate: no senior/aggregator + repeated FAIL
|
||||||
f"Auto-escalated: same issues detected across {i} iterations "
|
has_aggregator = config.seniors or any(
|
||||||
f"without resolution (no senior reviewer configured)."
|
s.prompt_template == "default:aggregate-review" for s in config.pipeline
|
||||||
)
|
)
|
||||||
escalated_issues.append(auto_msg)
|
if (
|
||||||
iter_result.escalated_issues = auto_msg
|
verdict == "FAIL"
|
||||||
logger.info(" AUTO-ESCALATE at iteration %d", i)
|
and not has_aggregator
|
||||||
break
|
and i >= 2
|
||||||
|
and _detect_auto_escalate(all_feedbacks[:-1], feedback)
|
||||||
|
):
|
||||||
|
final_verdict = "ESCALATE"
|
||||||
|
auto_msg = (
|
||||||
|
f"Auto-escalated: same issues detected across {i} iterations "
|
||||||
|
f"without resolution (no senior reviewer configured)."
|
||||||
|
)
|
||||||
|
escalated_issues.append(auto_msg)
|
||||||
|
iter_result.escalated_issues = auto_msg
|
||||||
|
logger.info(" AUTO-ESCALATE at iteration %d", i)
|
||||||
|
break
|
||||||
|
|
||||||
if dry_run:
|
if dry_run:
|
||||||
logger.info(" (dry-run: stopping after iteration 1)")
|
logger.info(" (dry-run: stopping after iteration 1)")
|
||||||
break
|
break
|
||||||
|
|
||||||
|
finally:
|
||||||
|
agentic_branch: str | None = None
|
||||||
|
if worktree_path is not None and agentic_branch_name is not None:
|
||||||
|
agentic_branch = _finalize_worktree(
|
||||||
|
cwd, worktree_path, agentic_branch_name,
|
||||||
|
config.preset_name, final_verdict,
|
||||||
|
)
|
||||||
|
|
||||||
total_duration = time.monotonic() - start_time
|
total_duration = time.monotonic() - start_time
|
||||||
|
|
||||||
@@ -174,6 +303,7 @@ def _run_simple_pipeline(
|
|||||||
run_dir=run_dir,
|
run_dir=run_dir,
|
||||||
repeated_aggregate_warnings=aggregate_warnings,
|
repeated_aggregate_warnings=aggregate_warnings,
|
||||||
escalated_issues=escalated_issues,
|
escalated_issues=escalated_issues,
|
||||||
|
agentic_branch=agentic_branch,
|
||||||
)
|
)
|
||||||
|
|
||||||
if not dry_run:
|
if not dry_run:
|
||||||
@@ -195,6 +325,16 @@ def _run_phased_pipeline(
|
|||||||
|
|
||||||
set_language(config.language)
|
set_language(config.language)
|
||||||
input_contents = _load_inputs(config)
|
input_contents = _load_inputs(config)
|
||||||
|
runtime_env = _build_runtime_inputs(config, input_contents, cwd)
|
||||||
|
|
||||||
|
# Setup shared worktree for agentic mode
|
||||||
|
all_phase_steps = [s for p in config.phases for s in p.steps]
|
||||||
|
worktree_path: Path | None = None
|
||||||
|
agentic_branch_name: str | None = None
|
||||||
|
if not dry_run and _has_agentic_steps(config, all_phase_steps):
|
||||||
|
worktree_path, agentic_branch_name = _setup_worktree(
|
||||||
|
cwd, run_dir, config.preset_name,
|
||||||
|
)
|
||||||
|
|
||||||
iterations: list[IterationResult] = []
|
iterations: list[IterationResult] = []
|
||||||
feedback = "(no feedback — first iteration)"
|
feedback = "(no feedback — first iteration)"
|
||||||
@@ -207,152 +347,171 @@ def _run_phased_pipeline(
|
|||||||
all_feedbacks: list[str] = []
|
all_feedbacks: list[str] = []
|
||||||
escalated = False
|
escalated = False
|
||||||
|
|
||||||
for phase_idx, phase in enumerate(config.phases):
|
try:
|
||||||
if escalated:
|
for phase_idx, phase in enumerate(config.phases):
|
||||||
break
|
if escalated:
|
||||||
|
break
|
||||||
|
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
logger.info(
|
|
||||||
" Phase: %s (max_iter=%d, consecutive_pass=%d)",
|
|
||||||
phase.name, phase.max_iterations, phase.consecutive_pass,
|
|
||||||
)
|
|
||||||
logger.info("=" * 60)
|
|
||||||
|
|
||||||
consecutive_passes = 0
|
|
||||||
phase_converged = False
|
|
||||||
|
|
||||||
for pi in range(1, phase.max_iterations + 1):
|
|
||||||
global_iter += 1
|
|
||||||
|
|
||||||
config = try_reload_config(config)
|
|
||||||
set_language(config.language)
|
|
||||||
_refresh_inputs(config, input_contents)
|
|
||||||
|
|
||||||
logger.info("-" * 50)
|
|
||||||
logger.info(
|
logger.info(
|
||||||
" [%s] Iteration %d/%d (global: v%d)",
|
" Phase: %s (max_iter=%d, consecutive_pass=%d)",
|
||||||
phase.name, pi, phase.max_iterations, global_iter,
|
phase.name, phase.max_iterations, phase.consecutive_pass,
|
||||||
)
|
)
|
||||||
logger.info("-" * 50)
|
logger.info("=" * 60)
|
||||||
|
|
||||||
step_outputs, step_results, verdict = _run_steps(
|
consecutive_passes = 0
|
||||||
phase.steps, config, input_contents, feedback,
|
phase_converged = False
|
||||||
pi, phase.max_iterations, cwd, timeout, dry_run,
|
|
||||||
run_dir=run_dir, output_iter=global_iter, phase_name=phase.name,
|
|
||||||
)
|
|
||||||
|
|
||||||
iter_result = IterationResult(
|
for pi in range(1, phase.max_iterations + 1):
|
||||||
iteration=global_iter,
|
global_iter += 1
|
||||||
step_results=step_results,
|
|
||||||
step_outputs=step_outputs,
|
|
||||||
verdict=verdict,
|
|
||||||
phase_name=phase.name,
|
|
||||||
)
|
|
||||||
phase_history = aggregate_history_by_phase.setdefault(phase.name, {})
|
|
||||||
warning = _detect_repeated_aggregate(
|
|
||||||
phase.steps, step_outputs, phase_history, iteration=global_iter,
|
|
||||||
phase_name=phase.name,
|
|
||||||
)
|
|
||||||
if warning:
|
|
||||||
iter_result.repeated_aggregate_warning = warning
|
|
||||||
aggregate_warnings.append(warning)
|
|
||||||
logger.warning(" %s", warning)
|
|
||||||
|
|
||||||
iter_result.feedback = _collect_feedback(phase.steps, step_outputs)
|
config = try_reload_config(config)
|
||||||
feedback = iter_result.feedback or feedback
|
set_language(config.language)
|
||||||
all_feedbacks.append(feedback)
|
_refresh_inputs(config, input_contents)
|
||||||
|
runtime_env = _build_runtime_inputs(config, input_contents, cwd)
|
||||||
|
|
||||||
# Extract tracker from verdict/review steps
|
logger.info("-" * 50)
|
||||||
for step in phase.steps:
|
logger.info(
|
||||||
if step.verdict or step.role == "review":
|
" [%s] Iteration %d/%d (global: v%d)",
|
||||||
tracker = _extract_senior_tracker(
|
phase.name, pi, phase.max_iterations, global_iter,
|
||||||
step_outputs.get(step.output_key, ""),
|
)
|
||||||
|
logger.info("-" * 50)
|
||||||
|
|
||||||
|
step_outputs, step_results, verdict = _run_steps(
|
||||||
|
phase.steps, config, input_contents, feedback,
|
||||||
|
pi, phase.max_iterations, cwd, timeout, dry_run,
|
||||||
|
run_dir=run_dir, output_iter=global_iter, phase_name=phase.name,
|
||||||
|
worktree_path=worktree_path,
|
||||||
|
runtime_env=runtime_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Intermediate commit so next iteration's diff only shows new changes
|
||||||
|
if worktree_path is not None:
|
||||||
|
_commit_iteration(
|
||||||
|
worktree_path, f"{config.preset_name}/{phase.name}",
|
||||||
|
global_iter, verdict,
|
||||||
)
|
)
|
||||||
if tracker:
|
|
||||||
input_contents["previous_senior_tracker"] = tracker
|
|
||||||
|
|
||||||
iterations.append(iter_result)
|
iter_result = IterationResult(
|
||||||
|
iteration=global_iter,
|
||||||
|
step_results=step_results,
|
||||||
|
step_outputs=step_outputs,
|
||||||
|
verdict=verdict,
|
||||||
|
phase_name=phase.name,
|
||||||
|
)
|
||||||
|
phase_history = aggregate_history_by_phase.setdefault(phase.name, {})
|
||||||
|
warning = _detect_repeated_aggregate(
|
||||||
|
phase.steps, step_outputs, phase_history, iteration=global_iter,
|
||||||
|
phase_name=phase.name,
|
||||||
|
)
|
||||||
|
if warning:
|
||||||
|
iter_result.repeated_aggregate_warning = warning
|
||||||
|
aggregate_warnings.append(warning)
|
||||||
|
logger.warning(" %s", warning)
|
||||||
|
|
||||||
# ESCALATE check
|
iter_result.feedback = _collect_feedback(phase.steps, step_outputs)
|
||||||
if verdict == "ESCALATE":
|
feedback = iter_result.feedback or feedback
|
||||||
final_verdict = "ESCALATE"
|
all_feedbacks.append(feedback)
|
||||||
|
|
||||||
|
# Extract tracker from verdict/review steps
|
||||||
for step in phase.steps:
|
for step in phase.steps:
|
||||||
if step.verdict:
|
if step.verdict or step.role == "review":
|
||||||
esc = _extract_escalated_issues(
|
tracker = _extract_senior_tracker(
|
||||||
step_outputs.get(step.output_key, ""),
|
step_outputs.get(step.output_key, ""),
|
||||||
)
|
)
|
||||||
if esc:
|
if tracker:
|
||||||
escalated_issues.append(esc)
|
input_contents["previous_senior_tracker"] = tracker
|
||||||
iter_result.escalated_issues = esc
|
|
||||||
logger.info(
|
|
||||||
" [%s] ESCALATE at iteration %d — stopping.",
|
|
||||||
phase.name, pi,
|
|
||||||
)
|
|
||||||
escalated = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if verdict is None:
|
iterations.append(iter_result)
|
||||||
logger.info(
|
|
||||||
" [%s] completed (no verdict step; single-pass phase)",
|
|
||||||
phase.name,
|
|
||||||
)
|
|
||||||
phase_converged = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if verdict == "PASS":
|
# ESCALATE check
|
||||||
consecutive_passes += 1
|
if verdict == "ESCALATE":
|
||||||
logger.info(
|
final_verdict = "ESCALATE"
|
||||||
" [%s] PASS (%d/%d consecutive)",
|
for step in phase.steps:
|
||||||
phase.name, consecutive_passes, phase.consecutive_pass,
|
if step.verdict:
|
||||||
)
|
esc = _extract_escalated_issues(
|
||||||
if consecutive_passes >= phase.consecutive_pass:
|
step_outputs.get(step.output_key, ""),
|
||||||
|
)
|
||||||
|
if esc:
|
||||||
|
escalated_issues.append(esc)
|
||||||
|
iter_result.escalated_issues = esc
|
||||||
logger.info(
|
logger.info(
|
||||||
" [%s] Converged! %d consecutive PASSes.",
|
" [%s] ESCALATE at iteration %d — stopping.",
|
||||||
phase.name, phase.consecutive_pass,
|
phase.name, pi,
|
||||||
|
)
|
||||||
|
escalated = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if verdict is None:
|
||||||
|
logger.info(
|
||||||
|
" [%s] completed (no verdict step; single-pass phase)",
|
||||||
|
phase.name,
|
||||||
)
|
)
|
||||||
phase_converged = True
|
phase_converged = True
|
||||||
break
|
break
|
||||||
else:
|
|
||||||
consecutive_passes = 0
|
|
||||||
|
|
||||||
# Auto-escalate in phased pipeline
|
if verdict == "PASS":
|
||||||
has_aggregator = config.seniors or any(
|
consecutive_passes += 1
|
||||||
s.prompt_template == "default:aggregate-review" for s in phase.steps
|
logger.info(
|
||||||
)
|
" [%s] PASS (%d/%d consecutive)",
|
||||||
if (
|
phase.name, consecutive_passes, phase.consecutive_pass,
|
||||||
verdict == "FAIL"
|
)
|
||||||
and not has_aggregator
|
if consecutive_passes >= phase.consecutive_pass:
|
||||||
and pi >= 2
|
logger.info(
|
||||||
and _detect_auto_escalate(all_feedbacks[:-1], feedback)
|
" [%s] Converged! %d consecutive PASSes.",
|
||||||
):
|
phase.name, phase.consecutive_pass,
|
||||||
final_verdict = "ESCALATE"
|
)
|
||||||
auto_msg = (
|
phase_converged = True
|
||||||
f"Auto-escalated: same issues detected across {pi} iterations "
|
break
|
||||||
f"in phase '{phase.name}' without resolution."
|
else:
|
||||||
|
consecutive_passes = 0
|
||||||
|
|
||||||
|
# Auto-escalate in phased pipeline
|
||||||
|
has_aggregator = config.seniors or any(
|
||||||
|
s.prompt_template == "default:aggregate-review" for s in phase.steps
|
||||||
)
|
)
|
||||||
escalated_issues.append(auto_msg)
|
if (
|
||||||
iter_result.escalated_issues = auto_msg
|
verdict == "FAIL"
|
||||||
logger.info(" [%s] AUTO-ESCALATE at iteration %d", phase.name, pi)
|
and not has_aggregator
|
||||||
escalated = True
|
and pi >= 2
|
||||||
|
and _detect_auto_escalate(all_feedbacks[:-1], feedback)
|
||||||
|
):
|
||||||
|
final_verdict = "ESCALATE"
|
||||||
|
auto_msg = (
|
||||||
|
f"Auto-escalated: same issues detected across {pi} iterations "
|
||||||
|
f"in phase '{phase.name}' without resolution."
|
||||||
|
)
|
||||||
|
escalated_issues.append(auto_msg)
|
||||||
|
iter_result.escalated_issues = auto_msg
|
||||||
|
logger.info(" [%s] AUTO-ESCALATE at iteration %d", phase.name, pi)
|
||||||
|
escalated = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
break
|
||||||
|
|
||||||
|
if escalated:
|
||||||
break
|
break
|
||||||
|
|
||||||
if dry_run:
|
if phase_converged:
|
||||||
break
|
logger.info(" Phase '%s' completed: CONVERGED", phase.name)
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
" Phase '%s' completed: max iterations (%d) reached",
|
||||||
|
phase.name, phase.max_iterations,
|
||||||
|
)
|
||||||
|
|
||||||
if escalated:
|
if phase_idx == len(config.phases) - 1:
|
||||||
break
|
final_verdict = "PASS" if phase_converged else "MAX_ITERATIONS_REACHED"
|
||||||
|
|
||||||
if phase_converged:
|
finally:
|
||||||
logger.info(" Phase '%s' completed: CONVERGED", phase.name)
|
agentic_branch: str | None = None
|
||||||
else:
|
if worktree_path is not None and agentic_branch_name is not None:
|
||||||
logger.info(
|
agentic_branch = _finalize_worktree(
|
||||||
" Phase '%s' completed: max iterations (%d) reached",
|
cwd, worktree_path, agentic_branch_name,
|
||||||
phase.name, phase.max_iterations,
|
config.preset_name, final_verdict,
|
||||||
)
|
)
|
||||||
|
|
||||||
if phase_idx == len(config.phases) - 1:
|
|
||||||
final_verdict = "PASS" if phase_converged else "MAX_ITERATIONS_REACHED"
|
|
||||||
|
|
||||||
total_duration = time.monotonic() - start_time
|
total_duration = time.monotonic() - start_time
|
||||||
|
|
||||||
pipeline_result = PipelineResult(
|
pipeline_result = PipelineResult(
|
||||||
@@ -362,6 +521,7 @@ def _run_phased_pipeline(
|
|||||||
run_dir=run_dir,
|
run_dir=run_dir,
|
||||||
repeated_aggregate_warnings=aggregate_warnings,
|
repeated_aggregate_warnings=aggregate_warnings,
|
||||||
escalated_issues=escalated_issues,
|
escalated_issues=escalated_issues,
|
||||||
|
agentic_branch=agentic_branch,
|
||||||
)
|
)
|
||||||
|
|
||||||
if not dry_run:
|
if not dry_run:
|
||||||
@@ -463,6 +623,8 @@ def _run_steps(
|
|||||||
run_dir: Path,
|
run_dir: Path,
|
||||||
output_iter: int,
|
output_iter: int,
|
||||||
phase_name: str | None = None,
|
phase_name: str | None = None,
|
||||||
|
worktree_path: Path | None = None,
|
||||||
|
runtime_env: dict[str, str] | None = None,
|
||||||
) -> tuple[dict[str, str], dict[str, AgentResult], str | None]:
|
) -> tuple[dict[str, str], dict[str, AgentResult], str | None]:
|
||||||
"""Execute all steps in one iteration, parallelizing where possible."""
|
"""Execute all steps in one iteration, parallelizing where possible."""
|
||||||
step_outputs: dict[str, str] = {}
|
step_outputs: dict[str, str] = {}
|
||||||
@@ -473,21 +635,23 @@ def _run_steps(
|
|||||||
|
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
if len(batch) == 1:
|
if len(batch) == 1:
|
||||||
# Single step — run directly
|
|
||||||
step = batch[0]
|
step = batch[0]
|
||||||
_execute_step(
|
_execute_step(
|
||||||
step, config, input_contents, feedback,
|
step, config, input_contents, feedback,
|
||||||
iteration, max_iterations, cwd, timeout, dry_run,
|
iteration, max_iterations, cwd, timeout, dry_run,
|
||||||
step_outputs, step_results,
|
step_outputs, step_results,
|
||||||
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
|
run_dir=run_dir, output_iter=output_iter,
|
||||||
|
phase_name=phase_name, worktree_path=worktree_path,
|
||||||
|
runtime_env=runtime_env,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Parallel batch — run with ThreadPoolExecutor
|
|
||||||
_execute_parallel_batch(
|
_execute_parallel_batch(
|
||||||
batch, config, input_contents, feedback,
|
batch, config, input_contents, feedback,
|
||||||
iteration, max_iterations, cwd, timeout, dry_run,
|
iteration, max_iterations, cwd, timeout, dry_run,
|
||||||
step_outputs, step_results,
|
step_outputs, step_results,
|
||||||
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
|
run_dir=run_dir, output_iter=output_iter,
|
||||||
|
phase_name=phase_name, worktree_path=worktree_path,
|
||||||
|
runtime_env=runtime_env,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract verdict from all verdict steps (ALL must PASS; ESCALATE wins over all)
|
# Extract verdict from all verdict steps (ALL must PASS; ESCALATE wins over all)
|
||||||
@@ -506,6 +670,25 @@ def _run_steps(
|
|||||||
return step_outputs, step_results, verdict
|
return step_outputs, step_results, verdict
|
||||||
|
|
||||||
|
|
||||||
|
def _invoke_agentic(
|
||||||
|
agent_config: AgentConfig,
|
||||||
|
prompt: str,
|
||||||
|
step_name: str,
|
||||||
|
*,
|
||||||
|
worktree_path: Path,
|
||||||
|
env: dict[str, str] | None = None,
|
||||||
|
timeout: int | None = None,
|
||||||
|
quiet: bool = False,
|
||||||
|
) -> AgentResult:
|
||||||
|
"""Run an agent in agentic mode using an existing worktree."""
|
||||||
|
return invoke_agent_agentic(
|
||||||
|
agent_config, prompt, step_name,
|
||||||
|
worktree_path=worktree_path,
|
||||||
|
env=env,
|
||||||
|
timeout=timeout, quiet=quiet,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _execute_step(
|
def _execute_step(
|
||||||
step: StepConfig,
|
step: StepConfig,
|
||||||
config: PipelineConfig,
|
config: PipelineConfig,
|
||||||
@@ -523,6 +706,8 @@ def _execute_step(
|
|||||||
output_iter: int,
|
output_iter: int,
|
||||||
phase_name: str | None = None,
|
phase_name: str | None = None,
|
||||||
quiet: bool = False,
|
quiet: bool = False,
|
||||||
|
worktree_path: Path | None = None,
|
||||||
|
runtime_env: dict[str, str] | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Execute a single step, updating step_outputs and step_results in place."""
|
"""Execute a single step, updating step_outputs and step_results in place."""
|
||||||
if not quiet:
|
if not quiet:
|
||||||
@@ -542,6 +727,7 @@ def _execute_step(
|
|||||||
|
|
||||||
# 4. Render prompt
|
# 4. Render prompt
|
||||||
prompt = render_template(template, context)
|
prompt = render_template(template, context)
|
||||||
|
prompt = _augment_prompt_with_runtime_context(prompt, context)
|
||||||
|
|
||||||
# 5. Dry run: print and skip
|
# 5. Dry run: print and skip
|
||||||
if dry_run:
|
if dry_run:
|
||||||
@@ -555,10 +741,21 @@ def _execute_step(
|
|||||||
# 6. Invoke agent
|
# 6. Invoke agent
|
||||||
agent_config = config.agents[step.agent]
|
agent_config = config.agents[step.agent]
|
||||||
try:
|
try:
|
||||||
result = invoke_agent(
|
if agent_config.agentic and worktree_path:
|
||||||
agent_config, prompt, step.name,
|
result = _invoke_agentic(
|
||||||
cwd=cwd, timeout=timeout, quiet=quiet,
|
agent_config, prompt, step.name,
|
||||||
)
|
worktree_path=worktree_path,
|
||||||
|
env=runtime_env,
|
||||||
|
timeout=timeout, quiet=quiet,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# When worktree exists, run non-agentic agents (reviewers) in
|
||||||
|
# the worktree too so they can inspect the modified files.
|
||||||
|
effective_cwd = worktree_path if worktree_path else cwd
|
||||||
|
result = invoke_agent(
|
||||||
|
agent_config, prompt, step.name,
|
||||||
|
cwd=effective_cwd, env=runtime_env, timeout=timeout, quiet=quiet,
|
||||||
|
)
|
||||||
except subprocess.TimeoutExpired as e:
|
except subprocess.TimeoutExpired as e:
|
||||||
stdout = (e.stdout or b"") if isinstance(e.stdout, bytes) else (e.stdout or "")
|
stdout = (e.stdout or b"") if isinstance(e.stdout, bytes) else (e.stdout or "")
|
||||||
stderr = (e.stderr or b"") if isinstance(e.stderr, bytes) else (e.stderr or "")
|
stderr = (e.stderr or b"") if isinstance(e.stderr, bytes) else (e.stderr or "")
|
||||||
@@ -625,6 +822,8 @@ def _execute_parallel_batch(
|
|||||||
run_dir: Path,
|
run_dir: Path,
|
||||||
output_iter: int,
|
output_iter: int,
|
||||||
phase_name: str | None = None,
|
phase_name: str | None = None,
|
||||||
|
worktree_path: Path | None = None,
|
||||||
|
runtime_env: dict[str, str] | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Execute multiple steps in parallel using threads."""
|
"""Execute multiple steps in parallel using threads."""
|
||||||
agent_names = ", ".join(s.agent for s in batch)
|
agent_names = ", ".join(s.agent for s in batch)
|
||||||
@@ -640,6 +839,26 @@ def _execute_parallel_batch(
|
|||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Agentic steps cannot run in parallel (they share a worktree)
|
||||||
|
agentic_in_batch = [
|
||||||
|
s for s in batch
|
||||||
|
if config.agents.get(s.agent, AgentConfig(name="", command="")).agentic
|
||||||
|
]
|
||||||
|
if len(agentic_in_batch) > 1:
|
||||||
|
logger.warning(
|
||||||
|
" [parallel] %d agentic steps cannot run concurrently — running sequentially",
|
||||||
|
len(agentic_in_batch),
|
||||||
|
)
|
||||||
|
for step in batch:
|
||||||
|
_execute_step(
|
||||||
|
step, config, input_contents, feedback,
|
||||||
|
iteration, max_iterations, cwd, timeout, dry_run,
|
||||||
|
step_outputs, step_results,
|
||||||
|
run_dir=run_dir, output_iter=output_iter,
|
||||||
|
phase_name=phase_name, worktree_path=worktree_path,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
# Snapshot context before parallel execution (all steps see same state)
|
# Snapshot context before parallel execution (all steps see same state)
|
||||||
context_snapshot = dict(input_contents)
|
context_snapshot = dict(input_contents)
|
||||||
context_snapshot.update(step_outputs)
|
context_snapshot.update(step_outputs)
|
||||||
@@ -666,12 +885,22 @@ def _execute_parallel_batch(
|
|||||||
if step.context_override:
|
if step.context_override:
|
||||||
context = _apply_context_override(context, step.context_override)
|
context = _apply_context_override(context, step.context_override)
|
||||||
prompt = render_template(template, context)
|
prompt = render_template(template, context)
|
||||||
|
prompt = _augment_prompt_with_runtime_context(prompt, context)
|
||||||
|
|
||||||
agent_config = config.agents[step.agent]
|
agent_config = config.agents[step.agent]
|
||||||
result = invoke_agent(
|
if agent_config.agentic and worktree_path:
|
||||||
agent_config, prompt, step.name,
|
result = _invoke_agentic(
|
||||||
cwd=cwd, timeout=timeout, quiet=True,
|
agent_config, prompt, step.name,
|
||||||
)
|
worktree_path=worktree_path,
|
||||||
|
env=runtime_env,
|
||||||
|
timeout=timeout, quiet=True,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
effective_cwd = worktree_path if worktree_path else cwd
|
||||||
|
result = invoke_agent(
|
||||||
|
agent_config, prompt, step.name,
|
||||||
|
cwd=effective_cwd, env=runtime_env, timeout=timeout, quiet=True,
|
||||||
|
)
|
||||||
return step.output_key, result.output, result
|
return step.output_key, result.output, result
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=len(batch)) as executor:
|
with ThreadPoolExecutor(max_workers=len(batch)) as executor:
|
||||||
@@ -765,6 +994,35 @@ def _build_context(
|
|||||||
return context
|
return context
|
||||||
|
|
||||||
|
|
||||||
|
def _build_runtime_inputs(
|
||||||
|
config: PipelineConfig,
|
||||||
|
input_contents: dict[str, str],
|
||||||
|
cwd: Path,
|
||||||
|
) -> dict[str, str]:
|
||||||
|
"""Load runtime env and expose safe execution hints to prompts."""
|
||||||
|
env, loaded_files, loaded_values = build_runtime_environment(config.execution, cwd)
|
||||||
|
input_contents["execution_policy"] = build_execution_policy(config.execution)
|
||||||
|
input_contents["environment_context"] = summarize_environment(
|
||||||
|
config.execution, loaded_files, env, loaded_values,
|
||||||
|
)
|
||||||
|
return env
|
||||||
|
|
||||||
|
|
||||||
|
def _augment_prompt_with_runtime_context(
|
||||||
|
prompt: str,
|
||||||
|
context: dict[str, str],
|
||||||
|
) -> str:
|
||||||
|
"""Append execution/env guidance without requiring every template to include placeholders."""
|
||||||
|
extras: list[str] = []
|
||||||
|
if context.get("execution_policy"):
|
||||||
|
extras.append("## Execution Policy\n" + context["execution_policy"])
|
||||||
|
if context.get("environment_context"):
|
||||||
|
extras.append("## Environment Context\n" + context["environment_context"])
|
||||||
|
if not extras:
|
||||||
|
return prompt
|
||||||
|
return prompt.rstrip() + "\n\n" + "\n\n".join(extras) + "\n"
|
||||||
|
|
||||||
|
|
||||||
def _apply_context_override(
|
def _apply_context_override(
|
||||||
context: dict[str, str],
|
context: dict[str, str],
|
||||||
overrides: dict[str, str],
|
overrides: dict[str, str],
|
||||||
|
|||||||
@@ -535,6 +535,10 @@ def _append_final_verdict(
|
|||||||
lines.append("---\n")
|
lines.append("---\n")
|
||||||
lines.append(f"## {_t(config, 'final_verdict_title')}: {result.final_verdict}\n")
|
lines.append(f"## {_t(config, 'final_verdict_title')}: {result.final_verdict}\n")
|
||||||
|
|
||||||
|
if result.agentic_branch:
|
||||||
|
lines.append(f"**Agentic branch**: `{result.agentic_branch}`")
|
||||||
|
lines.append(f"```bash\ngit checkout {result.agentic_branch}\n```\n")
|
||||||
|
|
||||||
if result.final_verdict == "PASS":
|
if result.final_verdict == "PASS":
|
||||||
lines.append(_t(config, "pass_msg"))
|
lines.append(_t(config, "pass_msg"))
|
||||||
elif result.final_verdict == "ESCALATE":
|
elif result.final_verdict == "ESCALATE":
|
||||||
|
|||||||
152
cross_eval/runtime_env.py
Normal file
152
cross_eval/runtime_env.py
Normal file
@@ -0,0 +1,152 @@
|
|||||||
|
"""Helpers for building agent runtime environments from .env files."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from cross_eval.models import ExecutionConfig
|
||||||
|
|
||||||
|
_SUMMARY_PREFIXES = (
|
||||||
|
"CLICKHOUSE",
|
||||||
|
"CH_",
|
||||||
|
"DB_",
|
||||||
|
"DATABASE",
|
||||||
|
"PG",
|
||||||
|
"POSTGRES",
|
||||||
|
"MYSQL",
|
||||||
|
"REDIS",
|
||||||
|
"AWS",
|
||||||
|
"S3",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_quotes(value: str) -> str:
|
||||||
|
if len(value) >= 2 and value[0] == value[-1] and value[0] in {"'", '"'}:
|
||||||
|
unwrapped = value[1:-1]
|
||||||
|
if value[0] == '"':
|
||||||
|
return bytes(unwrapped, "utf-8").decode("unicode_escape")
|
||||||
|
return unwrapped
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
def parse_dotenv(path: Path) -> dict[str, str]:
|
||||||
|
"""Parse a simple dotenv file into key/value pairs."""
|
||||||
|
values: dict[str, str] = {}
|
||||||
|
for raw_line in path.read_text(encoding="utf-8").splitlines():
|
||||||
|
line = raw_line.strip()
|
||||||
|
if not line or line.startswith("#"):
|
||||||
|
continue
|
||||||
|
if line.startswith("export "):
|
||||||
|
line = line[len("export ") :].strip()
|
||||||
|
if "=" not in line:
|
||||||
|
continue
|
||||||
|
key, value = line.split("=", 1)
|
||||||
|
key = key.strip()
|
||||||
|
if not key:
|
||||||
|
continue
|
||||||
|
values[key] = _strip_quotes(value.strip())
|
||||||
|
return values
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_env_files(execution: ExecutionConfig, project_root: Path) -> list[Path]:
|
||||||
|
"""Resolve and deduplicate configured env files under the project root."""
|
||||||
|
candidates: list[Path] = []
|
||||||
|
for raw in execution.env_files:
|
||||||
|
path = Path(raw)
|
||||||
|
if not path.is_absolute():
|
||||||
|
path = project_root / path
|
||||||
|
candidates.append(path)
|
||||||
|
|
||||||
|
for raw in execution.auto_env_files:
|
||||||
|
path = project_root / raw
|
||||||
|
candidates.append(path)
|
||||||
|
|
||||||
|
resolved: list[Path] = []
|
||||||
|
seen: set[Path] = set()
|
||||||
|
for path in candidates:
|
||||||
|
try:
|
||||||
|
normalized = path.resolve()
|
||||||
|
except OSError:
|
||||||
|
normalized = path
|
||||||
|
if normalized in seen or not normalized.exists() or not normalized.is_file():
|
||||||
|
continue
|
||||||
|
seen.add(normalized)
|
||||||
|
resolved.append(normalized)
|
||||||
|
return resolved
|
||||||
|
|
||||||
|
|
||||||
|
def build_runtime_environment(
|
||||||
|
execution: ExecutionConfig,
|
||||||
|
project_root: Path,
|
||||||
|
) -> tuple[dict[str, str], list[Path], dict[str, str]]:
|
||||||
|
"""Build subprocess env plus metadata about loaded files and names."""
|
||||||
|
env = os.environ.copy() if execution.inherit_env else {}
|
||||||
|
loaded_files = resolve_env_files(execution, project_root)
|
||||||
|
loaded_values: dict[str, str] = {}
|
||||||
|
for path in loaded_files:
|
||||||
|
file_values = parse_dotenv(path)
|
||||||
|
loaded_values.update(file_values)
|
||||||
|
env.update(file_values)
|
||||||
|
return env, loaded_files, loaded_values
|
||||||
|
|
||||||
|
|
||||||
|
def summarize_environment(
|
||||||
|
execution: ExecutionConfig,
|
||||||
|
loaded_files: list[Path],
|
||||||
|
env: dict[str, str],
|
||||||
|
loaded_values: dict[str, str],
|
||||||
|
) -> str:
|
||||||
|
"""Generate a safe environment summary for prompts without leaking secrets."""
|
||||||
|
lines: list[str] = []
|
||||||
|
if loaded_files:
|
||||||
|
joined = ", ".join(str(path) for path in loaded_files)
|
||||||
|
lines.append(f"Loaded env files into the agent process: {joined}")
|
||||||
|
else:
|
||||||
|
lines.append("No .env file was auto-loaded into the agent process.")
|
||||||
|
|
||||||
|
if execution.auto_context_targets:
|
||||||
|
lines.append(
|
||||||
|
"Execution targets hinted by the user: "
|
||||||
|
+ ", ".join(execution.auto_context_targets)
|
||||||
|
)
|
||||||
|
|
||||||
|
if execution.expose_env_names:
|
||||||
|
visible_names = sorted(
|
||||||
|
{
|
||||||
|
key
|
||||||
|
for key in set(loaded_values) | set(env)
|
||||||
|
if key.startswith(_SUMMARY_PREFIXES)
|
||||||
|
or any(prefix in key for prefix in ("CLICKHOUSE", "DATABASE", "DB_"))
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if visible_names:
|
||||||
|
lines.append("Relevant env var names available to commands: " + ", ".join(visible_names))
|
||||||
|
else:
|
||||||
|
lines.append("No DB/service env var names matched the default summary filters.")
|
||||||
|
else:
|
||||||
|
lines.append("Environment variable values are loaded but names are hidden from the prompt.")
|
||||||
|
|
||||||
|
wants_clickhouse = "clickhouse" in {target.lower() for target in execution.auto_context_targets}
|
||||||
|
clickhouse_keys = [key for key in env if "CLICKHOUSE" in key or key.startswith("CH_")]
|
||||||
|
if wants_clickhouse or clickhouse_keys:
|
||||||
|
if clickhouse_keys:
|
||||||
|
lines.append("ClickHouse-related environment variables are available to the agent.")
|
||||||
|
else:
|
||||||
|
lines.append("No ClickHouse-specific env vars were detected in the loaded environment.")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def build_execution_policy(execution: ExecutionConfig) -> str:
|
||||||
|
"""Describe the execution latitude granted to agentic coders/reviewers."""
|
||||||
|
lines = [
|
||||||
|
f"Execution mode: {execution.mode}",
|
||||||
|
f"Command policy: {execution.command_policy}",
|
||||||
|
"The agent may choose shell, Python, git, docker, test, and database commands on its own when needed.",
|
||||||
|
"The user does not need to pre-specify exact commands.",
|
||||||
|
]
|
||||||
|
if execution.command_policy == "broad":
|
||||||
|
lines.append("Prefer direct validation by running the minimum set of commands needed to prove a fix.")
|
||||||
|
else:
|
||||||
|
lines.append("Keep command usage minimal and focused on validation.")
|
||||||
|
return "\n".join(lines)
|
||||||
135
cross_eval/worktree.py
Normal file
135
cross_eval/worktree.py
Normal file
@@ -0,0 +1,135 @@
|
|||||||
|
"""Git worktree lifecycle management for agentic mode."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class WorktreeError(RuntimeError):
|
||||||
|
"""Error during worktree operations."""
|
||||||
|
|
||||||
|
|
||||||
|
def make_branch_name(preset_name: str) -> str:
|
||||||
|
"""Generate a branch name for agentic results."""
|
||||||
|
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
return f"cross-eval/{preset_name}_{ts}"
|
||||||
|
|
||||||
|
|
||||||
|
def create_worktree(base_cwd: Path, work_dir: Path, branch_name: str) -> Path:
|
||||||
|
"""Create a git worktree on a new branch from HEAD.
|
||||||
|
|
||||||
|
1. Create branch from HEAD
|
||||||
|
2. Create worktree checked out to that branch
|
||||||
|
|
||||||
|
The branch lives in the original repo, so it survives worktree removal.
|
||||||
|
"""
|
||||||
|
work_dir = work_dir.resolve()
|
||||||
|
if work_dir.exists():
|
||||||
|
shutil.rmtree(work_dir)
|
||||||
|
|
||||||
|
# Create the branch at HEAD
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["git", "branch", branch_name, "HEAD"],
|
||||||
|
cwd=base_cwd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
raise WorktreeError(
|
||||||
|
f"Failed to create branch '{branch_name}': {e.stderr.strip()}"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
# Create worktree on that branch
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["git", "worktree", "add", str(work_dir), branch_name],
|
||||||
|
cwd=base_cwd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
# Clean up the branch if worktree creation fails
|
||||||
|
subprocess.run(
|
||||||
|
["git", "branch", "-D", branch_name],
|
||||||
|
cwd=base_cwd,
|
||||||
|
capture_output=True,
|
||||||
|
)
|
||||||
|
raise WorktreeError(
|
||||||
|
f"Failed to create worktree at {work_dir}: {e.stderr.strip()}"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
logger.debug("Created worktree on branch '%s': %s", branch_name, work_dir)
|
||||||
|
return work_dir
|
||||||
|
|
||||||
|
|
||||||
|
def capture_diff(worktree_path: Path) -> str:
|
||||||
|
"""Capture all changes made in the worktree as a unified diff.
|
||||||
|
|
||||||
|
Includes both tracked modifications and new untracked files.
|
||||||
|
"""
|
||||||
|
subprocess.run(
|
||||||
|
["git", "add", "-A"],
|
||||||
|
cwd=worktree_path,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
["git", "diff", "--cached", "HEAD"],
|
||||||
|
cwd=worktree_path,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
return result.stdout.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def commit_worktree(worktree_path: Path, message: str) -> bool:
|
||||||
|
"""Stage and commit all changes in the worktree.
|
||||||
|
|
||||||
|
Returns True if a commit was made, False if nothing to commit.
|
||||||
|
"""
|
||||||
|
subprocess.run(
|
||||||
|
["git", "add", "-A"],
|
||||||
|
cwd=worktree_path,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
["git", "commit", "-m", message],
|
||||||
|
cwd=worktree_path,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
# exit code 1 = nothing to commit
|
||||||
|
return result.returncode == 0
|
||||||
|
|
||||||
|
|
||||||
|
def remove_worktree(base_cwd: Path, work_dir: Path) -> None:
|
||||||
|
"""Remove a git worktree (branch is preserved in the original repo)."""
|
||||||
|
work_dir = work_dir.resolve()
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["git", "worktree", "remove", "--force", str(work_dir)],
|
||||||
|
cwd=base_cwd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
if work_dir.exists():
|
||||||
|
shutil.rmtree(work_dir, ignore_errors=True)
|
||||||
|
subprocess.run(
|
||||||
|
["git", "worktree", "prune"],
|
||||||
|
cwd=base_cwd,
|
||||||
|
capture_output=True,
|
||||||
|
)
|
||||||
|
logger.debug("Removed worktree: %s (branch preserved)", work_dir)
|
||||||
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "cross-eval"
|
name = "cross-eval"
|
||||||
version = "0.1.0"
|
version = "0.2.0"
|
||||||
description = "AI agent cross-evaluation CLI tool"
|
description = "AI agent cross-evaluation CLI tool"
|
||||||
requires-python = ">=3.9"
|
requires-python = ">=3.9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
|||||||
701
tests/test_agentic.py
Normal file
701
tests/test_agentic.py
Normal file
@@ -0,0 +1,701 @@
|
|||||||
|
"""Comprehensive tests for the agentic worktree flow.
|
||||||
|
|
||||||
|
Covers:
|
||||||
|
1. worktree.py unit tests (real temp git repo)
|
||||||
|
2. agent.py agentic tests (mocking subprocess)
|
||||||
|
3. config.py _make_agentic tests
|
||||||
|
4. pipeline integration tests (mock invoke_agent / invoke_agent_agentic)
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock, call, patch
|
||||||
|
|
||||||
|
from cross_eval.agent import invoke_agent_agentic
|
||||||
|
from cross_eval.config import BUILTIN_AGENTS, _make_agentic
|
||||||
|
from cross_eval.models import (
|
||||||
|
AgentConfig,
|
||||||
|
AgentResult,
|
||||||
|
PipelineConfig,
|
||||||
|
StepConfig,
|
||||||
|
)
|
||||||
|
from cross_eval.pipeline import (
|
||||||
|
_commit_iteration,
|
||||||
|
_finalize_worktree,
|
||||||
|
_has_agentic_steps,
|
||||||
|
_setup_worktree,
|
||||||
|
run_pipeline,
|
||||||
|
)
|
||||||
|
from cross_eval.worktree import (
|
||||||
|
capture_diff,
|
||||||
|
commit_worktree,
|
||||||
|
create_worktree,
|
||||||
|
make_branch_name,
|
||||||
|
remove_worktree,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _init_git_repo(path: Path) -> None:
|
||||||
|
"""Initialise a minimal git repo with one commit."""
|
||||||
|
subprocess.run(["git", "init"], cwd=path, capture_output=True, check=True)
|
||||||
|
subprocess.run(
|
||||||
|
["git", "config", "user.email", "test@test.com"],
|
||||||
|
cwd=path, capture_output=True, check=True,
|
||||||
|
)
|
||||||
|
subprocess.run(
|
||||||
|
["git", "config", "user.name", "Test"],
|
||||||
|
cwd=path, capture_output=True, check=True,
|
||||||
|
)
|
||||||
|
(path / "README.md").write_text("# init\n")
|
||||||
|
subprocess.run(["git", "add", "."], cwd=path, capture_output=True, check=True)
|
||||||
|
subprocess.run(
|
||||||
|
["git", "commit", "-m", "initial"],
|
||||||
|
cwd=path, capture_output=True, check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ===================================================================
|
||||||
|
# 1. worktree.py unit tests (real temp git repo)
|
||||||
|
# ===================================================================
|
||||||
|
|
||||||
|
class TestCreateWorktree(unittest.TestCase):
|
||||||
|
"""create_worktree creates a worktree on a named branch."""
|
||||||
|
|
||||||
|
def test_creates_worktree_and_branch(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
base = Path(td) / "repo"
|
||||||
|
base.mkdir()
|
||||||
|
_init_git_repo(base)
|
||||||
|
|
||||||
|
wt_dir = Path(td) / "wt"
|
||||||
|
branch = "cross-eval/test_branch"
|
||||||
|
result_path = create_worktree(base, wt_dir, branch)
|
||||||
|
|
||||||
|
# Worktree directory exists
|
||||||
|
self.assertTrue(result_path.exists())
|
||||||
|
# Branch was created in the original repo
|
||||||
|
branches = subprocess.run(
|
||||||
|
["git", "branch", "--list", branch],
|
||||||
|
cwd=base, capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
self.assertIn(branch, branches.stdout)
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
remove_worktree(base, wt_dir)
|
||||||
|
|
||||||
|
|
||||||
|
class TestCaptureDiff(unittest.TestCase):
|
||||||
|
"""capture_diff captures changes correctly."""
|
||||||
|
|
||||||
|
def test_captures_new_and_modified_files(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
base = Path(td) / "repo"
|
||||||
|
base.mkdir()
|
||||||
|
_init_git_repo(base)
|
||||||
|
|
||||||
|
wt_dir = Path(td) / "wt"
|
||||||
|
branch = "cross-eval/diff_test"
|
||||||
|
create_worktree(base, wt_dir, branch)
|
||||||
|
|
||||||
|
# Make changes in the worktree
|
||||||
|
(wt_dir / "new_file.txt").write_text("hello\n")
|
||||||
|
(wt_dir / "README.md").write_text("# modified\n")
|
||||||
|
|
||||||
|
diff = capture_diff(wt_dir)
|
||||||
|
self.assertIn("new_file.txt", diff)
|
||||||
|
self.assertIn("hello", diff)
|
||||||
|
self.assertIn("modified", diff)
|
||||||
|
|
||||||
|
remove_worktree(base, wt_dir)
|
||||||
|
|
||||||
|
|
||||||
|
class TestCommitWorktree(unittest.TestCase):
|
||||||
|
"""commit_worktree commits changes and returns True; False when nothing to commit."""
|
||||||
|
|
||||||
|
def test_commit_returns_true_on_changes(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
base = Path(td) / "repo"
|
||||||
|
base.mkdir()
|
||||||
|
_init_git_repo(base)
|
||||||
|
|
||||||
|
wt_dir = Path(td) / "wt"
|
||||||
|
branch = "cross-eval/commit_test"
|
||||||
|
create_worktree(base, wt_dir, branch)
|
||||||
|
|
||||||
|
(wt_dir / "file.txt").write_text("data\n")
|
||||||
|
result = commit_worktree(wt_dir, "test commit")
|
||||||
|
self.assertTrue(result)
|
||||||
|
|
||||||
|
remove_worktree(base, wt_dir)
|
||||||
|
|
||||||
|
def test_commit_returns_false_when_nothing_to_commit(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
base = Path(td) / "repo"
|
||||||
|
base.mkdir()
|
||||||
|
_init_git_repo(base)
|
||||||
|
|
||||||
|
wt_dir = Path(td) / "wt"
|
||||||
|
branch = "cross-eval/empty_commit"
|
||||||
|
create_worktree(base, wt_dir, branch)
|
||||||
|
|
||||||
|
result = commit_worktree(wt_dir, "empty")
|
||||||
|
self.assertFalse(result)
|
||||||
|
|
||||||
|
remove_worktree(base, wt_dir)
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoveWorktree(unittest.TestCase):
|
||||||
|
"""remove_worktree removes worktree but branch survives."""
|
||||||
|
|
||||||
|
def test_branch_survives_worktree_removal(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
base = Path(td) / "repo"
|
||||||
|
base.mkdir()
|
||||||
|
_init_git_repo(base)
|
||||||
|
|
||||||
|
wt_dir = Path(td) / "wt"
|
||||||
|
branch = "cross-eval/remove_test"
|
||||||
|
create_worktree(base, wt_dir, branch)
|
||||||
|
|
||||||
|
remove_worktree(base, wt_dir)
|
||||||
|
|
||||||
|
# Worktree directory should be gone
|
||||||
|
self.assertFalse(wt_dir.exists())
|
||||||
|
|
||||||
|
# Branch should still exist in the original repo
|
||||||
|
branches = subprocess.run(
|
||||||
|
["git", "branch", "--list", branch],
|
||||||
|
cwd=base, capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
self.assertIn(branch, branches.stdout)
|
||||||
|
|
||||||
|
|
||||||
|
class TestMakeBranchName(unittest.TestCase):
|
||||||
|
"""make_branch_name generates expected format."""
|
||||||
|
|
||||||
|
def test_format(self) -> None:
|
||||||
|
name = make_branch_name("review-fix")
|
||||||
|
self.assertTrue(name.startswith("cross-eval/review-fix_"))
|
||||||
|
# Should contain a timestamp-like suffix
|
||||||
|
parts = name.split("_", 1)
|
||||||
|
self.assertEqual(len(parts), 2)
|
||||||
|
# Timestamp portion should be like 20260313_123456
|
||||||
|
ts_part = parts[1] # after "cross-eval/review-fix_"
|
||||||
|
self.assertEqual(len(ts_part), 15) # YYYYMMDD_HHMMSS
|
||||||
|
|
||||||
|
|
||||||
|
# ===================================================================
|
||||||
|
# 2. agent.py agentic tests (mocking subprocess)
|
||||||
|
# ===================================================================
|
||||||
|
|
||||||
|
class TestInvokeAgentAgenticClaude(unittest.TestCase):
|
||||||
|
"""invoke_agent_agentic builds correct cmd for claude (no -p, prompt as positional arg)."""
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_claude_cmd_has_no_dash_p_and_prompt_as_positional(
|
||||||
|
self, mock_run: MagicMock, mock_diff: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder",
|
||||||
|
command="claude",
|
||||||
|
args=["--setting-sources", "user", "--dangerously-skip-permissions"],
|
||||||
|
agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
wt = Path(td)
|
||||||
|
_init_git_repo(wt)
|
||||||
|
|
||||||
|
invoke_agent_agentic(
|
||||||
|
agent, "implement feature X", "coding",
|
||||||
|
worktree_path=wt, quiet=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Find the subprocess.run call that actually runs the agent
|
||||||
|
agent_call = None
|
||||||
|
for c in mock_run.call_args_list:
|
||||||
|
cmd = c[0][0] if c[0] else c[1].get("args", [])
|
||||||
|
if cmd and cmd[0] == "claude":
|
||||||
|
agent_call = c
|
||||||
|
break
|
||||||
|
|
||||||
|
self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'claude'")
|
||||||
|
cmd = agent_call[0][0]
|
||||||
|
|
||||||
|
# No -p flag
|
||||||
|
self.assertNotIn("-p", cmd)
|
||||||
|
# Last arg is a task file reference (not raw prompt — avoids arg length limits)
|
||||||
|
self.assertIn("task file", cmd[-1].lower())
|
||||||
|
|
||||||
|
|
||||||
|
class TestInvokeAgentAgenticCodex(unittest.TestCase):
|
||||||
|
"""invoke_agent_agentic builds correct cmd for codex (stdin mode, - sentinel)."""
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_codex_cmd_uses_stdin_with_dash_sentinel(
|
||||||
|
self, mock_run: MagicMock, mock_diff: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="codex-coder",
|
||||||
|
command="codex",
|
||||||
|
args=["exec", "--full-auto", "--skip-git-repo-check"],
|
||||||
|
agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
wt = Path(td)
|
||||||
|
_init_git_repo(wt)
|
||||||
|
|
||||||
|
invoke_agent_agentic(
|
||||||
|
agent, "implement feature Y", "coding",
|
||||||
|
worktree_path=wt, quiet=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
agent_call = None
|
||||||
|
for c in mock_run.call_args_list:
|
||||||
|
cmd = c[0][0] if c[0] else c[1].get("args", [])
|
||||||
|
if cmd and cmd[0] == "codex":
|
||||||
|
agent_call = c
|
||||||
|
break
|
||||||
|
|
||||||
|
self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'codex'")
|
||||||
|
cmd = agent_call[0][0]
|
||||||
|
|
||||||
|
# Should have "-" sentinel at the end for stdin
|
||||||
|
self.assertEqual(cmd[-1], "-")
|
||||||
|
# Stdin input should contain the prompt
|
||||||
|
input_data = agent_call[1].get("input")
|
||||||
|
self.assertIsNotNone(input_data)
|
||||||
|
self.assertIn("implement feature Y", input_data)
|
||||||
|
|
||||||
|
|
||||||
|
class TestTaskFileCleanup(unittest.TestCase):
|
||||||
|
"""Task file is cleaned up before capture_diff."""
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="(no changes)")
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_task_file_in_tmp_not_worktree(
|
||||||
|
self, mock_run: MagicMock, mock_diff: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder", command="claude", args=[], agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
wt = Path(td)
|
||||||
|
_init_git_repo(wt)
|
||||||
|
|
||||||
|
invoke_agent_agentic(
|
||||||
|
agent, "do stuff", "coding",
|
||||||
|
worktree_path=wt, quiet=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Task file should NOT be in the worktree (it's in /tmp)
|
||||||
|
self.assertFalse((wt / "CROSS_EVAL_TASK.md").exists())
|
||||||
|
|
||||||
|
|
||||||
|
# ===================================================================
|
||||||
|
# 3. config.py tests
|
||||||
|
# ===================================================================
|
||||||
|
|
||||||
|
class TestMakeAgenticClaude(unittest.TestCase):
|
||||||
|
"""_make_agentic strips -p from claude args and sets agentic=True."""
|
||||||
|
|
||||||
|
def test_strips_dash_p_and_sets_agentic(self) -> None:
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder",
|
||||||
|
command="claude",
|
||||||
|
args=["-p", "--setting-sources", "user", "--model", "opus"],
|
||||||
|
)
|
||||||
|
self.assertFalse(agent.agentic)
|
||||||
|
_make_agentic(agent)
|
||||||
|
self.assertTrue(agent.agentic)
|
||||||
|
self.assertNotIn("-p", agent.args)
|
||||||
|
self.assertIn("--setting-sources", agent.args)
|
||||||
|
|
||||||
|
def test_idempotent_when_no_dash_p(self) -> None:
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder",
|
||||||
|
command="claude",
|
||||||
|
args=["--setting-sources", "user"],
|
||||||
|
)
|
||||||
|
_make_agentic(agent)
|
||||||
|
self.assertTrue(agent.agentic)
|
||||||
|
self.assertEqual(agent.args, ["--setting-sources", "user"])
|
||||||
|
|
||||||
|
|
||||||
|
class TestMakeAgenticCodex(unittest.TestCase):
|
||||||
|
"""_make_agentic on codex agent still works (no -p to strip)."""
|
||||||
|
|
||||||
|
def test_codex_agentic_works(self) -> None:
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="codex-coder",
|
||||||
|
command="codex",
|
||||||
|
args=["exec", "--full-auto", "-"],
|
||||||
|
)
|
||||||
|
_make_agentic(agent)
|
||||||
|
self.assertTrue(agent.agentic)
|
||||||
|
# -p was never there so args are unchanged
|
||||||
|
self.assertIn("exec", agent.args)
|
||||||
|
self.assertIn("--full-auto", agent.args)
|
||||||
|
|
||||||
|
|
||||||
|
# ===================================================================
|
||||||
|
# 4. pipeline integration tests
|
||||||
|
# ===================================================================
|
||||||
|
|
||||||
|
def _make_agentic_config(
|
||||||
|
run_dir: Path,
|
||||||
|
agentic_coder: bool = True,
|
||||||
|
) -> PipelineConfig:
|
||||||
|
"""Build a config with an agentic coder + non-agentic reviewer."""
|
||||||
|
coder = AgentConfig(
|
||||||
|
name="claude-coder", command="claude",
|
||||||
|
args=["--setting-sources", "user"],
|
||||||
|
agentic=agentic_coder,
|
||||||
|
)
|
||||||
|
reviewer = AgentConfig(
|
||||||
|
name="claude-reviewer", command="claude",
|
||||||
|
args=["-p", "--setting-sources", "user"],
|
||||||
|
agentic=False,
|
||||||
|
)
|
||||||
|
steps = [
|
||||||
|
StepConfig(
|
||||||
|
name="coding",
|
||||||
|
agent="claude-coder",
|
||||||
|
role="coding",
|
||||||
|
prompt_template="default:coding",
|
||||||
|
output_key="coding_output",
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review",
|
||||||
|
agent="claude-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_result",
|
||||||
|
verdict=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
return PipelineConfig(
|
||||||
|
output_dir=run_dir,
|
||||||
|
max_iterations=2,
|
||||||
|
min_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||||
|
agents={"claude-coder": coder, "claude-reviewer": reviewer},
|
||||||
|
coders=["claude-coder"],
|
||||||
|
reviewers=["claude-reviewer"],
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="simple",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSetupWorktreeCalledForAgentic(unittest.TestCase):
|
||||||
|
"""When agentic agent is configured, _setup_worktree is called."""
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
|
||||||
|
@patch("cross_eval.pipeline._commit_iteration")
|
||||||
|
@patch("cross_eval.pipeline._setup_worktree")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent_agentic")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent")
|
||||||
|
def test_setup_worktree_called(
|
||||||
|
self,
|
||||||
|
mock_invoke: MagicMock,
|
||||||
|
mock_invoke_agentic: MagicMock,
|
||||||
|
mock_setup: MagicMock,
|
||||||
|
mock_commit_iter: MagicMock,
|
||||||
|
mock_finalize: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
run_dir = Path(td)
|
||||||
|
config = _make_agentic_config(run_dir)
|
||||||
|
|
||||||
|
wt_path = run_dir / "work"
|
||||||
|
wt_path.mkdir()
|
||||||
|
mock_setup.return_value = (wt_path, "cross-eval/test")
|
||||||
|
|
||||||
|
mock_invoke_agentic.return_value = AgentResult(
|
||||||
|
output="diff output", exit_code=0,
|
||||||
|
agent_name="claude-coder", step_name="coding",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
mock_invoke.return_value = AgentResult(
|
||||||
|
output="VERDICT: PASS", exit_code=0,
|
||||||
|
agent_name="claude-reviewer", step_name="review",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
|
run_pipeline(config, cwd=Path(td))
|
||||||
|
|
||||||
|
mock_setup.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
class TestReviewerRunsInWorktreeCwd(unittest.TestCase):
|
||||||
|
"""Reviewer runs with worktree cwd (not original cwd) when worktree exists."""
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
|
||||||
|
@patch("cross_eval.pipeline._commit_iteration")
|
||||||
|
@patch("cross_eval.pipeline._setup_worktree")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent_agentic")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent")
|
||||||
|
def test_reviewer_uses_worktree_cwd(
|
||||||
|
self,
|
||||||
|
mock_invoke: MagicMock,
|
||||||
|
mock_invoke_agentic: MagicMock,
|
||||||
|
mock_setup: MagicMock,
|
||||||
|
mock_commit_iter: MagicMock,
|
||||||
|
mock_finalize: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
run_dir = Path(td)
|
||||||
|
config = _make_agentic_config(run_dir)
|
||||||
|
|
||||||
|
wt_path = run_dir / "work"
|
||||||
|
wt_path.mkdir()
|
||||||
|
mock_setup.return_value = (wt_path, "cross-eval/test")
|
||||||
|
|
||||||
|
mock_invoke_agentic.return_value = AgentResult(
|
||||||
|
output="diff output", exit_code=0,
|
||||||
|
agent_name="claude-coder", step_name="coding",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
mock_invoke.return_value = AgentResult(
|
||||||
|
output="VERDICT: PASS", exit_code=0,
|
||||||
|
agent_name="claude-reviewer", step_name="review",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
|
run_pipeline(config, cwd=Path(td))
|
||||||
|
|
||||||
|
# The reviewer (non-agentic) should have been called with cwd=worktree_path
|
||||||
|
reviewer_call = mock_invoke.call_args
|
||||||
|
self.assertEqual(reviewer_call[1].get("cwd") or reviewer_call[0][3], wt_path)
|
||||||
|
|
||||||
|
|
||||||
|
class TestCommitIterationCalled(unittest.TestCase):
|
||||||
|
"""_commit_iteration is called after each iteration when worktree exists."""
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
|
||||||
|
@patch("cross_eval.pipeline._commit_iteration")
|
||||||
|
@patch("cross_eval.pipeline._setup_worktree")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent_agentic")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent")
|
||||||
|
def test_commit_iteration_called(
|
||||||
|
self,
|
||||||
|
mock_invoke: MagicMock,
|
||||||
|
mock_invoke_agentic: MagicMock,
|
||||||
|
mock_setup: MagicMock,
|
||||||
|
mock_commit_iter: MagicMock,
|
||||||
|
mock_finalize: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
run_dir = Path(td)
|
||||||
|
config = _make_agentic_config(run_dir)
|
||||||
|
|
||||||
|
wt_path = run_dir / "work"
|
||||||
|
wt_path.mkdir()
|
||||||
|
mock_setup.return_value = (wt_path, "cross-eval/test")
|
||||||
|
|
||||||
|
mock_invoke_agentic.return_value = AgentResult(
|
||||||
|
output="diff output", exit_code=0,
|
||||||
|
agent_name="claude-coder", step_name="coding",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
mock_invoke.return_value = AgentResult(
|
||||||
|
output="VERDICT: PASS", exit_code=0,
|
||||||
|
agent_name="claude-reviewer", step_name="review",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
|
run_pipeline(config, cwd=Path(td))
|
||||||
|
|
||||||
|
mock_commit_iter.assert_called_once()
|
||||||
|
call_args = mock_commit_iter.call_args
|
||||||
|
self.assertEqual(call_args[0][0], wt_path)
|
||||||
|
|
||||||
|
|
||||||
|
class TestFinalizeWorktreeCalled(unittest.TestCase):
|
||||||
|
"""_finalize_worktree commits and cleans up at end."""
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
|
||||||
|
@patch("cross_eval.pipeline._commit_iteration")
|
||||||
|
@patch("cross_eval.pipeline._setup_worktree")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent_agentic")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent")
|
||||||
|
def test_finalize_called(
|
||||||
|
self,
|
||||||
|
mock_invoke: MagicMock,
|
||||||
|
mock_invoke_agentic: MagicMock,
|
||||||
|
mock_setup: MagicMock,
|
||||||
|
mock_commit_iter: MagicMock,
|
||||||
|
mock_finalize: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
run_dir = Path(td)
|
||||||
|
config = _make_agentic_config(run_dir)
|
||||||
|
|
||||||
|
wt_path = run_dir / "work"
|
||||||
|
wt_path.mkdir()
|
||||||
|
mock_setup.return_value = (wt_path, "cross-eval/test")
|
||||||
|
|
||||||
|
mock_invoke_agentic.return_value = AgentResult(
|
||||||
|
output="diff output", exit_code=0,
|
||||||
|
agent_name="claude-coder", step_name="coding",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
mock_invoke.return_value = AgentResult(
|
||||||
|
output="VERDICT: PASS", exit_code=0,
|
||||||
|
agent_name="claude-reviewer", step_name="review",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
|
run_pipeline(config, cwd=Path(td))
|
||||||
|
|
||||||
|
mock_finalize.assert_called_once()
|
||||||
|
call_args = mock_finalize.call_args
|
||||||
|
# Should pass cwd, worktree_path, branch_name, preset_name, verdict
|
||||||
|
self.assertEqual(call_args[0][1], wt_path)
|
||||||
|
self.assertEqual(call_args[0][2], "cross-eval/test")
|
||||||
|
|
||||||
|
|
||||||
|
class TestParallelAgenticFallsBackToSequential(unittest.TestCase):
|
||||||
|
"""Multiple agentic steps in parallel batch fall back to sequential."""
|
||||||
|
|
||||||
|
def test_has_agentic_steps_detects_agentic(self) -> None:
|
||||||
|
coder = AgentConfig(
|
||||||
|
name="claude-coder", command="claude", args=[], agentic=True,
|
||||||
|
)
|
||||||
|
reviewer = AgentConfig(
|
||||||
|
name="claude-reviewer", command="claude", args=[], agentic=False,
|
||||||
|
)
|
||||||
|
config = PipelineConfig(
|
||||||
|
agents={"claude-coder": coder, "claude-reviewer": reviewer},
|
||||||
|
)
|
||||||
|
steps = [
|
||||||
|
StepConfig(name="a", agent="claude-coder", role="coding",
|
||||||
|
prompt_template="default:coding", output_key="a"),
|
||||||
|
]
|
||||||
|
self.assertTrue(_has_agentic_steps(config, steps))
|
||||||
|
|
||||||
|
def test_has_agentic_steps_returns_false_without_agentic(self) -> None:
|
||||||
|
reviewer = AgentConfig(
|
||||||
|
name="claude-reviewer", command="claude", args=[], agentic=False,
|
||||||
|
)
|
||||||
|
config = PipelineConfig(
|
||||||
|
agents={"claude-reviewer": reviewer},
|
||||||
|
)
|
||||||
|
steps = [
|
||||||
|
StepConfig(name="r", agent="claude-reviewer", role="review",
|
||||||
|
prompt_template="default:review", output_key="r", verdict=True),
|
||||||
|
]
|
||||||
|
self.assertFalse(_has_agentic_steps(config, steps))
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
|
||||||
|
@patch("cross_eval.pipeline._commit_iteration")
|
||||||
|
@patch("cross_eval.pipeline._setup_worktree")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent_agentic")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent")
|
||||||
|
def test_parallel_agentic_runs_sequentially(
|
||||||
|
self,
|
||||||
|
mock_invoke: MagicMock,
|
||||||
|
mock_invoke_agentic: MagicMock,
|
||||||
|
mock_setup: MagicMock,
|
||||||
|
mock_commit_iter: MagicMock,
|
||||||
|
mock_finalize: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
"""When multiple agentic steps are parallel, they should run sequentially."""
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
run_dir = Path(td)
|
||||||
|
|
||||||
|
coder_a = AgentConfig(
|
||||||
|
name="coder-a", command="claude", args=[], agentic=True,
|
||||||
|
)
|
||||||
|
coder_b = AgentConfig(
|
||||||
|
name="coder-b", command="claude", args=[], agentic=True,
|
||||||
|
)
|
||||||
|
reviewer = AgentConfig(
|
||||||
|
name="reviewer", command="claude", args=["-p"], agentic=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
steps = [
|
||||||
|
StepConfig(
|
||||||
|
name="code_a", agent="coder-a", role="coding",
|
||||||
|
prompt_template="default:coding", output_key="code_a",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="code_b", agent="coder-b", role="coding",
|
||||||
|
prompt_template="default:coding", output_key="code_b",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review", agent="reviewer", role="review",
|
||||||
|
prompt_template="default:review", output_key="review_result",
|
||||||
|
verdict=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
config = PipelineConfig(
|
||||||
|
output_dir=run_dir,
|
||||||
|
max_iterations=1,
|
||||||
|
min_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||||
|
agents={
|
||||||
|
"coder-a": coder_a,
|
||||||
|
"coder-b": coder_b,
|
||||||
|
"reviewer": reviewer,
|
||||||
|
},
|
||||||
|
coders=["coder-a", "coder-b"],
|
||||||
|
reviewers=["reviewer"],
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="custom",
|
||||||
|
)
|
||||||
|
|
||||||
|
wt_path = run_dir / "work"
|
||||||
|
wt_path.mkdir()
|
||||||
|
mock_setup.return_value = (wt_path, "cross-eval/test")
|
||||||
|
|
||||||
|
call_order: list[str] = []
|
||||||
|
|
||||||
|
def _track_agentic(agent_config, prompt, step_name, **kwargs):
|
||||||
|
call_order.append(step_name)
|
||||||
|
return AgentResult(
|
||||||
|
output="diff", exit_code=0,
|
||||||
|
agent_name=agent_config.name, step_name=step_name,
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
|
mock_invoke_agentic.side_effect = _track_agentic
|
||||||
|
mock_invoke.return_value = AgentResult(
|
||||||
|
output="VERDICT: PASS", exit_code=0,
|
||||||
|
agent_name="reviewer", step_name="review",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
|
run_pipeline(config, cwd=Path(td))
|
||||||
|
|
||||||
|
# Both agentic steps should have been called (sequentially)
|
||||||
|
agentic_calls = [c for c in call_order if c.startswith("code_")]
|
||||||
|
self.assertEqual(len(agentic_calls), 2)
|
||||||
|
# They should appear in order (sequential, not concurrent)
|
||||||
|
self.assertEqual(agentic_calls, ["code_a", "code_b"])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -6,12 +6,14 @@ from pathlib import Path
|
|||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
from cross_eval.agent import AgentInvocationError, _supports_reasoning_effort
|
from cross_eval.agent import AgentInvocationError, _supports_reasoning_effort
|
||||||
from cross_eval.cli import _apply_phased_iteration_override
|
from cross_eval.cli import _apply_phased_iteration_override, main
|
||||||
from cross_eval.agent import invoke_agent
|
from cross_eval.agent import invoke_agent
|
||||||
from cross_eval.config import (
|
from cross_eval.config import (
|
||||||
BUILTIN_AGENTS,
|
BUILTIN_AGENTS,
|
||||||
|
_SENIOR_SYSTEM_PROMPT,
|
||||||
_default_seniors_for_preset,
|
_default_seniors_for_preset,
|
||||||
apply_reasoning_effort_settings,
|
apply_reasoning_effort_settings,
|
||||||
|
load_config,
|
||||||
normalize_reasoning_effort,
|
normalize_reasoning_effort,
|
||||||
normalize_prompt_template,
|
normalize_prompt_template,
|
||||||
normalize_step_role,
|
normalize_step_role,
|
||||||
@@ -52,7 +54,6 @@ from cross_eval.prompts import (
|
|||||||
_build_review_only_preset,
|
_build_review_only_preset,
|
||||||
_build_simple_preset,
|
_build_simple_preset,
|
||||||
)
|
)
|
||||||
from cross_eval.config import _SENIOR_SYSTEM_PROMPT
|
|
||||||
from cross_eval.report import build_report, parse_review_metrics, print_escalation_report
|
from cross_eval.report import build_report, parse_review_metrics, print_escalation_report
|
||||||
|
|
||||||
class BuiltinAgentConfigTest(unittest.TestCase):
|
class BuiltinAgentConfigTest(unittest.TestCase):
|
||||||
@@ -954,5 +955,82 @@ class EscalateVerdictTest(unittest.TestCase):
|
|||||||
self.assertIn("VERDICT: ESCALATE", AGGREGATE_REVIEW_TEMPLATE_KO)
|
self.assertIn("VERDICT: ESCALATE", AGGREGATE_REVIEW_TEMPLATE_KO)
|
||||||
|
|
||||||
|
|
||||||
|
class FixPresetBehaviorTest(unittest.TestCase):
|
||||||
|
def _write_fix_config(self, root: Path, *, max_iterations: int = 7) -> Path:
|
||||||
|
(root / "plan.md").write_text("# plan\n", encoding="utf-8")
|
||||||
|
(root / "checklist.md").write_text("# checklist\n", encoding="utf-8")
|
||||||
|
config_path = root / "config.yaml"
|
||||||
|
config_path.write_text(
|
||||||
|
(
|
||||||
|
"inputs:\n"
|
||||||
|
" plan: plan.md\n"
|
||||||
|
" checklist: checklist.md\n"
|
||||||
|
"coders: [claude-coder]\n"
|
||||||
|
"reviewers: [claude-reviewer]\n"
|
||||||
|
"pipeline: preset:review-fix\n"
|
||||||
|
f"max_iterations: {max_iterations}\n"
|
||||||
|
"language: en\n"
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
return config_path
|
||||||
|
|
||||||
|
def test_load_config_syncs_phased_iterations_and_enables_agentic(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
config = load_config(self._write_fix_config(Path(tmpdir), max_iterations=7))
|
||||||
|
|
||||||
|
self.assertEqual(config.preset_name, "review-fix")
|
||||||
|
self.assertEqual(config.phases[0].max_iterations, 7)
|
||||||
|
self.assertTrue(config.agents["claude-coder"].agentic)
|
||||||
|
self.assertNotIn("-p", config.agents["claude-coder"].args)
|
||||||
|
|
||||||
|
def test_run_config_max_iter_updates_existing_phased_pipeline(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
config_path = self._write_fix_config(Path(tmpdir), max_iterations=7)
|
||||||
|
captured: dict[str, object] = {}
|
||||||
|
|
||||||
|
def _fake_run_pipeline(config, **kwargs):
|
||||||
|
captured["phase_max"] = config.phases[0].max_iterations
|
||||||
|
captured["agentic"] = config.agents[config.coders[0]].agentic
|
||||||
|
return PipelineResult(
|
||||||
|
iterations=[],
|
||||||
|
final_verdict="PASS",
|
||||||
|
run_dir=Path(tmpdir) / "output",
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.run_pipeline", side_effect=_fake_run_pipeline):
|
||||||
|
exit_code = main([
|
||||||
|
"run",
|
||||||
|
"--config", str(config_path),
|
||||||
|
"--max-iter", "9",
|
||||||
|
"--dry-run",
|
||||||
|
])
|
||||||
|
|
||||||
|
self.assertEqual(exit_code, 0)
|
||||||
|
self.assertEqual(captured["phase_max"], 9)
|
||||||
|
self.assertTrue(captured["agentic"])
|
||||||
|
|
||||||
|
def test_run_preset_review_fix_auto_enables_agentic_without_flag(self) -> None:
|
||||||
|
captured: dict[str, object] = {}
|
||||||
|
|
||||||
|
def _fake_run_pipeline(config, **kwargs):
|
||||||
|
captured["preset"] = config.preset_name
|
||||||
|
captured["agentic"] = config.agents[config.coders[0]].agentic
|
||||||
|
captured["phase_max"] = config.phases[0].max_iterations
|
||||||
|
return PipelineResult(
|
||||||
|
iterations=[],
|
||||||
|
final_verdict="PASS",
|
||||||
|
run_dir=Path(".cross-eval/output"),
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.run_pipeline", side_effect=_fake_run_pipeline):
|
||||||
|
exit_code = main(["run", "--preset", "review-fix", "--dry-run"])
|
||||||
|
|
||||||
|
self.assertEqual(exit_code, 0)
|
||||||
|
self.assertEqual(captured["preset"], "review-fix")
|
||||||
|
self.assertTrue(captured["agentic"])
|
||||||
|
self.assertEqual(captured["phase_max"], 3)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
Reference in New Issue
Block a user