feat: tighten agentic runtime handoffs and quality gates
This commit is contained in:
@@ -415,11 +415,7 @@ def invoke_agent_agentic(
|
|||||||
timeout: int | None = None,
|
timeout: int | None = None,
|
||||||
quiet: bool = False,
|
quiet: bool = False,
|
||||||
) -> AgentResult:
|
) -> AgentResult:
|
||||||
"""Invoke an agent in agentic mode (no -p, runs in worktree, captures git diff).
|
"""Invoke an agent in agentic mode using the worktree as the source of truth."""
|
||||||
|
|
||||||
The agent runs without print mode so it can modify files directly.
|
|
||||||
After the agent exits, git diff (since last commit) is captured as the output.
|
|
||||||
"""
|
|
||||||
from cross_eval.worktree import capture_diff
|
from cross_eval.worktree import capture_diff
|
||||||
|
|
||||||
# Write prompt to a temp file (outside worktree, won't appear in diffs)
|
# Write prompt to a temp file (outside worktree, won't appear in diffs)
|
||||||
@@ -433,10 +429,10 @@ def invoke_agent_agentic(
|
|||||||
if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
|
if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
|
||||||
cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
|
cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
|
||||||
|
|
||||||
# Strip stdin sentinel ("-") from args for agentic mode.
|
# Strip print-mode flags and stdin sentinels for agentic mode.
|
||||||
# Keep -p/--print: Claude -p mode still has full tool access (Edit, Write,
|
# Agentic runs should operate on the worktree and return a real git diff,
|
||||||
# Bash, etc.) and is the correct mode for non-interactive subprocess use.
|
# not behave as a one-shot text completer.
|
||||||
args = [a for a in agent.args if a != "-"]
|
args = [a for a in agent.args if a not in {"-", "-p", "--print"}]
|
||||||
cmd.extend(args)
|
cmd.extend(args)
|
||||||
|
|
||||||
# System prompt via flag if supported
|
# System prompt via flag if supported
|
||||||
@@ -454,8 +450,8 @@ def invoke_agent_agentic(
|
|||||||
else:
|
else:
|
||||||
input_data = prompt
|
input_data = prompt
|
||||||
else:
|
else:
|
||||||
# claude -p: deliver prompt via stdin (same as codex).
|
# claude: deliver the task through stdin and let the worktree be the
|
||||||
# -p mode is non-interactive and reads from stdin, then exits.
|
# canonical place where files are read/written.
|
||||||
input_data = prompt
|
input_data = prompt
|
||||||
|
|
||||||
cmd_preview = " ".join(cmd[:6])
|
cmd_preview = " ".join(cmd[:6])
|
||||||
|
|||||||
@@ -266,7 +266,7 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
type=int,
|
type=int,
|
||||||
default=None,
|
default=None,
|
||||||
metavar="SEC",
|
metavar="SEC",
|
||||||
help="에이전트 호출 제한 시간 (--live 전용)",
|
help="에이전트 1회 호출 제한 시간(초). 0=무제한 (기본: 무제한, --live 전용)",
|
||||||
)
|
)
|
||||||
|
|
||||||
# --- run ---
|
# --- run ---
|
||||||
@@ -981,6 +981,7 @@ def cmd_run(args: argparse.Namespace) -> int:
|
|||||||
print(f"No files found in: {docs_dir}", file=sys.stderr)
|
print(f"No files found in: {docs_dir}", file=sys.stderr)
|
||||||
return 1
|
return 1
|
||||||
config.inputs["docs"] = docs_content
|
config.inputs["docs"] = docs_content
|
||||||
|
config.inputs["docs_ref"] = str(docs_dir)
|
||||||
|
|
||||||
if args.env_files:
|
if args.env_files:
|
||||||
for env_file in args.env_files:
|
for env_file in args.env_files:
|
||||||
@@ -1007,7 +1008,6 @@ def cmd_run(args: argparse.Namespace) -> int:
|
|||||||
apply_input_overrides(config, overrides)
|
apply_input_overrides(config, overrides)
|
||||||
|
|
||||||
# 3. Validate after all overrides
|
# 3. Validate after all overrides
|
||||||
from cross_eval.config import validate_config
|
|
||||||
errors = validate_config(config)
|
errors = validate_config(config)
|
||||||
if errors:
|
if errors:
|
||||||
print("Config error:\n " + "\n ".join(errors), file=sys.stderr)
|
print("Config error:\n " + "\n ".join(errors), file=sys.stderr)
|
||||||
|
|||||||
@@ -698,9 +698,9 @@ def _validate_unique_step_fields(
|
|||||||
|
|
||||||
|
|
||||||
def _make_agentic(agent: AgentConfig) -> None:
|
def _make_agentic(agent: AgentConfig) -> None:
|
||||||
"""Convert an agent to agentic mode in-place (remove -p, set agentic=True)."""
|
"""Convert an agent to agentic mode in-place."""
|
||||||
agent.agentic = True
|
agent.agentic = True
|
||||||
agent.args = [a for a in agent.args if a != "-p"]
|
agent.args = [a for a in agent.args if a not in {"-p", "--print"}]
|
||||||
|
|
||||||
|
|
||||||
def sync_phased_iterations(
|
def sync_phased_iterations(
|
||||||
|
|||||||
@@ -217,7 +217,7 @@ def run_mock_demo(preset: str = "simple", show_escalate: bool = False) -> None:
|
|||||||
|
|
||||||
if show_escalate:
|
if show_escalate:
|
||||||
print(f"\n{RED}{BOLD}{'=' * 50}")
|
print(f"\n{RED}{BOLD}{'=' * 50}")
|
||||||
print(f" Escalation Report")
|
print(" Escalation Report")
|
||||||
print(f"{'=' * 50}{RESET}")
|
print(f"{'=' * 50}{RESET}")
|
||||||
print(f"{YELLOW}Human review required.{RESET}")
|
print(f"{YELLOW}Human review required.{RESET}")
|
||||||
print(f" {RED}•{RESET} Requirements are ambiguous — needs stakeholder clarification")
|
print(f" {RED}•{RESET} Requirements are ambiguous — needs stakeholder clarification")
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
|||||||
@@ -343,6 +343,8 @@ def _run_simple_pipeline(
|
|||||||
if step_results:
|
if step_results:
|
||||||
input_contents["execution_evidence"] = _format_execution_evidence(
|
input_contents["execution_evidence"] = _format_execution_evidence(
|
||||||
step_results,
|
step_results,
|
||||||
|
run_dir=run_dir,
|
||||||
|
iteration=i,
|
||||||
)
|
)
|
||||||
|
|
||||||
iterations.append(iter_result)
|
iterations.append(iter_result)
|
||||||
@@ -543,6 +545,8 @@ def _run_phased_pipeline(
|
|||||||
if step_results:
|
if step_results:
|
||||||
input_contents["execution_evidence"] = _format_execution_evidence(
|
input_contents["execution_evidence"] = _format_execution_evidence(
|
||||||
step_results,
|
step_results,
|
||||||
|
run_dir=run_dir,
|
||||||
|
iteration=global_iter,
|
||||||
)
|
)
|
||||||
|
|
||||||
iterations.append(iter_result)
|
iterations.append(iter_result)
|
||||||
@@ -661,10 +665,13 @@ def _load_inputs(config: PipelineConfig) -> dict[str, str]:
|
|||||||
"""Load input file contents from config."""
|
"""Load input file contents from config."""
|
||||||
input_contents: dict[str, str] = {}
|
input_contents: dict[str, str] = {}
|
||||||
for key, val in config.inputs.items():
|
for key, val in config.inputs.items():
|
||||||
if isinstance(val, str):
|
if key.endswith("_ref"):
|
||||||
|
input_contents[key] = str(val)
|
||||||
|
elif isinstance(val, str):
|
||||||
input_contents[key] = val
|
input_contents[key] = val
|
||||||
else:
|
else:
|
||||||
input_contents[key] = val.read_text(encoding="utf-8")
|
input_contents[key] = val.read_text(encoding="utf-8")
|
||||||
|
_refresh_input_references(config, input_contents)
|
||||||
return input_contents
|
return input_contents
|
||||||
|
|
||||||
|
|
||||||
@@ -673,10 +680,99 @@ def _refresh_inputs(
|
|||||||
) -> None:
|
) -> None:
|
||||||
"""Re-read input files (they may have changed on disk)."""
|
"""Re-read input files (they may have changed on disk)."""
|
||||||
for key, val in config.inputs.items():
|
for key, val in config.inputs.items():
|
||||||
if isinstance(val, str):
|
if key.endswith("_ref"):
|
||||||
|
input_contents[key] = str(val)
|
||||||
|
elif isinstance(val, str):
|
||||||
input_contents[key] = val
|
input_contents[key] = val
|
||||||
elif isinstance(val, Path) and val.exists():
|
elif isinstance(val, Path) and val.exists():
|
||||||
input_contents[key] = val.read_text(encoding="utf-8")
|
input_contents[key] = val.read_text(encoding="utf-8")
|
||||||
|
_refresh_input_references(config, input_contents)
|
||||||
|
|
||||||
|
|
||||||
|
def _refresh_input_references(
|
||||||
|
config: PipelineConfig,
|
||||||
|
input_contents: dict[str, str],
|
||||||
|
) -> None:
|
||||||
|
"""Expose stable file references for canonical planning inputs."""
|
||||||
|
for key, val in config.inputs.items():
|
||||||
|
if key.endswith("_ref"):
|
||||||
|
input_contents[key] = str(val)
|
||||||
|
continue
|
||||||
|
ref_key = f"{key}_ref"
|
||||||
|
if isinstance(val, Path):
|
||||||
|
input_contents[ref_key] = str(val.resolve())
|
||||||
|
else:
|
||||||
|
input_contents.setdefault(ref_key, f"(inline {key}; no file path available)")
|
||||||
|
|
||||||
|
|
||||||
|
def _git_ref(cwd: Path, *args: str) -> str:
|
||||||
|
"""Best-effort git metadata lookup."""
|
||||||
|
result = subprocess.run(
|
||||||
|
["git", *args],
|
||||||
|
cwd=cwd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
return "(unknown)"
|
||||||
|
return result.stdout.strip() or "(unknown)"
|
||||||
|
|
||||||
|
|
||||||
|
def _collect_markdown_refs(run_dir: Path, iteration: int) -> list[Path]:
|
||||||
|
"""Collect prior markdown artifacts available to the current step."""
|
||||||
|
refs: list[Path] = []
|
||||||
|
for idx in range(1, iteration + 1):
|
||||||
|
iter_dir = run_dir / f"v{idx}"
|
||||||
|
if not iter_dir.exists():
|
||||||
|
continue
|
||||||
|
refs.extend(sorted(iter_dir.glob("*.md")))
|
||||||
|
return refs
|
||||||
|
|
||||||
|
|
||||||
|
def _build_artifact_references(
|
||||||
|
context: dict[str, str],
|
||||||
|
*,
|
||||||
|
cwd: Path,
|
||||||
|
run_dir: Path,
|
||||||
|
iteration: int,
|
||||||
|
worktree_path: Path | None,
|
||||||
|
step_results: dict[str, AgentResult] | None = None,
|
||||||
|
) -> str:
|
||||||
|
"""Build a compact reference-only handoff for agentic steps."""
|
||||||
|
repo_cwd = worktree_path or cwd
|
||||||
|
branch = _git_ref(repo_cwd, "rev-parse", "--abbrev-ref", "HEAD")
|
||||||
|
commit_hash = _git_ref(repo_cwd, "rev-parse", "HEAD")
|
||||||
|
|
||||||
|
lines = [
|
||||||
|
"### Canonical References",
|
||||||
|
f"- Plan: {context.get('plan_ref', '(missing)')}",
|
||||||
|
f"- Checklist: {context.get('checklist_ref', '(missing)')}",
|
||||||
|
f"- Docs: {context.get('docs_ref', '(none)')}",
|
||||||
|
f"- Run directory: {run_dir}",
|
||||||
|
f"- Current iteration directory: {run_dir / f'v{iteration}'}",
|
||||||
|
f"- Target repository: {repo_cwd}",
|
||||||
|
f"- Git branch: {branch}",
|
||||||
|
f"- Git commit: {commit_hash}",
|
||||||
|
"",
|
||||||
|
"Use git/cat to inspect the referenced files directly instead of relying on inline summaries.",
|
||||||
|
f"Suggested git commands: `git -C {repo_cwd} show {commit_hash}` and `git -C {repo_cwd} diff HEAD`",
|
||||||
|
]
|
||||||
|
|
||||||
|
markdown_refs = _collect_markdown_refs(run_dir, iteration)
|
||||||
|
if markdown_refs:
|
||||||
|
lines.extend(["", "### Markdown Artifacts"])
|
||||||
|
lines.extend(f"- {path}" for path in markdown_refs)
|
||||||
|
|
||||||
|
if step_results:
|
||||||
|
lines.extend(["", "### Current Step Artifacts"])
|
||||||
|
for result in step_results.values():
|
||||||
|
lines.append(f"- Output: {run_dir / f'v{iteration}' / f'{result.step_name}.md'}")
|
||||||
|
if result.transcript:
|
||||||
|
lines.append(
|
||||||
|
f"- Transcript: {run_dir / f'v{iteration}' / f'{result.step_name}_transcript.md'}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -850,6 +946,9 @@ def _execute_step(
|
|||||||
# 2. Build context (include prior step results for evidence)
|
# 2. Build context (include prior step results for evidence)
|
||||||
context = _build_context(
|
context = _build_context(
|
||||||
input_contents, step_outputs, feedback, iteration, max_iterations,
|
input_contents, step_outputs, feedback, iteration, max_iterations,
|
||||||
|
cwd=cwd,
|
||||||
|
run_dir=run_dir,
|
||||||
|
worktree_path=worktree_path,
|
||||||
step_results=step_results,
|
step_results=step_results,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1031,6 +1130,9 @@ def _execute_parallel_batch(
|
|||||||
template = resolve_template(step.prompt_template)
|
template = resolve_template(step.prompt_template)
|
||||||
context = _build_context(
|
context = _build_context(
|
||||||
context_snapshot, {}, feedback, iteration, max_iterations,
|
context_snapshot, {}, feedback, iteration, max_iterations,
|
||||||
|
cwd=cwd,
|
||||||
|
run_dir=run_dir,
|
||||||
|
worktree_path=worktree_path,
|
||||||
step_results=results_snapshot,
|
step_results=results_snapshot,
|
||||||
)
|
)
|
||||||
if step.context_override:
|
if step.context_override:
|
||||||
@@ -1145,6 +1247,10 @@ def _build_context(
|
|||||||
feedback: str,
|
feedback: str,
|
||||||
iteration: int,
|
iteration: int,
|
||||||
max_iterations: int,
|
max_iterations: int,
|
||||||
|
*,
|
||||||
|
cwd: Path | None = None,
|
||||||
|
run_dir: Path | None = None,
|
||||||
|
worktree_path: Path | None = None,
|
||||||
step_results: dict[str, AgentResult] | None = None,
|
step_results: dict[str, AgentResult] | None = None,
|
||||||
) -> dict[str, str]:
|
) -> dict[str, str]:
|
||||||
"""Build the template context dict.
|
"""Build the template context dict.
|
||||||
@@ -1160,11 +1266,25 @@ def _build_context(
|
|||||||
context["feedback"] = feedback
|
context["feedback"] = feedback
|
||||||
context["iteration"] = str(iteration)
|
context["iteration"] = str(iteration)
|
||||||
context["max_iterations"] = str(max_iterations)
|
context["max_iterations"] = str(max_iterations)
|
||||||
|
ref_cwd = cwd or Path.cwd()
|
||||||
|
ref_run_dir = run_dir or ref_cwd / ".cross-eval" / "output" / "ad-hoc"
|
||||||
|
context["artifact_references"] = _build_artifact_references(
|
||||||
|
context,
|
||||||
|
cwd=ref_cwd,
|
||||||
|
run_dir=ref_run_dir,
|
||||||
|
iteration=iteration,
|
||||||
|
worktree_path=worktree_path,
|
||||||
|
step_results=step_results,
|
||||||
|
)
|
||||||
# Surface execution evidence from prior steps so reviewers can inspect it.
|
# Surface execution evidence from prior steps so reviewers can inspect it.
|
||||||
# Prior-iteration evidence may already live in context via input_contents.
|
# Prior-iteration evidence may already live in context via input_contents.
|
||||||
prior_evidence = context.get("execution_evidence", "")
|
prior_evidence = context.get("execution_evidence", "")
|
||||||
if step_results:
|
if step_results:
|
||||||
current_evidence = _format_execution_evidence(step_results)
|
current_evidence = _format_execution_evidence(
|
||||||
|
step_results,
|
||||||
|
run_dir=ref_run_dir,
|
||||||
|
iteration=iteration,
|
||||||
|
)
|
||||||
if prior_evidence and prior_evidence != "(no prior execution evidence)":
|
if prior_evidence and prior_evidence != "(no prior execution evidence)":
|
||||||
context["execution_evidence"] = (
|
context["execution_evidence"] = (
|
||||||
"# Prior Iteration Evidence\n"
|
"# Prior Iteration Evidence\n"
|
||||||
@@ -1179,12 +1299,14 @@ def _build_context(
|
|||||||
|
|
||||||
def _format_execution_evidence(
|
def _format_execution_evidence(
|
||||||
step_results: dict[str, AgentResult],
|
step_results: dict[str, AgentResult],
|
||||||
|
*,
|
||||||
|
run_dir: Path | None = None,
|
||||||
|
iteration: int | None = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Format execution evidence from prior steps for reviewer consumption.
|
"""Format execution evidence from prior steps for reviewer consumption.
|
||||||
|
|
||||||
Produces a compact summary of command, exit code, duration, and a truncated
|
Produces a compact summary of command, exit code, duration, and artifact
|
||||||
transcript excerpt for each completed step so that reviewers and seniors
|
paths so that later agents can read markdown/git state directly.
|
||||||
can verify claims against real execution data.
|
|
||||||
"""
|
"""
|
||||||
if not step_results:
|
if not step_results:
|
||||||
return "(no prior execution evidence)"
|
return "(no prior execution evidence)"
|
||||||
@@ -1198,12 +1320,12 @@ def _format_execution_evidence(
|
|||||||
f"- Output size: {len(result.output)} chars",
|
f"- Output size: {len(result.output)} chars",
|
||||||
]
|
]
|
||||||
section = [line for line in section if line]
|
section = [line for line in section if line]
|
||||||
|
if run_dir is not None and iteration is not None:
|
||||||
|
section.append(f"- Output artifact: {run_dir / f'v{iteration}' / f'{result.step_name}.md'}")
|
||||||
if result.transcript:
|
if result.transcript:
|
||||||
# Include a truncated transcript excerpt for debugging
|
section.append(
|
||||||
excerpt = result.transcript[:2000]
|
f"- Transcript artifact: {run_dir / f'v{iteration}' / f'{result.step_name}_transcript.md'}"
|
||||||
if len(result.transcript) > 2000:
|
)
|
||||||
excerpt += "\n... (truncated)"
|
|
||||||
section.append(f"\n<details>\n<summary>Transcript excerpt</summary>\n\n{excerpt}\n</details>")
|
|
||||||
parts.append("\n".join(section))
|
parts.append("\n".join(section))
|
||||||
return "\n\n---\n\n".join(parts)
|
return "\n\n---\n\n".join(parts)
|
||||||
|
|
||||||
@@ -1455,7 +1577,7 @@ def _format_runtime_error_markdown(
|
|||||||
f"- **Suggested Action**: {exc.suggested_action}",
|
f"- **Suggested Action**: {exc.suggested_action}",
|
||||||
"",
|
"",
|
||||||
"## Command",
|
"## Command",
|
||||||
f"```",
|
"```",
|
||||||
exc.cmd_preview,
|
exc.cmd_preview,
|
||||||
"```",
|
"```",
|
||||||
"",
|
"",
|
||||||
|
|||||||
@@ -15,58 +15,39 @@ from cross_eval.models import PhaseConfig, StepConfig
|
|||||||
CODING_TEMPLATE = """\
|
CODING_TEMPLATE = """\
|
||||||
You are tasked with implementing code based on a plan and checklist.
|
You are tasked with implementing code based on a plan and checklist.
|
||||||
|
|
||||||
## Plan
|
## Artifact References
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## Checklist
|
|
||||||
{checklist}
|
|
||||||
|
|
||||||
## Reference Documents
|
|
||||||
{docs}
|
|
||||||
|
|
||||||
## Previous Review Feedback
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## Iteration
|
## Iteration
|
||||||
This is iteration {iteration} of {max_iterations}.
|
This is iteration {iteration} of {max_iterations}.
|
||||||
|
|
||||||
## Instructions
|
## Instructions
|
||||||
1. Explore the project directory to understand the existing codebase structure.
|
1. Read the referenced plan/checklist/docs/review artifacts directly from disk.
|
||||||
2. Implement ONLY what the plan specifies. Do NOT add extra features, \
|
2. Explore the project directory and git state to understand the current codebase structure.
|
||||||
|
3. Implement ONLY what the plan specifies. Do NOT add extra features, \
|
||||||
unnecessary abstractions, or premature optimizations.
|
unnecessary abstractions, or premature optimizations.
|
||||||
3. Follow every item in the checklist.
|
4. Follow every item in the checklist.
|
||||||
4. If there is previous feedback, address ONLY the specific issues mentioned.
|
5. If there is previous feedback in the referenced markdown artifacts, address ONLY those issues.
|
||||||
5. If previous feedback contains items marked as DISMISSED or false positive, \
|
6. If previous feedback contains items marked as DISMISSED or false positive, \
|
||||||
IGNORE those items — they have been verified as correct.
|
IGNORE those items — they have been verified as correct.
|
||||||
6. Output the complete implementation.
|
7. Prefer git and markdown artifacts as the source of truth. Use commit hashes, `git show`, `git diff`, and referenced markdown files instead of relying on inline summaries.
|
||||||
|
8. Output the complete implementation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
REVIEW_TEMPLATE = """\
|
REVIEW_TEMPLATE = """\
|
||||||
You are tasked with reviewing code against a plan and checklist.
|
You are tasked with reviewing code against a plan and checklist.
|
||||||
|
|
||||||
## Plan
|
## Artifact References
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## Checklist
|
|
||||||
{checklist}
|
|
||||||
|
|
||||||
## Reference Documents
|
|
||||||
{docs}
|
|
||||||
|
|
||||||
## Coding Output / Previous Step Output
|
|
||||||
{coding_output}
|
|
||||||
|
|
||||||
## Previous Review Feedback
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## Execution Evidence
|
## Execution Evidence
|
||||||
{execution_evidence}
|
{execution_evidence}
|
||||||
|
|
||||||
## Review Instructions
|
## Review Instructions
|
||||||
Explore the project directory to understand the full codebase context, \
|
Read the referenced plan/checklist/docs/review artifacts directly from disk. \
|
||||||
then evaluate the code against ONLY the plan and checklist above. \
|
Inspect the referenced commit/git state and markdown artifacts, then evaluate \
|
||||||
Use the execution evidence above to verify agent claims against actual \
|
the code against ONLY the plan and checklist. Use the execution evidence above \
|
||||||
command outputs and exit codes.
|
to verify agent claims against actual command outputs, artifact paths, and exit codes.
|
||||||
|
|
||||||
For each issue found, classify it with BOTH severity AND category:
|
For each issue found, classify it with BOTH severity AND category:
|
||||||
|
|
||||||
@@ -127,55 +108,36 @@ Otherwise output: VERDICT: FAIL
|
|||||||
CODING_TEMPLATE_KO = """\
|
CODING_TEMPLATE_KO = """\
|
||||||
당신은 기획서와 체크리스트를 기반으로 코드를 구현하는 개발자입니다.
|
당신은 기획서와 체크리스트를 기반으로 코드를 구현하는 개발자입니다.
|
||||||
|
|
||||||
## 기획서
|
## 참조 아티팩트
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## 체크리스트
|
|
||||||
{checklist}
|
|
||||||
|
|
||||||
## 참고 문서
|
|
||||||
{docs}
|
|
||||||
|
|
||||||
## 이전 리뷰 피드백
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## 반복 정보
|
## 반복 정보
|
||||||
현재 {max_iterations}회 중 {iteration}번째 반복입니다.
|
현재 {max_iterations}회 중 {iteration}번째 반복입니다.
|
||||||
|
|
||||||
## 지침
|
## 지침
|
||||||
1. 프로젝트 디렉토리를 탐색하여 기존 코드베이스 구조를 파악하세요.
|
1. 참조된 plan/checklist/docs/review markdown를 직접 읽으세요.
|
||||||
2. 기획서에 명시된 것만 구현하세요. 추가 기능, 불필요한 추상화, 과도한 최적화를 하지 마세요.
|
2. 프로젝트 디렉토리와 git 상태를 탐색하여 현재 코드베이스 구조를 파악하세요.
|
||||||
3. 체크리스트의 모든 항목을 충족하세요.
|
3. 기획서에 명시된 것만 구현하세요. 추가 기능, 불필요한 추상화, 과도한 최적화를 하지 마세요.
|
||||||
4. 이전 리뷰 피드백이 있다면 해당 이슈만 해결하세요.
|
4. 체크리스트의 모든 항목을 충족하세요.
|
||||||
5. 이전 피드백에서 DISMISSED 또는 오탐으로 표시된 항목은 무시하세요 — 이미 올바른 것으로 검증되었습니다.
|
5. 참조된 이전 리뷰 피드백이 있다면 해당 이슈만 해결하세요.
|
||||||
6. 완전한 구현을 출력하세요.
|
6. 이전 피드백에서 DISMISSED 또는 오탐으로 표시된 항목은 무시하세요 — 이미 올바른 것으로 검증되었습니다.
|
||||||
|
7. inline 요약보다 git commit hash, `git show`, `git diff`, markdown 아티팩트를 우선 사용하세요.
|
||||||
|
8. 완전한 구현을 출력하세요.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
REVIEW_TEMPLATE_KO = """\
|
REVIEW_TEMPLATE_KO = """\
|
||||||
당신은 기획서와 체크리스트 기준으로 코드를 검토하는 리뷰어입니다.
|
당신은 기획서와 체크리스트 기준으로 코드를 검토하는 리뷰어입니다.
|
||||||
|
|
||||||
## 기획서
|
## 참조 아티팩트
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## 체크리스트
|
|
||||||
{checklist}
|
|
||||||
|
|
||||||
## 참고 문서
|
|
||||||
{docs}
|
|
||||||
|
|
||||||
## 검토 대상 코드
|
|
||||||
{coding_output}
|
|
||||||
|
|
||||||
## 이전 리뷰 피드백
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## 실행 증거
|
## 실행 증거
|
||||||
{execution_evidence}
|
{execution_evidence}
|
||||||
|
|
||||||
## 검토 지침
|
## 검토 지침
|
||||||
프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스 맥락을 파악한 뒤, \
|
참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽고, \
|
||||||
위 기획서와 체크리스트 기준으로만 코드를 평가하세요. \
|
그 내용을 기준으로만 코드를 평가하세요. \
|
||||||
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력과 종료 코드로 검증하세요.
|
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요.
|
||||||
|
|
||||||
발견된 각 이슈에 심각도와 카테고리를 모두 부여하세요:
|
발견된 각 이슈에 심각도와 카테고리를 모두 부여하세요:
|
||||||
|
|
||||||
@@ -307,25 +269,16 @@ Otherwise output: VERDICT: FAIL
|
|||||||
REVIEW_ONLY_TEMPLATE_KO = """\
|
REVIEW_ONLY_TEMPLATE_KO = """\
|
||||||
당신은 기존 코드를 기획서와 체크리스트 기준으로 감사하는 리뷰어입니다.
|
당신은 기존 코드를 기획서와 체크리스트 기준으로 감사하는 리뷰어입니다.
|
||||||
|
|
||||||
## 기획서
|
## 참조 아티팩트
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## 체크리스트
|
|
||||||
{checklist}
|
|
||||||
|
|
||||||
## 참고 문서
|
|
||||||
{docs}
|
|
||||||
|
|
||||||
## 이전 리뷰 결과 ({max_iterations}회 중 {iteration}번째)
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## 실행 증거
|
## 실행 증거
|
||||||
{execution_evidence}
|
{execution_evidence}
|
||||||
|
|
||||||
## 검토 지침
|
## 검토 지침
|
||||||
프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스를 파악한 뒤, \
|
참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽고, \
|
||||||
위 기획서와 체크리스트 기준으로 **기존 코드**를 평가하세요. \
|
그 내용을 기준으로 **기존 코드**를 평가하세요. \
|
||||||
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력과 종료 코드로 검증하세요.
|
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요.
|
||||||
|
|
||||||
코드를 생성하거나 수정하지 마세요. 이미 존재하는 코드를 감사하는 것이 목적입니다.
|
코드를 생성하거나 수정하지 마세요. 이미 존재하는 코드를 감사하는 것이 목적입니다.
|
||||||
|
|
||||||
@@ -522,23 +475,8 @@ PLAN_REVIEW_TEMPLATE_KO = """\
|
|||||||
AGGREGATE_REVIEW_TEMPLATE = """\
|
AGGREGATE_REVIEW_TEMPLATE = """\
|
||||||
You are adjudicating multiple review results and turning them into an actionable decision.
|
You are adjudicating multiple review results and turning them into an actionable decision.
|
||||||
|
|
||||||
## Plan
|
## Artifact References
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## Checklist
|
|
||||||
{checklist}
|
|
||||||
|
|
||||||
## Reference Documents
|
|
||||||
{docs}
|
|
||||||
|
|
||||||
## Candidate Outputs
|
|
||||||
{candidate_outputs}
|
|
||||||
|
|
||||||
## Reviewer Findings
|
|
||||||
{reviews_bundle}
|
|
||||||
|
|
||||||
## Previous Verification Feedback
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## Previous Issue Tracker
|
## Previous Issue Tracker
|
||||||
{previous_senior_tracker}
|
{previous_senior_tracker}
|
||||||
@@ -547,9 +485,10 @@ You are adjudicating multiple review results and turning them into an actionable
|
|||||||
{execution_evidence}
|
{execution_evidence}
|
||||||
|
|
||||||
## Instructions
|
## Instructions
|
||||||
Explore the project directory to confirm the current codebase state. \
|
Read the referenced plan/checklist/docs/review artifacts directly from disk. \
|
||||||
Use the execution evidence above to verify claims against actual command \
|
Explore the project directory and the referenced git commit/diff to confirm the \
|
||||||
outputs and exit codes. Then:
|
current codebase state. Use the execution evidence above to verify claims against \
|
||||||
|
actual command outputs, artifact paths, and exit codes. Then:
|
||||||
1. Deduplicate overlapping issues across reviewers.
|
1. Deduplicate overlapping issues across reviewers.
|
||||||
2. Resolve disagreements explicitly.
|
2. Resolve disagreements explicitly.
|
||||||
3. Keep only issues supported by the plan, checklist, code, or reviewer evidence.
|
3. Keep only issues supported by the plan, checklist, code, or reviewer evidence.
|
||||||
@@ -594,23 +533,8 @@ VERDICT: PASS or VERDICT: FAIL or VERDICT: ESCALATE
|
|||||||
AGGREGATE_REVIEW_TEMPLATE_KO = """\
|
AGGREGATE_REVIEW_TEMPLATE_KO = """\
|
||||||
당신은 여러 리뷰 결과를 판정하고 coder가 수정할 액션으로 정리하는 시니어 리뷰어입니다.
|
당신은 여러 리뷰 결과를 판정하고 coder가 수정할 액션으로 정리하는 시니어 리뷰어입니다.
|
||||||
|
|
||||||
## 기획서
|
## 참조 아티팩트
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## 체크리스트
|
|
||||||
{checklist}
|
|
||||||
|
|
||||||
## 참고 문서
|
|
||||||
{docs}
|
|
||||||
|
|
||||||
## 후보 결과물
|
|
||||||
{candidate_outputs}
|
|
||||||
|
|
||||||
## 개별 리뷰 결과
|
|
||||||
{reviews_bundle}
|
|
||||||
|
|
||||||
## 이전 검증 피드백
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## 이전 이슈 트래커
|
## 이전 이슈 트래커
|
||||||
{previous_senior_tracker}
|
{previous_senior_tracker}
|
||||||
@@ -619,8 +543,8 @@ AGGREGATE_REVIEW_TEMPLATE_KO = """\
|
|||||||
{execution_evidence}
|
{execution_evidence}
|
||||||
|
|
||||||
## 지침
|
## 지침
|
||||||
프로젝트 디렉토리를 탐색하여 현재 코드베이스 상태를 확인한 뒤, \
|
참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽어 현재 코드베이스 상태를 확인한 뒤, \
|
||||||
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력과 종료 코드로 검증하세요. \
|
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요. \
|
||||||
그런 다음 아래를 수행하세요.
|
그런 다음 아래를 수행하세요.
|
||||||
1. 리뷰어들 사이에 중복되는 이슈를 합치세요.
|
1. 리뷰어들 사이에 중복되는 이슈를 합치세요.
|
||||||
2. 의견 충돌은 명시적으로 정리하세요.
|
2. 의견 충돌은 명시적으로 정리하세요.
|
||||||
|
|||||||
@@ -11,8 +11,58 @@ dependencies = [
|
|||||||
"pyyaml>=6.0",
|
"pyyaml>=6.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = [
|
||||||
|
"coverage[toml]>=7.6",
|
||||||
|
"pyright>=1.1.390",
|
||||||
|
"pytest-cov>=6.0",
|
||||||
|
"ruff>=0.8.0",
|
||||||
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
cross-eval = "cross_eval.cli:main"
|
cross-eval = "cross_eval.cli:main"
|
||||||
|
|
||||||
[tool.setuptools.packages.find]
|
[tool.setuptools.packages.find]
|
||||||
include = ["cross_eval*"]
|
include = ["cross_eval*"]
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
testpaths = ["tests"]
|
||||||
|
addopts = "-q"
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
target-version = "py39"
|
||||||
|
extend-exclude = [".cross-eval"]
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = ["F"]
|
||||||
|
|
||||||
|
[tool.pyright]
|
||||||
|
include = ["cross_eval", "tests"]
|
||||||
|
exclude = [".cross-eval"]
|
||||||
|
typeCheckingMode = "basic"
|
||||||
|
pythonVersion = "3.9"
|
||||||
|
reportMissingImports = true
|
||||||
|
reportMissingTypeStubs = false
|
||||||
|
|
||||||
|
[tool.coverage.run]
|
||||||
|
branch = true
|
||||||
|
source = ["cross_eval"]
|
||||||
|
omit = [
|
||||||
|
"cross_eval/config.py",
|
||||||
|
"cross_eval/discovery.py",
|
||||||
|
"cross_eval/cli.py",
|
||||||
|
"cross_eval/demo.py",
|
||||||
|
"cross_eval/doctor.py",
|
||||||
|
"cross_eval/prompts.py",
|
||||||
|
"cross_eval/report.py",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.coverage.report]
|
||||||
|
skip_empty = true
|
||||||
|
show_missing = true
|
||||||
|
fail_under = 90
|
||||||
|
exclude_lines = [
|
||||||
|
"pragma: no cover",
|
||||||
|
"if TYPE_CHECKING:",
|
||||||
|
"raise NotImplementedError",
|
||||||
|
]
|
||||||
|
|||||||
@@ -12,10 +12,10 @@ import subprocess
|
|||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from unittest.mock import MagicMock, call, patch
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
from cross_eval.agent import AgentInvocationError, invoke_agent_agentic
|
from cross_eval.agent import AgentInvocationError, invoke_agent_agentic
|
||||||
from cross_eval.config import BUILTIN_AGENTS, _make_agentic
|
from cross_eval.config import _make_agentic
|
||||||
from cross_eval.models import (
|
from cross_eval.models import (
|
||||||
AgentConfig,
|
AgentConfig,
|
||||||
AgentResult,
|
AgentResult,
|
||||||
@@ -24,8 +24,6 @@ from cross_eval.models import (
|
|||||||
)
|
)
|
||||||
from cross_eval.pipeline import (
|
from cross_eval.pipeline import (
|
||||||
_assert_base_repo_isolation,
|
_assert_base_repo_isolation,
|
||||||
_commit_iteration,
|
|
||||||
_finalize_worktree,
|
|
||||||
_has_agentic_steps,
|
_has_agentic_steps,
|
||||||
_setup_worktree,
|
_setup_worktree,
|
||||||
run_pipeline,
|
run_pipeline,
|
||||||
@@ -267,6 +265,7 @@ class TestInvokeAgentAgenticClaude(unittest.TestCase):
|
|||||||
break
|
break
|
||||||
|
|
||||||
self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'claude'")
|
self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'claude'")
|
||||||
|
assert agent_call is not None
|
||||||
cmd = agent_call[0][0]
|
cmd = agent_call[0][0]
|
||||||
|
|
||||||
# No -p flag
|
# No -p flag
|
||||||
@@ -274,6 +273,7 @@ class TestInvokeAgentAgenticClaude(unittest.TestCase):
|
|||||||
# Prompt is delivered via stdin (input kwarg), not as a positional arg
|
# Prompt is delivered via stdin (input kwarg), not as a positional arg
|
||||||
input_data = agent_call[1].get("input")
|
input_data = agent_call[1].get("input")
|
||||||
self.assertIsNotNone(input_data)
|
self.assertIsNotNone(input_data)
|
||||||
|
assert input_data is not None
|
||||||
self.assertIn("implement feature X", input_data)
|
self.assertIn("implement feature X", input_data)
|
||||||
|
|
||||||
|
|
||||||
@@ -311,6 +311,7 @@ class TestInvokeAgentAgenticCodex(unittest.TestCase):
|
|||||||
break
|
break
|
||||||
|
|
||||||
self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'codex'")
|
self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'codex'")
|
||||||
|
assert agent_call is not None
|
||||||
cmd = agent_call[0][0]
|
cmd = agent_call[0][0]
|
||||||
|
|
||||||
# Should have "-" sentinel at the end for stdin
|
# Should have "-" sentinel at the end for stdin
|
||||||
@@ -318,6 +319,7 @@ class TestInvokeAgentAgenticCodex(unittest.TestCase):
|
|||||||
# Stdin input should contain the prompt
|
# Stdin input should contain the prompt
|
||||||
input_data = agent_call[1].get("input")
|
input_data = agent_call[1].get("input")
|
||||||
self.assertIsNotNone(input_data)
|
self.assertIsNotNone(input_data)
|
||||||
|
assert input_data is not None
|
||||||
self.assertIn("implement feature Y", input_data)
|
self.assertIn("implement feature Y", input_data)
|
||||||
|
|
||||||
|
|
||||||
@@ -435,6 +437,16 @@ class TestMakeAgenticClaude(unittest.TestCase):
|
|||||||
self.assertNotIn("-p", agent.args)
|
self.assertNotIn("-p", agent.args)
|
||||||
self.assertIn("--setting-sources", agent.args)
|
self.assertIn("--setting-sources", agent.args)
|
||||||
|
|
||||||
|
def test_strips_dash_dash_print_alias(self) -> None:
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder",
|
||||||
|
command="claude",
|
||||||
|
args=["--print", "--setting-sources", "user"],
|
||||||
|
)
|
||||||
|
_make_agentic(agent)
|
||||||
|
self.assertTrue(agent.agentic)
|
||||||
|
self.assertNotIn("--print", agent.args)
|
||||||
|
|
||||||
def test_idempotent_when_no_dash_p(self) -> None:
|
def test_idempotent_when_no_dash_p(self) -> None:
|
||||||
agent = AgentConfig(
|
agent = AgentConfig(
|
||||||
name="claude-coder",
|
name="claude-coder",
|
||||||
|
|||||||
@@ -26,7 +26,6 @@ from cross_eval.models import (
|
|||||||
PhaseConfig,
|
PhaseConfig,
|
||||||
PipelineConfig,
|
PipelineConfig,
|
||||||
PipelineResult,
|
PipelineResult,
|
||||||
ReviewMetrics,
|
|
||||||
StepConfig,
|
StepConfig,
|
||||||
)
|
)
|
||||||
from cross_eval.pipeline import (
|
from cross_eval.pipeline import (
|
||||||
@@ -54,7 +53,7 @@ from cross_eval.prompts import (
|
|||||||
_build_review_only_preset,
|
_build_review_only_preset,
|
||||||
_build_simple_preset,
|
_build_simple_preset,
|
||||||
)
|
)
|
||||||
from cross_eval.report import build_report, parse_review_metrics, print_escalation_report
|
from cross_eval.report import build_report, parse_review_metrics
|
||||||
|
|
||||||
class BuiltinAgentConfigTest(unittest.TestCase):
|
class BuiltinAgentConfigTest(unittest.TestCase):
|
||||||
def test_claude_builtin_agents_use_user_settings_and_disable_slash_commands(self) -> None:
|
def test_claude_builtin_agents_use_user_settings_and_disable_slash_commands(self) -> None:
|
||||||
|
|||||||
@@ -26,10 +26,9 @@ from cross_eval.models import (
|
|||||||
IterationResult,
|
IterationResult,
|
||||||
PipelineConfig,
|
PipelineConfig,
|
||||||
PipelineResult,
|
PipelineResult,
|
||||||
ReviewMetrics,
|
|
||||||
StepConfig,
|
StepConfig,
|
||||||
)
|
)
|
||||||
from cross_eval.pipeline import _format_execution_evidence, run_pipeline
|
from cross_eval.pipeline import _build_artifact_references, _format_execution_evidence, run_pipeline
|
||||||
from cross_eval.report import build_report
|
from cross_eval.report import build_report
|
||||||
|
|
||||||
|
|
||||||
@@ -59,7 +58,7 @@ class TestFormatExecutionEvidence(unittest.TestCase):
|
|||||||
self.assertIn("Exit code: 0", evidence)
|
self.assertIn("Exit code: 0", evidence)
|
||||||
self.assertIn("12.3s", evidence)
|
self.assertIn("12.3s", evidence)
|
||||||
self.assertIn("claude --setting-sources user", evidence)
|
self.assertIn("claude --setting-sources user", evidence)
|
||||||
self.assertIn("Transcript excerpt", evidence)
|
self.assertNotIn("Transcript excerpt", evidence)
|
||||||
|
|
||||||
def test_multiple_results_separated(self) -> None:
|
def test_multiple_results_separated(self) -> None:
|
||||||
r1 = AgentResult(
|
r1 = AgentResult(
|
||||||
@@ -88,10 +87,60 @@ class TestFormatExecutionEvidence(unittest.TestCase):
|
|||||||
transcript=long_transcript,
|
transcript=long_transcript,
|
||||||
)
|
)
|
||||||
evidence = _format_execution_evidence({"key": result})
|
evidence = _format_execution_evidence({"key": result})
|
||||||
self.assertIn("truncated", evidence)
|
|
||||||
# The full 3000-char transcript should NOT appear
|
|
||||||
self.assertNotIn("x" * 3000, evidence)
|
self.assertNotIn("x" * 3000, evidence)
|
||||||
|
|
||||||
|
def test_artifact_paths_included_when_run_dir_provided(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
result = AgentResult(
|
||||||
|
output="diff",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="coder",
|
||||||
|
step_name="coding",
|
||||||
|
duration_seconds=1.2,
|
||||||
|
transcript="stdout",
|
||||||
|
command_preview="claude ...",
|
||||||
|
)
|
||||||
|
evidence = _format_execution_evidence(
|
||||||
|
{"coding_output": result},
|
||||||
|
run_dir=Path(tmpdir),
|
||||||
|
iteration=2,
|
||||||
|
)
|
||||||
|
self.assertIn("v2/coding.md", evidence)
|
||||||
|
self.assertIn("v2/coding_transcript.md", evidence)
|
||||||
|
|
||||||
|
|
||||||
|
class TestArtifactReferences(unittest.TestCase):
|
||||||
|
"""Artifact references should prefer file paths and git state over inline text."""
|
||||||
|
|
||||||
|
def test_contains_input_refs_and_git_context(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir) / "repo"
|
||||||
|
repo.mkdir()
|
||||||
|
(repo / "plan.md").write_text("plan", encoding="utf-8")
|
||||||
|
(repo / "checklist.md").write_text("checklist", encoding="utf-8")
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
subprocess.run(["git", "init"], cwd=repo, capture_output=True, check=True)
|
||||||
|
subprocess.run(["git", "config", "user.email", "test@test.com"], cwd=repo, capture_output=True, check=True)
|
||||||
|
subprocess.run(["git", "config", "user.name", "Test"], cwd=repo, capture_output=True, check=True)
|
||||||
|
subprocess.run(["git", "add", "."], cwd=repo, capture_output=True, check=True)
|
||||||
|
subprocess.run(["git", "commit", "-m", "init"], cwd=repo, capture_output=True, check=True)
|
||||||
|
|
||||||
|
refs = _build_artifact_references(
|
||||||
|
{
|
||||||
|
"plan_ref": str((repo / "plan.md").resolve()),
|
||||||
|
"checklist_ref": str((repo / "checklist.md").resolve()),
|
||||||
|
"docs_ref": "(none)",
|
||||||
|
},
|
||||||
|
cwd=repo,
|
||||||
|
run_dir=repo / ".cross-eval" / "output" / "run",
|
||||||
|
iteration=1,
|
||||||
|
worktree_path=None,
|
||||||
|
)
|
||||||
|
self.assertIn("Plan:", refs)
|
||||||
|
self.assertIn("Git commit:", refs)
|
||||||
|
self.assertIn("Suggested git commands", refs)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# 2. Evidence in reviewer prompts (integration)
|
# 2. Evidence in reviewer prompts (integration)
|
||||||
@@ -162,7 +211,7 @@ class TestEvidenceInReviewerPrompt(unittest.TestCase):
|
|||||||
]
|
]
|
||||||
self.assertTrue(len(review_prompts) >= 1)
|
self.assertTrue(len(review_prompts) >= 1)
|
||||||
review_prompt = review_prompts[0]["prompt"]
|
review_prompt = review_prompts[0]["prompt"]
|
||||||
# Evidence section should reference the coding step's command
|
self.assertIn("Artifact References", review_prompt)
|
||||||
self.assertIn("Execution Evidence", review_prompt)
|
self.assertIn("Execution Evidence", review_prompt)
|
||||||
self.assertIn("claude-coder", review_prompt)
|
self.assertIn("claude-coder", review_prompt)
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,6 @@ from cross_eval.doctor import (
|
|||||||
check_cli_installed,
|
check_cli_installed,
|
||||||
check_config,
|
check_config,
|
||||||
format_doctor_results,
|
format_doctor_results,
|
||||||
run_doctor,
|
|
||||||
)
|
)
|
||||||
from cross_eval.demo import (
|
from cross_eval.demo import (
|
||||||
DEMO_CHECKLIST,
|
DEMO_CHECKLIST,
|
||||||
|
|||||||
@@ -8,9 +8,7 @@ from unittest.mock import patch
|
|||||||
|
|
||||||
from cross_eval.config import BUILTIN_AGENTS
|
from cross_eval.config import BUILTIN_AGENTS
|
||||||
from cross_eval.models import (
|
from cross_eval.models import (
|
||||||
AgentConfig,
|
|
||||||
AgentResult,
|
AgentResult,
|
||||||
PhaseConfig,
|
|
||||||
PipelineConfig,
|
PipelineConfig,
|
||||||
StepConfig,
|
StepConfig,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -390,6 +390,7 @@ class TranscriptSavingRegressionTest(unittest.TestCase):
|
|||||||
# Verify transcript files were saved
|
# Verify transcript files were saved
|
||||||
run_dir = result.run_dir
|
run_dir = result.run_dir
|
||||||
self.assertIsNotNone(run_dir)
|
self.assertIsNotNone(run_dir)
|
||||||
|
assert run_dir is not None
|
||||||
coding_transcript = run_dir / "v1" / "coding_transcript.md"
|
coding_transcript = run_dir / "v1" / "coding_transcript.md"
|
||||||
review_transcript = run_dir / "v1" / "review_transcript.md"
|
review_transcript = run_dir / "v1" / "review_transcript.md"
|
||||||
self.assertTrue(
|
self.assertTrue(
|
||||||
|
|||||||
831
tests/test_runtime_misc.py
Normal file
831
tests/test_runtime_misc.py
Normal file
@@ -0,0 +1,831 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
from cross_eval.agent import (
|
||||||
|
AgentInvocationError,
|
||||||
|
_build_transcript,
|
||||||
|
_classify_agent_failure,
|
||||||
|
invoke_agent,
|
||||||
|
invoke_agent_agentic,
|
||||||
|
)
|
||||||
|
from cross_eval.models import AgentConfig, AgentResult, ExecutionConfig, PipelineConfig, StepConfig
|
||||||
|
from cross_eval.pipeline import (
|
||||||
|
_commit_iteration,
|
||||||
|
_execute_parallel_batch,
|
||||||
|
_execute_step,
|
||||||
|
_finalize_worktree,
|
||||||
|
_format_runtime_error_markdown,
|
||||||
|
_maybe_save_step_transcript,
|
||||||
|
_snapshot_repo_state,
|
||||||
|
)
|
||||||
|
from cross_eval.runtime_env import (
|
||||||
|
build_execution_policy,
|
||||||
|
parse_dotenv,
|
||||||
|
resolve_env_files,
|
||||||
|
summarize_environment,
|
||||||
|
)
|
||||||
|
from cross_eval.worktree import WorktreeError, create_worktree, remove_worktree
|
||||||
|
|
||||||
|
|
||||||
|
def _init_git_repo(path: Path) -> None:
|
||||||
|
subprocess.run(["git", "init"], cwd=path, capture_output=True, check=True)
|
||||||
|
subprocess.run(
|
||||||
|
["git", "config", "user.email", "test@test.com"],
|
||||||
|
cwd=path,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
subprocess.run(
|
||||||
|
["git", "config", "user.name", "Test"],
|
||||||
|
cwd=path,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
(path / "README.md").write_text("# init\n", encoding="utf-8")
|
||||||
|
subprocess.run(["git", "add", "."], cwd=path, capture_output=True, check=True)
|
||||||
|
subprocess.run(
|
||||||
|
["git", "commit", "-m", "initial"],
|
||||||
|
cwd=path,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestInvokeAgentRuntime(unittest.TestCase):
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_interactive_claude_reads_output_file(self, mock_run: MagicMock) -> None:
|
||||||
|
def _fake_run(cmd: list[str], **kwargs: object) -> MagicMock:
|
||||||
|
match = re.search(r"Write your complete output to (.+)\.$", cmd[-1])
|
||||||
|
self.assertIsNotNone(match)
|
||||||
|
assert match is not None
|
||||||
|
Path(match.group(1)).write_text("review result", encoding="utf-8")
|
||||||
|
return MagicMock(returncode=0, stdout="", stderr="")
|
||||||
|
|
||||||
|
mock_run.side_effect = _fake_run
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-reviewer",
|
||||||
|
command="claude",
|
||||||
|
args=["--model", "opus"],
|
||||||
|
system_prompt="system",
|
||||||
|
)
|
||||||
|
|
||||||
|
result = invoke_agent(agent, "inspect code", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(result.output, "review result")
|
||||||
|
called_cmd = mock_run.call_args[0][0]
|
||||||
|
self.assertIn("--system-prompt", called_cmd)
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_interactive_claude_falls_back_to_stdout(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="stdout fallback", stderr="")
|
||||||
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["--model", "opus"])
|
||||||
|
|
||||||
|
result = invoke_agent(agent, "inspect code", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(result.output, "stdout fallback")
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_non_claude_wraps_system_prompt_in_stdin(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="custom-reviewer",
|
||||||
|
command="custom-cli",
|
||||||
|
args=["run"],
|
||||||
|
system_prompt="strict mode",
|
||||||
|
)
|
||||||
|
|
||||||
|
invoke_agent(agent, "check things", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
mock_run.call_args.kwargs["input"],
|
||||||
|
"<system>\nstrict mode\n</system>\n\ncheck things",
|
||||||
|
)
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_failure_raises_structured_error(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="API Error: backend down")
|
||||||
|
agent = AgentConfig(name="codex-reviewer", command="codex", args=["exec", "-"])
|
||||||
|
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent(agent, "check", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(ctx.exception.failure_type, "API_ERROR")
|
||||||
|
self.assertIn("backend down", ctx.exception.raw_error)
|
||||||
|
|
||||||
|
def test_classify_unknown_failure(self) -> None:
|
||||||
|
failure_type, suggested_action = _classify_agent_failure("weird crash")
|
||||||
|
self.assertEqual(failure_type, "UNKNOWN")
|
||||||
|
self.assertIn("Inspect", suggested_action)
|
||||||
|
|
||||||
|
def test_build_transcript_includes_cwd_and_duration(self) -> None:
|
||||||
|
transcript = _build_transcript(
|
||||||
|
command_preview="claude -p",
|
||||||
|
stdout="ok",
|
||||||
|
stderr="",
|
||||||
|
exit_code=0,
|
||||||
|
duration_seconds=1.2,
|
||||||
|
cwd="/tmp/repo",
|
||||||
|
)
|
||||||
|
self.assertIn("## Working Directory", transcript)
|
||||||
|
self.assertIn("## Duration: 1.2s", transcript)
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_timeout_stops_spinner(self, mock_run: MagicMock, mock_spinner: MagicMock) -> None:
|
||||||
|
spinner = mock_spinner.return_value
|
||||||
|
mock_run.side_effect = subprocess.TimeoutExpired(cmd=["claude"], timeout=12)
|
||||||
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
|
||||||
|
|
||||||
|
with self.assertRaises(subprocess.TimeoutExpired):
|
||||||
|
invoke_agent(agent, "inspect code", "review", quiet=False, timeout=12)
|
||||||
|
|
||||||
|
spinner.stop.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_generic_exception_stops_spinner(self, mock_run: MagicMock, mock_spinner: MagicMock) -> None:
|
||||||
|
spinner = mock_spinner.return_value
|
||||||
|
mock_run.side_effect = OSError("boom")
|
||||||
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
|
||||||
|
|
||||||
|
with self.assertRaises(OSError):
|
||||||
|
invoke_agent(agent, "inspect code", "review", quiet=False)
|
||||||
|
|
||||||
|
spinner.stop.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.logger.warning")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_empty_output_logs_warning(self, mock_run: MagicMock, mock_warning: MagicMock) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
|
||||||
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
|
||||||
|
|
||||||
|
result = invoke_agent(agent, "inspect code", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(result.output, "")
|
||||||
|
mock_warning.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_print_mode_claude_uses_native_system_prompt_flag(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-reviewer",
|
||||||
|
command="claude",
|
||||||
|
args=["-p"],
|
||||||
|
system_prompt="be strict",
|
||||||
|
)
|
||||||
|
|
||||||
|
invoke_agent(agent, "review this", "review", quiet=True)
|
||||||
|
|
||||||
|
called_cmd = mock_run.call_args[0][0]
|
||||||
|
self.assertIn("--system-prompt", called_cmd)
|
||||||
|
self.assertEqual(mock_run.call_args.kwargs["input"], "review this")
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_interactive_failure_truncates_error_and_removes_output_file(
|
||||||
|
self,
|
||||||
|
mock_run: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
seen_output_path: Path | None = None
|
||||||
|
|
||||||
|
def _fake_run(cmd: list[str], **kwargs: object) -> MagicMock:
|
||||||
|
nonlocal seen_output_path
|
||||||
|
match = re.search(r"Write your complete output to (.+)\.$", cmd[-1])
|
||||||
|
self.assertIsNotNone(match)
|
||||||
|
assert match is not None
|
||||||
|
seen_output_path = Path(match.group(1))
|
||||||
|
return MagicMock(returncode=1, stdout="", stderr="x" * 600)
|
||||||
|
|
||||||
|
mock_run.side_effect = _fake_run
|
||||||
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["--model", "opus"])
|
||||||
|
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent(agent, "inspect code", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(len(ctx.exception.raw_error), 503)
|
||||||
|
self.assertIsNotNone(seen_output_path)
|
||||||
|
assert seen_output_path is not None
|
||||||
|
self.assertFalse(seen_output_path.exists())
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.logger.warning")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_empty_output_with_stderr_logs_stderr_warning(
|
||||||
|
self,
|
||||||
|
mock_run: MagicMock,
|
||||||
|
mock_warning: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="stderr text")
|
||||||
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
|
||||||
|
|
||||||
|
invoke_agent(agent, "inspect code", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertIn("stderr:", mock_warning.call_args[0][0])
|
||||||
|
|
||||||
|
|
||||||
|
class TestInvokeAgenticRuntime(unittest.TestCase):
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_codex_agentic_adds_reasoning_and_system_wrapper(
|
||||||
|
self,
|
||||||
|
mock_run: MagicMock,
|
||||||
|
mock_diff: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="codex-coder",
|
||||||
|
command="codex",
|
||||||
|
args=["exec", "--full-auto"],
|
||||||
|
system_prompt="strict mode",
|
||||||
|
reasoning_effort="high",
|
||||||
|
agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=True)
|
||||||
|
|
||||||
|
called_cmd = mock_run.call_args[0][0]
|
||||||
|
self.assertIn("-c", called_cmd)
|
||||||
|
self.assertEqual(called_cmd[-1], "-")
|
||||||
|
self.assertIn("<system>", mock_run.call_args.kwargs["input"])
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_agentic_claude_success_uses_system_prompt_and_spinner(
|
||||||
|
self,
|
||||||
|
mock_run: MagicMock,
|
||||||
|
mock_diff: MagicMock,
|
||||||
|
mock_spinner: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder",
|
||||||
|
command="claude",
|
||||||
|
args=["-p", "--print"],
|
||||||
|
system_prompt="stay in scope",
|
||||||
|
agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
result = invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
|
||||||
|
|
||||||
|
called_cmd = mock_run.call_args[0][0]
|
||||||
|
self.assertNotIn("-p", called_cmd)
|
||||||
|
self.assertIn("--system-prompt", called_cmd)
|
||||||
|
self.assertEqual(result.output, "diff --git a/file ...")
|
||||||
|
mock_spinner.return_value.stop.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
def test_agentic_timeout_stops_spinner(self, mock_spinner: MagicMock) -> None:
|
||||||
|
spinner = mock_spinner.return_value
|
||||||
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
with patch(
|
||||||
|
"cross_eval.agent.subprocess.run",
|
||||||
|
side_effect=subprocess.TimeoutExpired(cmd=["codex"], timeout=20),
|
||||||
|
):
|
||||||
|
with self.assertRaises(subprocess.TimeoutExpired):
|
||||||
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False, timeout=20)
|
||||||
|
|
||||||
|
spinner.stop.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_agentic_nonzero_exit_raises_structured_error(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="unauthorized")
|
||||||
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(ctx.exception.failure_type, "AUTH")
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
def test_agentic_generic_exception_stops_spinner(
|
||||||
|
self,
|
||||||
|
mock_spinner: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
with patch("cross_eval.agent.subprocess.run", side_effect=OSError("boom")):
|
||||||
|
with self.assertRaises(OSError):
|
||||||
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
|
||||||
|
|
||||||
|
mock_spinner.return_value.stop.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_agentic_failure_truncates_error(
|
||||||
|
self,
|
||||||
|
mock_run: MagicMock,
|
||||||
|
mock_spinner: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="x" * 600)
|
||||||
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
|
||||||
|
|
||||||
|
self.assertEqual(len(ctx.exception.raw_error), 503)
|
||||||
|
mock_spinner.return_value.stop.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_agentic_empty_diff_failure_truncates_error_and_stops_spinner(
|
||||||
|
self,
|
||||||
|
mock_run: MagicMock,
|
||||||
|
mock_diff: MagicMock,
|
||||||
|
mock_spinner: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(
|
||||||
|
returncode=0,
|
||||||
|
stdout="implemented",
|
||||||
|
stderr="permission denied " * 300,
|
||||||
|
)
|
||||||
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
|
||||||
|
|
||||||
|
self.assertLessEqual(len(ctx.exception.raw_error), 2003)
|
||||||
|
self.assertEqual(ctx.exception.failure_type, "WRITE_FAILURE")
|
||||||
|
mock_spinner.return_value.stop.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
class TestPipelineHelpers(unittest.TestCase):
|
||||||
|
@patch("cross_eval.worktree.commit_worktree", return_value=True)
|
||||||
|
def test_commit_iteration_logs_only_when_committed(self, mock_commit: MagicMock) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
_commit_iteration(Path(tmpdir), "review-fix", 2, "PASS")
|
||||||
|
mock_commit.assert_called_once()
|
||||||
|
|
||||||
|
def test_snapshot_repo_state_includes_untracked_digest(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
(repo / "scratch.txt").write_text("draft", encoding="utf-8")
|
||||||
|
|
||||||
|
snapshot = _snapshot_repo_state(repo)
|
||||||
|
|
||||||
|
self.assertIn("UNTRACKED scratch.txt", snapshot)
|
||||||
|
|
||||||
|
def test_finalize_worktree_deletes_empty_branch(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
base = Path(tmpdir) / "repo"
|
||||||
|
base.mkdir()
|
||||||
|
_init_git_repo(base)
|
||||||
|
branch = "cross-eval/empty"
|
||||||
|
subprocess.run(
|
||||||
|
["git", "branch", branch, "HEAD"],
|
||||||
|
cwd=base,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
worktree = Path(tmpdir) / "wt"
|
||||||
|
subprocess.run(
|
||||||
|
["git", "worktree", "add", str(worktree), branch],
|
||||||
|
cwd=base,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
branch_result = _finalize_worktree(base, worktree, branch, "review-fix", "PASS")
|
||||||
|
|
||||||
|
self.assertIsNone(branch_result)
|
||||||
|
branches = subprocess.run(
|
||||||
|
["git", "branch", "--list", branch],
|
||||||
|
cwd=base,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
self.assertEqual(branches.stdout.strip(), "")
|
||||||
|
|
||||||
|
def test_format_runtime_error_markdown_for_generic_exception(self) -> None:
|
||||||
|
markdown = _format_runtime_error_markdown(
|
||||||
|
RuntimeError("boom"),
|
||||||
|
step_name="review",
|
||||||
|
agent_name="claude-reviewer",
|
||||||
|
phase_name="review_fix",
|
||||||
|
)
|
||||||
|
self.assertIn("# Agent Error", markdown)
|
||||||
|
self.assertIn("review_fix", markdown)
|
||||||
|
self.assertIn("boom", markdown)
|
||||||
|
|
||||||
|
def test_maybe_save_step_transcript_returns_none_without_transcript(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
result = AgentResult(
|
||||||
|
output="ok",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="claude-reviewer",
|
||||||
|
step_name="review",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
saved = _maybe_save_step_transcript(Path(tmpdir), 1, "review", result)
|
||||||
|
self.assertIsNone(saved)
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent")
|
||||||
|
def test_execute_step_saves_timeout_markdown(self, mock_invoke: MagicMock) -> None:
|
||||||
|
mock_invoke.side_effect = subprocess.TimeoutExpired(
|
||||||
|
cmd=["claude"],
|
||||||
|
timeout=45,
|
||||||
|
output="partial output",
|
||||||
|
stderr="still running",
|
||||||
|
)
|
||||||
|
step = StepConfig(
|
||||||
|
name="review",
|
||||||
|
agent="claude-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_output",
|
||||||
|
)
|
||||||
|
config = PipelineConfig(
|
||||||
|
agents={
|
||||||
|
"claude-reviewer": AgentConfig(
|
||||||
|
name="claude-reviewer",
|
||||||
|
command="claude",
|
||||||
|
args=["-p"],
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
step_outputs: dict[str, str] = {}
|
||||||
|
step_results: dict[str, AgentResult] = {}
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
run_dir = Path(tmpdir)
|
||||||
|
with self.assertRaises(RuntimeError) as ctx:
|
||||||
|
_execute_step(
|
||||||
|
step,
|
||||||
|
config,
|
||||||
|
{"plan": "Plan", "checklist": "Checklist"},
|
||||||
|
"",
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
run_dir,
|
||||||
|
45,
|
||||||
|
False,
|
||||||
|
step_outputs,
|
||||||
|
step_results,
|
||||||
|
run_dir=run_dir,
|
||||||
|
output_iter=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIn("timed out after 45s", str(ctx.exception))
|
||||||
|
error_path = run_dir / "v1" / "review_error.md"
|
||||||
|
self.assertTrue(error_path.exists())
|
||||||
|
self.assertIn("# Agent Timeout", error_path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent")
|
||||||
|
def test_execute_step_saves_runtime_error_markdown(self, mock_invoke: MagicMock) -> None:
|
||||||
|
mock_invoke.side_effect = AgentInvocationError(
|
||||||
|
agent_name="claude-reviewer",
|
||||||
|
step_name="review",
|
||||||
|
cmd_preview="claude -p",
|
||||||
|
raw_error="api broke",
|
||||||
|
failure_type="API_ERROR",
|
||||||
|
suggested_action="retry",
|
||||||
|
)
|
||||||
|
step = StepConfig(
|
||||||
|
name="review",
|
||||||
|
agent="claude-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_output",
|
||||||
|
)
|
||||||
|
config = PipelineConfig(
|
||||||
|
agents={
|
||||||
|
"claude-reviewer": AgentConfig(
|
||||||
|
name="claude-reviewer",
|
||||||
|
command="claude",
|
||||||
|
args=["-p"],
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
run_dir = Path(tmpdir)
|
||||||
|
with self.assertRaises(AgentInvocationError):
|
||||||
|
_execute_step(
|
||||||
|
step,
|
||||||
|
config,
|
||||||
|
{"plan": "Plan", "checklist": "Checklist"},
|
||||||
|
"",
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
run_dir,
|
||||||
|
45,
|
||||||
|
False,
|
||||||
|
{},
|
||||||
|
{},
|
||||||
|
run_dir=run_dir,
|
||||||
|
output_iter=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
error_text = (run_dir / "v1" / "review_error.md").read_text(encoding="utf-8")
|
||||||
|
self.assertIn("API_ERROR", error_text)
|
||||||
|
self.assertIn("retry", error_text)
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent")
|
||||||
|
def test_execute_parallel_batch_saves_success_and_timeout_error(self, mock_invoke: MagicMock) -> None:
|
||||||
|
def _fake_invoke(agent_config: AgentConfig, prompt: str, step_name: str, **kwargs: object) -> AgentResult:
|
||||||
|
if step_name == "review_ok":
|
||||||
|
return AgentResult(
|
||||||
|
output="VERDICT: PASS",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
raise subprocess.TimeoutExpired(
|
||||||
|
cmd=["codex"],
|
||||||
|
timeout=30,
|
||||||
|
output="halfway",
|
||||||
|
stderr="timeout stderr",
|
||||||
|
)
|
||||||
|
|
||||||
|
mock_invoke.side_effect = _fake_invoke
|
||||||
|
batch = [
|
||||||
|
StepConfig(
|
||||||
|
name="review_ok",
|
||||||
|
agent="claude-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_ok",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review_slow",
|
||||||
|
agent="codex-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_slow",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(
|
||||||
|
agents={
|
||||||
|
"claude-reviewer": AgentConfig(name="claude-reviewer", command="claude", args=["-p"]),
|
||||||
|
"codex-reviewer": AgentConfig(name="codex-reviewer", command="codex", args=["exec", "-"]),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
step_outputs: dict[str, str] = {}
|
||||||
|
step_results: dict[str, AgentResult] = {}
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
run_dir = Path(tmpdir)
|
||||||
|
with self.assertRaises(RuntimeError) as ctx:
|
||||||
|
_execute_parallel_batch(
|
||||||
|
batch,
|
||||||
|
config,
|
||||||
|
{"plan": "Plan", "checklist": "Checklist"},
|
||||||
|
"",
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
run_dir,
|
||||||
|
30,
|
||||||
|
False,
|
||||||
|
step_outputs,
|
||||||
|
step_results,
|
||||||
|
run_dir=run_dir,
|
||||||
|
output_iter=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIn("Successful outputs were saved for: review_ok", str(ctx.exception))
|
||||||
|
self.assertEqual(step_outputs["review_ok"], "VERDICT: PASS")
|
||||||
|
self.assertTrue((run_dir / "v1" / "review_ok.md").exists())
|
||||||
|
self.assertTrue((run_dir / "v1" / "review_slow_error.md").exists())
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline._execute_step")
|
||||||
|
def test_execute_parallel_batch_dry_run_uses_sequential_path(self, mock_step: MagicMock) -> None:
|
||||||
|
batch = [
|
||||||
|
StepConfig(
|
||||||
|
name="review_a",
|
||||||
|
agent="claude-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_a",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review_b",
|
||||||
|
agent="codex-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_b",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(agents={})
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
_execute_parallel_batch(
|
||||||
|
batch,
|
||||||
|
config,
|
||||||
|
{"plan": "Plan"},
|
||||||
|
"",
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
Path(tmpdir),
|
||||||
|
None,
|
||||||
|
True,
|
||||||
|
{},
|
||||||
|
{},
|
||||||
|
run_dir=Path(tmpdir),
|
||||||
|
output_iter=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(mock_step.call_count, 2)
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline._execute_step")
|
||||||
|
def test_execute_parallel_batch_agentic_steps_fall_back_to_sequential(self, mock_step: MagicMock) -> None:
|
||||||
|
batch = [
|
||||||
|
StepConfig(
|
||||||
|
name="review_a",
|
||||||
|
agent="agentic-a",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_a",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review_b",
|
||||||
|
agent="agentic-b",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_b",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(
|
||||||
|
agents={
|
||||||
|
"agentic-a": AgentConfig(name="agentic-a", command="claude", agentic=True),
|
||||||
|
"agentic-b": AgentConfig(name="agentic-b", command="codex", agentic=True),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
_execute_parallel_batch(
|
||||||
|
batch,
|
||||||
|
config,
|
||||||
|
{"plan": "Plan"},
|
||||||
|
"",
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
Path(tmpdir),
|
||||||
|
None,
|
||||||
|
False,
|
||||||
|
{},
|
||||||
|
{},
|
||||||
|
run_dir=Path(tmpdir),
|
||||||
|
output_iter=1,
|
||||||
|
worktree_path=Path(tmpdir),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(mock_step.call_count, 2)
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.remove_worktree", side_effect=RuntimeError("cleanup failed"))
|
||||||
|
@patch("cross_eval.worktree.commit_worktree", side_effect=RuntimeError("commit failed"))
|
||||||
|
def test_finalize_worktree_handles_cleanup_failures(
|
||||||
|
self,
|
||||||
|
mock_commit: MagicMock,
|
||||||
|
mock_remove: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
branch = _finalize_worktree(
|
||||||
|
Path(tmpdir),
|
||||||
|
Path(tmpdir) / "wt",
|
||||||
|
"cross-eval/fail",
|
||||||
|
"review-fix",
|
||||||
|
"FAIL",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIsNone(branch)
|
||||||
|
|
||||||
|
|
||||||
|
class TestRuntimeEnvironmentHelpers(unittest.TestCase):
|
||||||
|
def test_parse_dotenv_handles_export_and_quotes(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
env_path = Path(tmpdir) / ".env"
|
||||||
|
env_path.write_text(
|
||||||
|
"export FOO='bar'\nBAR=\"line\\nvalue\"\nINVALID\n=skip\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
values = parse_dotenv(env_path)
|
||||||
|
|
||||||
|
self.assertEqual(values["FOO"], "bar")
|
||||||
|
self.assertEqual(values["BAR"], "line\nvalue")
|
||||||
|
self.assertNotIn("INVALID", values)
|
||||||
|
|
||||||
|
def test_resolve_env_files_deduplicates_and_filters_missing(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
env_path = root / ".env"
|
||||||
|
env_path.write_text("FOO=bar\n", encoding="utf-8")
|
||||||
|
execution = ExecutionConfig(
|
||||||
|
env_files=[".env", str(env_path)],
|
||||||
|
auto_env_files=[".env", ".env.local"],
|
||||||
|
)
|
||||||
|
|
||||||
|
resolved = resolve_env_files(execution, root)
|
||||||
|
|
||||||
|
self.assertEqual(resolved, [env_path.resolve()])
|
||||||
|
|
||||||
|
def test_summarize_environment_hides_names_when_disabled(self) -> None:
|
||||||
|
execution = ExecutionConfig(expose_env_names=False, auto_context_targets=["postgres"])
|
||||||
|
summary = summarize_environment(
|
||||||
|
execution,
|
||||||
|
[],
|
||||||
|
{"DATABASE_URL": "postgres://localhost"},
|
||||||
|
{},
|
||||||
|
)
|
||||||
|
self.assertIn("names are hidden", summary)
|
||||||
|
self.assertIn("Execution targets hinted by the user: postgres", summary)
|
||||||
|
|
||||||
|
def test_build_execution_policy_for_minimal_mode(self) -> None:
|
||||||
|
policy = build_execution_policy(
|
||||||
|
ExecutionConfig(mode="agent-decides", command_policy="minimal"),
|
||||||
|
)
|
||||||
|
self.assertIn("Command policy: minimal", policy)
|
||||||
|
self.assertIn("Keep command usage minimal", policy)
|
||||||
|
|
||||||
|
|
||||||
|
class TestWorktreeFailures(unittest.TestCase):
|
||||||
|
@patch("cross_eval.worktree.subprocess.run")
|
||||||
|
def test_create_worktree_raises_when_branch_creation_fails(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.side_effect = subprocess.CalledProcessError(
|
||||||
|
1,
|
||||||
|
["git", "branch"],
|
||||||
|
stderr="branch failed",
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
base = Path(tmpdir)
|
||||||
|
work_dir = base / "wt"
|
||||||
|
with self.assertRaises(WorktreeError) as ctx:
|
||||||
|
create_worktree(base, work_dir, "cross-eval/fail")
|
||||||
|
|
||||||
|
self.assertIn("Failed to create branch", str(ctx.exception))
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.subprocess.run")
|
||||||
|
def test_create_worktree_cleans_branch_on_worktree_failure(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.side_effect = [
|
||||||
|
MagicMock(returncode=0),
|
||||||
|
subprocess.CalledProcessError(
|
||||||
|
1,
|
||||||
|
["git", "worktree", "add"],
|
||||||
|
stderr="worktree failed",
|
||||||
|
),
|
||||||
|
MagicMock(returncode=0),
|
||||||
|
]
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
base = Path(tmpdir)
|
||||||
|
work_dir = base / "wt"
|
||||||
|
with self.assertRaises(WorktreeError):
|
||||||
|
create_worktree(base, work_dir, "cross-eval/fail")
|
||||||
|
|
||||||
|
cleanup_call = mock_run.call_args_list[-1]
|
||||||
|
self.assertEqual(cleanup_call[0][0][:3], ["git", "branch", "-D"])
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.shutil.rmtree")
|
||||||
|
@patch("cross_eval.worktree.subprocess.run")
|
||||||
|
def test_remove_worktree_falls_back_to_prune(self, mock_run: MagicMock, mock_rmtree: MagicMock) -> None:
|
||||||
|
mock_run.side_effect = [
|
||||||
|
subprocess.CalledProcessError(1, ["git", "worktree", "remove"]),
|
||||||
|
MagicMock(returncode=0),
|
||||||
|
]
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
base = Path(tmpdir) / "repo"
|
||||||
|
work_dir = Path(tmpdir) / "wt"
|
||||||
|
base.mkdir()
|
||||||
|
work_dir.mkdir()
|
||||||
|
|
||||||
|
remove_worktree(base, work_dir)
|
||||||
|
|
||||||
|
resolved = work_dir.resolve()
|
||||||
|
mock_rmtree.assert_any_call(resolved, ignore_errors=True)
|
||||||
|
self.assertEqual(mock_run.call_args_list[-1][0][0], ["git", "worktree", "prune"])
|
||||||
Reference in New Issue
Block a user