989 lines
37 KiB
Python
989 lines
37 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
import subprocess
|
|
import tempfile
|
|
import unittest
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
from cross_eval.agent import (
|
|
AgentInvocationError,
|
|
_build_transcript,
|
|
_classify_agent_failure,
|
|
invoke_agent,
|
|
invoke_agent_agentic,
|
|
)
|
|
from cross_eval.models import AgentConfig, AgentResult, ExecutionConfig, PipelineConfig, StepConfig
|
|
from cross_eval.pipeline import (
|
|
_apply_worktree_inputs_to_base,
|
|
_commit_base_repo_paths,
|
|
_copy_inputs_to_worktree,
|
|
_commit_iteration,
|
|
_execute_parallel_batch,
|
|
_execute_step,
|
|
_finalize_worktree,
|
|
_format_runtime_error_markdown,
|
|
_load_inputs,
|
|
_maybe_save_step_transcript,
|
|
_refresh_inputs,
|
|
_snapshot_repo_state,
|
|
)
|
|
from cross_eval.runtime_env import (
|
|
build_execution_policy,
|
|
parse_dotenv,
|
|
resolve_env_files,
|
|
summarize_environment,
|
|
)
|
|
from cross_eval.worktree import WorktreeError, create_worktree, remove_worktree
|
|
|
|
|
|
def _init_git_repo(path: Path) -> None:
|
|
subprocess.run(["git", "init"], cwd=path, capture_output=True, check=True)
|
|
subprocess.run(
|
|
["git", "config", "user.email", "test@test.com"],
|
|
cwd=path,
|
|
capture_output=True,
|
|
check=True,
|
|
)
|
|
subprocess.run(
|
|
["git", "config", "user.name", "Test"],
|
|
cwd=path,
|
|
capture_output=True,
|
|
check=True,
|
|
)
|
|
(path / "README.md").write_text("# init\n", encoding="utf-8")
|
|
subprocess.run(["git", "add", "."], cwd=path, capture_output=True, check=True)
|
|
subprocess.run(
|
|
["git", "commit", "-m", "initial"],
|
|
cwd=path,
|
|
capture_output=True,
|
|
check=True,
|
|
)
|
|
|
|
|
|
class TestInvokeAgentRuntime(unittest.TestCase):
|
|
@patch("cross_eval.agent.subprocess.run")
|
|
def test_interactive_claude_reads_output_file(self, mock_run: MagicMock) -> None:
|
|
def _fake_run(cmd: list[str], **kwargs: object) -> MagicMock:
|
|
match = re.search(r"Write your complete output to (.+)\.$", cmd[-1])
|
|
self.assertIsNotNone(match)
|
|
assert match is not None
|
|
Path(match.group(1)).write_text("review result", encoding="utf-8")
|
|
return MagicMock(returncode=0, stdout="", stderr="")
|
|
|
|
mock_run.side_effect = _fake_run
|
|
agent = AgentConfig(
|
|
name="claude-reviewer",
|
|
command="claude",
|
|
args=["--model", "opus"],
|
|
system_prompt="system",
|
|
)
|
|
|
|
result = invoke_agent(agent, "inspect code", "review", quiet=True)
|
|
|
|
self.assertEqual(result.output, "review result")
|
|
called_cmd = mock_run.call_args[0][0]
|
|
self.assertIn("--system-prompt", called_cmd)
|
|
|
|
@patch("cross_eval.agent.subprocess.run")
|
|
def test_interactive_claude_falls_back_to_stdout(self, mock_run: MagicMock) -> None:
|
|
mock_run.return_value = MagicMock(returncode=0, stdout="stdout fallback", stderr="")
|
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["--model", "opus"])
|
|
|
|
result = invoke_agent(agent, "inspect code", "review", quiet=True)
|
|
|
|
self.assertEqual(result.output, "stdout fallback")
|
|
|
|
@patch("cross_eval.agent.subprocess.run")
|
|
def test_non_claude_wraps_system_prompt_in_stdin(self, mock_run: MagicMock) -> None:
|
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
|
agent = AgentConfig(
|
|
name="custom-reviewer",
|
|
command="custom-cli",
|
|
args=["run"],
|
|
system_prompt="strict mode",
|
|
)
|
|
|
|
invoke_agent(agent, "check things", "review", quiet=True)
|
|
|
|
self.assertEqual(
|
|
mock_run.call_args.kwargs["input"],
|
|
"<system>\nstrict mode\n</system>\n\ncheck things",
|
|
)
|
|
|
|
@patch("cross_eval.agent.subprocess.run")
|
|
def test_failure_raises_structured_error(self, mock_run: MagicMock) -> None:
|
|
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="API Error: backend down")
|
|
agent = AgentConfig(name="codex-reviewer", command="codex", args=["exec", "-"])
|
|
|
|
with self.assertRaises(AgentInvocationError) as ctx:
|
|
invoke_agent(agent, "check", "review", quiet=True)
|
|
|
|
self.assertEqual(ctx.exception.failure_type, "API_ERROR")
|
|
self.assertIn("backend down", ctx.exception.raw_error)
|
|
|
|
|
|
class TestWorktreeInputMapping(unittest.TestCase):
|
|
def test_repo_local_plan_input_maps_to_tracked_worktree_path(self) -> None:
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
repo = Path(tmpdir) / "repo"
|
|
repo.mkdir()
|
|
_init_git_repo(repo)
|
|
(repo / "plan.md").write_text("plan v1\n", encoding="utf-8")
|
|
subprocess.run(["git", "add", "plan.md"], cwd=repo, capture_output=True, check=True)
|
|
subprocess.run(
|
|
["git", "commit", "-m", "add plan"],
|
|
cwd=repo,
|
|
capture_output=True,
|
|
check=True,
|
|
)
|
|
|
|
worktree_dir = Path(tmpdir) / "wt"
|
|
branch = "cross-eval/test-plan-review"
|
|
worktree_path, _ = create_worktree(repo, worktree_dir, branch)
|
|
try:
|
|
config = PipelineConfig(
|
|
inputs={"plan": repo / "plan.md"},
|
|
preset_name="plan-review",
|
|
)
|
|
|
|
_copy_inputs_to_worktree(config, worktree_path, base_cwd=repo)
|
|
|
|
self.assertEqual(config.inputs["plan"], worktree_path / "plan.md")
|
|
finally:
|
|
remove_worktree(base_cwd=repo, work_dir=worktree_path)
|
|
subprocess.run(
|
|
["git", "branch", "-D", branch],
|
|
cwd=repo,
|
|
capture_output=True,
|
|
)
|
|
|
|
def test_plan_review_docs_ref_maps_to_worktree_and_refreshes_docs(self) -> None:
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
repo = Path(tmpdir) / "repo"
|
|
repo.mkdir()
|
|
_init_git_repo(repo)
|
|
docs_dir = repo / "plans"
|
|
docs_dir.mkdir()
|
|
(docs_dir / "A.md").write_text("A v1\n", encoding="utf-8")
|
|
subprocess.run(["git", "add", "."], cwd=repo, capture_output=True, check=True)
|
|
subprocess.run(
|
|
["git", "commit", "-m", "add docs"],
|
|
cwd=repo,
|
|
capture_output=True,
|
|
check=True,
|
|
)
|
|
|
|
config = PipelineConfig(
|
|
inputs={
|
|
"docs": "stale snapshot",
|
|
"docs_ref": docs_dir,
|
|
},
|
|
preset_name="plan-review",
|
|
)
|
|
input_contents = _load_inputs(config)
|
|
self.assertIn("A.md", input_contents["docs"])
|
|
|
|
worktree_dir = Path(tmpdir) / "wt"
|
|
branch = "cross-eval/test-docs-ref"
|
|
worktree_path, _ = create_worktree(repo, worktree_dir, branch)
|
|
try:
|
|
_copy_inputs_to_worktree(config, worktree_path, base_cwd=repo)
|
|
self.assertEqual(config.inputs["docs_ref"], worktree_path / "plans")
|
|
|
|
updated = worktree_path / "plans" / "A.md"
|
|
updated.write_text("A v2\n", encoding="utf-8")
|
|
_refresh_inputs(config, input_contents)
|
|
self.assertIn("A.md", input_contents["docs"])
|
|
self.assertIn("A v2", input_contents["docs"])
|
|
finally:
|
|
remove_worktree(base_cwd=repo, work_dir=worktree_path)
|
|
subprocess.run(
|
|
["git", "branch", "-D", branch],
|
|
cwd=repo,
|
|
capture_output=True,
|
|
)
|
|
|
|
def test_worktree_doc_changes_apply_back_and_commit_in_base_repo(self) -> None:
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
repo = Path(tmpdir) / "repo"
|
|
repo.mkdir()
|
|
_init_git_repo(repo)
|
|
docs_dir = repo / "plans"
|
|
docs_dir.mkdir()
|
|
doc_path = docs_dir / "A.md"
|
|
doc_path.write_text("A v1\n", encoding="utf-8")
|
|
subprocess.run(["git", "add", "."], cwd=repo, capture_output=True, check=True)
|
|
subprocess.run(
|
|
["git", "commit", "-m", "add docs"],
|
|
cwd=repo,
|
|
capture_output=True,
|
|
check=True,
|
|
)
|
|
|
|
config = PipelineConfig(
|
|
inputs={"docs_ref": docs_dir},
|
|
preset_name="plan-review",
|
|
)
|
|
original_inputs = {"docs_ref": docs_dir}
|
|
|
|
worktree_dir = Path(tmpdir) / "wt"
|
|
branch = "cross-eval/test-apply-back"
|
|
worktree_path, _ = create_worktree(repo, worktree_dir, branch)
|
|
try:
|
|
_copy_inputs_to_worktree(config, worktree_path, base_cwd=repo)
|
|
worktree_doc = config.inputs["docs_ref"] / "A.md"
|
|
worktree_doc.write_text("A v2\n", encoding="utf-8")
|
|
|
|
restored = _apply_worktree_inputs_to_base(
|
|
config, original_inputs, cwd=repo,
|
|
)
|
|
self.assertEqual(restored, [docs_dir])
|
|
self.assertEqual(doc_path.read_text(encoding="utf-8"), "A v2\n")
|
|
|
|
committed = _commit_base_repo_paths(
|
|
repo, restored, "cross-eval: plan-review (FAIL)",
|
|
)
|
|
self.assertTrue(committed)
|
|
|
|
log = subprocess.run(
|
|
["git", "log", "-1", "--pretty=%s"],
|
|
cwd=repo,
|
|
capture_output=True,
|
|
text=True,
|
|
check=True,
|
|
)
|
|
self.assertEqual(log.stdout.strip(), "cross-eval: plan-review (FAIL)")
|
|
finally:
|
|
remove_worktree(base_cwd=repo, work_dir=worktree_path)
|
|
subprocess.run(
|
|
["git", "branch", "-D", branch],
|
|
cwd=repo,
|
|
capture_output=True,
|
|
)
|
|
|
|
def test_classify_unknown_failure(self) -> None:
|
|
failure_type, suggested_action = _classify_agent_failure("weird crash")
|
|
self.assertEqual(failure_type, "UNKNOWN")
|
|
self.assertIn("Inspect", suggested_action)
|
|
|
|
def test_build_transcript_includes_cwd_and_duration(self) -> None:
|
|
transcript = _build_transcript(
|
|
command_preview="claude -p",
|
|
stdout="ok",
|
|
stderr="",
|
|
exit_code=0,
|
|
duration_seconds=1.2,
|
|
cwd="/tmp/repo",
|
|
)
|
|
self.assertIn("## Working Directory", transcript)
|
|
self.assertIn("## Duration: 1.2s", transcript)
|
|
|
|
@patch("cross_eval.agent._Spinner")
|
|
@patch("cross_eval.agent.subprocess.run")
|
|
def test_timeout_stops_spinner(self, mock_run: MagicMock, mock_spinner: MagicMock) -> None:
|
|
spinner = mock_spinner.return_value
|
|
mock_run.side_effect = subprocess.TimeoutExpired(cmd=["claude"], timeout=12)
|
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
|
|
|
|
with self.assertRaises(subprocess.TimeoutExpired):
|
|
invoke_agent(agent, "inspect code", "review", quiet=False, timeout=12)
|
|
|
|
spinner.stop.assert_called_once()
|
|
|
|
@patch("cross_eval.agent._Spinner")
|
|
@patch("cross_eval.agent.subprocess.run")
|
|
def test_generic_exception_stops_spinner(self, mock_run: MagicMock, mock_spinner: MagicMock) -> None:
|
|
spinner = mock_spinner.return_value
|
|
mock_run.side_effect = OSError("boom")
|
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
|
|
|
|
with self.assertRaises(OSError):
|
|
invoke_agent(agent, "inspect code", "review", quiet=False)
|
|
|
|
spinner.stop.assert_called_once()
|
|
|
|
@patch("cross_eval.agent.logger.warning")
|
|
@patch("cross_eval.agent.subprocess.run")
|
|
def test_empty_output_logs_warning(self, mock_run: MagicMock, mock_warning: MagicMock) -> None:
|
|
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
|
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
|
|
|
|
result = invoke_agent(agent, "inspect code", "review", quiet=True)
|
|
|
|
self.assertEqual(result.output, "")
|
|
mock_warning.assert_called_once()
|
|
|
|
@patch("cross_eval.agent.subprocess.run")
|
|
def test_print_mode_claude_uses_native_system_prompt_flag(self, mock_run: MagicMock) -> None:
|
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
|
agent = AgentConfig(
|
|
name="claude-reviewer",
|
|
command="claude",
|
|
args=["-p"],
|
|
system_prompt="be strict",
|
|
)
|
|
|
|
invoke_agent(agent, "review this", "review", quiet=True)
|
|
|
|
called_cmd = mock_run.call_args[0][0]
|
|
self.assertIn("--system-prompt", called_cmd)
|
|
self.assertEqual(mock_run.call_args.kwargs["input"], "review this")
|
|
|
|
@patch("cross_eval.agent.subprocess.run")
|
|
def test_interactive_failure_truncates_error_and_removes_output_file(
|
|
self,
|
|
mock_run: MagicMock,
|
|
) -> None:
|
|
seen_output_path: Path | None = None
|
|
|
|
def _fake_run(cmd: list[str], **kwargs: object) -> MagicMock:
|
|
nonlocal seen_output_path
|
|
match = re.search(r"Write your complete output to (.+)\.$", cmd[-1])
|
|
self.assertIsNotNone(match)
|
|
assert match is not None
|
|
seen_output_path = Path(match.group(1))
|
|
return MagicMock(returncode=1, stdout="", stderr="x" * 600)
|
|
|
|
mock_run.side_effect = _fake_run
|
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["--model", "opus"])
|
|
|
|
with self.assertRaises(AgentInvocationError) as ctx:
|
|
invoke_agent(agent, "inspect code", "review", quiet=True)
|
|
|
|
self.assertEqual(len(ctx.exception.raw_error), 503)
|
|
self.assertIsNotNone(seen_output_path)
|
|
assert seen_output_path is not None
|
|
self.assertFalse(seen_output_path.exists())
|
|
|
|
@patch("cross_eval.agent.logger.warning")
|
|
@patch("cross_eval.agent.subprocess.run")
|
|
def test_empty_output_with_stderr_logs_stderr_warning(
|
|
self,
|
|
mock_run: MagicMock,
|
|
mock_warning: MagicMock,
|
|
) -> None:
|
|
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="stderr text")
|
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
|
|
|
|
invoke_agent(agent, "inspect code", "review", quiet=True)
|
|
|
|
self.assertIn("stderr:", mock_warning.call_args[0][0])
|
|
|
|
|
|
class TestInvokeAgenticRuntime(unittest.TestCase):
|
|
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
|
|
@patch("cross_eval.agent.subprocess.run")
|
|
def test_codex_agentic_adds_reasoning_and_system_wrapper(
|
|
self,
|
|
mock_run: MagicMock,
|
|
mock_diff: MagicMock,
|
|
) -> None:
|
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
|
agent = AgentConfig(
|
|
name="codex-coder",
|
|
command="codex",
|
|
args=["exec", "--full-auto"],
|
|
system_prompt="strict mode",
|
|
reasoning_effort="high",
|
|
agentic=True,
|
|
)
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
repo = Path(tmpdir)
|
|
_init_git_repo(repo)
|
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=True)
|
|
|
|
called_cmd = mock_run.call_args[0][0]
|
|
self.assertIn("-c", called_cmd)
|
|
self.assertEqual(called_cmd[-1], "-")
|
|
self.assertIn("<system>", mock_run.call_args.kwargs["input"])
|
|
|
|
@patch("cross_eval.agent._Spinner")
|
|
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
|
|
@patch("cross_eval.agent.subprocess.run")
|
|
def test_agentic_claude_success_uses_system_prompt_and_spinner(
|
|
self,
|
|
mock_run: MagicMock,
|
|
mock_diff: MagicMock,
|
|
mock_spinner: MagicMock,
|
|
) -> None:
|
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
|
agent = AgentConfig(
|
|
name="claude-coder",
|
|
command="claude",
|
|
args=["-p", "--print"],
|
|
system_prompt="stay in scope",
|
|
agentic=True,
|
|
)
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
repo = Path(tmpdir)
|
|
_init_git_repo(repo)
|
|
result = invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
|
|
|
|
called_cmd = mock_run.call_args[0][0]
|
|
self.assertNotIn("-p", called_cmd)
|
|
self.assertIn("--system-prompt", called_cmd)
|
|
self.assertEqual(result.output, "diff --git a/file ...")
|
|
mock_spinner.return_value.stop.assert_called_once()
|
|
|
|
@patch("cross_eval.agent._Spinner")
|
|
def test_agentic_timeout_stops_spinner(self, mock_spinner: MagicMock) -> None:
|
|
spinner = mock_spinner.return_value
|
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
repo = Path(tmpdir)
|
|
_init_git_repo(repo)
|
|
with patch(
|
|
"cross_eval.agent.subprocess.run",
|
|
side_effect=subprocess.TimeoutExpired(cmd=["codex"], timeout=20),
|
|
):
|
|
with self.assertRaises(subprocess.TimeoutExpired):
|
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False, timeout=20)
|
|
|
|
spinner.stop.assert_called_once()
|
|
|
|
@patch("cross_eval.agent.subprocess.run")
|
|
def test_agentic_nonzero_exit_raises_structured_error(self, mock_run: MagicMock) -> None:
|
|
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="unauthorized")
|
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
repo = Path(tmpdir)
|
|
_init_git_repo(repo)
|
|
with self.assertRaises(AgentInvocationError) as ctx:
|
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=True)
|
|
|
|
self.assertEqual(ctx.exception.failure_type, "AUTH")
|
|
|
|
@patch("cross_eval.agent._Spinner")
|
|
def test_agentic_generic_exception_stops_spinner(
|
|
self,
|
|
mock_spinner: MagicMock,
|
|
) -> None:
|
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
repo = Path(tmpdir)
|
|
_init_git_repo(repo)
|
|
with patch("cross_eval.agent.subprocess.run", side_effect=OSError("boom")):
|
|
with self.assertRaises(OSError):
|
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
|
|
|
|
mock_spinner.return_value.stop.assert_called_once()
|
|
|
|
@patch("cross_eval.agent._Spinner")
|
|
@patch("cross_eval.agent.subprocess.run")
|
|
def test_agentic_failure_truncates_error(
|
|
self,
|
|
mock_run: MagicMock,
|
|
mock_spinner: MagicMock,
|
|
) -> None:
|
|
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="x" * 600)
|
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
repo = Path(tmpdir)
|
|
_init_git_repo(repo)
|
|
with self.assertRaises(AgentInvocationError) as ctx:
|
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
|
|
|
|
self.assertEqual(len(ctx.exception.raw_error), 503)
|
|
mock_spinner.return_value.stop.assert_called_once()
|
|
|
|
@patch("cross_eval.agent._Spinner")
|
|
@patch("cross_eval.worktree.capture_diff", return_value="")
|
|
@patch("cross_eval.agent.subprocess.run")
|
|
def test_agentic_empty_diff_failure_truncates_error_and_stops_spinner(
|
|
self,
|
|
mock_run: MagicMock,
|
|
mock_diff: MagicMock,
|
|
mock_spinner: MagicMock,
|
|
) -> None:
|
|
mock_run.return_value = MagicMock(
|
|
returncode=0,
|
|
stdout="implemented",
|
|
stderr="permission denied " * 300,
|
|
)
|
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
repo = Path(tmpdir)
|
|
_init_git_repo(repo)
|
|
with self.assertRaises(AgentInvocationError) as ctx:
|
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
|
|
|
|
self.assertLessEqual(len(ctx.exception.raw_error), 2003)
|
|
self.assertEqual(ctx.exception.failure_type, "WRITE_FAILURE")
|
|
mock_spinner.return_value.stop.assert_called_once()
|
|
|
|
|
|
class TestPipelineHelpers(unittest.TestCase):
|
|
@patch("cross_eval.worktree.get_current_head", return_value="a" * 40)
|
|
@patch("cross_eval.worktree.commit_worktree", return_value=True)
|
|
def test_commit_iteration_logs_only_when_committed(self, mock_commit: MagicMock, mock_head: MagicMock) -> None:
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
new_head = _commit_iteration(Path(tmpdir), "review-fix", 2, "PASS")
|
|
mock_commit.assert_called_once()
|
|
self.assertEqual(new_head, "a" * 40)
|
|
|
|
def test_snapshot_repo_state_includes_untracked_digest(self) -> None:
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
repo = Path(tmpdir)
|
|
_init_git_repo(repo)
|
|
(repo / "scratch.txt").write_text("draft", encoding="utf-8")
|
|
|
|
snapshot = _snapshot_repo_state(repo)
|
|
|
|
self.assertIn("UNTRACKED scratch.txt", snapshot["untracked"])
|
|
|
|
def test_finalize_worktree_deletes_empty_branch(self) -> None:
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
base = Path(tmpdir) / "repo"
|
|
base.mkdir()
|
|
_init_git_repo(base)
|
|
branch = "cross-eval/empty"
|
|
subprocess.run(
|
|
["git", "branch", branch, "HEAD"],
|
|
cwd=base,
|
|
capture_output=True,
|
|
check=True,
|
|
)
|
|
worktree = Path(tmpdir) / "wt"
|
|
subprocess.run(
|
|
["git", "worktree", "add", str(worktree), branch],
|
|
cwd=base,
|
|
capture_output=True,
|
|
check=True,
|
|
)
|
|
|
|
branch_result = _finalize_worktree(base, worktree, branch, "review-fix", "PASS")
|
|
|
|
self.assertIsNone(branch_result)
|
|
branches = subprocess.run(
|
|
["git", "branch", "--list", branch],
|
|
cwd=base,
|
|
capture_output=True,
|
|
text=True,
|
|
check=True,
|
|
)
|
|
self.assertEqual(branches.stdout.strip(), "")
|
|
|
|
def test_format_runtime_error_markdown_for_generic_exception(self) -> None:
|
|
markdown = _format_runtime_error_markdown(
|
|
RuntimeError("boom"),
|
|
step_name="review",
|
|
agent_name="claude-reviewer",
|
|
phase_name="review_fix",
|
|
)
|
|
self.assertIn("# Agent Error", markdown)
|
|
self.assertIn("review_fix", markdown)
|
|
self.assertIn("boom", markdown)
|
|
|
|
def test_maybe_save_step_transcript_returns_none_without_transcript(self) -> None:
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
result = AgentResult(
|
|
output="ok",
|
|
exit_code=0,
|
|
agent_name="claude-reviewer",
|
|
step_name="review",
|
|
duration_seconds=0.1,
|
|
)
|
|
saved = _maybe_save_step_transcript(Path(tmpdir), 1, "review", result)
|
|
self.assertIsNone(saved)
|
|
|
|
@patch("cross_eval.pipeline.invoke_agent")
|
|
def test_execute_step_saves_timeout_markdown(self, mock_invoke: MagicMock) -> None:
|
|
mock_invoke.side_effect = subprocess.TimeoutExpired(
|
|
cmd=["claude"],
|
|
timeout=45,
|
|
output="partial output",
|
|
stderr="still running",
|
|
)
|
|
step = StepConfig(
|
|
name="review",
|
|
agent="claude-reviewer",
|
|
role="review",
|
|
prompt_template="default:review",
|
|
output_key="review_output",
|
|
)
|
|
config = PipelineConfig(
|
|
agents={
|
|
"claude-reviewer": AgentConfig(
|
|
name="claude-reviewer",
|
|
command="claude",
|
|
args=["-p"],
|
|
),
|
|
},
|
|
)
|
|
step_outputs: dict[str, str] = {}
|
|
step_results: dict[str, AgentResult] = {}
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
run_dir = Path(tmpdir)
|
|
with self.assertRaises(RuntimeError) as ctx:
|
|
_execute_step(
|
|
step,
|
|
config,
|
|
{"plan": "Plan", "checklist": "Checklist"},
|
|
"",
|
|
1,
|
|
3,
|
|
run_dir,
|
|
45,
|
|
False,
|
|
step_outputs,
|
|
step_results,
|
|
run_dir=run_dir,
|
|
output_iter=1,
|
|
)
|
|
|
|
self.assertIn("timed out after 45s", str(ctx.exception))
|
|
error_path = run_dir / "v1" / "review_error.md"
|
|
self.assertTrue(error_path.exists())
|
|
self.assertIn("# Agent Timeout", error_path.read_text(encoding="utf-8"))
|
|
|
|
@patch("cross_eval.pipeline.invoke_agent")
|
|
def test_execute_step_saves_runtime_error_markdown(self, mock_invoke: MagicMock) -> None:
|
|
mock_invoke.side_effect = AgentInvocationError(
|
|
agent_name="claude-reviewer",
|
|
step_name="review",
|
|
cmd_preview="claude -p",
|
|
raw_error="api broke",
|
|
failure_type="API_ERROR",
|
|
suggested_action="retry",
|
|
)
|
|
step = StepConfig(
|
|
name="review",
|
|
agent="claude-reviewer",
|
|
role="review",
|
|
prompt_template="default:review",
|
|
output_key="review_output",
|
|
)
|
|
config = PipelineConfig(
|
|
agents={
|
|
"claude-reviewer": AgentConfig(
|
|
name="claude-reviewer",
|
|
command="claude",
|
|
args=["-p"],
|
|
),
|
|
},
|
|
)
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
run_dir = Path(tmpdir)
|
|
with self.assertRaises(AgentInvocationError):
|
|
_execute_step(
|
|
step,
|
|
config,
|
|
{"plan": "Plan", "checklist": "Checklist"},
|
|
"",
|
|
1,
|
|
3,
|
|
run_dir,
|
|
45,
|
|
False,
|
|
{},
|
|
{},
|
|
run_dir=run_dir,
|
|
output_iter=1,
|
|
)
|
|
|
|
error_text = (run_dir / "v1" / "review_error.md").read_text(encoding="utf-8")
|
|
self.assertIn("API_ERROR", error_text)
|
|
self.assertIn("retry", error_text)
|
|
|
|
@patch("cross_eval.pipeline.invoke_agent")
|
|
def test_execute_parallel_batch_saves_success_and_timeout_error(self, mock_invoke: MagicMock) -> None:
|
|
def _fake_invoke(agent_config: AgentConfig, prompt: str, step_name: str, **kwargs: object) -> AgentResult:
|
|
if step_name == "review_ok":
|
|
return AgentResult(
|
|
output="VERDICT: PASS",
|
|
exit_code=0,
|
|
agent_name=agent_config.name,
|
|
step_name=step_name,
|
|
duration_seconds=0.1,
|
|
)
|
|
raise subprocess.TimeoutExpired(
|
|
cmd=["codex"],
|
|
timeout=30,
|
|
output="halfway",
|
|
stderr="timeout stderr",
|
|
)
|
|
|
|
mock_invoke.side_effect = _fake_invoke
|
|
batch = [
|
|
StepConfig(
|
|
name="review_ok",
|
|
agent="claude-reviewer",
|
|
role="review",
|
|
prompt_template="default:review",
|
|
output_key="review_ok",
|
|
parallel=True,
|
|
),
|
|
StepConfig(
|
|
name="review_slow",
|
|
agent="codex-reviewer",
|
|
role="review",
|
|
prompt_template="default:review",
|
|
output_key="review_slow",
|
|
parallel=True,
|
|
),
|
|
]
|
|
config = PipelineConfig(
|
|
agents={
|
|
"claude-reviewer": AgentConfig(name="claude-reviewer", command="claude", args=["-p"]),
|
|
"codex-reviewer": AgentConfig(name="codex-reviewer", command="codex", args=["exec", "-"]),
|
|
},
|
|
)
|
|
step_outputs: dict[str, str] = {}
|
|
step_results: dict[str, AgentResult] = {}
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
run_dir = Path(tmpdir)
|
|
with self.assertRaises(RuntimeError) as ctx:
|
|
_execute_parallel_batch(
|
|
batch,
|
|
config,
|
|
{"plan": "Plan", "checklist": "Checklist"},
|
|
"",
|
|
1,
|
|
3,
|
|
run_dir,
|
|
30,
|
|
False,
|
|
step_outputs,
|
|
step_results,
|
|
run_dir=run_dir,
|
|
output_iter=1,
|
|
)
|
|
|
|
self.assertIn("Successful outputs were saved for: review_ok", str(ctx.exception))
|
|
self.assertEqual(step_outputs["review_ok"], "VERDICT: PASS")
|
|
self.assertTrue((run_dir / "v1" / "review_ok.md").exists())
|
|
self.assertTrue((run_dir / "v1" / "review_slow_error.md").exists())
|
|
|
|
@patch("cross_eval.pipeline._execute_step")
|
|
def test_execute_parallel_batch_dry_run_uses_sequential_path(self, mock_step: MagicMock) -> None:
|
|
batch = [
|
|
StepConfig(
|
|
name="review_a",
|
|
agent="claude-reviewer",
|
|
role="review",
|
|
prompt_template="default:review",
|
|
output_key="review_a",
|
|
parallel=True,
|
|
),
|
|
StepConfig(
|
|
name="review_b",
|
|
agent="codex-reviewer",
|
|
role="review",
|
|
prompt_template="default:review",
|
|
output_key="review_b",
|
|
parallel=True,
|
|
),
|
|
]
|
|
config = PipelineConfig(agents={})
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
_execute_parallel_batch(
|
|
batch,
|
|
config,
|
|
{"plan": "Plan"},
|
|
"",
|
|
1,
|
|
3,
|
|
Path(tmpdir),
|
|
None,
|
|
True,
|
|
{},
|
|
{},
|
|
run_dir=Path(tmpdir),
|
|
output_iter=1,
|
|
)
|
|
|
|
self.assertEqual(mock_step.call_count, 2)
|
|
|
|
@patch("cross_eval.pipeline._execute_step")
|
|
def test_execute_parallel_batch_agentic_steps_fall_back_to_sequential(self, mock_step: MagicMock) -> None:
|
|
batch = [
|
|
StepConfig(
|
|
name="review_a",
|
|
agent="agentic-a",
|
|
role="review",
|
|
prompt_template="default:review",
|
|
output_key="review_a",
|
|
parallel=True,
|
|
),
|
|
StepConfig(
|
|
name="review_b",
|
|
agent="agentic-b",
|
|
role="review",
|
|
prompt_template="default:review",
|
|
output_key="review_b",
|
|
parallel=True,
|
|
),
|
|
]
|
|
config = PipelineConfig(
|
|
agents={
|
|
"agentic-a": AgentConfig(name="agentic-a", command="claude", agentic=True),
|
|
"agentic-b": AgentConfig(name="agentic-b", command="codex", agentic=True),
|
|
},
|
|
)
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
_execute_parallel_batch(
|
|
batch,
|
|
config,
|
|
{"plan": "Plan"},
|
|
"",
|
|
1,
|
|
3,
|
|
Path(tmpdir),
|
|
None,
|
|
False,
|
|
{},
|
|
{},
|
|
run_dir=Path(tmpdir),
|
|
output_iter=1,
|
|
worktree_path=Path(tmpdir),
|
|
)
|
|
|
|
self.assertEqual(mock_step.call_count, 2)
|
|
|
|
@patch("cross_eval.worktree.remove_worktree", side_effect=RuntimeError("cleanup failed"))
|
|
@patch("cross_eval.worktree.commit_worktree", side_effect=RuntimeError("commit failed"))
|
|
def test_finalize_worktree_handles_cleanup_failures(
|
|
self,
|
|
mock_commit: MagicMock,
|
|
mock_remove: MagicMock,
|
|
) -> None:
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
branch = _finalize_worktree(
|
|
Path(tmpdir),
|
|
Path(tmpdir) / "wt",
|
|
"cross-eval/fail",
|
|
"review-fix",
|
|
"FAIL",
|
|
)
|
|
|
|
self.assertIsNone(branch)
|
|
|
|
|
|
class TestRuntimeEnvironmentHelpers(unittest.TestCase):
|
|
def test_parse_dotenv_handles_export_and_quotes(self) -> None:
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
env_path = Path(tmpdir) / ".env"
|
|
env_path.write_text(
|
|
"export FOO='bar'\nBAR=\"line\\nvalue\"\nINVALID\n=skip\n",
|
|
encoding="utf-8",
|
|
)
|
|
values = parse_dotenv(env_path)
|
|
|
|
self.assertEqual(values["FOO"], "bar")
|
|
self.assertEqual(values["BAR"], "line\nvalue")
|
|
self.assertNotIn("INVALID", values)
|
|
|
|
def test_resolve_env_files_deduplicates_and_filters_missing(self) -> None:
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
root = Path(tmpdir)
|
|
env_path = root / ".env"
|
|
env_path.write_text("FOO=bar\n", encoding="utf-8")
|
|
execution = ExecutionConfig(
|
|
env_files=[".env", str(env_path)],
|
|
auto_env_files=[".env", ".env.local"],
|
|
)
|
|
|
|
resolved = resolve_env_files(execution, root)
|
|
|
|
self.assertEqual(resolved, [env_path.resolve()])
|
|
|
|
def test_summarize_environment_hides_names_when_disabled(self) -> None:
|
|
execution = ExecutionConfig(expose_env_names=False, auto_context_targets=["postgres"])
|
|
summary = summarize_environment(
|
|
execution,
|
|
[],
|
|
{"DATABASE_URL": "postgres://localhost"},
|
|
{},
|
|
)
|
|
self.assertIn("names are hidden", summary)
|
|
self.assertIn("Execution targets hinted by the user: postgres", summary)
|
|
|
|
def test_build_execution_policy_for_minimal_mode(self) -> None:
|
|
policy = build_execution_policy(
|
|
ExecutionConfig(mode="agent-decides", command_policy="minimal"),
|
|
)
|
|
self.assertIn("Command policy: minimal", policy)
|
|
self.assertIn("Keep command usage minimal", policy)
|
|
|
|
|
|
class TestWorktreeFailures(unittest.TestCase):
|
|
@patch("cross_eval.worktree.subprocess.run")
|
|
def test_create_worktree_raises_when_branch_creation_fails(self, mock_run: MagicMock) -> None:
|
|
# First call: git rev-parse HEAD (succeeds)
|
|
# Second call: git branch (fails)
|
|
rev_parse_result = MagicMock(returncode=0)
|
|
rev_parse_result.stdout = "a" * 40
|
|
mock_run.side_effect = [
|
|
rev_parse_result,
|
|
subprocess.CalledProcessError(
|
|
1,
|
|
["git", "branch"],
|
|
stderr="branch failed",
|
|
),
|
|
]
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
base = Path(tmpdir)
|
|
work_dir = base / "wt"
|
|
with self.assertRaises(WorktreeError) as ctx:
|
|
create_worktree(base, work_dir, "cross-eval/fail")
|
|
|
|
self.assertIn("Failed to create branch", str(ctx.exception))
|
|
|
|
@patch("cross_eval.worktree.subprocess.run")
|
|
def test_create_worktree_cleans_branch_on_worktree_failure(self, mock_run: MagicMock) -> None:
|
|
rev_parse_result = MagicMock(returncode=0)
|
|
rev_parse_result.stdout = "a" * 40
|
|
mock_run.side_effect = [
|
|
rev_parse_result, # git rev-parse HEAD
|
|
MagicMock(returncode=0), # git branch
|
|
subprocess.CalledProcessError(
|
|
1,
|
|
["git", "worktree", "add"],
|
|
stderr="worktree failed",
|
|
),
|
|
MagicMock(returncode=0), # git branch -D (cleanup)
|
|
]
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
base = Path(tmpdir)
|
|
work_dir = base / "wt"
|
|
with self.assertRaises(WorktreeError):
|
|
create_worktree(base, work_dir, "cross-eval/fail")
|
|
|
|
cleanup_call = mock_run.call_args_list[-1]
|
|
self.assertEqual(cleanup_call[0][0][:3], ["git", "branch", "-D"])
|
|
|
|
@patch("cross_eval.worktree.shutil.rmtree")
|
|
@patch("cross_eval.worktree.subprocess.run")
|
|
def test_remove_worktree_falls_back_to_prune(self, mock_run: MagicMock, mock_rmtree: MagicMock) -> None:
|
|
mock_run.side_effect = [
|
|
subprocess.CalledProcessError(1, ["git", "worktree", "remove"]),
|
|
MagicMock(returncode=0),
|
|
]
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
base = Path(tmpdir) / "repo"
|
|
work_dir = Path(tmpdir) / "wt"
|
|
base.mkdir()
|
|
work_dir.mkdir()
|
|
|
|
remove_worktree(base, work_dir)
|
|
|
|
resolved = work_dir.resolve()
|
|
mock_rmtree.assert_any_call(resolved, ignore_errors=True)
|
|
self.assertEqual(mock_run.call_args_list[-1][0][0], ["git", "worktree", "prune"])
|