continue
This commit is contained in:
@@ -490,6 +490,8 @@ class TestMakeAgenticCodex(unittest.TestCase):
|
||||
def _make_agentic_config(
|
||||
run_dir: Path,
|
||||
agentic_coder: bool = True,
|
||||
*,
|
||||
use_worktree: bool = False,
|
||||
) -> PipelineConfig:
|
||||
"""Build a config with an agentic coder + non-agentic reviewer."""
|
||||
coder = AgentConfig(
|
||||
@@ -521,6 +523,7 @@ def _make_agentic_config(
|
||||
]
|
||||
return PipelineConfig(
|
||||
output_dir=run_dir,
|
||||
use_worktree=use_worktree,
|
||||
max_iterations=2,
|
||||
min_iterations=1,
|
||||
language="en",
|
||||
@@ -551,7 +554,7 @@ class TestSetupWorktreeCalledForAgentic(unittest.TestCase):
|
||||
) -> None:
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
run_dir = Path(td)
|
||||
config = _make_agentic_config(run_dir)
|
||||
config = _make_agentic_config(run_dir, use_worktree=True)
|
||||
|
||||
wt_path = run_dir / "work"
|
||||
wt_path.mkdir()
|
||||
@@ -573,6 +576,44 @@ class TestSetupWorktreeCalledForAgentic(unittest.TestCase):
|
||||
mock_setup.assert_called_once()
|
||||
|
||||
|
||||
class TestDirectAgenticMode(unittest.TestCase):
|
||||
"""Agentic coders run in the current working tree by default."""
|
||||
|
||||
@patch("cross_eval.pipeline._setup_worktree")
|
||||
@patch("cross_eval.pipeline.invoke_agent_agentic")
|
||||
@patch("cross_eval.pipeline.invoke_agent")
|
||||
def test_agentic_uses_current_worktree_by_default(
|
||||
self,
|
||||
mock_invoke: MagicMock,
|
||||
mock_invoke_agentic: MagicMock,
|
||||
mock_setup: MagicMock,
|
||||
) -> None:
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
repo = Path(td)
|
||||
_init_git_repo(repo)
|
||||
run_dir = repo / ".cross-eval" / "output"
|
||||
run_dir.mkdir(parents=True, exist_ok=True)
|
||||
config = _make_agentic_config(run_dir)
|
||||
|
||||
mock_invoke_agentic.return_value = AgentResult(
|
||||
output="diff output", exit_code=0,
|
||||
agent_name="claude-coder", step_name="coding",
|
||||
duration_seconds=0.1,
|
||||
)
|
||||
mock_invoke.return_value = AgentResult(
|
||||
output="VERDICT: PASS", exit_code=0,
|
||||
agent_name="claude-reviewer", step_name="review",
|
||||
duration_seconds=0.1,
|
||||
)
|
||||
|
||||
run_pipeline(config, cwd=repo)
|
||||
|
||||
mock_setup.assert_not_called()
|
||||
self.assertEqual(mock_invoke_agentic.call_args.kwargs["worktree_path"], repo)
|
||||
reviewer_call = mock_invoke.call_args
|
||||
self.assertEqual(reviewer_call.kwargs["cwd"], repo)
|
||||
|
||||
|
||||
class TestSetupWorktreeLocation(unittest.TestCase):
|
||||
"""_setup_worktree places agentic worktrees outside the base repo."""
|
||||
|
||||
@@ -618,7 +659,7 @@ class TestReviewerRunsInWorktreeCwd(unittest.TestCase):
|
||||
) -> None:
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
run_dir = Path(td)
|
||||
config = _make_agentic_config(run_dir)
|
||||
config = _make_agentic_config(run_dir, use_worktree=True)
|
||||
|
||||
wt_path = run_dir / "work"
|
||||
wt_path.mkdir()
|
||||
@@ -660,7 +701,7 @@ class TestCommitIterationCalled(unittest.TestCase):
|
||||
) -> None:
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
run_dir = Path(td)
|
||||
config = _make_agentic_config(run_dir)
|
||||
config = _make_agentic_config(run_dir, use_worktree=True)
|
||||
|
||||
wt_path = run_dir / "work"
|
||||
wt_path.mkdir()
|
||||
@@ -702,7 +743,7 @@ class TestFinalizeWorktreeCalled(unittest.TestCase):
|
||||
) -> None:
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
run_dir = Path(td)
|
||||
config = _make_agentic_config(run_dir)
|
||||
config = _make_agentic_config(run_dir, use_worktree=True)
|
||||
|
||||
wt_path = run_dir / "work"
|
||||
wt_path.mkdir()
|
||||
|
||||
@@ -331,7 +331,7 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
||||
)
|
||||
self.assertEqual(
|
||||
_default_seniors_for_preset(
|
||||
"preset:review-fix",
|
||||
"preset:coding-plan-review",
|
||||
["codex-reviewer", "claude-reviewer"],
|
||||
BUILTIN_AGENTS,
|
||||
),
|
||||
@@ -339,7 +339,7 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
||||
)
|
||||
self.assertEqual(
|
||||
_default_seniors_for_preset(
|
||||
"preset:review-fix",
|
||||
"preset:coding-plan-review",
|
||||
["claude-reviewer"],
|
||||
BUILTIN_AGENTS,
|
||||
),
|
||||
@@ -347,15 +347,7 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
||||
)
|
||||
self.assertEqual(
|
||||
_default_seniors_for_preset(
|
||||
"preset:coding-review-fix",
|
||||
["codex-reviewer"],
|
||||
BUILTIN_AGENTS,
|
||||
),
|
||||
["codex-senior"],
|
||||
)
|
||||
self.assertEqual(
|
||||
_default_seniors_for_preset(
|
||||
"preset:simple",
|
||||
"preset:unknown",
|
||||
["codex-reviewer"],
|
||||
BUILTIN_AGENTS,
|
||||
),
|
||||
@@ -1019,7 +1011,7 @@ class FixPresetBehaviorTest(unittest.TestCase):
|
||||
" checklist: checklist.md\n"
|
||||
"coders: [claude-coder]\n"
|
||||
"reviewers: [claude-reviewer]\n"
|
||||
"pipeline: preset:review-fix\n"
|
||||
"pipeline: preset:coding-plan-review\n"
|
||||
f"max_iterations: {max_iterations}\n"
|
||||
"language: en\n"
|
||||
),
|
||||
@@ -1031,8 +1023,9 @@ class FixPresetBehaviorTest(unittest.TestCase):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
config = load_config(self._write_fix_config(Path(tmpdir), max_iterations=7))
|
||||
|
||||
self.assertEqual(config.preset_name, "review-fix")
|
||||
self.assertEqual(config.phases[0].max_iterations, 7)
|
||||
self.assertEqual(config.preset_name, "coding-plan-review")
|
||||
self.assertEqual(config.phases[0].max_iterations, 1)
|
||||
self.assertEqual(config.phases[1].max_iterations, 7)
|
||||
self.assertTrue(config.agents["claude-coder"].agentic)
|
||||
self.assertNotIn("-p", config.agents["claude-coder"].args)
|
||||
|
||||
@@ -1042,7 +1035,7 @@ class FixPresetBehaviorTest(unittest.TestCase):
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
def _fake_run_pipeline(config, **kwargs):
|
||||
captured["phase_max"] = config.phases[0].max_iterations
|
||||
captured["phase_max"] = config.phases[1].max_iterations
|
||||
captured["agentic"] = config.agents[config.coders[0]].agentic
|
||||
return PipelineResult(
|
||||
iterations=[],
|
||||
@@ -1062,13 +1055,13 @@ class FixPresetBehaviorTest(unittest.TestCase):
|
||||
self.assertEqual(captured["phase_max"], 9)
|
||||
self.assertTrue(captured["agentic"])
|
||||
|
||||
def test_run_preset_review_fix_auto_enables_agentic_without_flag(self) -> None:
|
||||
def test_run_preset_coding_plan_review_auto_enables_agentic_without_flag(self) -> None:
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
def _fake_run_pipeline(config, **kwargs):
|
||||
captured["preset"] = config.preset_name
|
||||
captured["agentic"] = config.agents[config.coders[0]].agentic
|
||||
captured["phase_max"] = config.phases[0].max_iterations
|
||||
captured["phase_max"] = config.phases[1].max_iterations
|
||||
return PipelineResult(
|
||||
iterations=[],
|
||||
final_verdict="PASS",
|
||||
@@ -1076,10 +1069,10 @@ class FixPresetBehaviorTest(unittest.TestCase):
|
||||
)
|
||||
|
||||
with patch("cross_eval.pipeline.run_pipeline", side_effect=_fake_run_pipeline):
|
||||
exit_code = main(["run", "--preset", "review-fix", "--dry-run"])
|
||||
exit_code = main(["run", "--preset", "coding-plan-review", "--dry-run"])
|
||||
|
||||
self.assertEqual(exit_code, 0)
|
||||
self.assertEqual(captured["preset"], "review-fix")
|
||||
self.assertEqual(captured["preset"], "coding-plan-review")
|
||||
self.assertTrue(captured["agentic"])
|
||||
self.assertEqual(captured["phase_max"], 3)
|
||||
|
||||
@@ -1089,6 +1082,7 @@ class FixPresetBehaviorTest(unittest.TestCase):
|
||||
def _fake_run_pipeline(config, **kwargs):
|
||||
captured["preset"] = config.preset_name
|
||||
captured["agentic"] = config.agents[config.coders[0]].agentic
|
||||
captured["use_worktree"] = config.use_worktree
|
||||
captured["seniors"] = list(config.seniors)
|
||||
captured["steps"] = [step.name for step in config.pipeline]
|
||||
captured["max_iter"] = config.max_iterations
|
||||
@@ -1104,6 +1098,7 @@ class FixPresetBehaviorTest(unittest.TestCase):
|
||||
self.assertEqual(exit_code, 0)
|
||||
self.assertEqual(captured["preset"], "plan-review")
|
||||
self.assertTrue(captured["agentic"])
|
||||
self.assertFalse(captured["use_worktree"])
|
||||
self.assertEqual(captured["seniors"], ["claude-senior"])
|
||||
self.assertEqual(
|
||||
captured["steps"],
|
||||
@@ -1111,6 +1106,36 @@ class FixPresetBehaviorTest(unittest.TestCase):
|
||||
)
|
||||
self.assertEqual(captured["max_iter"], 3)
|
||||
|
||||
def test_run_worktree_flag_enables_isolated_worktree_mode(self) -> None:
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
def _fake_run_pipeline(config, **kwargs):
|
||||
captured["use_worktree"] = config.use_worktree
|
||||
return PipelineResult(
|
||||
iterations=[],
|
||||
final_verdict="PASS",
|
||||
run_dir=Path(".cross-eval/output"),
|
||||
)
|
||||
|
||||
with patch("cross_eval.pipeline.run_pipeline", side_effect=_fake_run_pipeline):
|
||||
exit_code = main(["run", "--preset", "plan-review", "--dry-run", "--worktree"])
|
||||
|
||||
self.assertEqual(exit_code, 0)
|
||||
self.assertTrue(captured["use_worktree"])
|
||||
|
||||
def test_run_dry_run_returns_zero_even_when_not_pass(self) -> None:
|
||||
def _fake_run_pipeline(config, **kwargs):
|
||||
return PipelineResult(
|
||||
iterations=[],
|
||||
final_verdict="MAX_ITERATIONS_REACHED",
|
||||
run_dir=Path(".cross-eval/output"),
|
||||
)
|
||||
|
||||
with patch("cross_eval.pipeline.run_pipeline", side_effect=_fake_run_pipeline):
|
||||
exit_code = main(["run", "--preset", "plan-review", "--dry-run"])
|
||||
|
||||
self.assertEqual(exit_code, 0)
|
||||
|
||||
def test_run_senior_model_override_applies_only_to_seniors(self) -> None:
|
||||
captured: dict[str, list[str]] = {}
|
||||
|
||||
@@ -1127,7 +1152,7 @@ class FixPresetBehaviorTest(unittest.TestCase):
|
||||
with patch("cross_eval.pipeline.run_pipeline", side_effect=_fake_run_pipeline):
|
||||
exit_code = main([
|
||||
"run",
|
||||
"--preset", "review-fix",
|
||||
"--preset", "coding-plan-review",
|
||||
"--coder", "claude",
|
||||
"--reviewer", "claude",
|
||||
"--senior", "claude",
|
||||
@@ -1155,7 +1180,7 @@ class OutputDirectoryResolutionTest(unittest.TestCase):
|
||||
" plan: plan.md\n"
|
||||
"coders: [claude-coder]\n"
|
||||
"reviewers: [claude-reviewer]\n"
|
||||
"pipeline: preset:simple\n"
|
||||
"pipeline: preset:coding-plan-review\n"
|
||||
"output_dir: .cross-eval/output\n"
|
||||
),
|
||||
encoding="utf-8",
|
||||
|
||||
@@ -55,7 +55,7 @@ class DoctorCheckInstalledTest(unittest.TestCase):
|
||||
config_path = ce_dir / "config.yaml"
|
||||
config_path.write_text(
|
||||
"inputs:\n plan: plan.md\ncoders: [claude-coder]\n"
|
||||
"reviewers: [claude-reviewer]\npipeline: preset:simple\n",
|
||||
"reviewers: [claude-reviewer]\npipeline: preset:coding-plan-review\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
# Also create plan.md so validation passes
|
||||
@@ -137,22 +137,22 @@ class DemoTest(unittest.TestCase):
|
||||
def test_mock_demo_runs_without_error(self) -> None:
|
||||
# Should not raise
|
||||
with patch("sys.stdout"):
|
||||
run_mock_demo(preset="simple")
|
||||
run_mock_demo(preset="coding-plan-review")
|
||||
|
||||
def test_mock_demo_escalate_runs_without_error(self) -> None:
|
||||
with patch("sys.stdout"):
|
||||
run_mock_demo(preset="simple", show_escalate=True)
|
||||
run_mock_demo(preset="coding-plan-review", show_escalate=True)
|
||||
|
||||
def test_cmd_demo_mock_default(self) -> None:
|
||||
with patch("cross_eval.demo.run_mock_demo") as mock:
|
||||
exit_code = main(["demo"])
|
||||
mock.assert_called_once_with(preset="simple", show_escalate=False)
|
||||
mock.assert_called_once_with(preset="coding-plan-review", show_escalate=False)
|
||||
self.assertEqual(exit_code, 0)
|
||||
|
||||
def test_cmd_demo_escalate_flag(self) -> None:
|
||||
with patch("cross_eval.demo.run_mock_demo") as mock:
|
||||
exit_code = main(["demo", "--escalate"])
|
||||
mock.assert_called_once_with(preset="simple", show_escalate=True)
|
||||
mock.assert_called_once_with(preset="coding-plan-review", show_escalate=True)
|
||||
self.assertEqual(exit_code, 0)
|
||||
|
||||
def test_cmd_demo_live_requires_confirmation(self) -> None:
|
||||
|
||||
@@ -16,13 +16,17 @@ from cross_eval.agent import (
|
||||
)
|
||||
from cross_eval.models import AgentConfig, AgentResult, ExecutionConfig, PipelineConfig, StepConfig
|
||||
from cross_eval.pipeline import (
|
||||
_apply_worktree_inputs_to_base,
|
||||
_commit_base_repo_paths,
|
||||
_copy_inputs_to_worktree,
|
||||
_commit_iteration,
|
||||
_execute_parallel_batch,
|
||||
_execute_step,
|
||||
_finalize_worktree,
|
||||
_format_runtime_error_markdown,
|
||||
_load_inputs,
|
||||
_maybe_save_step_transcript,
|
||||
_refresh_inputs,
|
||||
_snapshot_repo_state,
|
||||
)
|
||||
from cross_eval.runtime_env import (
|
||||
@@ -155,6 +159,110 @@ class TestWorktreeInputMapping(unittest.TestCase):
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
def test_plan_review_docs_ref_maps_to_worktree_and_refreshes_docs(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
repo = Path(tmpdir) / "repo"
|
||||
repo.mkdir()
|
||||
_init_git_repo(repo)
|
||||
docs_dir = repo / "plans"
|
||||
docs_dir.mkdir()
|
||||
(docs_dir / "A.md").write_text("A v1\n", encoding="utf-8")
|
||||
subprocess.run(["git", "add", "."], cwd=repo, capture_output=True, check=True)
|
||||
subprocess.run(
|
||||
["git", "commit", "-m", "add docs"],
|
||||
cwd=repo,
|
||||
capture_output=True,
|
||||
check=True,
|
||||
)
|
||||
|
||||
config = PipelineConfig(
|
||||
inputs={
|
||||
"docs": "stale snapshot",
|
||||
"docs_ref": docs_dir,
|
||||
},
|
||||
preset_name="plan-review",
|
||||
)
|
||||
input_contents = _load_inputs(config)
|
||||
self.assertIn("A.md", input_contents["docs"])
|
||||
|
||||
worktree_dir = Path(tmpdir) / "wt"
|
||||
branch = "cross-eval/test-docs-ref"
|
||||
worktree_path, _ = create_worktree(repo, worktree_dir, branch)
|
||||
try:
|
||||
_copy_inputs_to_worktree(config, worktree_path, base_cwd=repo)
|
||||
self.assertEqual(config.inputs["docs_ref"], worktree_path / "plans")
|
||||
|
||||
updated = worktree_path / "plans" / "A.md"
|
||||
updated.write_text("A v2\n", encoding="utf-8")
|
||||
_refresh_inputs(config, input_contents)
|
||||
self.assertIn("A.md", input_contents["docs"])
|
||||
self.assertIn("A v2", input_contents["docs"])
|
||||
finally:
|
||||
remove_worktree(base_cwd=repo, work_dir=worktree_path)
|
||||
subprocess.run(
|
||||
["git", "branch", "-D", branch],
|
||||
cwd=repo,
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
def test_worktree_doc_changes_apply_back_and_commit_in_base_repo(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
repo = Path(tmpdir) / "repo"
|
||||
repo.mkdir()
|
||||
_init_git_repo(repo)
|
||||
docs_dir = repo / "plans"
|
||||
docs_dir.mkdir()
|
||||
doc_path = docs_dir / "A.md"
|
||||
doc_path.write_text("A v1\n", encoding="utf-8")
|
||||
subprocess.run(["git", "add", "."], cwd=repo, capture_output=True, check=True)
|
||||
subprocess.run(
|
||||
["git", "commit", "-m", "add docs"],
|
||||
cwd=repo,
|
||||
capture_output=True,
|
||||
check=True,
|
||||
)
|
||||
|
||||
config = PipelineConfig(
|
||||
inputs={"docs_ref": docs_dir},
|
||||
preset_name="plan-review",
|
||||
)
|
||||
original_inputs = {"docs_ref": docs_dir}
|
||||
|
||||
worktree_dir = Path(tmpdir) / "wt"
|
||||
branch = "cross-eval/test-apply-back"
|
||||
worktree_path, _ = create_worktree(repo, worktree_dir, branch)
|
||||
try:
|
||||
_copy_inputs_to_worktree(config, worktree_path, base_cwd=repo)
|
||||
worktree_doc = config.inputs["docs_ref"] / "A.md"
|
||||
worktree_doc.write_text("A v2\n", encoding="utf-8")
|
||||
|
||||
restored = _apply_worktree_inputs_to_base(
|
||||
config, original_inputs, cwd=repo,
|
||||
)
|
||||
self.assertEqual(restored, [docs_dir])
|
||||
self.assertEqual(doc_path.read_text(encoding="utf-8"), "A v2\n")
|
||||
|
||||
committed = _commit_base_repo_paths(
|
||||
repo, restored, "cross-eval: plan-review (FAIL)",
|
||||
)
|
||||
self.assertTrue(committed)
|
||||
|
||||
log = subprocess.run(
|
||||
["git", "log", "-1", "--pretty=%s"],
|
||||
cwd=repo,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
self.assertEqual(log.stdout.strip(), "cross-eval: plan-review (FAIL)")
|
||||
finally:
|
||||
remove_worktree(base_cwd=repo, work_dir=worktree_path)
|
||||
subprocess.run(
|
||||
["git", "branch", "-D", branch],
|
||||
cwd=repo,
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
def test_classify_unknown_failure(self) -> None:
|
||||
failure_type, suggested_action = _classify_agent_failure("weird crash")
|
||||
self.assertEqual(failure_type, "UNKNOWN")
|
||||
|
||||
Reference in New Issue
Block a user