"""Comprehensive tests for the agentic worktree flow. Covers: 1. worktree.py unit tests (real temp git repo) 2. agent.py agentic tests (mocking subprocess) 3. config.py _make_agentic tests 4. pipeline integration tests (mock invoke_agent / invoke_agent_agentic) """ from __future__ import annotations import subprocess import tempfile import unittest from pathlib import Path from unittest.mock import MagicMock, patch from cross_eval.agent import AgentInvocationError, invoke_agent_agentic from cross_eval.config import _make_agentic from cross_eval.models import ( AgentConfig, AgentResult, PipelineConfig, StepConfig, ) from cross_eval.pipeline import ( _assert_base_repo_isolation, _has_agentic_steps, _setup_worktree, run_pipeline, ) from cross_eval.worktree import ( capture_diff, commit_worktree, create_worktree, make_branch_name, make_worktree_dir, remove_worktree, ) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _init_git_repo(path: Path) -> None: """Initialise a minimal git repo with one commit.""" subprocess.run(["git", "init"], cwd=path, capture_output=True, check=True) subprocess.run( ["git", "config", "user.email", "test@test.com"], cwd=path, capture_output=True, check=True, ) subprocess.run( ["git", "config", "user.name", "Test"], cwd=path, capture_output=True, check=True, ) (path / "README.md").write_text("# init\n") subprocess.run(["git", "add", "."], cwd=path, capture_output=True, check=True) subprocess.run( ["git", "commit", "-m", "initial"], cwd=path, capture_output=True, check=True, ) # =================================================================== # 1. worktree.py unit tests (real temp git repo) # =================================================================== class TestCreateWorktree(unittest.TestCase): """create_worktree creates a worktree on a named branch.""" def test_creates_worktree_and_branch(self) -> None: with tempfile.TemporaryDirectory() as td: base = Path(td) / "repo" base.mkdir() _init_git_repo(base) wt_dir = Path(td) / "wt" branch = "cross-eval/test_branch" result_path, base_commit = create_worktree(base, wt_dir, branch) # Worktree directory exists self.assertTrue(result_path.exists()) # Base commit SHA was captured self.assertEqual(len(base_commit), 40) # Branch was created in the original repo branches = subprocess.run( ["git", "branch", "--list", branch], cwd=base, capture_output=True, text=True, ) self.assertIn(branch, branches.stdout) # Clean up remove_worktree(base, wt_dir) class TestCaptureDiff(unittest.TestCase): """capture_diff captures changes correctly.""" def test_captures_new_and_modified_files(self) -> None: with tempfile.TemporaryDirectory() as td: base = Path(td) / "repo" base.mkdir() _init_git_repo(base) wt_dir = Path(td) / "wt" branch = "cross-eval/diff_test" create_worktree(base, wt_dir, branch) # ignore return tuple # Make changes in the worktree (wt_dir / "new_file.txt").write_text("hello\n") (wt_dir / "README.md").write_text("# modified\n") diff = capture_diff(wt_dir) self.assertIn("new_file.txt", diff) self.assertIn("hello", diff) self.assertIn("modified", diff) remove_worktree(base, wt_dir) class TestCommitWorktree(unittest.TestCase): """commit_worktree commits changes and returns True; False when nothing to commit.""" def test_commit_returns_true_on_changes(self) -> None: with tempfile.TemporaryDirectory() as td: base = Path(td) / "repo" base.mkdir() _init_git_repo(base) wt_dir = Path(td) / "wt" branch = "cross-eval/commit_test" create_worktree(base, wt_dir, branch) (wt_dir / "file.txt").write_text("data\n") result = commit_worktree(wt_dir, "test commit") self.assertTrue(result) remove_worktree(base, wt_dir) def test_commit_returns_false_when_nothing_to_commit(self) -> None: with tempfile.TemporaryDirectory() as td: base = Path(td) / "repo" base.mkdir() _init_git_repo(base) wt_dir = Path(td) / "wt" branch = "cross-eval/empty_commit" create_worktree(base, wt_dir, branch) result = commit_worktree(wt_dir, "empty") self.assertFalse(result) remove_worktree(base, wt_dir) class TestRemoveWorktree(unittest.TestCase): """remove_worktree removes worktree but branch survives.""" def test_branch_survives_worktree_removal(self) -> None: with tempfile.TemporaryDirectory() as td: base = Path(td) / "repo" base.mkdir() _init_git_repo(base) wt_dir = Path(td) / "wt" branch = "cross-eval/remove_test" create_worktree(base, wt_dir, branch) remove_worktree(base, wt_dir) # Worktree directory should be gone self.assertFalse(wt_dir.exists()) # Branch should still exist in the original repo branches = subprocess.run( ["git", "branch", "--list", branch], cwd=base, capture_output=True, text=True, ) self.assertIn(branch, branches.stdout) class TestMakeBranchName(unittest.TestCase): """make_branch_name generates expected format.""" def test_format(self) -> None: name = make_branch_name("review-fix") self.assertTrue(name.startswith("cross-eval/review-fix_")) # Should contain a timestamp-like suffix parts = name.split("_", 1) self.assertEqual(len(parts), 2) # Timestamp portion should be like 20260313_123456 ts_part = parts[1] # after "cross-eval/review-fix_" self.assertEqual(len(ts_part), 15) # YYYYMMDD_HHMMSS class TestMakeWorktreeDir(unittest.TestCase): """make_worktree_dir chooses an external temp location.""" def test_uses_tmp_dir_outside_repo(self) -> None: with tempfile.TemporaryDirectory() as td: base = Path(td) / "repo" base.mkdir() path = make_worktree_dir(base, "cross-eval/review-fix_20260313_123456") self.assertIn("cross-eval-worktrees", str(path)) self.assertNotIn(str(base), str(path)) class TestBaseRepoIsolation(unittest.TestCase): """Base repo mutations should fail fast during agentic execution.""" def test_raises_when_base_repo_state_changes(self) -> None: with tempfile.TemporaryDirectory() as td: base = Path(td) / "repo" worktree = Path(td) / "worktree" base.mkdir() worktree.mkdir() # Baseline has a diff that won't match a non-git directory # (which returns {}), triggering the isolation error. baseline_state = { "diff": "diff --git a/file.py ...\n", "untracked": "", } with self.assertRaises(RuntimeError) as ctx: _assert_base_repo_isolation( base, baseline_state, step_name="coding", agent_name="claude-coder", worktree_path=worktree, baseline_status="M file.py", ) self.assertIn("base repository", str(ctx.exception)) # =================================================================== # 2. agent.py agentic tests (mocking subprocess) # =================================================================== class TestInvokeAgentAgenticClaude(unittest.TestCase): """invoke_agent_agentic builds correct cmd for claude (no -p, prompt via stdin).""" @patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...") @patch("subprocess.run") def test_claude_cmd_has_no_dash_p_and_prompt_via_stdin( self, mock_run: MagicMock, mock_diff: MagicMock, ) -> None: mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="") agent = AgentConfig( name="claude-coder", command="claude", args=["--setting-sources", "user", "--dangerously-skip-permissions"], agentic=True, ) with tempfile.TemporaryDirectory() as td: wt = Path(td) _init_git_repo(wt) invoke_agent_agentic( agent, "implement feature X", "coding", worktree_path=wt, quiet=True, ) # Find the subprocess.run call that actually runs the agent agent_call = None for c in mock_run.call_args_list: cmd = c[0][0] if c[0] else c[1].get("args", []) if cmd and cmd[0] == "claude": agent_call = c break self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'claude'") assert agent_call is not None cmd = agent_call[0][0] # No -p flag self.assertNotIn("-p", cmd) # Prompt is delivered via stdin (input kwarg), not as a positional arg input_data = agent_call[1].get("input") self.assertIsNotNone(input_data) assert input_data is not None self.assertIn("implement feature X", input_data) class TestInvokeAgentAgenticCodex(unittest.TestCase): """invoke_agent_agentic builds correct cmd for codex (stdin mode, - sentinel).""" @patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...") @patch("subprocess.run") def test_codex_cmd_uses_stdin_with_dash_sentinel( self, mock_run: MagicMock, mock_diff: MagicMock, ) -> None: mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="") agent = AgentConfig( name="codex-coder", command="codex", args=["exec", "--full-auto", "--skip-git-repo-check"], agentic=True, ) with tempfile.TemporaryDirectory() as td: wt = Path(td) _init_git_repo(wt) invoke_agent_agentic( agent, "implement feature Y", "coding", worktree_path=wt, quiet=True, ) agent_call = None for c in mock_run.call_args_list: cmd = c[0][0] if c[0] else c[1].get("args", []) if cmd and cmd[0] == "codex": agent_call = c break self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'codex'") assert agent_call is not None cmd = agent_call[0][0] # Should have "-" sentinel at the end for stdin self.assertEqual(cmd[-1], "-") # Stdin input should contain the prompt input_data = agent_call[1].get("input") self.assertIsNotNone(input_data) assert input_data is not None self.assertIn("implement feature Y", input_data) class TestTaskFileCleanup(unittest.TestCase): """Task file is cleaned up before capture_diff.""" @patch("cross_eval.worktree.capture_diff", return_value="(no changes)") @patch("subprocess.run") def test_task_file_in_tmp_not_worktree( self, mock_run: MagicMock, mock_diff: MagicMock, ) -> None: mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="") agent = AgentConfig( name="claude-coder", command="claude", args=[], agentic=True, ) with tempfile.TemporaryDirectory() as td: wt = Path(td) _init_git_repo(wt) invoke_agent_agentic( agent, "do stuff", "coding", worktree_path=wt, quiet=True, ) # Task file should NOT be in the worktree (it's in /tmp) self.assertFalse((wt / "CROSS_EVAL_TASK.md").exists()) class TestAgenticEmptyDiffDetection(unittest.TestCase): """Agentic coders should not succeed when they only claim changes in stdout.""" @patch("cross_eval.worktree.capture_diff", return_value="") @patch("subprocess.run") def test_claude_empty_diff_with_change_claim_fails( self, mock_run: MagicMock, mock_diff: MagicMock, ) -> None: mock_run.return_value = MagicMock( returncode=0, stdout=( "All tests pass.\n" "Here's a summary of all changes made:\n" "- Updated discovery.py\n" ), stderr="", ) agent = AgentConfig( name="claude-coder", command="claude", args=["--setting-sources", "user"], agentic=True, ) with tempfile.TemporaryDirectory() as td: wt = Path(td) _init_git_repo(wt) with self.assertRaises(AgentInvocationError) as ctx: invoke_agent_agentic( agent, "implement feature X", "coding", worktree_path=wt, quiet=True, ) self.assertEqual(ctx.exception.failure_type, "EMPTY_DIFF") self.assertIn("summary of all changes made", ctx.exception.raw_error.lower()) @patch("cross_eval.worktree.capture_diff", return_value="") @patch("subprocess.run") def test_empty_diff_without_change_claim_is_allowed( self, mock_run: MagicMock, mock_diff: MagicMock, ) -> None: mock_run.return_value = MagicMock( returncode=0, stdout="No changes were required; the current implementation already satisfies the task.", stderr="", ) agent = AgentConfig( name="claude-coder", command="claude", args=["--setting-sources", "user"], agentic=True, ) with tempfile.TemporaryDirectory() as td: wt = Path(td) _init_git_repo(wt) result = invoke_agent_agentic( agent, "check whether any fix is needed", "coding", worktree_path=wt, quiet=True, ) self.assertEqual(result.output, "(no changes)") # =================================================================== # 3. config.py tests # =================================================================== class TestMakeAgenticClaude(unittest.TestCase): """_make_agentic strips -p from claude args and sets agentic=True.""" def test_strips_dash_p_and_sets_agentic(self) -> None: agent = AgentConfig( name="claude-coder", command="claude", args=["-p", "--setting-sources", "user", "--model", "opus"], ) self.assertFalse(agent.agentic) _make_agentic(agent) self.assertTrue(agent.agentic) self.assertNotIn("-p", agent.args) self.assertIn("--setting-sources", agent.args) def test_strips_dash_dash_print_alias(self) -> None: agent = AgentConfig( name="claude-coder", command="claude", args=["--print", "--setting-sources", "user"], ) _make_agentic(agent) self.assertTrue(agent.agentic) self.assertNotIn("--print", agent.args) def test_idempotent_when_no_dash_p(self) -> None: agent = AgentConfig( name="claude-coder", command="claude", args=["--setting-sources", "user"], ) _make_agentic(agent) self.assertTrue(agent.agentic) self.assertEqual(agent.args, ["--setting-sources", "user"]) class TestMakeAgenticCodex(unittest.TestCase): """_make_agentic on codex agent still works (no -p to strip).""" def test_codex_agentic_works(self) -> None: agent = AgentConfig( name="codex-coder", command="codex", args=["exec", "--full-auto", "-"], ) _make_agentic(agent) self.assertTrue(agent.agentic) # -p was never there so args are unchanged self.assertIn("exec", agent.args) self.assertIn("--full-auto", agent.args) # =================================================================== # 4. pipeline integration tests # =================================================================== def _make_agentic_config( run_dir: Path, agentic_coder: bool = True, *, use_worktree: bool = False, ) -> PipelineConfig: """Build a config with an agentic coder + non-agentic reviewer.""" coder = AgentConfig( name="claude-coder", command="claude", args=["--setting-sources", "user"], agentic=agentic_coder, ) reviewer = AgentConfig( name="claude-reviewer", command="claude", args=["-p", "--setting-sources", "user"], agentic=False, ) steps = [ StepConfig( name="coding", agent="claude-coder", role="coding", prompt_template="default:coding", output_key="coding_output", ), StepConfig( name="review", agent="claude-reviewer", role="review", prompt_template="default:review", output_key="review_result", verdict=True, ), ] return PipelineConfig( output_dir=run_dir, use_worktree=use_worktree, max_iterations=2, min_iterations=1, language="en", inputs={"plan": "Test plan", "checklist": "Test checklist"}, agents={"claude-coder": coder, "claude-reviewer": reviewer}, coders=["claude-coder"], reviewers=["claude-reviewer"], pipeline=steps, preset_name="simple", ) class TestSetupWorktreeCalledForAgentic(unittest.TestCase): """When agentic agent is configured, _setup_worktree is called.""" @patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test") @patch("cross_eval.pipeline._commit_iteration") @patch("cross_eval.pipeline._setup_worktree") @patch("cross_eval.pipeline.invoke_agent_agentic") @patch("cross_eval.pipeline.invoke_agent") def test_setup_worktree_called( self, mock_invoke: MagicMock, mock_invoke_agentic: MagicMock, mock_setup: MagicMock, mock_commit_iter: MagicMock, mock_finalize: MagicMock, ) -> None: with tempfile.TemporaryDirectory() as td: run_dir = Path(td) config = _make_agentic_config(run_dir, use_worktree=True) wt_path = run_dir / "work" wt_path.mkdir() mock_setup.return_value = (wt_path, "cross-eval/test", "a" * 40) mock_invoke_agentic.return_value = AgentResult( output="diff output", exit_code=0, agent_name="claude-coder", step_name="coding", duration_seconds=0.1, ) mock_invoke.return_value = AgentResult( output="VERDICT: PASS", exit_code=0, agent_name="claude-reviewer", step_name="review", duration_seconds=0.1, ) run_pipeline(config, cwd=Path(td)) mock_setup.assert_called_once() class TestDirectAgenticMode(unittest.TestCase): """Agentic coders run in the current working tree by default.""" @patch("cross_eval.pipeline._setup_worktree") @patch("cross_eval.pipeline.invoke_agent_agentic") @patch("cross_eval.pipeline.invoke_agent") def test_agentic_uses_current_worktree_by_default( self, mock_invoke: MagicMock, mock_invoke_agentic: MagicMock, mock_setup: MagicMock, ) -> None: with tempfile.TemporaryDirectory() as td: repo = Path(td) _init_git_repo(repo) run_dir = repo / ".cross-eval" / "output" run_dir.mkdir(parents=True, exist_ok=True) config = _make_agentic_config(run_dir) mock_invoke_agentic.return_value = AgentResult( output="diff output", exit_code=0, agent_name="claude-coder", step_name="coding", duration_seconds=0.1, ) mock_invoke.return_value = AgentResult( output="VERDICT: PASS", exit_code=0, agent_name="claude-reviewer", step_name="review", duration_seconds=0.1, ) run_pipeline(config, cwd=repo) mock_setup.assert_not_called() self.assertEqual(mock_invoke_agentic.call_args.kwargs["worktree_path"], repo) reviewer_call = mock_invoke.call_args self.assertEqual(reviewer_call.kwargs["cwd"], repo) class TestSetupWorktreeLocation(unittest.TestCase): """_setup_worktree places agentic worktrees outside the base repo.""" def test_worktree_is_created_outside_repo(self) -> None: with tempfile.TemporaryDirectory() as td: base = Path(td) / "repo" run_dir = base / ".cross-eval" / "output" / "smoke" base.mkdir() run_dir.mkdir(parents=True) _init_git_repo(base) worktree_path, branch_name, _base_commit = _setup_worktree(base, run_dir, "review-fix") try: self.assertTrue(worktree_path.exists()) self.assertNotIn(str(base.resolve()), str(worktree_path.resolve())) self.assertEqual( (run_dir / "worktree_path.txt").read_text(encoding="utf-8").strip(), str(worktree_path), ) self.assertEqual( (run_dir / "worktree_branch.txt").read_text(encoding="utf-8").strip(), branch_name, ) finally: remove_worktree(base, worktree_path) class TestReviewerRunsInWorktreeCwd(unittest.TestCase): """Reviewer runs with worktree cwd (not original cwd) when worktree exists.""" @patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test") @patch("cross_eval.pipeline._commit_iteration") @patch("cross_eval.pipeline._setup_worktree") @patch("cross_eval.pipeline.invoke_agent_agentic") @patch("cross_eval.pipeline.invoke_agent") def test_reviewer_uses_worktree_cwd( self, mock_invoke: MagicMock, mock_invoke_agentic: MagicMock, mock_setup: MagicMock, mock_commit_iter: MagicMock, mock_finalize: MagicMock, ) -> None: with tempfile.TemporaryDirectory() as td: run_dir = Path(td) config = _make_agentic_config(run_dir, use_worktree=True) wt_path = run_dir / "work" wt_path.mkdir() mock_setup.return_value = (wt_path, "cross-eval/test", "a" * 40) mock_invoke_agentic.return_value = AgentResult( output="diff output", exit_code=0, agent_name="claude-coder", step_name="coding", duration_seconds=0.1, ) mock_invoke.return_value = AgentResult( output="VERDICT: PASS", exit_code=0, agent_name="claude-reviewer", step_name="review", duration_seconds=0.1, ) run_pipeline(config, cwd=Path(td)) # The reviewer (non-agentic) should have been called with cwd=worktree_path reviewer_call = mock_invoke.call_args self.assertEqual(reviewer_call[1].get("cwd") or reviewer_call[0][3], wt_path) class TestCommitIterationCalled(unittest.TestCase): """_commit_iteration is called after each iteration when worktree exists.""" @patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test") @patch("cross_eval.pipeline._commit_iteration") @patch("cross_eval.pipeline._setup_worktree") @patch("cross_eval.pipeline.invoke_agent_agentic") @patch("cross_eval.pipeline.invoke_agent") def test_commit_iteration_called( self, mock_invoke: MagicMock, mock_invoke_agentic: MagicMock, mock_setup: MagicMock, mock_commit_iter: MagicMock, mock_finalize: MagicMock, ) -> None: with tempfile.TemporaryDirectory() as td: run_dir = Path(td) config = _make_agentic_config(run_dir, use_worktree=True) wt_path = run_dir / "work" wt_path.mkdir() mock_setup.return_value = (wt_path, "cross-eval/test", "a" * 40) mock_invoke_agentic.return_value = AgentResult( output="diff output", exit_code=0, agent_name="claude-coder", step_name="coding", duration_seconds=0.1, ) mock_invoke.return_value = AgentResult( output="VERDICT: PASS", exit_code=0, agent_name="claude-reviewer", step_name="review", duration_seconds=0.1, ) run_pipeline(config, cwd=Path(td)) mock_commit_iter.assert_called_once() call_args = mock_commit_iter.call_args self.assertEqual(call_args[0][0], wt_path) class TestFinalizeWorktreeCalled(unittest.TestCase): """_finalize_worktree commits and cleans up at end.""" @patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test") @patch("cross_eval.pipeline._commit_iteration") @patch("cross_eval.pipeline._setup_worktree") @patch("cross_eval.pipeline.invoke_agent_agentic") @patch("cross_eval.pipeline.invoke_agent") def test_finalize_called( self, mock_invoke: MagicMock, mock_invoke_agentic: MagicMock, mock_setup: MagicMock, mock_commit_iter: MagicMock, mock_finalize: MagicMock, ) -> None: with tempfile.TemporaryDirectory() as td: run_dir = Path(td) config = _make_agentic_config(run_dir, use_worktree=True) wt_path = run_dir / "work" wt_path.mkdir() mock_setup.return_value = (wt_path, "cross-eval/test", "a" * 40) mock_invoke_agentic.return_value = AgentResult( output="diff output", exit_code=0, agent_name="claude-coder", step_name="coding", duration_seconds=0.1, ) mock_invoke.return_value = AgentResult( output="VERDICT: PASS", exit_code=0, agent_name="claude-reviewer", step_name="review", duration_seconds=0.1, ) run_pipeline(config, cwd=Path(td)) mock_finalize.assert_called_once() call_args = mock_finalize.call_args # Should pass cwd, worktree_path, branch_name, preset_name, verdict self.assertEqual(call_args[0][1], wt_path) self.assertEqual(call_args[0][2], "cross-eval/test") class TestParallelAgenticFallsBackToSequential(unittest.TestCase): """Multiple agentic steps in parallel batch fall back to sequential.""" def test_has_agentic_steps_detects_agentic(self) -> None: coder = AgentConfig( name="claude-coder", command="claude", args=[], agentic=True, ) reviewer = AgentConfig( name="claude-reviewer", command="claude", args=[], agentic=False, ) config = PipelineConfig( agents={"claude-coder": coder, "claude-reviewer": reviewer}, ) steps = [ StepConfig(name="a", agent="claude-coder", role="coding", prompt_template="default:coding", output_key="a"), ] self.assertTrue(_has_agentic_steps(config, steps)) def test_has_agentic_steps_returns_false_without_agentic(self) -> None: reviewer = AgentConfig( name="claude-reviewer", command="claude", args=[], agentic=False, ) config = PipelineConfig( agents={"claude-reviewer": reviewer}, ) steps = [ StepConfig(name="r", agent="claude-reviewer", role="review", prompt_template="default:review", output_key="r", verdict=True), ] self.assertFalse(_has_agentic_steps(config, steps)) @patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test") @patch("cross_eval.pipeline._commit_iteration") @patch("cross_eval.pipeline._setup_worktree") @patch("cross_eval.pipeline.invoke_agent_agentic") @patch("cross_eval.pipeline.invoke_agent") def test_parallel_agentic_runs_sequentially( self, mock_invoke: MagicMock, mock_invoke_agentic: MagicMock, mock_setup: MagicMock, mock_commit_iter: MagicMock, mock_finalize: MagicMock, ) -> None: """When multiple agentic steps are parallel, they should run sequentially.""" with tempfile.TemporaryDirectory() as td: run_dir = Path(td) coder_a = AgentConfig( name="coder-a", command="claude", args=[], agentic=True, ) coder_b = AgentConfig( name="coder-b", command="claude", args=[], agentic=True, ) reviewer = AgentConfig( name="reviewer", command="claude", args=["-p"], agentic=False, ) steps = [ StepConfig( name="code_a", agent="coder-a", role="coding", prompt_template="default:coding", output_key="code_a", parallel=True, ), StepConfig( name="code_b", agent="coder-b", role="coding", prompt_template="default:coding", output_key="code_b", parallel=True, ), StepConfig( name="review", agent="reviewer", role="review", prompt_template="default:review", output_key="review_result", verdict=True, ), ] config = PipelineConfig( output_dir=run_dir, max_iterations=1, min_iterations=1, language="en", inputs={"plan": "Test plan", "checklist": "Test checklist"}, agents={ "coder-a": coder_a, "coder-b": coder_b, "reviewer": reviewer, }, coders=["coder-a", "coder-b"], reviewers=["reviewer"], pipeline=steps, preset_name="custom", ) wt_path = run_dir / "work" wt_path.mkdir() mock_setup.return_value = (wt_path, "cross-eval/test", "a" * 40) call_order: list[str] = [] def _track_agentic(agent_config, prompt, step_name, **kwargs): call_order.append(step_name) return AgentResult( output="diff", exit_code=0, agent_name=agent_config.name, step_name=step_name, duration_seconds=0.1, ) mock_invoke_agentic.side_effect = _track_agentic mock_invoke.return_value = AgentResult( output="VERDICT: PASS", exit_code=0, agent_name="reviewer", step_name="review", duration_seconds=0.1, ) run_pipeline(config, cwd=Path(td)) # Both agentic steps should have been called (sequentially) agentic_calls = [c for c in call_order if c.startswith("code_")] self.assertEqual(len(agentic_calls), 2) # They should appear in order (sequential, not concurrent) self.assertEqual(agentic_calls, ["code_a", "code_b"]) if __name__ == "__main__": unittest.main()