Files
cross-eval/tests/test_agentic.py
chungyeong 60c7b07939 fix: capture_diff uses base commit to handle agent self-commits
Claude in agentic mode (interactive, no -p flag) commits its own changes,
advancing HEAD. This made `git diff --cached HEAD` return empty, triggering
false EMPTY_DIFF errors every time. Now capture_diff diffs against the
base commit SHA recorded at worktree creation, so changes are captured
regardless of whether the agent committed them.

Also adds UX_IMPROVEMENT_PLAN.md for guided message improvements.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-14 23:59:53 +09:00

857 lines
30 KiB
Python

"""Comprehensive tests for the agentic worktree flow.
Covers:
1. worktree.py unit tests (real temp git repo)
2. agent.py agentic tests (mocking subprocess)
3. config.py _make_agentic tests
4. pipeline integration tests (mock invoke_agent / invoke_agent_agentic)
"""
from __future__ import annotations
import subprocess
import tempfile
import unittest
from pathlib import Path
from unittest.mock import MagicMock, patch
from cross_eval.agent import AgentInvocationError, invoke_agent_agentic
from cross_eval.config import _make_agentic
from cross_eval.models import (
AgentConfig,
AgentResult,
PipelineConfig,
StepConfig,
)
from cross_eval.pipeline import (
_assert_base_repo_isolation,
_has_agentic_steps,
_setup_worktree,
run_pipeline,
)
from cross_eval.worktree import (
capture_diff,
commit_worktree,
create_worktree,
make_branch_name,
make_worktree_dir,
remove_worktree,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _init_git_repo(path: Path) -> None:
"""Initialise a minimal git repo with one commit."""
subprocess.run(["git", "init"], cwd=path, capture_output=True, check=True)
subprocess.run(
["git", "config", "user.email", "test@test.com"],
cwd=path, capture_output=True, check=True,
)
subprocess.run(
["git", "config", "user.name", "Test"],
cwd=path, capture_output=True, check=True,
)
(path / "README.md").write_text("# init\n")
subprocess.run(["git", "add", "."], cwd=path, capture_output=True, check=True)
subprocess.run(
["git", "commit", "-m", "initial"],
cwd=path, capture_output=True, check=True,
)
# ===================================================================
# 1. worktree.py unit tests (real temp git repo)
# ===================================================================
class TestCreateWorktree(unittest.TestCase):
"""create_worktree creates a worktree on a named branch."""
def test_creates_worktree_and_branch(self) -> None:
with tempfile.TemporaryDirectory() as td:
base = Path(td) / "repo"
base.mkdir()
_init_git_repo(base)
wt_dir = Path(td) / "wt"
branch = "cross-eval/test_branch"
result_path, base_commit = create_worktree(base, wt_dir, branch)
# Worktree directory exists
self.assertTrue(result_path.exists())
# Base commit SHA was captured
self.assertEqual(len(base_commit), 40)
# Branch was created in the original repo
branches = subprocess.run(
["git", "branch", "--list", branch],
cwd=base, capture_output=True, text=True,
)
self.assertIn(branch, branches.stdout)
# Clean up
remove_worktree(base, wt_dir)
class TestCaptureDiff(unittest.TestCase):
"""capture_diff captures changes correctly."""
def test_captures_new_and_modified_files(self) -> None:
with tempfile.TemporaryDirectory() as td:
base = Path(td) / "repo"
base.mkdir()
_init_git_repo(base)
wt_dir = Path(td) / "wt"
branch = "cross-eval/diff_test"
create_worktree(base, wt_dir, branch) # ignore return tuple
# Make changes in the worktree
(wt_dir / "new_file.txt").write_text("hello\n")
(wt_dir / "README.md").write_text("# modified\n")
diff = capture_diff(wt_dir)
self.assertIn("new_file.txt", diff)
self.assertIn("hello", diff)
self.assertIn("modified", diff)
remove_worktree(base, wt_dir)
class TestCommitWorktree(unittest.TestCase):
"""commit_worktree commits changes and returns True; False when nothing to commit."""
def test_commit_returns_true_on_changes(self) -> None:
with tempfile.TemporaryDirectory() as td:
base = Path(td) / "repo"
base.mkdir()
_init_git_repo(base)
wt_dir = Path(td) / "wt"
branch = "cross-eval/commit_test"
create_worktree(base, wt_dir, branch)
(wt_dir / "file.txt").write_text("data\n")
result = commit_worktree(wt_dir, "test commit")
self.assertTrue(result)
remove_worktree(base, wt_dir)
def test_commit_returns_false_when_nothing_to_commit(self) -> None:
with tempfile.TemporaryDirectory() as td:
base = Path(td) / "repo"
base.mkdir()
_init_git_repo(base)
wt_dir = Path(td) / "wt"
branch = "cross-eval/empty_commit"
create_worktree(base, wt_dir, branch)
result = commit_worktree(wt_dir, "empty")
self.assertFalse(result)
remove_worktree(base, wt_dir)
class TestRemoveWorktree(unittest.TestCase):
"""remove_worktree removes worktree but branch survives."""
def test_branch_survives_worktree_removal(self) -> None:
with tempfile.TemporaryDirectory() as td:
base = Path(td) / "repo"
base.mkdir()
_init_git_repo(base)
wt_dir = Path(td) / "wt"
branch = "cross-eval/remove_test"
create_worktree(base, wt_dir, branch)
remove_worktree(base, wt_dir)
# Worktree directory should be gone
self.assertFalse(wt_dir.exists())
# Branch should still exist in the original repo
branches = subprocess.run(
["git", "branch", "--list", branch],
cwd=base, capture_output=True, text=True,
)
self.assertIn(branch, branches.stdout)
class TestMakeBranchName(unittest.TestCase):
"""make_branch_name generates expected format."""
def test_format(self) -> None:
name = make_branch_name("review-fix")
self.assertTrue(name.startswith("cross-eval/review-fix_"))
# Should contain a timestamp-like suffix
parts = name.split("_", 1)
self.assertEqual(len(parts), 2)
# Timestamp portion should be like 20260313_123456
ts_part = parts[1] # after "cross-eval/review-fix_"
self.assertEqual(len(ts_part), 15) # YYYYMMDD_HHMMSS
class TestMakeWorktreeDir(unittest.TestCase):
"""make_worktree_dir chooses an external temp location."""
def test_uses_tmp_dir_outside_repo(self) -> None:
with tempfile.TemporaryDirectory() as td:
base = Path(td) / "repo"
base.mkdir()
path = make_worktree_dir(base, "cross-eval/review-fix_20260313_123456")
self.assertIn("cross-eval-worktrees", str(path))
self.assertNotIn(str(base), str(path))
class TestBaseRepoIsolation(unittest.TestCase):
"""Base repo mutations should fail fast during agentic execution."""
def test_raises_when_base_repo_state_changes(self) -> None:
with tempfile.TemporaryDirectory() as td:
base = Path(td) / "repo"
worktree = Path(td) / "worktree"
base.mkdir()
worktree.mkdir()
# Baseline has a diff that won't match a non-git directory
# (which returns {}), triggering the isolation error.
baseline_state = {
"diff": "diff --git a/file.py ...\n",
"untracked": "",
}
with self.assertRaises(RuntimeError) as ctx:
_assert_base_repo_isolation(
base,
baseline_state,
step_name="coding",
agent_name="claude-coder",
worktree_path=worktree,
baseline_status="M file.py",
)
self.assertIn("base repository", str(ctx.exception))
# ===================================================================
# 2. agent.py agentic tests (mocking subprocess)
# ===================================================================
class TestInvokeAgentAgenticClaude(unittest.TestCase):
"""invoke_agent_agentic builds correct cmd for claude (no -p, prompt via stdin)."""
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
@patch("subprocess.run")
def test_claude_cmd_has_no_dash_p_and_prompt_via_stdin(
self, mock_run: MagicMock, mock_diff: MagicMock,
) -> None:
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
agent = AgentConfig(
name="claude-coder",
command="claude",
args=["--setting-sources", "user", "--dangerously-skip-permissions"],
agentic=True,
)
with tempfile.TemporaryDirectory() as td:
wt = Path(td)
_init_git_repo(wt)
invoke_agent_agentic(
agent, "implement feature X", "coding",
worktree_path=wt, quiet=True,
)
# Find the subprocess.run call that actually runs the agent
agent_call = None
for c in mock_run.call_args_list:
cmd = c[0][0] if c[0] else c[1].get("args", [])
if cmd and cmd[0] == "claude":
agent_call = c
break
self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'claude'")
assert agent_call is not None
cmd = agent_call[0][0]
# No -p flag
self.assertNotIn("-p", cmd)
# Prompt is delivered via stdin (input kwarg), not as a positional arg
input_data = agent_call[1].get("input")
self.assertIsNotNone(input_data)
assert input_data is not None
self.assertIn("implement feature X", input_data)
class TestInvokeAgentAgenticCodex(unittest.TestCase):
"""invoke_agent_agentic builds correct cmd for codex (stdin mode, - sentinel)."""
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
@patch("subprocess.run")
def test_codex_cmd_uses_stdin_with_dash_sentinel(
self, mock_run: MagicMock, mock_diff: MagicMock,
) -> None:
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
agent = AgentConfig(
name="codex-coder",
command="codex",
args=["exec", "--full-auto", "--skip-git-repo-check"],
agentic=True,
)
with tempfile.TemporaryDirectory() as td:
wt = Path(td)
_init_git_repo(wt)
invoke_agent_agentic(
agent, "implement feature Y", "coding",
worktree_path=wt, quiet=True,
)
agent_call = None
for c in mock_run.call_args_list:
cmd = c[0][0] if c[0] else c[1].get("args", [])
if cmd and cmd[0] == "codex":
agent_call = c
break
self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'codex'")
assert agent_call is not None
cmd = agent_call[0][0]
# Should have "-" sentinel at the end for stdin
self.assertEqual(cmd[-1], "-")
# Stdin input should contain the prompt
input_data = agent_call[1].get("input")
self.assertIsNotNone(input_data)
assert input_data is not None
self.assertIn("implement feature Y", input_data)
class TestTaskFileCleanup(unittest.TestCase):
"""Task file is cleaned up before capture_diff."""
@patch("cross_eval.worktree.capture_diff", return_value="(no changes)")
@patch("subprocess.run")
def test_task_file_in_tmp_not_worktree(
self, mock_run: MagicMock, mock_diff: MagicMock,
) -> None:
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
agent = AgentConfig(
name="claude-coder", command="claude", args=[], agentic=True,
)
with tempfile.TemporaryDirectory() as td:
wt = Path(td)
_init_git_repo(wt)
invoke_agent_agentic(
agent, "do stuff", "coding",
worktree_path=wt, quiet=True,
)
# Task file should NOT be in the worktree (it's in /tmp)
self.assertFalse((wt / "CROSS_EVAL_TASK.md").exists())
class TestAgenticEmptyDiffDetection(unittest.TestCase):
"""Agentic coders should not succeed when they only claim changes in stdout."""
@patch("cross_eval.worktree.capture_diff", return_value="")
@patch("subprocess.run")
def test_claude_empty_diff_with_change_claim_fails(
self, mock_run: MagicMock, mock_diff: MagicMock,
) -> None:
mock_run.return_value = MagicMock(
returncode=0,
stdout=(
"All tests pass.\n"
"Here's a summary of all changes made:\n"
"- Updated discovery.py\n"
),
stderr="",
)
agent = AgentConfig(
name="claude-coder",
command="claude",
args=["--setting-sources", "user"],
agentic=True,
)
with tempfile.TemporaryDirectory() as td:
wt = Path(td)
_init_git_repo(wt)
with self.assertRaises(AgentInvocationError) as ctx:
invoke_agent_agentic(
agent, "implement feature X", "coding",
worktree_path=wt, quiet=True,
)
self.assertEqual(ctx.exception.failure_type, "EMPTY_DIFF")
self.assertIn("summary of all changes made", ctx.exception.raw_error.lower())
@patch("cross_eval.worktree.capture_diff", return_value="")
@patch("subprocess.run")
def test_empty_diff_without_change_claim_is_allowed(
self, mock_run: MagicMock, mock_diff: MagicMock,
) -> None:
mock_run.return_value = MagicMock(
returncode=0,
stdout="No changes were required; the current implementation already satisfies the task.",
stderr="",
)
agent = AgentConfig(
name="claude-coder",
command="claude",
args=["--setting-sources", "user"],
agentic=True,
)
with tempfile.TemporaryDirectory() as td:
wt = Path(td)
_init_git_repo(wt)
result = invoke_agent_agentic(
agent, "check whether any fix is needed", "coding",
worktree_path=wt, quiet=True,
)
self.assertEqual(result.output, "(no changes)")
# ===================================================================
# 3. config.py tests
# ===================================================================
class TestMakeAgenticClaude(unittest.TestCase):
"""_make_agentic strips -p from claude args and sets agentic=True."""
def test_strips_dash_p_and_sets_agentic(self) -> None:
agent = AgentConfig(
name="claude-coder",
command="claude",
args=["-p", "--setting-sources", "user", "--model", "opus"],
)
self.assertFalse(agent.agentic)
_make_agentic(agent)
self.assertTrue(agent.agentic)
self.assertNotIn("-p", agent.args)
self.assertIn("--setting-sources", agent.args)
def test_strips_dash_dash_print_alias(self) -> None:
agent = AgentConfig(
name="claude-coder",
command="claude",
args=["--print", "--setting-sources", "user"],
)
_make_agentic(agent)
self.assertTrue(agent.agentic)
self.assertNotIn("--print", agent.args)
def test_idempotent_when_no_dash_p(self) -> None:
agent = AgentConfig(
name="claude-coder",
command="claude",
args=["--setting-sources", "user"],
)
_make_agentic(agent)
self.assertTrue(agent.agentic)
self.assertEqual(agent.args, ["--setting-sources", "user"])
class TestMakeAgenticCodex(unittest.TestCase):
"""_make_agentic on codex agent still works (no -p to strip)."""
def test_codex_agentic_works(self) -> None:
agent = AgentConfig(
name="codex-coder",
command="codex",
args=["exec", "--full-auto", "-"],
)
_make_agentic(agent)
self.assertTrue(agent.agentic)
# -p was never there so args are unchanged
self.assertIn("exec", agent.args)
self.assertIn("--full-auto", agent.args)
# ===================================================================
# 4. pipeline integration tests
# ===================================================================
def _make_agentic_config(
run_dir: Path,
agentic_coder: bool = True,
) -> PipelineConfig:
"""Build a config with an agentic coder + non-agentic reviewer."""
coder = AgentConfig(
name="claude-coder", command="claude",
args=["--setting-sources", "user"],
agentic=agentic_coder,
)
reviewer = AgentConfig(
name="claude-reviewer", command="claude",
args=["-p", "--setting-sources", "user"],
agentic=False,
)
steps = [
StepConfig(
name="coding",
agent="claude-coder",
role="coding",
prompt_template="default:coding",
output_key="coding_output",
),
StepConfig(
name="review",
agent="claude-reviewer",
role="review",
prompt_template="default:review",
output_key="review_result",
verdict=True,
),
]
return PipelineConfig(
output_dir=run_dir,
max_iterations=2,
min_iterations=1,
language="en",
inputs={"plan": "Test plan", "checklist": "Test checklist"},
agents={"claude-coder": coder, "claude-reviewer": reviewer},
coders=["claude-coder"],
reviewers=["claude-reviewer"],
pipeline=steps,
preset_name="simple",
)
class TestSetupWorktreeCalledForAgentic(unittest.TestCase):
"""When agentic agent is configured, _setup_worktree is called."""
@patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
@patch("cross_eval.pipeline._commit_iteration")
@patch("cross_eval.pipeline._setup_worktree")
@patch("cross_eval.pipeline.invoke_agent_agentic")
@patch("cross_eval.pipeline.invoke_agent")
def test_setup_worktree_called(
self,
mock_invoke: MagicMock,
mock_invoke_agentic: MagicMock,
mock_setup: MagicMock,
mock_commit_iter: MagicMock,
mock_finalize: MagicMock,
) -> None:
with tempfile.TemporaryDirectory() as td:
run_dir = Path(td)
config = _make_agentic_config(run_dir)
wt_path = run_dir / "work"
wt_path.mkdir()
mock_setup.return_value = (wt_path, "cross-eval/test", "a" * 40)
mock_invoke_agentic.return_value = AgentResult(
output="diff output", exit_code=0,
agent_name="claude-coder", step_name="coding",
duration_seconds=0.1,
)
mock_invoke.return_value = AgentResult(
output="VERDICT: PASS", exit_code=0,
agent_name="claude-reviewer", step_name="review",
duration_seconds=0.1,
)
run_pipeline(config, cwd=Path(td))
mock_setup.assert_called_once()
class TestSetupWorktreeLocation(unittest.TestCase):
"""_setup_worktree places agentic worktrees outside the base repo."""
def test_worktree_is_created_outside_repo(self) -> None:
with tempfile.TemporaryDirectory() as td:
base = Path(td) / "repo"
run_dir = base / ".cross-eval" / "output" / "smoke"
base.mkdir()
run_dir.mkdir(parents=True)
_init_git_repo(base)
worktree_path, branch_name, _base_commit = _setup_worktree(base, run_dir, "review-fix")
try:
self.assertTrue(worktree_path.exists())
self.assertNotIn(str(base.resolve()), str(worktree_path.resolve()))
self.assertEqual(
(run_dir / "worktree_path.txt").read_text(encoding="utf-8").strip(),
str(worktree_path),
)
self.assertEqual(
(run_dir / "worktree_branch.txt").read_text(encoding="utf-8").strip(),
branch_name,
)
finally:
remove_worktree(base, worktree_path)
class TestReviewerRunsInWorktreeCwd(unittest.TestCase):
"""Reviewer runs with worktree cwd (not original cwd) when worktree exists."""
@patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
@patch("cross_eval.pipeline._commit_iteration")
@patch("cross_eval.pipeline._setup_worktree")
@patch("cross_eval.pipeline.invoke_agent_agentic")
@patch("cross_eval.pipeline.invoke_agent")
def test_reviewer_uses_worktree_cwd(
self,
mock_invoke: MagicMock,
mock_invoke_agentic: MagicMock,
mock_setup: MagicMock,
mock_commit_iter: MagicMock,
mock_finalize: MagicMock,
) -> None:
with tempfile.TemporaryDirectory() as td:
run_dir = Path(td)
config = _make_agentic_config(run_dir)
wt_path = run_dir / "work"
wt_path.mkdir()
mock_setup.return_value = (wt_path, "cross-eval/test", "a" * 40)
mock_invoke_agentic.return_value = AgentResult(
output="diff output", exit_code=0,
agent_name="claude-coder", step_name="coding",
duration_seconds=0.1,
)
mock_invoke.return_value = AgentResult(
output="VERDICT: PASS", exit_code=0,
agent_name="claude-reviewer", step_name="review",
duration_seconds=0.1,
)
run_pipeline(config, cwd=Path(td))
# The reviewer (non-agentic) should have been called with cwd=worktree_path
reviewer_call = mock_invoke.call_args
self.assertEqual(reviewer_call[1].get("cwd") or reviewer_call[0][3], wt_path)
class TestCommitIterationCalled(unittest.TestCase):
"""_commit_iteration is called after each iteration when worktree exists."""
@patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
@patch("cross_eval.pipeline._commit_iteration")
@patch("cross_eval.pipeline._setup_worktree")
@patch("cross_eval.pipeline.invoke_agent_agentic")
@patch("cross_eval.pipeline.invoke_agent")
def test_commit_iteration_called(
self,
mock_invoke: MagicMock,
mock_invoke_agentic: MagicMock,
mock_setup: MagicMock,
mock_commit_iter: MagicMock,
mock_finalize: MagicMock,
) -> None:
with tempfile.TemporaryDirectory() as td:
run_dir = Path(td)
config = _make_agentic_config(run_dir)
wt_path = run_dir / "work"
wt_path.mkdir()
mock_setup.return_value = (wt_path, "cross-eval/test", "a" * 40)
mock_invoke_agentic.return_value = AgentResult(
output="diff output", exit_code=0,
agent_name="claude-coder", step_name="coding",
duration_seconds=0.1,
)
mock_invoke.return_value = AgentResult(
output="VERDICT: PASS", exit_code=0,
agent_name="claude-reviewer", step_name="review",
duration_seconds=0.1,
)
run_pipeline(config, cwd=Path(td))
mock_commit_iter.assert_called_once()
call_args = mock_commit_iter.call_args
self.assertEqual(call_args[0][0], wt_path)
class TestFinalizeWorktreeCalled(unittest.TestCase):
"""_finalize_worktree commits and cleans up at end."""
@patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
@patch("cross_eval.pipeline._commit_iteration")
@patch("cross_eval.pipeline._setup_worktree")
@patch("cross_eval.pipeline.invoke_agent_agentic")
@patch("cross_eval.pipeline.invoke_agent")
def test_finalize_called(
self,
mock_invoke: MagicMock,
mock_invoke_agentic: MagicMock,
mock_setup: MagicMock,
mock_commit_iter: MagicMock,
mock_finalize: MagicMock,
) -> None:
with tempfile.TemporaryDirectory() as td:
run_dir = Path(td)
config = _make_agentic_config(run_dir)
wt_path = run_dir / "work"
wt_path.mkdir()
mock_setup.return_value = (wt_path, "cross-eval/test", "a" * 40)
mock_invoke_agentic.return_value = AgentResult(
output="diff output", exit_code=0,
agent_name="claude-coder", step_name="coding",
duration_seconds=0.1,
)
mock_invoke.return_value = AgentResult(
output="VERDICT: PASS", exit_code=0,
agent_name="claude-reviewer", step_name="review",
duration_seconds=0.1,
)
run_pipeline(config, cwd=Path(td))
mock_finalize.assert_called_once()
call_args = mock_finalize.call_args
# Should pass cwd, worktree_path, branch_name, preset_name, verdict
self.assertEqual(call_args[0][1], wt_path)
self.assertEqual(call_args[0][2], "cross-eval/test")
class TestParallelAgenticFallsBackToSequential(unittest.TestCase):
"""Multiple agentic steps in parallel batch fall back to sequential."""
def test_has_agentic_steps_detects_agentic(self) -> None:
coder = AgentConfig(
name="claude-coder", command="claude", args=[], agentic=True,
)
reviewer = AgentConfig(
name="claude-reviewer", command="claude", args=[], agentic=False,
)
config = PipelineConfig(
agents={"claude-coder": coder, "claude-reviewer": reviewer},
)
steps = [
StepConfig(name="a", agent="claude-coder", role="coding",
prompt_template="default:coding", output_key="a"),
]
self.assertTrue(_has_agentic_steps(config, steps))
def test_has_agentic_steps_returns_false_without_agentic(self) -> None:
reviewer = AgentConfig(
name="claude-reviewer", command="claude", args=[], agentic=False,
)
config = PipelineConfig(
agents={"claude-reviewer": reviewer},
)
steps = [
StepConfig(name="r", agent="claude-reviewer", role="review",
prompt_template="default:review", output_key="r", verdict=True),
]
self.assertFalse(_has_agentic_steps(config, steps))
@patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
@patch("cross_eval.pipeline._commit_iteration")
@patch("cross_eval.pipeline._setup_worktree")
@patch("cross_eval.pipeline.invoke_agent_agentic")
@patch("cross_eval.pipeline.invoke_agent")
def test_parallel_agentic_runs_sequentially(
self,
mock_invoke: MagicMock,
mock_invoke_agentic: MagicMock,
mock_setup: MagicMock,
mock_commit_iter: MagicMock,
mock_finalize: MagicMock,
) -> None:
"""When multiple agentic steps are parallel, they should run sequentially."""
with tempfile.TemporaryDirectory() as td:
run_dir = Path(td)
coder_a = AgentConfig(
name="coder-a", command="claude", args=[], agentic=True,
)
coder_b = AgentConfig(
name="coder-b", command="claude", args=[], agentic=True,
)
reviewer = AgentConfig(
name="reviewer", command="claude", args=["-p"], agentic=False,
)
steps = [
StepConfig(
name="code_a", agent="coder-a", role="coding",
prompt_template="default:coding", output_key="code_a",
parallel=True,
),
StepConfig(
name="code_b", agent="coder-b", role="coding",
prompt_template="default:coding", output_key="code_b",
parallel=True,
),
StepConfig(
name="review", agent="reviewer", role="review",
prompt_template="default:review", output_key="review_result",
verdict=True,
),
]
config = PipelineConfig(
output_dir=run_dir,
max_iterations=1,
min_iterations=1,
language="en",
inputs={"plan": "Test plan", "checklist": "Test checklist"},
agents={
"coder-a": coder_a,
"coder-b": coder_b,
"reviewer": reviewer,
},
coders=["coder-a", "coder-b"],
reviewers=["reviewer"],
pipeline=steps,
preset_name="custom",
)
wt_path = run_dir / "work"
wt_path.mkdir()
mock_setup.return_value = (wt_path, "cross-eval/test", "a" * 40)
call_order: list[str] = []
def _track_agentic(agent_config, prompt, step_name, **kwargs):
call_order.append(step_name)
return AgentResult(
output="diff", exit_code=0,
agent_name=agent_config.name, step_name=step_name,
duration_seconds=0.1,
)
mock_invoke_agentic.side_effect = _track_agentic
mock_invoke.return_value = AgentResult(
output="VERDICT: PASS", exit_code=0,
agent_name="reviewer", step_name="review",
duration_seconds=0.1,
)
run_pipeline(config, cwd=Path(td))
# Both agentic steps should have been called (sequentially)
agentic_calls = [c for c in call_order if c.startswith("code_")]
self.assertEqual(len(agentic_calls), 2)
# They should appear in order (sequential, not concurrent)
self.assertEqual(agentic_calls, ["code_a", "code_b"])
if __name__ == "__main__":
unittest.main()