cross-eval/tests/test_agentic.py

"""Comprehensive tests for the agentic worktree flow.

Covers:
  1. worktree.py unit tests (real temp git repo)
  2. agent.py agentic tests (mocking subprocess)
  3. config.py _make_agentic tests
  4. pipeline integration tests (mock invoke_agent / invoke_agent_agentic)
"""
from __future__ import annotations

import subprocess
import tempfile
import unittest
from pathlib import Path
from unittest.mock import MagicMock, call, patch

from cross_eval.agent import invoke_agent_agentic
from cross_eval.config import BUILTIN_AGENTS, _make_agentic
from cross_eval.models import (
    AgentConfig,
    AgentResult,
    PipelineConfig,
    StepConfig,
)
from cross_eval.pipeline import (
    _commit_iteration,
    _finalize_worktree,
    _has_agentic_steps,
    _setup_worktree,
    run_pipeline,
)
from cross_eval.worktree import (
    capture_diff,
    commit_worktree,
    create_worktree,
    make_branch_name,
    remove_worktree,
)


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _init_git_repo(path: Path) -> None:
    """Initialise a minimal git repo with one commit."""
    subprocess.run(["git", "init"], cwd=path, capture_output=True, check=True)
    subprocess.run(
        ["git", "config", "user.email", "test@test.com"],
        cwd=path, capture_output=True, check=True,
    )
    subprocess.run(
        ["git", "config", "user.name", "Test"],
        cwd=path, capture_output=True, check=True,
    )
    (path / "README.md").write_text("# init\n")
    subprocess.run(["git", "add", "."], cwd=path, capture_output=True, check=True)
    subprocess.run(
        ["git", "commit", "-m", "initial"],
        cwd=path, capture_output=True, check=True,
    )


# ===================================================================
# 1. worktree.py unit tests (real temp git repo)
# ===================================================================

class TestCreateWorktree(unittest.TestCase):
    """create_worktree creates a worktree on a named branch."""

    def test_creates_worktree_and_branch(self) -> None:
        with tempfile.TemporaryDirectory() as td:
            base = Path(td) / "repo"
            base.mkdir()
            _init_git_repo(base)

            wt_dir = Path(td) / "wt"
            branch = "cross-eval/test_branch"
            result_path = create_worktree(base, wt_dir, branch)

            # Worktree directory exists
            self.assertTrue(result_path.exists())
            # Branch was created in the original repo
            branches = subprocess.run(
                ["git", "branch", "--list", branch],
                cwd=base, capture_output=True, text=True,
            )
            self.assertIn(branch, branches.stdout)

            # Clean up
            remove_worktree(base, wt_dir)


class TestCaptureDiff(unittest.TestCase):
    """capture_diff captures changes correctly."""

    def test_captures_new_and_modified_files(self) -> None:
        with tempfile.TemporaryDirectory() as td:
            base = Path(td) / "repo"
            base.mkdir()
            _init_git_repo(base)

            wt_dir = Path(td) / "wt"
            branch = "cross-eval/diff_test"
            create_worktree(base, wt_dir, branch)

            # Make changes in the worktree
            (wt_dir / "new_file.txt").write_text("hello\n")
            (wt_dir / "README.md").write_text("# modified\n")

            diff = capture_diff(wt_dir)
            self.assertIn("new_file.txt", diff)
            self.assertIn("hello", diff)
            self.assertIn("modified", diff)

            remove_worktree(base, wt_dir)


class TestCommitWorktree(unittest.TestCase):
    """commit_worktree commits changes and returns True; False when nothing to commit."""

    def test_commit_returns_true_on_changes(self) -> None:
        with tempfile.TemporaryDirectory() as td:
            base = Path(td) / "repo"
            base.mkdir()
            _init_git_repo(base)

            wt_dir = Path(td) / "wt"
            branch = "cross-eval/commit_test"
            create_worktree(base, wt_dir, branch)

            (wt_dir / "file.txt").write_text("data\n")
            result = commit_worktree(wt_dir, "test commit")
            self.assertTrue(result)

            remove_worktree(base, wt_dir)

    def test_commit_returns_false_when_nothing_to_commit(self) -> None:
        with tempfile.TemporaryDirectory() as td:
            base = Path(td) / "repo"
            base.mkdir()
            _init_git_repo(base)

            wt_dir = Path(td) / "wt"
            branch = "cross-eval/empty_commit"
            create_worktree(base, wt_dir, branch)

            result = commit_worktree(wt_dir, "empty")
            self.assertFalse(result)

            remove_worktree(base, wt_dir)


class TestRemoveWorktree(unittest.TestCase):
    """remove_worktree removes worktree but branch survives."""

    def test_branch_survives_worktree_removal(self) -> None:
        with tempfile.TemporaryDirectory() as td:
            base = Path(td) / "repo"
            base.mkdir()
            _init_git_repo(base)

            wt_dir = Path(td) / "wt"
            branch = "cross-eval/remove_test"
            create_worktree(base, wt_dir, branch)

            remove_worktree(base, wt_dir)

            # Worktree directory should be gone
            self.assertFalse(wt_dir.exists())

            # Branch should still exist in the original repo
            branches = subprocess.run(
                ["git", "branch", "--list", branch],
                cwd=base, capture_output=True, text=True,
            )
            self.assertIn(branch, branches.stdout)


class TestMakeBranchName(unittest.TestCase):
    """make_branch_name generates expected format."""

    def test_format(self) -> None:
        name = make_branch_name("review-fix")
        self.assertTrue(name.startswith("cross-eval/review-fix_"))
        # Should contain a timestamp-like suffix
        parts = name.split("_", 1)
        self.assertEqual(len(parts), 2)
        # Timestamp portion should be like 20260313_123456
        ts_part = parts[1]  # after "cross-eval/review-fix_"
        self.assertEqual(len(ts_part), 15)  # YYYYMMDD_HHMMSS


# ===================================================================
# 2. agent.py agentic tests (mocking subprocess)
# ===================================================================

class TestInvokeAgentAgenticClaude(unittest.TestCase):
    """invoke_agent_agentic builds correct cmd for claude (no -p, prompt as positional arg)."""

    @patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
    @patch("subprocess.run")
    def test_claude_cmd_has_no_dash_p_and_prompt_as_positional(
        self, mock_run: MagicMock, mock_diff: MagicMock,
    ) -> None:
        mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")

        agent = AgentConfig(
            name="claude-coder",
            command="claude",
            args=["--setting-sources", "user", "--dangerously-skip-permissions"],
            agentic=True,
        )

        with tempfile.TemporaryDirectory() as td:
            wt = Path(td)
            _init_git_repo(wt)

            invoke_agent_agentic(
                agent, "implement feature X", "coding",
                worktree_path=wt, quiet=True,
            )

        # Find the subprocess.run call that actually runs the agent
        agent_call = None
        for c in mock_run.call_args_list:
            cmd = c[0][0] if c[0] else c[1].get("args", [])
            if cmd and cmd[0] == "claude":
                agent_call = c
                break

        self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'claude'")
        cmd = agent_call[0][0]

        # No -p flag
        self.assertNotIn("-p", cmd)
        # Last arg is a task file reference (not raw prompt — avoids arg length limits)
        self.assertIn("task file", cmd[-1].lower())


class TestInvokeAgentAgenticCodex(unittest.TestCase):
    """invoke_agent_agentic builds correct cmd for codex (stdin mode, - sentinel)."""

    @patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
    @patch("subprocess.run")
    def test_codex_cmd_uses_stdin_with_dash_sentinel(
        self, mock_run: MagicMock, mock_diff: MagicMock,
    ) -> None:
        mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")

        agent = AgentConfig(
            name="codex-coder",
            command="codex",
            args=["exec", "--full-auto", "--skip-git-repo-check"],
            agentic=True,
        )

        with tempfile.TemporaryDirectory() as td:
            wt = Path(td)
            _init_git_repo(wt)

            invoke_agent_agentic(
                agent, "implement feature Y", "coding",
                worktree_path=wt, quiet=True,
            )

        agent_call = None
        for c in mock_run.call_args_list:
            cmd = c[0][0] if c[0] else c[1].get("args", [])
            if cmd and cmd[0] == "codex":
                agent_call = c
                break

        self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'codex'")
        cmd = agent_call[0][0]

        # Should have "-" sentinel at the end for stdin
        self.assertEqual(cmd[-1], "-")
        # Stdin input should contain the prompt
        input_data = agent_call[1].get("input")
        self.assertIsNotNone(input_data)
        self.assertIn("implement feature Y", input_data)


class TestTaskFileCleanup(unittest.TestCase):
    """Task file is cleaned up before capture_diff."""

    @patch("cross_eval.worktree.capture_diff", return_value="(no changes)")
    @patch("subprocess.run")
    def test_task_file_in_tmp_not_worktree(
        self, mock_run: MagicMock, mock_diff: MagicMock,
    ) -> None:
        mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")

        agent = AgentConfig(
            name="claude-coder", command="claude", args=[], agentic=True,
        )

        with tempfile.TemporaryDirectory() as td:
            wt = Path(td)
            _init_git_repo(wt)

            invoke_agent_agentic(
                agent, "do stuff", "coding",
                worktree_path=wt, quiet=True,
            )

            # Task file should NOT be in the worktree (it's in /tmp)
            self.assertFalse((wt / "CROSS_EVAL_TASK.md").exists())


# ===================================================================
# 3. config.py tests
# ===================================================================

class TestMakeAgenticClaude(unittest.TestCase):
    """_make_agentic strips -p from claude args and sets agentic=True."""

    def test_strips_dash_p_and_sets_agentic(self) -> None:
        agent = AgentConfig(
            name="claude-coder",
            command="claude",
            args=["-p", "--setting-sources", "user", "--model", "opus"],
        )
        self.assertFalse(agent.agentic)
        _make_agentic(agent)
        self.assertTrue(agent.agentic)
        self.assertNotIn("-p", agent.args)
        self.assertIn("--setting-sources", agent.args)

    def test_idempotent_when_no_dash_p(self) -> None:
        agent = AgentConfig(
            name="claude-coder",
            command="claude",
            args=["--setting-sources", "user"],
        )
        _make_agentic(agent)
        self.assertTrue(agent.agentic)
        self.assertEqual(agent.args, ["--setting-sources", "user"])


class TestMakeAgenticCodex(unittest.TestCase):
    """_make_agentic on codex agent still works (no -p to strip)."""

    def test_codex_agentic_works(self) -> None:
        agent = AgentConfig(
            name="codex-coder",
            command="codex",
            args=["exec", "--full-auto", "-"],
        )
        _make_agentic(agent)
        self.assertTrue(agent.agentic)
        # -p was never there so args are unchanged
        self.assertIn("exec", agent.args)
        self.assertIn("--full-auto", agent.args)


# ===================================================================
# 4. pipeline integration tests
# ===================================================================

def _make_agentic_config(
    run_dir: Path,
    agentic_coder: bool = True,
) -> PipelineConfig:
    """Build a config with an agentic coder + non-agentic reviewer."""
    coder = AgentConfig(
        name="claude-coder", command="claude",
        args=["--setting-sources", "user"],
        agentic=agentic_coder,
    )
    reviewer = AgentConfig(
        name="claude-reviewer", command="claude",
        args=["-p", "--setting-sources", "user"],
        agentic=False,
    )
    steps = [
        StepConfig(
            name="coding",
            agent="claude-coder",
            role="coding",
            prompt_template="default:coding",
            output_key="coding_output",
        ),
        StepConfig(
            name="review",
            agent="claude-reviewer",
            role="review",
            prompt_template="default:review",
            output_key="review_result",
            verdict=True,
        ),
    ]
    return PipelineConfig(
        output_dir=run_dir,
        max_iterations=2,
        min_iterations=1,
        language="en",
        inputs={"plan": "Test plan", "checklist": "Test checklist"},
        agents={"claude-coder": coder, "claude-reviewer": reviewer},
        coders=["claude-coder"],
        reviewers=["claude-reviewer"],
        pipeline=steps,
        preset_name="simple",
    )


class TestSetupWorktreeCalledForAgentic(unittest.TestCase):
    """When agentic agent is configured, _setup_worktree is called."""

    @patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
    @patch("cross_eval.pipeline._commit_iteration")
    @patch("cross_eval.pipeline._setup_worktree")
    @patch("cross_eval.pipeline.invoke_agent_agentic")
    @patch("cross_eval.pipeline.invoke_agent")
    def test_setup_worktree_called(
        self,
        mock_invoke: MagicMock,
        mock_invoke_agentic: MagicMock,
        mock_setup: MagicMock,
        mock_commit_iter: MagicMock,
        mock_finalize: MagicMock,
    ) -> None:
        with tempfile.TemporaryDirectory() as td:
            run_dir = Path(td)
            config = _make_agentic_config(run_dir)

            wt_path = run_dir / "work"
            wt_path.mkdir()
            mock_setup.return_value = (wt_path, "cross-eval/test")

            mock_invoke_agentic.return_value = AgentResult(
                output="diff output", exit_code=0,
                agent_name="claude-coder", step_name="coding",
                duration_seconds=0.1,
            )
            mock_invoke.return_value = AgentResult(
                output="VERDICT: PASS", exit_code=0,
                agent_name="claude-reviewer", step_name="review",
                duration_seconds=0.1,
            )

            run_pipeline(config, cwd=Path(td))

            mock_setup.assert_called_once()


class TestReviewerRunsInWorktreeCwd(unittest.TestCase):
    """Reviewer runs with worktree cwd (not original cwd) when worktree exists."""

    @patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
    @patch("cross_eval.pipeline._commit_iteration")
    @patch("cross_eval.pipeline._setup_worktree")
    @patch("cross_eval.pipeline.invoke_agent_agentic")
    @patch("cross_eval.pipeline.invoke_agent")
    def test_reviewer_uses_worktree_cwd(
        self,
        mock_invoke: MagicMock,
        mock_invoke_agentic: MagicMock,
        mock_setup: MagicMock,
        mock_commit_iter: MagicMock,
        mock_finalize: MagicMock,
    ) -> None:
        with tempfile.TemporaryDirectory() as td:
            run_dir = Path(td)
            config = _make_agentic_config(run_dir)

            wt_path = run_dir / "work"
            wt_path.mkdir()
            mock_setup.return_value = (wt_path, "cross-eval/test")

            mock_invoke_agentic.return_value = AgentResult(
                output="diff output", exit_code=0,
                agent_name="claude-coder", step_name="coding",
                duration_seconds=0.1,
            )
            mock_invoke.return_value = AgentResult(
                output="VERDICT: PASS", exit_code=0,
                agent_name="claude-reviewer", step_name="review",
                duration_seconds=0.1,
            )

            run_pipeline(config, cwd=Path(td))

            # The reviewer (non-agentic) should have been called with cwd=worktree_path
            reviewer_call = mock_invoke.call_args
            self.assertEqual(reviewer_call[1].get("cwd") or reviewer_call[0][3], wt_path)


class TestCommitIterationCalled(unittest.TestCase):
    """_commit_iteration is called after each iteration when worktree exists."""

    @patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
    @patch("cross_eval.pipeline._commit_iteration")
    @patch("cross_eval.pipeline._setup_worktree")
    @patch("cross_eval.pipeline.invoke_agent_agentic")
    @patch("cross_eval.pipeline.invoke_agent")
    def test_commit_iteration_called(
        self,
        mock_invoke: MagicMock,
        mock_invoke_agentic: MagicMock,
        mock_setup: MagicMock,
        mock_commit_iter: MagicMock,
        mock_finalize: MagicMock,
    ) -> None:
        with tempfile.TemporaryDirectory() as td:
            run_dir = Path(td)
            config = _make_agentic_config(run_dir)

            wt_path = run_dir / "work"
            wt_path.mkdir()
            mock_setup.return_value = (wt_path, "cross-eval/test")

            mock_invoke_agentic.return_value = AgentResult(
                output="diff output", exit_code=0,
                agent_name="claude-coder", step_name="coding",
                duration_seconds=0.1,
            )
            mock_invoke.return_value = AgentResult(
                output="VERDICT: PASS", exit_code=0,
                agent_name="claude-reviewer", step_name="review",
                duration_seconds=0.1,
            )

            run_pipeline(config, cwd=Path(td))

            mock_commit_iter.assert_called_once()
            call_args = mock_commit_iter.call_args
            self.assertEqual(call_args[0][0], wt_path)


class TestFinalizeWorktreeCalled(unittest.TestCase):
    """_finalize_worktree commits and cleans up at end."""

    @patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
    @patch("cross_eval.pipeline._commit_iteration")
    @patch("cross_eval.pipeline._setup_worktree")
    @patch("cross_eval.pipeline.invoke_agent_agentic")
    @patch("cross_eval.pipeline.invoke_agent")
    def test_finalize_called(
        self,
        mock_invoke: MagicMock,
        mock_invoke_agentic: MagicMock,
        mock_setup: MagicMock,
        mock_commit_iter: MagicMock,
        mock_finalize: MagicMock,
    ) -> None:
        with tempfile.TemporaryDirectory() as td:
            run_dir = Path(td)
            config = _make_agentic_config(run_dir)

            wt_path = run_dir / "work"
            wt_path.mkdir()
            mock_setup.return_value = (wt_path, "cross-eval/test")

            mock_invoke_agentic.return_value = AgentResult(
                output="diff output", exit_code=0,
                agent_name="claude-coder", step_name="coding",
                duration_seconds=0.1,
            )
            mock_invoke.return_value = AgentResult(
                output="VERDICT: PASS", exit_code=0,
                agent_name="claude-reviewer", step_name="review",
                duration_seconds=0.1,
            )

            run_pipeline(config, cwd=Path(td))

            mock_finalize.assert_called_once()
            call_args = mock_finalize.call_args
            # Should pass cwd, worktree_path, branch_name, preset_name, verdict
            self.assertEqual(call_args[0][1], wt_path)
            self.assertEqual(call_args[0][2], "cross-eval/test")


class TestParallelAgenticFallsBackToSequential(unittest.TestCase):
    """Multiple agentic steps in parallel batch fall back to sequential."""

    def test_has_agentic_steps_detects_agentic(self) -> None:
        coder = AgentConfig(
            name="claude-coder", command="claude", args=[], agentic=True,
        )
        reviewer = AgentConfig(
            name="claude-reviewer", command="claude", args=[], agentic=False,
        )
        config = PipelineConfig(
            agents={"claude-coder": coder, "claude-reviewer": reviewer},
        )
        steps = [
            StepConfig(name="a", agent="claude-coder", role="coding",
                       prompt_template="default:coding", output_key="a"),
        ]
        self.assertTrue(_has_agentic_steps(config, steps))

    def test_has_agentic_steps_returns_false_without_agentic(self) -> None:
        reviewer = AgentConfig(
            name="claude-reviewer", command="claude", args=[], agentic=False,
        )
        config = PipelineConfig(
            agents={"claude-reviewer": reviewer},
        )
        steps = [
            StepConfig(name="r", agent="claude-reviewer", role="review",
                       prompt_template="default:review", output_key="r", verdict=True),
        ]
        self.assertFalse(_has_agentic_steps(config, steps))

    @patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
    @patch("cross_eval.pipeline._commit_iteration")
    @patch("cross_eval.pipeline._setup_worktree")
    @patch("cross_eval.pipeline.invoke_agent_agentic")
    @patch("cross_eval.pipeline.invoke_agent")
    def test_parallel_agentic_runs_sequentially(
        self,
        mock_invoke: MagicMock,
        mock_invoke_agentic: MagicMock,
        mock_setup: MagicMock,
        mock_commit_iter: MagicMock,
        mock_finalize: MagicMock,
    ) -> None:
        """When multiple agentic steps are parallel, they should run sequentially."""
        with tempfile.TemporaryDirectory() as td:
            run_dir = Path(td)

            coder_a = AgentConfig(
                name="coder-a", command="claude", args=[], agentic=True,
            )
            coder_b = AgentConfig(
                name="coder-b", command="claude", args=[], agentic=True,
            )
            reviewer = AgentConfig(
                name="reviewer", command="claude", args=["-p"], agentic=False,
            )

            steps = [
                StepConfig(
                    name="code_a", agent="coder-a", role="coding",
                    prompt_template="default:coding", output_key="code_a",
                    parallel=True,
                ),
                StepConfig(
                    name="code_b", agent="coder-b", role="coding",
                    prompt_template="default:coding", output_key="code_b",
                    parallel=True,
                ),
                StepConfig(
                    name="review", agent="reviewer", role="review",
                    prompt_template="default:review", output_key="review_result",
                    verdict=True,
                ),
            ]

            config = PipelineConfig(
                output_dir=run_dir,
                max_iterations=1,
                min_iterations=1,
                language="en",
                inputs={"plan": "Test plan", "checklist": "Test checklist"},
                agents={
                    "coder-a": coder_a,
                    "coder-b": coder_b,
                    "reviewer": reviewer,
                },
                coders=["coder-a", "coder-b"],
                reviewers=["reviewer"],
                pipeline=steps,
                preset_name="custom",
            )

            wt_path = run_dir / "work"
            wt_path.mkdir()
            mock_setup.return_value = (wt_path, "cross-eval/test")

            call_order: list[str] = []

            def _track_agentic(agent_config, prompt, step_name, **kwargs):
                call_order.append(step_name)
                return AgentResult(
                    output="diff", exit_code=0,
                    agent_name=agent_config.name, step_name=step_name,
                    duration_seconds=0.1,
                )

            mock_invoke_agentic.side_effect = _track_agentic
            mock_invoke.return_value = AgentResult(
                output="VERDICT: PASS", exit_code=0,
                agent_name="reviewer", step_name="review",
                duration_seconds=0.1,
            )

            run_pipeline(config, cwd=Path(td))

            # Both agentic steps should have been called (sequentially)
            agentic_calls = [c for c in call_order if c.startswith("code_")]
            self.assertEqual(len(agentic_calls), 2)
            # They should appear in order (sequential, not concurrent)
            self.assertEqual(agentic_calls, ["code_a", "code_b"])


if __name__ == "__main__":
    unittest.main()