feat: isolate agentic worktrees and surface execution evidence
This commit is contained in:
@@ -23,6 +23,7 @@ from cross_eval.models import (
|
||||
StepConfig,
|
||||
)
|
||||
from cross_eval.pipeline import (
|
||||
_assert_base_repo_isolation,
|
||||
_commit_iteration,
|
||||
_finalize_worktree,
|
||||
_has_agentic_steps,
|
||||
@@ -34,6 +35,7 @@ from cross_eval.worktree import (
|
||||
commit_worktree,
|
||||
create_worktree,
|
||||
make_branch_name,
|
||||
make_worktree_dir,
|
||||
remove_worktree,
|
||||
)
|
||||
|
||||
@@ -191,6 +193,41 @@ class TestMakeBranchName(unittest.TestCase):
|
||||
self.assertEqual(len(ts_part), 15) # YYYYMMDD_HHMMSS
|
||||
|
||||
|
||||
class TestMakeWorktreeDir(unittest.TestCase):
|
||||
"""make_worktree_dir chooses an external temp location."""
|
||||
|
||||
def test_uses_tmp_dir_outside_repo(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
base = Path(td) / "repo"
|
||||
base.mkdir()
|
||||
path = make_worktree_dir(base, "cross-eval/review-fix_20260313_123456")
|
||||
self.assertIn("cross-eval-worktrees", str(path))
|
||||
self.assertNotIn(str(base), str(path))
|
||||
|
||||
|
||||
class TestBaseRepoIsolation(unittest.TestCase):
|
||||
"""Base repo mutations should fail fast during agentic execution."""
|
||||
|
||||
def test_raises_when_base_repo_status_changes(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
base = Path(td) / "repo"
|
||||
worktree = Path(td) / "worktree"
|
||||
base.mkdir()
|
||||
worktree.mkdir()
|
||||
|
||||
with self.assertRaises(RuntimeError) as ctx:
|
||||
_assert_base_repo_isolation(
|
||||
base,
|
||||
"M cross_eval/agent.py",
|
||||
step_name="coding",
|
||||
agent_name="claude-coder",
|
||||
worktree_path=worktree,
|
||||
baseline_status="M cross_eval/agent.py",
|
||||
)
|
||||
|
||||
self.assertIn("base repository", str(ctx.exception))
|
||||
|
||||
|
||||
# ===================================================================
|
||||
# 2. agent.py agentic tests (mocking subprocess)
|
||||
# ===================================================================
|
||||
@@ -513,6 +550,33 @@ class TestSetupWorktreeCalledForAgentic(unittest.TestCase):
|
||||
mock_setup.assert_called_once()
|
||||
|
||||
|
||||
class TestSetupWorktreeLocation(unittest.TestCase):
|
||||
"""_setup_worktree places agentic worktrees outside the base repo."""
|
||||
|
||||
def test_worktree_is_created_outside_repo(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
base = Path(td) / "repo"
|
||||
run_dir = base / ".cross-eval" / "output" / "smoke"
|
||||
base.mkdir()
|
||||
run_dir.mkdir(parents=True)
|
||||
_init_git_repo(base)
|
||||
|
||||
worktree_path, branch_name = _setup_worktree(base, run_dir, "review-fix")
|
||||
try:
|
||||
self.assertTrue(worktree_path.exists())
|
||||
self.assertNotIn(str(base.resolve()), str(worktree_path.resolve()))
|
||||
self.assertEqual(
|
||||
(run_dir / "worktree_path.txt").read_text(encoding="utf-8").strip(),
|
||||
str(worktree_path),
|
||||
)
|
||||
self.assertEqual(
|
||||
(run_dir / "worktree_branch.txt").read_text(encoding="utf-8").strip(),
|
||||
branch_name,
|
||||
)
|
||||
finally:
|
||||
remove_worktree(base, worktree_path)
|
||||
|
||||
|
||||
class TestReviewerRunsInWorktreeCwd(unittest.TestCase):
|
||||
"""Reviewer runs with worktree cwd (not original cwd) when worktree exists."""
|
||||
|
||||
|
||||
395
tests/test_evidence.py
Normal file
395
tests/test_evidence.py
Normal file
@@ -0,0 +1,395 @@
|
||||
"""Regression tests for runtime evidence propagation and report visibility.
|
||||
|
||||
Covers:
|
||||
1. Execution evidence is surfaced in reviewer/senior prompt context.
|
||||
2. Reports include command preview and transcript excerpts.
|
||||
3. Claude agentic failure detection (empty diff, write failure, expanded markers).
|
||||
4. _format_execution_evidence produces expected output.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from cross_eval.agent import (
|
||||
AgentInvocationError,
|
||||
_claims_file_changes,
|
||||
_has_write_failure_indicators,
|
||||
invoke_agent_agentic,
|
||||
)
|
||||
from cross_eval.config import BUILTIN_AGENTS
|
||||
from cross_eval.models import (
|
||||
AgentConfig,
|
||||
AgentResult,
|
||||
IterationResult,
|
||||
PipelineConfig,
|
||||
PipelineResult,
|
||||
ReviewMetrics,
|
||||
StepConfig,
|
||||
)
|
||||
from cross_eval.pipeline import _format_execution_evidence, run_pipeline
|
||||
from cross_eval.report import build_report
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 1. Execution evidence formatting
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestFormatExecutionEvidence(unittest.TestCase):
|
||||
"""_format_execution_evidence produces a compact summary for reviewers."""
|
||||
|
||||
def test_empty_results_returns_placeholder(self) -> None:
|
||||
self.assertIn("no prior execution evidence", _format_execution_evidence({}))
|
||||
|
||||
def test_single_result_includes_key_fields(self) -> None:
|
||||
result = AgentResult(
|
||||
output="some diff",
|
||||
exit_code=0,
|
||||
agent_name="claude-coder",
|
||||
step_name="coding",
|
||||
duration_seconds=12.3,
|
||||
transcript="# Agent Execution Transcript\n\n## Command\nclaude ...",
|
||||
command_preview="claude --setting-sources user",
|
||||
)
|
||||
evidence = _format_execution_evidence({"coding_output": result})
|
||||
self.assertIn("claude-coder", evidence)
|
||||
self.assertIn("coding", evidence)
|
||||
self.assertIn("Exit code: 0", evidence)
|
||||
self.assertIn("12.3s", evidence)
|
||||
self.assertIn("claude --setting-sources user", evidence)
|
||||
self.assertIn("Transcript excerpt", evidence)
|
||||
|
||||
def test_multiple_results_separated(self) -> None:
|
||||
r1 = AgentResult(
|
||||
output="diff1", exit_code=0, agent_name="coder",
|
||||
step_name="coding", duration_seconds=1.0,
|
||||
command_preview="cmd1",
|
||||
)
|
||||
r2 = AgentResult(
|
||||
output="review text", exit_code=0, agent_name="reviewer",
|
||||
step_name="review", duration_seconds=2.0,
|
||||
command_preview="cmd2",
|
||||
)
|
||||
evidence = _format_execution_evidence({
|
||||
"coding_output": r1,
|
||||
"review_result": r2,
|
||||
})
|
||||
self.assertIn("coder", evidence)
|
||||
self.assertIn("reviewer", evidence)
|
||||
self.assertIn("---", evidence)
|
||||
|
||||
def test_transcript_truncated_at_2000_chars(self) -> None:
|
||||
long_transcript = "x" * 3000
|
||||
result = AgentResult(
|
||||
output="out", exit_code=0, agent_name="agent",
|
||||
step_name="step", duration_seconds=1.0,
|
||||
transcript=long_transcript,
|
||||
)
|
||||
evidence = _format_execution_evidence({"key": result})
|
||||
self.assertIn("truncated", evidence)
|
||||
# The full 3000-char transcript should NOT appear
|
||||
self.assertNotIn("x" * 3000, evidence)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 2. Evidence in reviewer prompts (integration)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestEvidenceInReviewerPrompt(unittest.TestCase):
|
||||
"""Reviewer prompts include execution evidence from prior coding step."""
|
||||
|
||||
def test_reviewer_receives_evidence(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
steps = [
|
||||
StepConfig(
|
||||
name="coding", agent="claude-coder", role="coding",
|
||||
prompt_template="default:coding", output_key="coding_output",
|
||||
),
|
||||
StepConfig(
|
||||
name="review", agent="claude-reviewer", role="review",
|
||||
prompt_template="default:review", output_key="review_result",
|
||||
verdict=True,
|
||||
),
|
||||
]
|
||||
config = PipelineConfig(
|
||||
output_dir=Path(tmpdir),
|
||||
max_iterations=1,
|
||||
min_iterations=1,
|
||||
language="en",
|
||||
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||
agents=dict(BUILTIN_AGENTS),
|
||||
coders=["claude-coder"],
|
||||
reviewers=["claude-reviewer"],
|
||||
pipeline=steps,
|
||||
preset_name="simple",
|
||||
)
|
||||
|
||||
captured_prompts: list[dict] = []
|
||||
|
||||
def _mock(agent_config, prompt, step_name, **kwargs):
|
||||
captured_prompts.append({
|
||||
"step_name": step_name,
|
||||
"prompt": prompt,
|
||||
})
|
||||
if step_name == "coding":
|
||||
return AgentResult(
|
||||
output="Implemented feature X",
|
||||
exit_code=0,
|
||||
agent_name=agent_config.name,
|
||||
step_name=step_name,
|
||||
duration_seconds=5.0,
|
||||
transcript="# Transcript\nclaude ran...",
|
||||
command_preview="claude --setting-sources user",
|
||||
)
|
||||
return AgentResult(
|
||||
output="VERDICT: PASS",
|
||||
exit_code=0,
|
||||
agent_name=agent_config.name,
|
||||
step_name=step_name,
|
||||
duration_seconds=2.0,
|
||||
)
|
||||
|
||||
with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock):
|
||||
result = run_pipeline(config)
|
||||
|
||||
self.assertEqual(result.final_verdict, "PASS")
|
||||
|
||||
# The reviewer prompt should contain execution evidence
|
||||
review_prompts = [
|
||||
p for p in captured_prompts if p["step_name"] == "review"
|
||||
]
|
||||
self.assertTrue(len(review_prompts) >= 1)
|
||||
review_prompt = review_prompts[0]["prompt"]
|
||||
# Evidence section should reference the coding step's command
|
||||
self.assertIn("Execution Evidence", review_prompt)
|
||||
self.assertIn("claude-coder", review_prompt)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 3. Report includes evidence
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestReportIncludesEvidence(unittest.TestCase):
|
||||
"""Report generation includes command preview and transcript excerpts."""
|
||||
|
||||
def _make_pipeline_result(self) -> tuple[PipelineConfig, PipelineResult]:
|
||||
steps = [
|
||||
StepConfig(
|
||||
name="coding", agent="claude-coder", role="coding",
|
||||
prompt_template="default:coding", output_key="coding_output",
|
||||
),
|
||||
StepConfig(
|
||||
name="review", agent="claude-reviewer", role="review",
|
||||
prompt_template="default:review", output_key="review_result",
|
||||
verdict=True,
|
||||
),
|
||||
]
|
||||
config = PipelineConfig(
|
||||
max_iterations=1,
|
||||
language="en",
|
||||
inputs={"plan": "Plan", "checklist": "CL"},
|
||||
agents=dict(BUILTIN_AGENTS),
|
||||
pipeline=steps,
|
||||
preset_name="simple",
|
||||
)
|
||||
|
||||
coding_result = AgentResult(
|
||||
output="diff --git a/file ...",
|
||||
exit_code=0,
|
||||
agent_name="claude-coder",
|
||||
step_name="coding",
|
||||
duration_seconds=10.0,
|
||||
transcript="# Agent Execution Transcript\n## Command\nclaude ...\n## Stdout\nok",
|
||||
command_preview="claude --setting-sources user",
|
||||
)
|
||||
review_result = AgentResult(
|
||||
output="All good.\n\nVERDICT: PASS",
|
||||
exit_code=0,
|
||||
agent_name="claude-reviewer",
|
||||
step_name="review",
|
||||
duration_seconds=5.0,
|
||||
transcript="# Agent Execution Transcript\n## Command\nclaude -p ...\n## Stdout\nAll good.",
|
||||
command_preview="claude -p --setting-sources user",
|
||||
)
|
||||
|
||||
iteration = IterationResult(
|
||||
iteration=1,
|
||||
step_results={
|
||||
"coding_output": coding_result,
|
||||
"review_result": review_result,
|
||||
},
|
||||
step_outputs={
|
||||
"coding_output": "diff --git a/file ...",
|
||||
"review_result": "All good.\n\nVERDICT: PASS",
|
||||
},
|
||||
verdict="PASS",
|
||||
)
|
||||
|
||||
pipeline_result = PipelineResult(
|
||||
iterations=[iteration],
|
||||
final_verdict="PASS",
|
||||
total_duration=15.0,
|
||||
)
|
||||
|
||||
return config, pipeline_result
|
||||
|
||||
def test_report_contains_command_preview(self) -> None:
|
||||
config, result = self._make_pipeline_result()
|
||||
report = build_report(config, result)
|
||||
self.assertIn("claude --setting-sources user", report)
|
||||
self.assertIn("**Command**", report)
|
||||
|
||||
def test_report_contains_transcript_excerpt(self) -> None:
|
||||
config, result = self._make_pipeline_result()
|
||||
report = build_report(config, result)
|
||||
self.assertIn("Execution transcript", report)
|
||||
self.assertIn("Agent Execution Transcript", report)
|
||||
|
||||
def test_report_contains_exit_code(self) -> None:
|
||||
config, result = self._make_pipeline_result()
|
||||
report = build_report(config, result)
|
||||
self.assertIn("**Exit code**: 0", report)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 4. Claude agentic hardened failure detection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestClaimsFileChangesExpanded(unittest.TestCase):
|
||||
"""Expanded change-claim markers detect more Claude output patterns."""
|
||||
|
||||
def test_ive_implemented(self) -> None:
|
||||
self.assertTrue(_claims_file_changes("I've implemented the feature"))
|
||||
|
||||
def test_ive_updated(self) -> None:
|
||||
self.assertTrue(_claims_file_changes("I've updated the config"))
|
||||
|
||||
def test_made_the_following_changes(self) -> None:
|
||||
self.assertTrue(_claims_file_changes("I made the following changes to the file"))
|
||||
|
||||
def test_applied_the_fix(self) -> None:
|
||||
self.assertTrue(_claims_file_changes("Applied the fix for the bug"))
|
||||
|
||||
def test_changes_have_been_applied(self) -> None:
|
||||
self.assertTrue(_claims_file_changes("Changes have been applied successfully"))
|
||||
|
||||
def test_wrote_the_code(self) -> None:
|
||||
self.assertTrue(_claims_file_changes("Wrote the code for the new module"))
|
||||
|
||||
def test_refactored(self) -> None:
|
||||
self.assertTrue(_claims_file_changes("I refactored the pipeline"))
|
||||
|
||||
def test_no_changes_still_returns_false(self) -> None:
|
||||
self.assertFalse(_claims_file_changes("No changes were necessary"))
|
||||
|
||||
def test_empty_string_returns_false(self) -> None:
|
||||
self.assertFalse(_claims_file_changes(""))
|
||||
|
||||
|
||||
class TestWriteFailureIndicators(unittest.TestCase):
|
||||
"""_has_write_failure_indicators detects stderr patterns."""
|
||||
|
||||
def test_permission_denied(self) -> None:
|
||||
self.assertTrue(_has_write_failure_indicators("Error: Permission denied"))
|
||||
|
||||
def test_read_only_filesystem(self) -> None:
|
||||
self.assertTrue(_has_write_failure_indicators("read-only file system"))
|
||||
|
||||
def test_sandbox_restriction(self) -> None:
|
||||
self.assertTrue(_has_write_failure_indicators("Blocked by sandbox policy"))
|
||||
|
||||
def test_eacces(self) -> None:
|
||||
self.assertTrue(_has_write_failure_indicators("EACCES: operation not permitted"))
|
||||
|
||||
def test_empty_stderr_returns_false(self) -> None:
|
||||
self.assertFalse(_has_write_failure_indicators(""))
|
||||
|
||||
def test_normal_stderr_returns_false(self) -> None:
|
||||
self.assertFalse(_has_write_failure_indicators("Downloading model..."))
|
||||
|
||||
|
||||
class TestAgenticWriteFailureRaisesError(unittest.TestCase):
|
||||
"""Agentic mode raises AgentInvocationError on stderr write-failure indicators."""
|
||||
|
||||
@patch("cross_eval.worktree.capture_diff", return_value="")
|
||||
@patch("subprocess.run")
|
||||
def test_write_failure_detected_from_stderr(
|
||||
self, mock_run: MagicMock, mock_diff: MagicMock,
|
||||
) -> None:
|
||||
mock_run.return_value = MagicMock(
|
||||
returncode=0,
|
||||
stdout="Done.",
|
||||
stderr="Error: Permission denied writing to /src/main.py",
|
||||
)
|
||||
|
||||
agent = AgentConfig(
|
||||
name="claude-coder", command="claude",
|
||||
args=["--setting-sources", "user"], agentic=True,
|
||||
)
|
||||
|
||||
import subprocess as _sp
|
||||
import tempfile as _tf
|
||||
|
||||
with _tf.TemporaryDirectory() as td:
|
||||
wt = Path(td)
|
||||
_sp.run(["git", "init"], cwd=wt, capture_output=True, check=True)
|
||||
_sp.run(["git", "config", "user.email", "t@t.com"], cwd=wt, capture_output=True)
|
||||
_sp.run(["git", "config", "user.name", "T"], cwd=wt, capture_output=True)
|
||||
(wt / "README.md").write_text("# init\n")
|
||||
_sp.run(["git", "add", "."], cwd=wt, capture_output=True, check=True)
|
||||
_sp.run(["git", "commit", "-m", "init"], cwd=wt, capture_output=True, check=True)
|
||||
|
||||
with self.assertRaises(AgentInvocationError) as ctx:
|
||||
invoke_agent_agentic(
|
||||
agent, "implement feature", "coding",
|
||||
worktree_path=wt, quiet=True,
|
||||
)
|
||||
|
||||
self.assertEqual(ctx.exception.failure_type, "WRITE_FAILURE")
|
||||
self.assertIn("Permission denied", ctx.exception.raw_error)
|
||||
|
||||
|
||||
class TestAgenticExpandedClaimMarkers(unittest.TestCase):
|
||||
"""Agentic mode detects expanded claim markers in empty diff scenarios."""
|
||||
|
||||
@patch("cross_eval.worktree.capture_diff", return_value="")
|
||||
@patch("subprocess.run")
|
||||
def test_ive_implemented_triggers_empty_diff_error(
|
||||
self, mock_run: MagicMock, mock_diff: MagicMock,
|
||||
) -> None:
|
||||
mock_run.return_value = MagicMock(
|
||||
returncode=0,
|
||||
stdout="I've implemented the requested changes to the pipeline.",
|
||||
stderr="",
|
||||
)
|
||||
|
||||
agent = AgentConfig(
|
||||
name="claude-coder", command="claude",
|
||||
args=["--setting-sources", "user"], agentic=True,
|
||||
)
|
||||
|
||||
import subprocess as _sp
|
||||
import tempfile as _tf
|
||||
|
||||
with _tf.TemporaryDirectory() as td:
|
||||
wt = Path(td)
|
||||
_sp.run(["git", "init"], cwd=wt, capture_output=True, check=True)
|
||||
_sp.run(["git", "config", "user.email", "t@t.com"], cwd=wt, capture_output=True)
|
||||
_sp.run(["git", "config", "user.name", "T"], cwd=wt, capture_output=True)
|
||||
(wt / "README.md").write_text("# init\n")
|
||||
_sp.run(["git", "add", "."], cwd=wt, capture_output=True, check=True)
|
||||
_sp.run(["git", "commit", "-m", "init"], cwd=wt, capture_output=True, check=True)
|
||||
|
||||
with self.assertRaises(AgentInvocationError) as ctx:
|
||||
invoke_agent_agentic(
|
||||
agent, "implement feature", "coding",
|
||||
worktree_path=wt, quiet=True,
|
||||
)
|
||||
|
||||
self.assertEqual(ctx.exception.failure_type, "EMPTY_DIFF")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user