feat: harden runtime evidence and claude agentic validation
This commit is contained in:
@@ -14,7 +14,7 @@ import unittest
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, call, patch
|
||||
|
||||
from cross_eval.agent import invoke_agent_agentic
|
||||
from cross_eval.agent import AgentInvocationError, invoke_agent_agentic
|
||||
from cross_eval.config import BUILTIN_AGENTS, _make_agentic
|
||||
from cross_eval.models import (
|
||||
AgentConfig,
|
||||
@@ -309,6 +309,74 @@ class TestTaskFileCleanup(unittest.TestCase):
|
||||
self.assertFalse((wt / "CROSS_EVAL_TASK.md").exists())
|
||||
|
||||
|
||||
class TestAgenticEmptyDiffDetection(unittest.TestCase):
|
||||
"""Agentic coders should not succeed when they only claim changes in stdout."""
|
||||
|
||||
@patch("cross_eval.worktree.capture_diff", return_value="")
|
||||
@patch("subprocess.run")
|
||||
def test_claude_empty_diff_with_change_claim_fails(
|
||||
self, mock_run: MagicMock, mock_diff: MagicMock,
|
||||
) -> None:
|
||||
mock_run.return_value = MagicMock(
|
||||
returncode=0,
|
||||
stdout=(
|
||||
"All tests pass.\n"
|
||||
"Here's a summary of all changes made:\n"
|
||||
"- Updated discovery.py\n"
|
||||
),
|
||||
stderr="",
|
||||
)
|
||||
|
||||
agent = AgentConfig(
|
||||
name="claude-coder",
|
||||
command="claude",
|
||||
args=["--setting-sources", "user"],
|
||||
agentic=True,
|
||||
)
|
||||
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
wt = Path(td)
|
||||
_init_git_repo(wt)
|
||||
|
||||
with self.assertRaises(AgentInvocationError) as ctx:
|
||||
invoke_agent_agentic(
|
||||
agent, "implement feature X", "coding",
|
||||
worktree_path=wt, quiet=True,
|
||||
)
|
||||
|
||||
self.assertEqual(ctx.exception.failure_type, "EMPTY_DIFF")
|
||||
self.assertIn("summary of all changes made", ctx.exception.raw_error.lower())
|
||||
|
||||
@patch("cross_eval.worktree.capture_diff", return_value="")
|
||||
@patch("subprocess.run")
|
||||
def test_empty_diff_without_change_claim_is_allowed(
|
||||
self, mock_run: MagicMock, mock_diff: MagicMock,
|
||||
) -> None:
|
||||
mock_run.return_value = MagicMock(
|
||||
returncode=0,
|
||||
stdout="No changes were required; the current implementation already satisfies the task.",
|
||||
stderr="",
|
||||
)
|
||||
|
||||
agent = AgentConfig(
|
||||
name="claude-coder",
|
||||
command="claude",
|
||||
args=["--setting-sources", "user"],
|
||||
agentic=True,
|
||||
)
|
||||
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
wt = Path(td)
|
||||
_init_git_repo(wt)
|
||||
|
||||
result = invoke_agent_agentic(
|
||||
agent, "check whether any fix is needed", "coding",
|
||||
worktree_path=wt, quiet=True,
|
||||
)
|
||||
|
||||
self.assertEqual(result.output, "(no changes)")
|
||||
|
||||
|
||||
# ===================================================================
|
||||
# 3. config.py tests
|
||||
# ===================================================================
|
||||
|
||||
Reference in New Issue
Block a user