feat: harden runtime evidence and claude agentic validation

2026-03-13 22:29:22 +09:00
parent 28dd794f54
commit 3fb19e90c0
5 changed files with 655 additions and 59 deletions
--- a/cross_eval/agent.py
+++ b/cross_eval/agent.py
@@ -19,6 +19,34 @@ logger = logging.getLogger(__name__)
 # CLI tools that support --system-prompt flag natively
 _SYSTEM_PROMPT_AGENTS = ("claude",)
 _REASONING_EFFORT_AGENTS = ("codex",)
+_NO_CHANGE_ACK_MARKERS = (
+    "no changes",
+    "no code changes",
+    "no file changes",
+    "did not make any changes",
+    "nothing to change",
+    "no modifications were necessary",
+    "no update was necessary",
+    "already satisfied",
+)
+_CHANGE_CLAIM_MARKERS = (
+    "summary of all changes made",
+    "here's a summary of all changes made",
+    "implemented",
+    "i implemented",
+    "added",
+    "i added",
+    "updated",
+    "i updated",
+    "modified",
+    "i modified",
+    "created",
+    "i created",
+    "fixed",
+    "i fixed",
+    "completed the changes",
+    "finished the changes",
+)


 class AgentInvocationError(RuntimeError):
@@ -106,6 +134,16 @@ def _classify_agent_failure(detail: str) -> tuple[str, str]:
    )


+def _claims_file_changes(output: str) -> bool:
+    """Heuristic for agent text that claims code changes were made."""
+    normalized = output.lower()
+    if not normalized.strip():
+        return False
+    if any(marker in normalized for marker in _NO_CHANGE_ACK_MARKERS):
+        return False
+    return any(marker in normalized for marker in _CHANGE_CLAIM_MARKERS)
+
+
 class _Spinner:
    """Animated spinner for long-running agent calls."""

@@ -302,6 +340,9 @@ def invoke_agent(
        command_preview=cmd_preview,
        stdout=result.stdout,
        stderr=result.stderr,
+        exit_code=result.returncode,
+        duration_seconds=round(duration, 1),
+        cwd=str(cwd) if cwd else "",
    )

    return AgentResult(
@@ -424,6 +465,28 @@ def invoke_agent_agentic(
    diff_output = capture_diff(worktree_path)

    if not diff_output:
+        stdout_excerpt = (result.stdout or "").strip()
+        stderr_excerpt = (result.stderr or "").strip()
+        if _claims_file_changes(stdout_excerpt):
+            if spinner:
+                spinner.stop(f"[{step_name}] FAILED (empty diff)")
+            raw_error = stdout_excerpt or "(stdout empty)"
+            if stderr_excerpt:
+                raw_error = f"{raw_error}\n\n[stderr]\n{stderr_excerpt}"
+            if len(raw_error) > 2000:
+                raw_error = raw_error[:2000] + "..."
+            raise AgentInvocationError(
+                agent_name=agent.name,
+                step_name=step_name,
+                cmd_preview=cmd_preview,
+                raw_error=raw_error,
+                failure_type="EMPTY_DIFF",
+                suggested_action=(
+                    "Agent reported code changes but produced no git diff. "
+                    "Treat this run as failed and require a real worktree diff before continuing."
+                ),
+            )
+
        diff_output = "(no changes)"
        logger.warning(
            "Agent '%s' made no file changes at step '%s'",
@@ -438,6 +501,9 @@ def invoke_agent_agentic(
        command_preview=cmd_preview,
        stdout=result.stdout,
        stderr=result.stderr,
+        exit_code=result.returncode,
+        duration_seconds=round(duration, 1),
+        cwd=str(worktree_path),
    )

    return AgentResult(
@@ -456,6 +522,9 @@ def _build_transcript(
    command_preview: str,
    stdout: str,
    stderr: str,
+    exit_code: int = 0,
+    duration_seconds: float = 0.0,
+    cwd: str = "",
 ) -> str:
    """Build a compact execution transcript for debugging/audit output."""
    sections = [
@@ -466,6 +535,16 @@ def _build_transcript(
        command_preview or "(unknown command)",
        "```",
        "",
+    ]
+    if cwd:
+        sections.extend(["## Working Directory", f"`{cwd}`", ""])
+    sections.extend([
+        f"## Exit Code: {exit_code}",
+        "",
+    ])
+    if duration_seconds > 0:
+        sections.extend([f"## Duration: {duration_seconds}s", ""])
+    sections.extend([
        "## Stdout",
        "```",
        (stdout or "(empty)").strip(),
@@ -476,5 +555,5 @@ def _build_transcript(
        (stderr or "(empty)").strip(),
        "```",
        "",
-    ]
+    ])
    return "\n".join(sections)