feat: isolate agentic worktrees and surface execution evidence

2026-03-13 22:50:46 +09:00
parent 3fb19e90c0
commit b19d174c98
7 changed files with 758 additions and 14 deletions
--- a/cross_eval/agent.py
+++ b/cross_eval/agent.py
@@ -32,20 +32,33 @@ _NO_CHANGE_ACK_MARKERS = (
 _CHANGE_CLAIM_MARKERS = (
    "summary of all changes made",
    "here's a summary of all changes made",
+    "here is a summary of all changes",
    "implemented",
    "i implemented",
+    "i've implemented",
    "added",
    "i added",
+    "i've added",
    "updated",
    "i updated",
+    "i've updated",
    "modified",
    "i modified",
+    "i've modified",
    "created",
    "i created",
+    "i've created",
    "fixed",
    "i fixed",
+    "i've fixed",
    "completed the changes",
    "finished the changes",
+    "made the following changes",
+    "applied the fix",
+    "changes have been applied",
+    "wrote the code",
+    "refactored",
+    "i refactored",
 )


@@ -134,6 +147,29 @@ def _classify_agent_failure(detail: str) -> tuple[str, str]:
    )


+_WRITE_FAILURE_MARKERS = (
+    "permission denied",
+    "read-only file system",
+    "read only file system",
+    "operation not permitted",
+    "cannot write",
+    "failed to write",
+    "could not write",
+    "unable to write",
+    "sandbox",
+    "eacces",
+    "erofs",
+)
+
+
+def _has_write_failure_indicators(stderr: str) -> bool:
+    """Detect stderr patterns indicating the agent could not write files."""
+    if not stderr.strip():
+        return False
+    normalized = stderr.lower()
+    return any(marker in normalized for marker in _WRITE_FAILURE_MARKERS)
+
+
 def _claims_file_changes(output: str) -> bool:
    """Heuristic for agent text that claims code changes were made."""
    normalized = output.lower()
@@ -406,7 +442,8 @@ def invoke_agent_agentic(
        # (avoids OS arg length limits for large prompts)
        cmd.append(
            f"Read the task file at {task_file} and execute all instructions in it. "
-            f"Work in the current directory."
+            f"Work only inside the current directory and do not modify files "
+            f"outside it."
        )

    cmd_preview = " ".join(cmd[:6])
@@ -467,7 +504,14 @@ def invoke_agent_agentic(
    if not diff_output:
        stdout_excerpt = (result.stdout or "").strip()
        stderr_excerpt = (result.stderr or "").strip()
-        if _claims_file_changes(stdout_excerpt):
+
+        # Detect two failure modes:
+        # 1. Agent claims changes in stdout but produced no diff
+        # 2. Agent stderr contains permission or write-failure indicators
+        claims_changes = _claims_file_changes(stdout_excerpt)
+        has_write_failure = _has_write_failure_indicators(stderr_excerpt)
+
+        if claims_changes or has_write_failure:
            if spinner:
                spinner.stop(f"[{step_name}] FAILED (empty diff)")
            raw_error = stdout_excerpt or "(stdout empty)"
@@ -475,16 +519,27 @@ def invoke_agent_agentic(
                raw_error = f"{raw_error}\n\n[stderr]\n{stderr_excerpt}"
            if len(raw_error) > 2000:
                raw_error = raw_error[:2000] + "..."
+
+            if has_write_failure:
+                failure_type = "WRITE_FAILURE"
+                suggested_action = (
+                    "Agent encountered file write errors (permission denied, read-only, "
+                    "or sandbox restriction). Check agent permissions and worktree state."
+                )
+            else:
+                failure_type = "EMPTY_DIFF"
+                suggested_action = (
+                    "Agent reported code changes but produced no git diff. "
+                    "Treat this run as failed and require a real worktree diff before continuing."
+                )
+
            raise AgentInvocationError(
                agent_name=agent.name,
                step_name=step_name,
                cmd_preview=cmd_preview,
                raw_error=raw_error,
-                failure_type="EMPTY_DIFF",
-                suggested_action=(
-                    "Agent reported code changes but produced no git diff. "
-                    "Treat this run as failed and require a real worktree diff before continuing."
-                ),
+                failure_type=failure_type,
+                suggested_action=suggested_action,
            )

        diff_output = "(no changes)"