feat: tighten agentic runtime handoffs and quality gates

2026-03-14 10:05:25 +09:00
parent 87bc0ffbfb
commit 7b95233edf
15 changed files with 1148 additions and 167 deletions
--- a/cross_eval/agent.py
+++ b/cross_eval/agent.py
@@ -415,11 +415,7 @@ def invoke_agent_agentic(
    timeout: int | None = None,
    quiet: bool = False,
 ) -> AgentResult:
-    """Invoke an agent in agentic mode (no -p, runs in worktree, captures git diff).
-
-    The agent runs without print mode so it can modify files directly.
-    After the agent exits, git diff (since last commit) is captured as the output.
-    """
+    """Invoke an agent in agentic mode using the worktree as the source of truth."""
    from cross_eval.worktree import capture_diff

    # Write prompt to a temp file (outside worktree, won't appear in diffs)
@@ -433,10 +429,10 @@ def invoke_agent_agentic(
    if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
        cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])

-    # Strip stdin sentinel ("-") from args for agentic mode.
-    # Keep -p/--print: Claude -p mode still has full tool access (Edit, Write,
-    # Bash, etc.) and is the correct mode for non-interactive subprocess use.
-    args = [a for a in agent.args if a != "-"]
+    # Strip print-mode flags and stdin sentinels for agentic mode.
+    # Agentic runs should operate on the worktree and return a real git diff,
+    # not behave as a one-shot text completer.
+    args = [a for a in agent.args if a not in {"-", "-p", "--print"}]
    cmd.extend(args)

    # System prompt via flag if supported
@@ -454,8 +450,8 @@ def invoke_agent_agentic(
        else:
            input_data = prompt
    else:
-        # claude -p: deliver prompt via stdin (same as codex).
-        # -p mode is non-interactive and reads from stdin, then exits.
+        # claude: deliver the task through stdin and let the worktree be the
+        # canonical place where files are read/written.
        input_data = prompt

    cmd_preview = " ".join(cmd[:6])