initial commit

2026-03-11 21:53:14 +09:00
commit ee4f1a07ef
42 changed files with 4533 additions and 0 deletions
--- a/cross_eval/agent.py
+++ b/cross_eval/agent.py
@@ -0,0 +1,162 @@
+"""Agent invocation via subprocess with live spinner."""
+from __future__ import annotations
+
+import itertools
+import logging
+import subprocess
+import sys
+import threading
+import time
+from pathlib import Path
+from typing import Optional
+
+from cross_eval.models import AgentConfig, AgentResult
+
+logger = logging.getLogger(__name__)
+
+# CLI tools that support --system-prompt flag natively
+_SYSTEM_PROMPT_AGENTS = ("claude",)
+_REASONING_EFFORT_AGENTS = ("codex",)
+
+
+def _supports_system_prompt_flag(command: str) -> bool:
+    """Check if the agent CLI supports --system-prompt flag."""
+    return any(name in command for name in _SYSTEM_PROMPT_AGENTS)
+
+
+def _supports_reasoning_effort(command: str) -> bool:
+    """Check if the agent CLI supports reasoning effort overrides."""
+    return any(name in command for name in _REASONING_EFFORT_AGENTS)
+
+
+class _Spinner:
+    """Animated spinner for long-running agent calls."""
+
+    FRAMES = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"
+    _CLEAR_LINE = "\r" + (" " * 160) + "\r"
+
+    def __init__(self, message: str) -> None:
+        self.message = message
+        self._running = False
+        self._thread: Optional[threading.Thread] = None
+        self._start_time = 0.0
+
+    def start(self) -> None:
+        self._running = True
+        self._start_time = time.monotonic()
+        self._thread = threading.Thread(target=self._spin, daemon=True)
+        self._thread.start()
+
+    def _spin(self) -> None:
+        for frame in itertools.cycle(self.FRAMES):
+            if not self._running:
+                break
+            elapsed = int(time.monotonic() - self._start_time)
+            line = f"\r  {frame} {self.message} ({elapsed}s)"
+            sys.stderr.write(line)
+            sys.stderr.flush()
+            time.sleep(0.1)
+
+    def stop(self, final: str) -> None:
+        self._running = False
+        if self._thread:
+            self._thread.join(timeout=1)
+        elapsed = round(time.monotonic() - self._start_time, 1)
+        sys.stderr.write(self._CLEAR_LINE)
+        sys.stderr.write(f"  \u2713 {final} ({elapsed}s)\n")
+        sys.stderr.flush()
+
+
+def invoke_agent(
+    agent: AgentConfig,
+    prompt: str,
+    step_name: str,
+    cwd: Optional[Path] = None,
+    timeout: int | None = None,
+    quiet: bool = False,
+) -> AgentResult:
+    """Invoke an agent CLI with the given prompt.
+
+    Args:
+        quiet: If True, suppress spinner (for parallel execution).
+    """
+    cmd = [agent.command]
+    if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
+        cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
+    cmd.extend(agent.args)
+
+    # Build the full prompt (system prompt + user prompt)
+    if agent.system_prompt and _supports_system_prompt_flag(agent.command):
+        # claude: --system-prompt flag supported natively
+        cmd.extend(["--system-prompt", agent.system_prompt])
+        input_data = prompt
+    elif agent.system_prompt:
+        # codex, others: no --system-prompt flag, prepend to prompt
+        input_data = (
+            f"<system>\n{agent.system_prompt}\n</system>\n\n"
+            f"{prompt}"
+        )
+    else:
+        input_data = prompt
+
+    logger.debug("Invoking agent '%s': %s", agent.name, " ".join(cmd[:5]) + " ...")
+
+    spinner: Optional[_Spinner] = None
+    if not quiet:
+        logger.info("  cmd: %s", " ".join(cmd[:6]))
+        spinner = _Spinner(f"[{step_name}] {agent.name} running...")
+        spinner.start()
+
+    try:
+        start = time.monotonic()
+        result = subprocess.run(
+            cmd,
+            input=input_data,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+            cwd=cwd,
+        )
+        duration = time.monotonic() - start
+    except subprocess.TimeoutExpired:
+        if spinner:
+            spinner.stop(f"[{step_name}] TIMEOUT after {timeout}s")
+        raise
+    except Exception:
+        if spinner:
+            spinner.stop(f"[{step_name}] ERROR")
+        raise
+
+    output = result.stdout.strip()
+    chars = len(output)
+
+    if result.returncode != 0:
+        if spinner:
+            spinner.stop(f"[{step_name}] FAILED (exit {result.returncode})")
+        err_detail = result.stderr.strip() or result.stdout.strip()
+        if err_detail and len(err_detail) > 500:
+            err_detail = err_detail[:500] + "..."
+        cmd_preview = " ".join(cmd[:6])
+        raise RuntimeError(
+            f"Agent '{agent.name}' failed (exit code {result.returncode}) "
+            f"at step '{step_name}':\n"
+            f"  cmd: {cmd_preview}\n"
+            f"  error: {err_detail or '(no output)'}"
+        )
+
+    if spinner:
+        spinner.stop(f"[{step_name}] done — {chars} chars")
+
+    if not output:
+        logger.warning(
+            "Agent '%s' produced empty output at step '%s'",
+            agent.name, step_name,
+        )
+
+    return AgentResult(
+        output=output,
+        exit_code=result.returncode,
+        agent_name=agent.name,
+        step_name=step_name,
+        duration_seconds=round(duration, 1),
+    )