initial commit
This commit is contained in:
162
cross_eval/agent.py
Normal file
162
cross_eval/agent.py
Normal file
@@ -0,0 +1,162 @@
|
||||
"""Agent invocation via subprocess with live spinner."""
|
||||
from __future__ import annotations
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from cross_eval.models import AgentConfig, AgentResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# CLI tools that support --system-prompt flag natively
|
||||
_SYSTEM_PROMPT_AGENTS = ("claude",)
|
||||
_REASONING_EFFORT_AGENTS = ("codex",)
|
||||
|
||||
|
||||
def _supports_system_prompt_flag(command: str) -> bool:
|
||||
"""Check if the agent CLI supports --system-prompt flag."""
|
||||
return any(name in command for name in _SYSTEM_PROMPT_AGENTS)
|
||||
|
||||
|
||||
def _supports_reasoning_effort(command: str) -> bool:
|
||||
"""Check if the agent CLI supports reasoning effort overrides."""
|
||||
return any(name in command for name in _REASONING_EFFORT_AGENTS)
|
||||
|
||||
|
||||
class _Spinner:
|
||||
"""Animated spinner for long-running agent calls."""
|
||||
|
||||
FRAMES = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"
|
||||
_CLEAR_LINE = "\r" + (" " * 160) + "\r"
|
||||
|
||||
def __init__(self, message: str) -> None:
|
||||
self.message = message
|
||||
self._running = False
|
||||
self._thread: Optional[threading.Thread] = None
|
||||
self._start_time = 0.0
|
||||
|
||||
def start(self) -> None:
|
||||
self._running = True
|
||||
self._start_time = time.monotonic()
|
||||
self._thread = threading.Thread(target=self._spin, daemon=True)
|
||||
self._thread.start()
|
||||
|
||||
def _spin(self) -> None:
|
||||
for frame in itertools.cycle(self.FRAMES):
|
||||
if not self._running:
|
||||
break
|
||||
elapsed = int(time.monotonic() - self._start_time)
|
||||
line = f"\r {frame} {self.message} ({elapsed}s)"
|
||||
sys.stderr.write(line)
|
||||
sys.stderr.flush()
|
||||
time.sleep(0.1)
|
||||
|
||||
def stop(self, final: str) -> None:
|
||||
self._running = False
|
||||
if self._thread:
|
||||
self._thread.join(timeout=1)
|
||||
elapsed = round(time.monotonic() - self._start_time, 1)
|
||||
sys.stderr.write(self._CLEAR_LINE)
|
||||
sys.stderr.write(f" \u2713 {final} ({elapsed}s)\n")
|
||||
sys.stderr.flush()
|
||||
|
||||
|
||||
def invoke_agent(
|
||||
agent: AgentConfig,
|
||||
prompt: str,
|
||||
step_name: str,
|
||||
cwd: Optional[Path] = None,
|
||||
timeout: int | None = None,
|
||||
quiet: bool = False,
|
||||
) -> AgentResult:
|
||||
"""Invoke an agent CLI with the given prompt.
|
||||
|
||||
Args:
|
||||
quiet: If True, suppress spinner (for parallel execution).
|
||||
"""
|
||||
cmd = [agent.command]
|
||||
if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
|
||||
cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
|
||||
cmd.extend(agent.args)
|
||||
|
||||
# Build the full prompt (system prompt + user prompt)
|
||||
if agent.system_prompt and _supports_system_prompt_flag(agent.command):
|
||||
# claude: --system-prompt flag supported natively
|
||||
cmd.extend(["--system-prompt", agent.system_prompt])
|
||||
input_data = prompt
|
||||
elif agent.system_prompt:
|
||||
# codex, others: no --system-prompt flag, prepend to prompt
|
||||
input_data = (
|
||||
f"<system>\n{agent.system_prompt}\n</system>\n\n"
|
||||
f"{prompt}"
|
||||
)
|
||||
else:
|
||||
input_data = prompt
|
||||
|
||||
logger.debug("Invoking agent '%s': %s", agent.name, " ".join(cmd[:5]) + " ...")
|
||||
|
||||
spinner: Optional[_Spinner] = None
|
||||
if not quiet:
|
||||
logger.info(" cmd: %s", " ".join(cmd[:6]))
|
||||
spinner = _Spinner(f"[{step_name}] {agent.name} running...")
|
||||
spinner.start()
|
||||
|
||||
try:
|
||||
start = time.monotonic()
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
input=input_data,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout,
|
||||
cwd=cwd,
|
||||
)
|
||||
duration = time.monotonic() - start
|
||||
except subprocess.TimeoutExpired:
|
||||
if spinner:
|
||||
spinner.stop(f"[{step_name}] TIMEOUT after {timeout}s")
|
||||
raise
|
||||
except Exception:
|
||||
if spinner:
|
||||
spinner.stop(f"[{step_name}] ERROR")
|
||||
raise
|
||||
|
||||
output = result.stdout.strip()
|
||||
chars = len(output)
|
||||
|
||||
if result.returncode != 0:
|
||||
if spinner:
|
||||
spinner.stop(f"[{step_name}] FAILED (exit {result.returncode})")
|
||||
err_detail = result.stderr.strip() or result.stdout.strip()
|
||||
if err_detail and len(err_detail) > 500:
|
||||
err_detail = err_detail[:500] + "..."
|
||||
cmd_preview = " ".join(cmd[:6])
|
||||
raise RuntimeError(
|
||||
f"Agent '{agent.name}' failed (exit code {result.returncode}) "
|
||||
f"at step '{step_name}':\n"
|
||||
f" cmd: {cmd_preview}\n"
|
||||
f" error: {err_detail or '(no output)'}"
|
||||
)
|
||||
|
||||
if spinner:
|
||||
spinner.stop(f"[{step_name}] done — {chars} chars")
|
||||
|
||||
if not output:
|
||||
logger.warning(
|
||||
"Agent '%s' produced empty output at step '%s'",
|
||||
agent.name, step_name,
|
||||
)
|
||||
|
||||
return AgentResult(
|
||||
output=output,
|
||||
exit_code=result.returncode,
|
||||
agent_name=agent.name,
|
||||
step_name=step_name,
|
||||
duration_seconds=round(duration, 1),
|
||||
)
|
||||
Reference in New Issue
Block a user