"""Agent invocation via subprocess with live spinner.""" from __future__ import annotations import itertools import logging import subprocess import sys import threading import time from pathlib import Path from typing import Optional from cross_eval.models import AgentConfig, AgentResult logger = logging.getLogger(__name__) # CLI tools that support --system-prompt flag natively _SYSTEM_PROMPT_AGENTS = ("claude",) _REASONING_EFFORT_AGENTS = ("codex",) def _supports_system_prompt_flag(command: str) -> bool: """Check if the agent CLI supports --system-prompt flag.""" return any(name in command for name in _SYSTEM_PROMPT_AGENTS) def _supports_reasoning_effort(command: str) -> bool: """Check if the agent CLI supports reasoning effort overrides.""" return any(name in command for name in _REASONING_EFFORT_AGENTS) class _Spinner: """Animated spinner for long-running agent calls.""" FRAMES = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏" _CLEAR_LINE = "\r" + (" " * 160) + "\r" def __init__(self, message: str) -> None: self.message = message self._running = False self._thread: Optional[threading.Thread] = None self._start_time = 0.0 def start(self) -> None: self._running = True self._start_time = time.monotonic() self._thread = threading.Thread(target=self._spin, daemon=True) self._thread.start() def _spin(self) -> None: for frame in itertools.cycle(self.FRAMES): if not self._running: break elapsed = int(time.monotonic() - self._start_time) line = f"\r {frame} {self.message} ({elapsed}s)" sys.stderr.write(line) sys.stderr.flush() time.sleep(0.1) def stop(self, final: str) -> None: self._running = False if self._thread: self._thread.join(timeout=1) elapsed = round(time.monotonic() - self._start_time, 1) sys.stderr.write(self._CLEAR_LINE) sys.stderr.write(f" \u2713 {final} ({elapsed}s)\n") sys.stderr.flush() def invoke_agent( agent: AgentConfig, prompt: str, step_name: str, cwd: Optional[Path] = None, timeout: int | None = None, quiet: bool = False, ) -> AgentResult: """Invoke an agent CLI with the given prompt. Args: quiet: If True, suppress spinner (for parallel execution). """ cmd = [agent.command] if agent.reasoning_effort and _supports_reasoning_effort(agent.command): cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"']) cmd.extend(agent.args) # Build the full prompt (system prompt + user prompt) if agent.system_prompt and _supports_system_prompt_flag(agent.command): # claude: --system-prompt flag supported natively cmd.extend(["--system-prompt", agent.system_prompt]) input_data = prompt elif agent.system_prompt: # codex, others: no --system-prompt flag, prepend to prompt input_data = ( f"\n{agent.system_prompt}\n\n\n" f"{prompt}" ) else: input_data = prompt logger.debug("Invoking agent '%s': %s", agent.name, " ".join(cmd[:5]) + " ...") spinner: Optional[_Spinner] = None if not quiet: logger.info(" cmd: %s", " ".join(cmd[:6])) spinner = _Spinner(f"[{step_name}] {agent.name} running...") spinner.start() try: start = time.monotonic() result = subprocess.run( cmd, input=input_data, capture_output=True, text=True, timeout=timeout, cwd=cwd, ) duration = time.monotonic() - start except subprocess.TimeoutExpired: if spinner: spinner.stop(f"[{step_name}] TIMEOUT after {timeout}s") raise except Exception: if spinner: spinner.stop(f"[{step_name}] ERROR") raise output = result.stdout.strip() chars = len(output) if result.returncode != 0: if spinner: spinner.stop(f"[{step_name}] FAILED (exit {result.returncode})") err_detail = result.stderr.strip() or result.stdout.strip() if err_detail and len(err_detail) > 500: err_detail = err_detail[:500] + "..." cmd_preview = " ".join(cmd[:6]) raise RuntimeError( f"Agent '{agent.name}' failed (exit code {result.returncode}) " f"at step '{step_name}':\n" f" cmd: {cmd_preview}\n" f" error: {err_detail or '(no output)'}" ) if spinner: spinner.stop(f"[{step_name}] done — {chars} chars") if not output: logger.warning( "Agent '%s' produced empty output at step '%s'", agent.name, step_name, ) return AgentResult( output=output, exit_code=result.returncode, agent_name=agent.name, step_name=step_name, duration_seconds=round(duration, 1), )