cross-eval/cross_eval/agent.py

"""Agent invocation via subprocess with live spinner."""
from __future__ import annotations

import itertools
import logging
import subprocess
import sys
import threading
import time
from pathlib import Path
from typing import Optional

from cross_eval.models import AgentConfig, AgentResult

logger = logging.getLogger(__name__)

# CLI tools that support --system-prompt flag natively
_SYSTEM_PROMPT_AGENTS = ("claude",)
_REASONING_EFFORT_AGENTS = ("codex",)


def _supports_system_prompt_flag(command: str) -> bool:
    """Check if the agent CLI supports --system-prompt flag."""
    return any(name in command for name in _SYSTEM_PROMPT_AGENTS)


def _supports_reasoning_effort(command: str) -> bool:
    """Check if the agent CLI supports reasoning effort overrides."""
    return any(name in command for name in _REASONING_EFFORT_AGENTS)


class _Spinner:
    """Animated spinner for long-running agent calls."""

    FRAMES = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"
    _CLEAR_LINE = "\r" + (" " * 160) + "\r"

    def __init__(self, message: str) -> None:
        self.message = message
        self._running = False
        self._thread: Optional[threading.Thread] = None
        self._start_time = 0.0

    def start(self) -> None:
        self._running = True
        self._start_time = time.monotonic()
        self._thread = threading.Thread(target=self._spin, daemon=True)
        self._thread.start()

    def _spin(self) -> None:
        for frame in itertools.cycle(self.FRAMES):
            if not self._running:
                break
            elapsed = int(time.monotonic() - self._start_time)
            line = f"\r  {frame} {self.message} ({elapsed}s)"
            sys.stderr.write(line)
            sys.stderr.flush()
            time.sleep(0.1)

    def stop(self, final: str) -> None:
        self._running = False
        if self._thread:
            self._thread.join(timeout=1)
        elapsed = round(time.monotonic() - self._start_time, 1)
        sys.stderr.write(self._CLEAR_LINE)
        sys.stderr.write(f"  \u2713 {final} ({elapsed}s)\n")
        sys.stderr.flush()


def invoke_agent(
    agent: AgentConfig,
    prompt: str,
    step_name: str,
    cwd: Optional[Path] = None,
    timeout: int | None = None,
    quiet: bool = False,
) -> AgentResult:
    """Invoke an agent CLI with the given prompt.

    Args:
        quiet: If True, suppress spinner (for parallel execution).
    """
    cmd = [agent.command]
    if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
        cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
    cmd.extend(agent.args)

    # Build the full prompt (system prompt + user prompt)
    if agent.system_prompt and _supports_system_prompt_flag(agent.command):
        # claude: --system-prompt flag supported natively
        cmd.extend(["--system-prompt", agent.system_prompt])
        input_data = prompt
    elif agent.system_prompt:
        # codex, others: no --system-prompt flag, prepend to prompt
        input_data = (
            f"<system>\n{agent.system_prompt}\n</system>\n\n"
            f"{prompt}"
        )
    else:
        input_data = prompt

    logger.debug("Invoking agent '%s': %s", agent.name, " ".join(cmd[:5]) + " ...")

    spinner: Optional[_Spinner] = None
    if not quiet:
        logger.info("  cmd: %s", " ".join(cmd[:6]))
        spinner = _Spinner(f"[{step_name}] {agent.name} running...")
        spinner.start()

    try:
        start = time.monotonic()
        result = subprocess.run(
            cmd,
            input=input_data,
            capture_output=True,
            text=True,
            timeout=timeout,
            cwd=cwd,
        )
        duration = time.monotonic() - start
    except subprocess.TimeoutExpired:
        if spinner:
            spinner.stop(f"[{step_name}] TIMEOUT after {timeout}s")
        raise
    except Exception:
        if spinner:
            spinner.stop(f"[{step_name}] ERROR")
        raise

    output = result.stdout.strip()
    chars = len(output)

    if result.returncode != 0:
        if spinner:
            spinner.stop(f"[{step_name}] FAILED (exit {result.returncode})")
        err_detail = result.stderr.strip() or result.stdout.strip()
        if err_detail and len(err_detail) > 500:
            err_detail = err_detail[:500] + "..."
        cmd_preview = " ".join(cmd[:6])
        raise RuntimeError(
            f"Agent '{agent.name}' failed (exit code {result.returncode}) "
            f"at step '{step_name}':\n"
            f"  cmd: {cmd_preview}\n"
            f"  error: {err_detail or '(no output)'}"
        )

    if spinner:
        spinner.stop(f"[{step_name}] done — {chars} chars")

    if not output:
        logger.warning(
            "Agent '%s' produced empty output at step '%s'",
            agent.name, step_name,
        )

    return AgentResult(
        output=output,
        exit_code=result.returncode,
        agent_name=agent.name,
        step_name=step_name,
        duration_seconds=round(duration, 1),
    )