release: cut 0.2.0 baseline

This commit is contained in:
chungyeong
2026-03-13 21:47:54 +09:00
parent 204e071b74
commit 941304398d
15 changed files with 1930 additions and 270 deletions

View File

@@ -3,8 +3,10 @@ from __future__ import annotations
import itertools
import logging
import os
import subprocess
import sys
import tempfile
import threading
import time
from pathlib import Path
@@ -142,11 +144,17 @@ class _Spinner:
sys.stderr.flush()
def _is_print_mode(args: list[str]) -> bool:
"""Check if the agent args include -p / --print flag."""
return "-p" in args or "--print" in args
def invoke_agent(
agent: AgentConfig,
prompt: str,
step_name: str,
cwd: Optional[Path] = None,
env: Optional[dict[str, str]] = None,
timeout: int | None = None,
quiet: bool = False,
) -> AgentResult:
@@ -155,30 +163,67 @@ def invoke_agent(
Args:
quiet: If True, suppress spinner (for parallel execution).
"""
is_claude = "claude" in agent.command
is_interactive = is_claude and not _is_print_mode(agent.args)
cmd = [agent.command]
if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
cmd.extend(agent.args)
# Build the full prompt (system prompt + user prompt)
if agent.system_prompt and _supports_system_prompt_flag(agent.command):
# claude: --system-prompt flag supported natively
cmd.extend(["--system-prompt", agent.system_prompt])
input_data = prompt
elif agent.system_prompt:
# codex, others: no --system-prompt flag, prepend to prompt
input_data = (
f"<system>\n{agent.system_prompt}\n</system>\n\n"
f"{prompt}"
# --- Temp files for interactive (non -p) claude ---
task_file: Optional[Path] = None
output_file: Optional[Path] = None
if is_interactive:
# Write prompt + output instruction to temp task file
task_fd, task_path = tempfile.mkstemp(suffix=".md", prefix="cross_eval_task_")
task_file = Path(task_path)
os.close(task_fd)
out_fd, out_path = tempfile.mkstemp(suffix=".md", prefix="cross_eval_out_")
output_file = Path(out_path)
os.close(out_fd)
# Clear the output file so we can detect if agent wrote to it
output_file.write_text("", encoding="utf-8")
wrapped_prompt = (
f"{prompt}\n\n"
f"---\n"
f"IMPORTANT: Write your COMPLETE response to this file: {output_file}\n"
f"Do NOT modify any other files in the project."
)
task_file.write_text(wrapped_prompt, encoding="utf-8")
# System prompt via flag
if agent.system_prompt and _supports_system_prompt_flag(agent.command):
cmd.extend(["--system-prompt", agent.system_prompt])
# Positional arg: point claude to the task file
cmd.append(
f"Read the task file at {task_file} and follow all instructions in it. "
f"Write your complete output to {output_file}."
)
input_data: str | None = None
else:
input_data = prompt
# Print mode (-p) or non-claude: deliver prompt via stdin
if agent.system_prompt and _supports_system_prompt_flag(agent.command):
cmd.extend(["--system-prompt", agent.system_prompt])
input_data = prompt
elif agent.system_prompt:
input_data = (
f"<system>\n{agent.system_prompt}\n</system>\n\n"
f"{prompt}"
)
else:
input_data = prompt
logger.debug("Invoking agent '%s': %s", agent.name, " ".join(cmd[:5]) + " ...")
spinner: Optional[_Spinner] = None
if not quiet:
logger.info(" cmd: %s", " ".join(cmd[:6]))
mode_label = "interactive" if is_interactive else ""
logger.info(" cmd: %s %s", " ".join(cmd[:6]), f"({mode_label})" if mode_label else "")
spinner = _Spinner(f"[{step_name}] {agent.name} running...")
spinner.start()
@@ -191,6 +236,7 @@ def invoke_agent(
text=True,
timeout=timeout,
cwd=cwd,
env=env,
)
duration = time.monotonic() - start
except subprocess.TimeoutExpired:
@@ -201,10 +247,154 @@ def invoke_agent(
if spinner:
spinner.stop(f"[{step_name}] ERROR")
raise
finally:
if task_file:
task_file.unlink(missing_ok=True)
if result.returncode != 0:
if spinner:
spinner.stop(f"[{step_name}] FAILED (exit {result.returncode})")
if output_file:
output_file.unlink(missing_ok=True)
err_detail = result.stderr.strip() or result.stdout.strip()
if err_detail and len(err_detail) > 500:
err_detail = err_detail[:500] + "..."
cmd_preview = " ".join(cmd[:6])
failure_type, suggested_action = _classify_agent_failure(err_detail or "")
raise AgentInvocationError(
agent_name=agent.name,
step_name=step_name,
cmd_preview=cmd_preview,
raw_error=err_detail or "(no output)",
failure_type=failure_type,
suggested_action=suggested_action,
)
# --- Capture output ---
if output_file:
output = output_file.read_text(encoding="utf-8").strip()
output_file.unlink(missing_ok=True)
if not output:
# Fallback to stdout if agent didn't write to the file
output = result.stdout.strip()
else:
output = result.stdout.strip()
output = result.stdout.strip()
chars = len(output)
if spinner:
spinner.stop(f"[{step_name}] done — {chars} chars")
if not output:
stderr_info = result.stderr.strip()
if stderr_info:
logger.warning(
"Agent '%s' produced empty output at step '%s'. stderr: %s",
agent.name, step_name, stderr_info[:500],
)
else:
logger.warning(
"Agent '%s' produced empty output at step '%s' (no stderr either)",
agent.name, step_name,
)
return AgentResult(
output=output,
exit_code=result.returncode,
agent_name=agent.name,
step_name=step_name,
duration_seconds=round(duration, 1),
)
def invoke_agent_agentic(
agent: AgentConfig,
prompt: str,
step_name: str,
worktree_path: Path,
env: Optional[dict[str, str]] = None,
timeout: int | None = None,
quiet: bool = False,
) -> AgentResult:
"""Invoke an agent in agentic mode (no -p, runs in worktree, captures git diff).
The agent runs without print mode so it can modify files directly.
After the agent exits, git diff (since last commit) is captured as the output.
"""
from cross_eval.worktree import capture_diff
# Write prompt to a temp file (outside worktree, won't appear in diffs)
import tempfile
task_fd, task_path = tempfile.mkstemp(suffix=".md", prefix="cross_eval_task_")
task_file = Path(task_path)
task_file.write_text(prompt, encoding="utf-8")
os.close(task_fd)
cmd = [agent.command]
if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
# Strip stdin sentinel ("-") from args for agentic mode
args = [a for a in agent.args if a != "-"]
cmd.extend(args)
# System prompt via flag if supported
if agent.system_prompt and _supports_system_prompt_flag(agent.command):
cmd.extend(["--system-prompt", agent.system_prompt])
# Deliver the prompt differently per agent type
is_codex = "codex" in agent.command
input_data: str | None = None
if is_codex:
# codex: stdin mode
cmd.append("-")
if agent.system_prompt and not _supports_system_prompt_flag(agent.command):
input_data = f"<system>\n{agent.system_prompt}\n</system>\n\n{prompt}"
else:
input_data = prompt
else:
# claude: use positional arg with a pointer to the task file
# (avoids OS arg length limits for large prompts)
cmd.append(
f"Read the task file at {task_file} and execute all instructions in it. "
f"Work in the current directory."
)
logger.debug(
"Invoking agent '%s' (agentic) in worktree: %s",
agent.name, worktree_path,
)
spinner: Optional[_Spinner] = None
if not quiet:
logger.info(" cmd: %s (agentic)", " ".join(cmd[:6]))
spinner = _Spinner(f"[{step_name}] {agent.name} (agentic) running...")
spinner.start()
try:
start = time.monotonic()
result = subprocess.run(
cmd,
input=input_data,
capture_output=True,
text=True,
timeout=timeout,
cwd=worktree_path,
env=env,
)
duration = time.monotonic() - start
except subprocess.TimeoutExpired:
if spinner:
spinner.stop(f"[{step_name}] TIMEOUT after {timeout}s")
raise
except Exception:
if spinner:
spinner.stop(f"[{step_name}] ERROR")
raise
finally:
# Clean up temp task file (it's in /tmp, not in worktree)
task_file.unlink(missing_ok=True)
if result.returncode != 0:
if spinner:
spinner.stop(f"[{step_name}] FAILED (exit {result.returncode})")
@@ -222,17 +412,22 @@ def invoke_agent(
suggested_action=suggested_action,
)
if spinner:
spinner.stop(f"[{step_name}] done — {chars} chars")
# Capture git diff as the output (changes since last commit on the branch)
diff_output = capture_diff(worktree_path)
if not output:
if not diff_output:
diff_output = "(no changes)"
logger.warning(
"Agent '%s' produced empty output at step '%s'",
"Agent '%s' made no file changes at step '%s'",
agent.name, step_name,
)
chars = len(diff_output)
if spinner:
spinner.stop(f"[{step_name}] done — {chars} chars (agentic)")
return AgentResult(
output=output,
output=diff_output,
exit_code=result.returncode,
agent_name=agent.name,
step_name=step_name,