feat: add runtime discovery and execution traces

This commit is contained in:
chungyeong
2026-03-13 21:52:13 +09:00
parent 941304398d
commit 28dd794f54
35 changed files with 376 additions and 88 deletions

7
.gitignore vendored Normal file
View File

@@ -0,0 +1,7 @@
__pycache__/
*.py[cod]
.pytest_cache/
.idea/
output/
.cross-eval/output/
cross_eval.egg-info/

10
.idea/.gitignore generated vendored
View File

@@ -1,10 +0,0 @@
# Default ignored files
/shelf/
/workspace.xml
# Ignored default folder with query files
/queries/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/

14
.idea/cross-eval.iml generated
View File

@@ -1,14 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/.venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.12 (cross-eval)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
</module>

View File

@@ -1,6 +0,0 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
</profile>
</component>

View File

@@ -1,6 +0,0 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

7
.idea/misc.xml generated
View File

@@ -1,7 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="Python 3.12 (cross-eval)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (cross-eval)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml generated
View File

@@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/cross-eval.iml" filepath="$PROJECT_DIR$/.idea/cross-eval.iml" />
</modules>
</component>
</project>

View File

@@ -1,6 +0,0 @@
Metadata-Version: 2.4
Name: cross-eval
Version: 0.2.0
Summary: AI agent cross-evaluation CLI tool
Requires-Python: >=3.9
Requires-Dist: pyyaml>=6.0

View File

@@ -1,24 +0,0 @@
README.md
pyproject.toml
cross_eval/__init__.py
cross_eval/agent.py
cross_eval/cli.py
cross_eval/config.py
cross_eval/demo.py
cross_eval/doctor.py
cross_eval/models.py
cross_eval/pipeline.py
cross_eval/prompts.py
cross_eval/report.py
cross_eval/runtime_env.py
cross_eval/worktree.py
cross_eval.egg-info/PKG-INFO
cross_eval.egg-info/SOURCES.txt
cross_eval.egg-info/dependency_links.txt
cross_eval.egg-info/entry_points.txt
cross_eval.egg-info/requires.txt
cross_eval.egg-info/top_level.txt
tests/test_agentic.py
tests/test_config.py
tests/test_onboarding.py
tests/test_pipeline_integration.py

View File

@@ -1 +0,0 @@

View File

@@ -1,2 +0,0 @@
[console_scripts]
cross-eval = cross_eval.cli:main

View File

@@ -1 +0,0 @@
pyyaml>=6.0

View File

@@ -1 +0,0 @@
cross_eval

View File

@@ -218,6 +218,7 @@ def invoke_agent(
else: else:
input_data = prompt input_data = prompt
cmd_preview = " ".join(cmd[:6])
logger.debug("Invoking agent '%s': %s", agent.name, " ".join(cmd[:5]) + " ...") logger.debug("Invoking agent '%s': %s", agent.name, " ".join(cmd[:5]) + " ...")
spinner: Optional[_Spinner] = None spinner: Optional[_Spinner] = None
@@ -259,7 +260,6 @@ def invoke_agent(
err_detail = result.stderr.strip() or result.stdout.strip() err_detail = result.stderr.strip() or result.stdout.strip()
if err_detail and len(err_detail) > 500: if err_detail and len(err_detail) > 500:
err_detail = err_detail[:500] + "..." err_detail = err_detail[:500] + "..."
cmd_preview = " ".join(cmd[:6])
failure_type, suggested_action = _classify_agent_failure(err_detail or "") failure_type, suggested_action = _classify_agent_failure(err_detail or "")
raise AgentInvocationError( raise AgentInvocationError(
agent_name=agent.name, agent_name=agent.name,
@@ -298,12 +298,20 @@ def invoke_agent(
agent.name, step_name, agent.name, step_name,
) )
transcript = _build_transcript(
command_preview=cmd_preview,
stdout=result.stdout,
stderr=result.stderr,
)
return AgentResult( return AgentResult(
output=output, output=output,
exit_code=result.returncode, exit_code=result.returncode,
agent_name=agent.name, agent_name=agent.name,
step_name=step_name, step_name=step_name,
duration_seconds=round(duration, 1), duration_seconds=round(duration, 1),
transcript=transcript,
command_preview=cmd_preview,
) )
@@ -360,6 +368,7 @@ def invoke_agent_agentic(
f"Work in the current directory." f"Work in the current directory."
) )
cmd_preview = " ".join(cmd[:6])
logger.debug( logger.debug(
"Invoking agent '%s' (agentic) in worktree: %s", "Invoking agent '%s' (agentic) in worktree: %s",
agent.name, worktree_path, agent.name, worktree_path,
@@ -401,7 +410,6 @@ def invoke_agent_agentic(
err_detail = result.stderr.strip() or result.stdout.strip() err_detail = result.stderr.strip() or result.stdout.strip()
if err_detail and len(err_detail) > 500: if err_detail and len(err_detail) > 500:
err_detail = err_detail[:500] + "..." err_detail = err_detail[:500] + "..."
cmd_preview = " ".join(cmd[:6])
failure_type, suggested_action = _classify_agent_failure(err_detail or "") failure_type, suggested_action = _classify_agent_failure(err_detail or "")
raise AgentInvocationError( raise AgentInvocationError(
agent_name=agent.name, agent_name=agent.name,
@@ -426,10 +434,47 @@ def invoke_agent_agentic(
if spinner: if spinner:
spinner.stop(f"[{step_name}] done — {chars} chars (agentic)") spinner.stop(f"[{step_name}] done — {chars} chars (agentic)")
transcript = _build_transcript(
command_preview=cmd_preview,
stdout=result.stdout,
stderr=result.stderr,
)
return AgentResult( return AgentResult(
output=diff_output, output=diff_output,
exit_code=result.returncode, exit_code=result.returncode,
agent_name=agent.name, agent_name=agent.name,
step_name=step_name, step_name=step_name,
duration_seconds=round(duration, 1), duration_seconds=round(duration, 1),
transcript=transcript,
command_preview=cmd_preview,
) )
def _build_transcript(
*,
command_preview: str,
stdout: str,
stderr: str,
) -> str:
"""Build a compact execution transcript for debugging/audit output."""
sections = [
"# Agent Execution Transcript",
"",
"## Command",
"```",
command_preview or "(unknown command)",
"```",
"",
"## Stdout",
"```",
(stdout or "(empty)").strip(),
"```",
"",
"## Stderr",
"```",
(stderr or "(empty)").strip(),
"```",
"",
]
return "\n".join(sections)

167
cross_eval/discovery.py Normal file
View File

@@ -0,0 +1,167 @@
"""Repository/service discovery helpers for autonomous execution prompts."""
from __future__ import annotations
import json
from dataclasses import dataclass, field
from pathlib import Path
@dataclass
class RepoDiscovery:
languages: set[str] = field(default_factory=set)
package_managers: set[str] = field(default_factory=set)
databases: set[str] = field(default_factory=set)
services: set[str] = field(default_factory=set)
hints: list[str] = field(default_factory=list)
def _read_text(path: Path) -> str:
try:
return path.read_text(encoding="utf-8")
except (OSError, UnicodeDecodeError):
return ""
def _add_if_contains(target: set[str], content: str, mapping: dict[str, str]) -> None:
lowered = content.lower()
for needle, name in mapping.items():
if needle in lowered:
target.add(name)
def discover_repo(project_root: Path, env_names: set[str] | None = None) -> RepoDiscovery:
"""Infer runtime-relevant stack hints from common manifest/config files."""
discovery = RepoDiscovery()
env_names = {name.upper() for name in (env_names or set())}
file_map = {
"pyproject": project_root / "pyproject.toml",
"requirements": project_root / "requirements.txt",
"package": project_root / "package.json",
"docker_compose": project_root / "docker-compose.yml",
"docker_compose_alt": project_root / "docker-compose.yaml",
"compose": project_root / "compose.yaml",
"prisma": project_root / "prisma" / "schema.prisma",
}
if file_map["pyproject"].exists() or file_map["requirements"].exists():
discovery.languages.add("python")
if file_map["package"].exists():
discovery.languages.add("node")
if file_map["pyproject"].exists():
discovery.package_managers.add("pip")
if file_map["package"].exists():
try:
package_json = json.loads(_read_text(file_map["package"]) or "{}")
except json.JSONDecodeError:
package_json = {}
pm = package_json.get("packageManager")
if isinstance(pm, str) and pm:
discovery.package_managers.add(pm.split("@", 1)[0])
else:
discovery.package_managers.add("npm")
manifests = {
name: _read_text(path)
for name, path in file_map.items()
if path.exists()
}
combined = "\n".join(manifests.values())
_add_if_contains(
discovery.databases,
combined,
{
"psycopg": "postgresql",
"asyncpg": "postgresql",
"postgres": "postgresql",
"mysql": "mysql",
"pymongo": "mongodb",
"mongodb": "mongodb",
"mongoengine": "mongodb",
"clickhouse": "clickhouse",
"clickhouse-driver": "clickhouse",
"clickhouse_connect": "clickhouse",
"redis": "redis",
},
)
if file_map["package"].exists():
try:
package_json = json.loads(_read_text(file_map["package"]) or "{}")
except json.JSONDecodeError:
package_json = {}
deps = {
**(package_json.get("dependencies") or {}),
**(package_json.get("devDependencies") or {}),
}
dep_blob = "\n".join(deps.keys()).lower()
_add_if_contains(
discovery.databases,
dep_blob,
{
"pg": "postgresql",
"mysql": "mysql",
"mongoose": "mongodb",
"mongodb": "mongodb",
"@clickhouse/client": "clickhouse",
"redis": "redis",
"prisma": "postgresql",
},
)
for env_name in env_names:
if "CLICKHOUSE" in env_name or env_name.startswith("CH_"):
discovery.databases.add("clickhouse")
if "POSTGRES" in env_name or env_name.startswith("PG") or env_name == "DATABASE_URL":
discovery.databases.add("postgresql")
if "MYSQL" in env_name:
discovery.databases.add("mysql")
if "MONGO" in env_name:
discovery.databases.add("mongodb")
if "REDIS" in env_name:
discovery.databases.add("redis")
compose_blob = "\n".join(
manifests.get(key, "")
for key in ("docker_compose", "docker_compose_alt", "compose")
).lower()
_add_if_contains(
discovery.services,
compose_blob,
{
"clickhouse": "clickhouse",
"postgres": "postgresql",
"mysql": "mysql",
"mongo": "mongodb",
"redis": "redis",
},
)
if file_map["prisma"].exists():
discovery.hints.append("Prisma schema detected.")
if (project_root / "alembic.ini").exists():
discovery.hints.append("Alembic migration config detected.")
if (project_root / "docker").exists() or discovery.services:
discovery.hints.append("Containerized services may be available for local verification.")
return discovery
def format_repo_discovery(discovery: RepoDiscovery) -> str:
"""Render discovery results into a compact prompt summary."""
lines: list[str] = []
if discovery.languages:
lines.append("Detected languages: " + ", ".join(sorted(discovery.languages)))
if discovery.package_managers:
lines.append("Likely package managers: " + ", ".join(sorted(discovery.package_managers)))
if discovery.databases:
lines.append("Detected databases/services in code or env: " + ", ".join(sorted(discovery.databases)))
if discovery.services:
lines.append("Detected local service containers: " + ", ".join(sorted(discovery.services)))
if discovery.hints:
lines.extend(discovery.hints)
if not lines:
return "No strong runtime/service signals were detected from repository manifests."
return "\n".join(lines)

View File

@@ -88,6 +88,8 @@ class AgentResult:
agent_name: str agent_name: str
step_name: str step_name: str
duration_seconds: float duration_seconds: float
transcript: str = ""
command_preview: str = ""
@dataclass @dataclass

View File

@@ -13,6 +13,7 @@ from pathlib import Path
from cross_eval.agent import AgentInvocationError, invoke_agent, invoke_agent_agentic from cross_eval.agent import AgentInvocationError, invoke_agent, invoke_agent_agentic
from cross_eval.worktree import WorktreeError from cross_eval.worktree import WorktreeError
from cross_eval.config import try_reload_config from cross_eval.config import try_reload_config
from cross_eval.discovery import discover_repo, format_repo_discovery
from cross_eval.models import ( from cross_eval.models import (
AgentConfig, AgentConfig,
AgentResult, AgentResult,
@@ -804,6 +805,7 @@ def _execute_step(
# 8. Save to disk # 8. Save to disk
_save_step_output(run_dir, output_iter, step.name, result.output) _save_step_output(run_dir, output_iter, step.name, result.output)
_maybe_save_step_transcript(run_dir, output_iter, step.name, result)
def _execute_parallel_batch( def _execute_parallel_batch(
@@ -929,6 +931,7 @@ def _execute_parallel_batch(
step.name, r.duration_seconds, len(r.output), step.name, r.duration_seconds, len(r.output),
) )
_save_step_output(run_dir, output_iter, step.name, r.output) _save_step_output(run_dir, output_iter, step.name, r.output)
_maybe_save_step_transcript(run_dir, output_iter, step.name, r)
if errors: if errors:
spinner.stop(f"[parallel] FAILED ({batch_elapsed}s)") spinner.stop(f"[parallel] FAILED ({batch_elapsed}s)")
@@ -1001,10 +1004,12 @@ def _build_runtime_inputs(
) -> dict[str, str]: ) -> dict[str, str]:
"""Load runtime env and expose safe execution hints to prompts.""" """Load runtime env and expose safe execution hints to prompts."""
env, loaded_files, loaded_values = build_runtime_environment(config.execution, cwd) env, loaded_files, loaded_values = build_runtime_environment(config.execution, cwd)
discovery = discover_repo(cwd, set(loaded_values) | set(env))
input_contents["execution_policy"] = build_execution_policy(config.execution) input_contents["execution_policy"] = build_execution_policy(config.execution)
input_contents["environment_context"] = summarize_environment( input_contents["environment_context"] = summarize_environment(
config.execution, loaded_files, env, loaded_values, config.execution, loaded_files, env, loaded_values,
) )
input_contents["repo_discovery"] = format_repo_discovery(discovery)
return env return env
@@ -1018,6 +1023,8 @@ def _augment_prompt_with_runtime_context(
extras.append("## Execution Policy\n" + context["execution_policy"]) extras.append("## Execution Policy\n" + context["execution_policy"])
if context.get("environment_context"): if context.get("environment_context"):
extras.append("## Environment Context\n" + context["environment_context"]) extras.append("## Environment Context\n" + context["environment_context"])
if context.get("repo_discovery"):
extras.append("## Repository Discovery\n" + context["repo_discovery"])
if not extras: if not extras:
return prompt return prompt
return prompt.rstrip() + "\n\n" + "\n\n".join(extras) + "\n" return prompt.rstrip() + "\n\n" + "\n\n".join(extras) + "\n"
@@ -1198,6 +1205,20 @@ def _save_step_output(
return path return path
def _maybe_save_step_transcript(
run_dir: Path,
iteration: int,
step_name: str,
result: AgentResult,
) -> Path | None:
"""Persist raw stdout/stderr transcript when available."""
if not result.transcript:
return None
return _save_step_output(
run_dir, iteration, f"{step_name}_transcript", result.transcript,
)
def _format_runtime_error_markdown( def _format_runtime_error_markdown(
exc: Exception, exc: Exception,
*, *,

View File

@@ -0,0 +1,132 @@
from __future__ import annotations
import tempfile
import unittest
from pathlib import Path
from unittest.mock import patch
from cross_eval.agent import invoke_agent
from cross_eval.config import BUILTIN_AGENTS
from cross_eval.discovery import discover_repo, format_repo_discovery
from cross_eval.models import AgentConfig, AgentResult, PipelineConfig
from cross_eval.pipeline import run_pipeline
from cross_eval.prompts import _build_simple_preset
from cross_eval.runtime_env import build_runtime_environment, summarize_environment
class RuntimeEnvTest(unittest.TestCase):
def test_build_runtime_environment_loads_dotenv_values(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
root = Path(tmpdir)
(root / ".env").write_text(
"CLICKHOUSE_URL=http://localhost:8123\nDATABASE_URL=postgres://db\n",
encoding="utf-8",
)
execution = PipelineConfig().execution
env, loaded_files, loaded_values = build_runtime_environment(execution, root)
self.assertEqual(loaded_files[0].name, ".env")
self.assertEqual(loaded_values["CLICKHOUSE_URL"], "http://localhost:8123")
self.assertEqual(env["DATABASE_URL"], "postgres://db")
def test_summarize_environment_mentions_clickhouse_from_env(self) -> None:
execution = PipelineConfig().execution
summary = summarize_environment(
execution,
[Path("/tmp/.env")],
{"CLICKHOUSE_URL": "http://localhost:8123"},
{"CLICKHOUSE_URL": "http://localhost:8123"},
)
self.assertIn("CLICKHOUSE_URL", summary)
self.assertIn("ClickHouse-related", summary)
class RepoDiscoveryTest(unittest.TestCase):
def test_discover_repo_detects_python_postgres_and_clickhouse(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
root = Path(tmpdir)
(root / "pyproject.toml").write_text(
'[project]\nname = "svc"\ndependencies = ["psycopg", "clickhouse-driver"]\n',
encoding="utf-8",
)
(root / "docker-compose.yml").write_text(
"services:\n db:\n image: postgres:16\n ch:\n image: clickhouse/clickhouse-server:latest\n",
encoding="utf-8",
)
discovery = discover_repo(root, {"DATABASE_URL", "CLICKHOUSE_URL"})
summary = format_repo_discovery(discovery)
self.assertIn("python", discovery.languages)
self.assertIn("postgresql", discovery.databases)
self.assertIn("clickhouse", discovery.databases)
self.assertIn("Detected local service containers", summary)
class PromptContextTest(unittest.TestCase):
def test_run_pipeline_injects_env_and_discovery_context_into_prompt(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
root = Path(tmpdir)
(root / ".env").write_text("CLICKHOUSE_URL=http://localhost:8123\n", encoding="utf-8")
steps = _build_simple_preset(["claude-coder"], ["claude-reviewer"], [])
config = PipelineConfig(
output_dir=root / "out",
max_iterations=1,
language="en",
inputs={"plan": "Plan", "checklist": "Checklist"},
agents={name: agent for name, agent in BUILTIN_AGENTS.items()},
coders=["claude-coder"],
reviewers=["claude-reviewer"],
pipeline=steps,
preset_name="simple",
)
prompts: list[str] = []
def _fake_invoke(agent_config, prompt, step_name, **kwargs):
prompts.append(prompt)
output = "VERDICT: PASS" if step_name == "review" else "coding output"
return AgentResult(
output=output,
exit_code=0,
agent_name=agent_config.name,
step_name=step_name,
duration_seconds=0.1,
transcript="# Agent Execution Transcript",
)
with patch("cross_eval.pipeline.invoke_agent", side_effect=_fake_invoke):
run_pipeline(config, cwd=root)
joined = "\n".join(prompts)
self.assertIn("Execution Policy", joined)
self.assertIn("Environment Context", joined)
self.assertIn("Repository Discovery", joined)
self.assertIn("ClickHouse-related environment variables are available", joined)
self.assertTrue((root / "out").exists())
class AgentTranscriptTest(unittest.TestCase):
def test_invoke_agent_records_transcript(self) -> None:
def _fake_run(cmd, **kwargs):
class _Result:
returncode = 0
stdout = "hello"
stderr = "warn"
return _Result()
agent = AgentConfig(
name="codex-reviewer",
command="codex",
args=["exec", "--model", "gpt-5.4", "-"],
)
with patch("subprocess.run", side_effect=_fake_run):
result = invoke_agent(agent, "prompt", "review", quiet=True)
self.assertIn("## Command", result.transcript)
self.assertIn("hello", result.transcript)
self.assertIn("warn", result.transcript)
if __name__ == "__main__":
unittest.main()