Compare commits
10 Commits
941304398d
...
7b95233edf
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7b95233edf | ||
|
|
87bc0ffbfb | ||
|
|
c467222a2a | ||
|
|
99cbf171aa | ||
|
|
d5fcc258b7 | ||
|
|
290eace01b | ||
|
|
ecf44b4c07 | ||
|
|
b19d174c98 | ||
|
|
3fb19e90c0 | ||
|
|
28dd794f54 |
7
.gitignore
vendored
Normal file
7
.gitignore
vendored
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
.pytest_cache/
|
||||||
|
.idea/
|
||||||
|
output/
|
||||||
|
.cross-eval/output/
|
||||||
|
cross_eval.egg-info/
|
||||||
10
.idea/.gitignore
generated
vendored
10
.idea/.gitignore
generated
vendored
@@ -1,10 +0,0 @@
|
|||||||
# Default ignored files
|
|
||||||
/shelf/
|
|
||||||
/workspace.xml
|
|
||||||
# Ignored default folder with query files
|
|
||||||
/queries/
|
|
||||||
# Datasource local storage ignored files
|
|
||||||
/dataSources/
|
|
||||||
/dataSources.local.xml
|
|
||||||
# Editor-based HTTP Client requests
|
|
||||||
/httpRequests/
|
|
||||||
14
.idea/cross-eval.iml
generated
14
.idea/cross-eval.iml
generated
@@ -1,14 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<module type="PYTHON_MODULE" version="4">
|
|
||||||
<component name="NewModuleRootManager">
|
|
||||||
<content url="file://$MODULE_DIR$">
|
|
||||||
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
|
||||||
</content>
|
|
||||||
<orderEntry type="jdk" jdkName="Python 3.12 (cross-eval)" jdkType="Python SDK" />
|
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
|
||||||
</component>
|
|
||||||
<component name="PyDocumentationSettings">
|
|
||||||
<option name="format" value="PLAIN" />
|
|
||||||
<option name="myDocStringFormat" value="Plain" />
|
|
||||||
</component>
|
|
||||||
</module>
|
|
||||||
6
.idea/inspectionProfiles/Project_Default.xml
generated
6
.idea/inspectionProfiles/Project_Default.xml
generated
@@ -1,6 +0,0 @@
|
|||||||
<component name="InspectionProjectProfileManager">
|
|
||||||
<profile version="1.0">
|
|
||||||
<option name="myName" value="Project Default" />
|
|
||||||
<inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
|
|
||||||
</profile>
|
|
||||||
</component>
|
|
||||||
6
.idea/inspectionProfiles/profiles_settings.xml
generated
6
.idea/inspectionProfiles/profiles_settings.xml
generated
@@ -1,6 +0,0 @@
|
|||||||
<component name="InspectionProjectProfileManager">
|
|
||||||
<settings>
|
|
||||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
|
||||||
<version value="1.0" />
|
|
||||||
</settings>
|
|
||||||
</component>
|
|
||||||
7
.idea/misc.xml
generated
7
.idea/misc.xml
generated
@@ -1,7 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="Black">
|
|
||||||
<option name="sdkName" value="Python 3.12 (cross-eval)" />
|
|
||||||
</component>
|
|
||||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (cross-eval)" project-jdk-type="Python SDK" />
|
|
||||||
</project>
|
|
||||||
8
.idea/modules.xml
generated
8
.idea/modules.xml
generated
@@ -1,8 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="ProjectModuleManager">
|
|
||||||
<modules>
|
|
||||||
<module fileurl="file://$PROJECT_DIR$/.idea/cross-eval.iml" filepath="$PROJECT_DIR$/.idea/cross-eval.iml" />
|
|
||||||
</modules>
|
|
||||||
</component>
|
|
||||||
</project>
|
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
Metadata-Version: 2.4
|
|
||||||
Name: cross-eval
|
|
||||||
Version: 0.2.0
|
|
||||||
Summary: AI agent cross-evaluation CLI tool
|
|
||||||
Requires-Python: >=3.9
|
|
||||||
Requires-Dist: pyyaml>=6.0
|
|
||||||
@@ -1,24 +0,0 @@
|
|||||||
README.md
|
|
||||||
pyproject.toml
|
|
||||||
cross_eval/__init__.py
|
|
||||||
cross_eval/agent.py
|
|
||||||
cross_eval/cli.py
|
|
||||||
cross_eval/config.py
|
|
||||||
cross_eval/demo.py
|
|
||||||
cross_eval/doctor.py
|
|
||||||
cross_eval/models.py
|
|
||||||
cross_eval/pipeline.py
|
|
||||||
cross_eval/prompts.py
|
|
||||||
cross_eval/report.py
|
|
||||||
cross_eval/runtime_env.py
|
|
||||||
cross_eval/worktree.py
|
|
||||||
cross_eval.egg-info/PKG-INFO
|
|
||||||
cross_eval.egg-info/SOURCES.txt
|
|
||||||
cross_eval.egg-info/dependency_links.txt
|
|
||||||
cross_eval.egg-info/entry_points.txt
|
|
||||||
cross_eval.egg-info/requires.txt
|
|
||||||
cross_eval.egg-info/top_level.txt
|
|
||||||
tests/test_agentic.py
|
|
||||||
tests/test_config.py
|
|
||||||
tests/test_onboarding.py
|
|
||||||
tests/test_pipeline_integration.py
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
|
|
||||||
@@ -1,2 +0,0 @@
|
|||||||
[console_scripts]
|
|
||||||
cross-eval = cross_eval.cli:main
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
pyyaml>=6.0
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
cross_eval
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -19,6 +19,61 @@ logger = logging.getLogger(__name__)
|
|||||||
# CLI tools that support --system-prompt flag natively
|
# CLI tools that support --system-prompt flag natively
|
||||||
_SYSTEM_PROMPT_AGENTS = ("claude",)
|
_SYSTEM_PROMPT_AGENTS = ("claude",)
|
||||||
_REASONING_EFFORT_AGENTS = ("codex",)
|
_REASONING_EFFORT_AGENTS = ("codex",)
|
||||||
|
_NO_CHANGE_ACK_MARKERS = (
|
||||||
|
"no changes",
|
||||||
|
"no code changes",
|
||||||
|
"no file changes",
|
||||||
|
"did not make any changes",
|
||||||
|
"nothing to change",
|
||||||
|
"no modifications were necessary",
|
||||||
|
"no update was necessary",
|
||||||
|
"already satisfied",
|
||||||
|
"no changes needed",
|
||||||
|
"no fixes needed",
|
||||||
|
"everything is correct",
|
||||||
|
"code is correct as-is",
|
||||||
|
"already correct",
|
||||||
|
"no action required",
|
||||||
|
)
|
||||||
|
_CHANGE_CLAIM_MARKERS = (
|
||||||
|
"summary of all changes made",
|
||||||
|
"here's a summary of all changes made",
|
||||||
|
"here is a summary of all changes",
|
||||||
|
"implemented",
|
||||||
|
"i implemented",
|
||||||
|
"i've implemented",
|
||||||
|
"added",
|
||||||
|
"i added",
|
||||||
|
"i've added",
|
||||||
|
"updated",
|
||||||
|
"i updated",
|
||||||
|
"i've updated",
|
||||||
|
"modified",
|
||||||
|
"i modified",
|
||||||
|
"i've modified",
|
||||||
|
"created",
|
||||||
|
"i created",
|
||||||
|
"i've created",
|
||||||
|
"fixed",
|
||||||
|
"i fixed",
|
||||||
|
"i've fixed",
|
||||||
|
"completed the changes",
|
||||||
|
"finished the changes",
|
||||||
|
"made the following changes",
|
||||||
|
"applied the fix",
|
||||||
|
"changes have been applied",
|
||||||
|
"wrote the code",
|
||||||
|
"refactored",
|
||||||
|
"i refactored",
|
||||||
|
"completed all the changes",
|
||||||
|
"finished implementing",
|
||||||
|
"all tasks completed",
|
||||||
|
"done with the implementation",
|
||||||
|
"successfully implemented",
|
||||||
|
"completed the implementation",
|
||||||
|
"all changes have been made",
|
||||||
|
"changes are complete",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class AgentInvocationError(RuntimeError):
|
class AgentInvocationError(RuntimeError):
|
||||||
@@ -106,6 +161,39 @@ def _classify_agent_failure(detail: str) -> tuple[str, str]:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_WRITE_FAILURE_MARKERS = (
|
||||||
|
"permission denied",
|
||||||
|
"read-only file system",
|
||||||
|
"read only file system",
|
||||||
|
"operation not permitted",
|
||||||
|
"cannot write",
|
||||||
|
"failed to write",
|
||||||
|
"could not write",
|
||||||
|
"unable to write",
|
||||||
|
"sandbox",
|
||||||
|
"eacces",
|
||||||
|
"erofs",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _has_write_failure_indicators(stderr: str) -> bool:
|
||||||
|
"""Detect stderr patterns indicating the agent could not write files."""
|
||||||
|
if not stderr.strip():
|
||||||
|
return False
|
||||||
|
normalized = stderr.lower()
|
||||||
|
return any(marker in normalized for marker in _WRITE_FAILURE_MARKERS)
|
||||||
|
|
||||||
|
|
||||||
|
def _claims_file_changes(output: str) -> bool:
|
||||||
|
"""Heuristic for agent text that claims code changes were made."""
|
||||||
|
normalized = output.lower()
|
||||||
|
if not normalized.strip():
|
||||||
|
return False
|
||||||
|
if any(marker in normalized for marker in _NO_CHANGE_ACK_MARKERS):
|
||||||
|
return False
|
||||||
|
return any(marker in normalized for marker in _CHANGE_CLAIM_MARKERS)
|
||||||
|
|
||||||
|
|
||||||
class _Spinner:
|
class _Spinner:
|
||||||
"""Animated spinner for long-running agent calls."""
|
"""Animated spinner for long-running agent calls."""
|
||||||
|
|
||||||
@@ -218,6 +306,7 @@ def invoke_agent(
|
|||||||
else:
|
else:
|
||||||
input_data = prompt
|
input_data = prompt
|
||||||
|
|
||||||
|
cmd_preview = " ".join(cmd[:6])
|
||||||
logger.debug("Invoking agent '%s': %s", agent.name, " ".join(cmd[:5]) + " ...")
|
logger.debug("Invoking agent '%s': %s", agent.name, " ".join(cmd[:5]) + " ...")
|
||||||
|
|
||||||
spinner: Optional[_Spinner] = None
|
spinner: Optional[_Spinner] = None
|
||||||
@@ -259,7 +348,6 @@ def invoke_agent(
|
|||||||
err_detail = result.stderr.strip() or result.stdout.strip()
|
err_detail = result.stderr.strip() or result.stdout.strip()
|
||||||
if err_detail and len(err_detail) > 500:
|
if err_detail and len(err_detail) > 500:
|
||||||
err_detail = err_detail[:500] + "..."
|
err_detail = err_detail[:500] + "..."
|
||||||
cmd_preview = " ".join(cmd[:6])
|
|
||||||
failure_type, suggested_action = _classify_agent_failure(err_detail or "")
|
failure_type, suggested_action = _classify_agent_failure(err_detail or "")
|
||||||
raise AgentInvocationError(
|
raise AgentInvocationError(
|
||||||
agent_name=agent.name,
|
agent_name=agent.name,
|
||||||
@@ -298,12 +386,23 @@ def invoke_agent(
|
|||||||
agent.name, step_name,
|
agent.name, step_name,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
transcript = _build_transcript(
|
||||||
|
command_preview=cmd_preview,
|
||||||
|
stdout=result.stdout,
|
||||||
|
stderr=result.stderr,
|
||||||
|
exit_code=result.returncode,
|
||||||
|
duration_seconds=round(duration, 1),
|
||||||
|
cwd=str(cwd) if cwd else "",
|
||||||
|
)
|
||||||
|
|
||||||
return AgentResult(
|
return AgentResult(
|
||||||
output=output,
|
output=output,
|
||||||
exit_code=result.returncode,
|
exit_code=result.returncode,
|
||||||
agent_name=agent.name,
|
agent_name=agent.name,
|
||||||
step_name=step_name,
|
step_name=step_name,
|
||||||
duration_seconds=round(duration, 1),
|
duration_seconds=round(duration, 1),
|
||||||
|
transcript=transcript,
|
||||||
|
command_preview=cmd_preview,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -316,11 +415,7 @@ def invoke_agent_agentic(
|
|||||||
timeout: int | None = None,
|
timeout: int | None = None,
|
||||||
quiet: bool = False,
|
quiet: bool = False,
|
||||||
) -> AgentResult:
|
) -> AgentResult:
|
||||||
"""Invoke an agent in agentic mode (no -p, runs in worktree, captures git diff).
|
"""Invoke an agent in agentic mode using the worktree as the source of truth."""
|
||||||
|
|
||||||
The agent runs without print mode so it can modify files directly.
|
|
||||||
After the agent exits, git diff (since last commit) is captured as the output.
|
|
||||||
"""
|
|
||||||
from cross_eval.worktree import capture_diff
|
from cross_eval.worktree import capture_diff
|
||||||
|
|
||||||
# Write prompt to a temp file (outside worktree, won't appear in diffs)
|
# Write prompt to a temp file (outside worktree, won't appear in diffs)
|
||||||
@@ -334,8 +429,10 @@ def invoke_agent_agentic(
|
|||||||
if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
|
if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
|
||||||
cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
|
cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
|
||||||
|
|
||||||
# Strip stdin sentinel ("-") from args for agentic mode
|
# Strip print-mode flags and stdin sentinels for agentic mode.
|
||||||
args = [a for a in agent.args if a != "-"]
|
# Agentic runs should operate on the worktree and return a real git diff,
|
||||||
|
# not behave as a one-shot text completer.
|
||||||
|
args = [a for a in agent.args if a not in {"-", "-p", "--print"}]
|
||||||
cmd.extend(args)
|
cmd.extend(args)
|
||||||
|
|
||||||
# System prompt via flag if supported
|
# System prompt via flag if supported
|
||||||
@@ -353,13 +450,11 @@ def invoke_agent_agentic(
|
|||||||
else:
|
else:
|
||||||
input_data = prompt
|
input_data = prompt
|
||||||
else:
|
else:
|
||||||
# claude: use positional arg with a pointer to the task file
|
# claude: deliver the task through stdin and let the worktree be the
|
||||||
# (avoids OS arg length limits for large prompts)
|
# canonical place where files are read/written.
|
||||||
cmd.append(
|
input_data = prompt
|
||||||
f"Read the task file at {task_file} and execute all instructions in it. "
|
|
||||||
f"Work in the current directory."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
cmd_preview = " ".join(cmd[:6])
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Invoking agent '%s' (agentic) in worktree: %s",
|
"Invoking agent '%s' (agentic) in worktree: %s",
|
||||||
agent.name, worktree_path,
|
agent.name, worktree_path,
|
||||||
@@ -401,7 +496,6 @@ def invoke_agent_agentic(
|
|||||||
err_detail = result.stderr.strip() or result.stdout.strip()
|
err_detail = result.stderr.strip() or result.stdout.strip()
|
||||||
if err_detail and len(err_detail) > 500:
|
if err_detail and len(err_detail) > 500:
|
||||||
err_detail = err_detail[:500] + "..."
|
err_detail = err_detail[:500] + "..."
|
||||||
cmd_preview = " ".join(cmd[:6])
|
|
||||||
failure_type, suggested_action = _classify_agent_failure(err_detail or "")
|
failure_type, suggested_action = _classify_agent_failure(err_detail or "")
|
||||||
raise AgentInvocationError(
|
raise AgentInvocationError(
|
||||||
agent_name=agent.name,
|
agent_name=agent.name,
|
||||||
@@ -416,6 +510,46 @@ def invoke_agent_agentic(
|
|||||||
diff_output = capture_diff(worktree_path)
|
diff_output = capture_diff(worktree_path)
|
||||||
|
|
||||||
if not diff_output:
|
if not diff_output:
|
||||||
|
stdout_excerpt = (result.stdout or "").strip()
|
||||||
|
stderr_excerpt = (result.stderr or "").strip()
|
||||||
|
|
||||||
|
# Detect two failure modes:
|
||||||
|
# 1. Agent claims changes in stdout but produced no diff
|
||||||
|
# 2. Agent stderr contains permission or write-failure indicators
|
||||||
|
claims_changes = _claims_file_changes(stdout_excerpt)
|
||||||
|
has_write_failure = _has_write_failure_indicators(stderr_excerpt)
|
||||||
|
|
||||||
|
if claims_changes or has_write_failure:
|
||||||
|
if spinner:
|
||||||
|
spinner.stop(f"[{step_name}] FAILED (empty diff)")
|
||||||
|
raw_error = stdout_excerpt or "(stdout empty)"
|
||||||
|
if stderr_excerpt:
|
||||||
|
raw_error = f"{raw_error}\n\n[stderr]\n{stderr_excerpt}"
|
||||||
|
if len(raw_error) > 2000:
|
||||||
|
raw_error = raw_error[:2000] + "..."
|
||||||
|
|
||||||
|
if has_write_failure:
|
||||||
|
failure_type = "WRITE_FAILURE"
|
||||||
|
suggested_action = (
|
||||||
|
"Agent encountered file write errors (permission denied, read-only, "
|
||||||
|
"or sandbox restriction). Check agent permissions and worktree state."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
failure_type = "EMPTY_DIFF"
|
||||||
|
suggested_action = (
|
||||||
|
"Agent reported code changes but produced no git diff. "
|
||||||
|
"Treat this run as failed and require a real worktree diff before continuing."
|
||||||
|
)
|
||||||
|
|
||||||
|
raise AgentInvocationError(
|
||||||
|
agent_name=agent.name,
|
||||||
|
step_name=step_name,
|
||||||
|
cmd_preview=cmd_preview,
|
||||||
|
raw_error=raw_error,
|
||||||
|
failure_type=failure_type,
|
||||||
|
suggested_action=suggested_action,
|
||||||
|
)
|
||||||
|
|
||||||
diff_output = "(no changes)"
|
diff_output = "(no changes)"
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Agent '%s' made no file changes at step '%s'",
|
"Agent '%s' made no file changes at step '%s'",
|
||||||
@@ -426,10 +560,63 @@ def invoke_agent_agentic(
|
|||||||
if spinner:
|
if spinner:
|
||||||
spinner.stop(f"[{step_name}] done — {chars} chars (agentic)")
|
spinner.stop(f"[{step_name}] done — {chars} chars (agentic)")
|
||||||
|
|
||||||
|
transcript = _build_transcript(
|
||||||
|
command_preview=cmd_preview,
|
||||||
|
stdout=result.stdout,
|
||||||
|
stderr=result.stderr,
|
||||||
|
exit_code=result.returncode,
|
||||||
|
duration_seconds=round(duration, 1),
|
||||||
|
cwd=str(worktree_path),
|
||||||
|
)
|
||||||
|
|
||||||
return AgentResult(
|
return AgentResult(
|
||||||
output=diff_output,
|
output=diff_output,
|
||||||
exit_code=result.returncode,
|
exit_code=result.returncode,
|
||||||
agent_name=agent.name,
|
agent_name=agent.name,
|
||||||
step_name=step_name,
|
step_name=step_name,
|
||||||
duration_seconds=round(duration, 1),
|
duration_seconds=round(duration, 1),
|
||||||
|
transcript=transcript,
|
||||||
|
command_preview=cmd_preview,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_transcript(
|
||||||
|
*,
|
||||||
|
command_preview: str,
|
||||||
|
stdout: str,
|
||||||
|
stderr: str,
|
||||||
|
exit_code: int = 0,
|
||||||
|
duration_seconds: float = 0.0,
|
||||||
|
cwd: str = "",
|
||||||
|
) -> str:
|
||||||
|
"""Build a compact execution transcript for debugging/audit output."""
|
||||||
|
sections = [
|
||||||
|
"# Agent Execution Transcript",
|
||||||
|
"",
|
||||||
|
"## Command",
|
||||||
|
"```",
|
||||||
|
command_preview or "(unknown command)",
|
||||||
|
"```",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
if cwd:
|
||||||
|
sections.extend(["## Working Directory", f"`{cwd}`", ""])
|
||||||
|
sections.extend([
|
||||||
|
f"## Exit Code: {exit_code}",
|
||||||
|
"",
|
||||||
|
])
|
||||||
|
if duration_seconds > 0:
|
||||||
|
sections.extend([f"## Duration: {duration_seconds}s", ""])
|
||||||
|
sections.extend([
|
||||||
|
"## Stdout",
|
||||||
|
"```",
|
||||||
|
(stdout or "(empty)").strip(),
|
||||||
|
"```",
|
||||||
|
"",
|
||||||
|
"## Stderr",
|
||||||
|
"```",
|
||||||
|
(stderr or "(empty)").strip(),
|
||||||
|
"```",
|
||||||
|
"",
|
||||||
|
])
|
||||||
|
return "\n".join(sections)
|
||||||
|
|||||||
@@ -266,7 +266,7 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
type=int,
|
type=int,
|
||||||
default=None,
|
default=None,
|
||||||
metavar="SEC",
|
metavar="SEC",
|
||||||
help="에이전트 호출 제한 시간 (--live 전용)",
|
help="에이전트 1회 호출 제한 시간(초). 0=무제한 (기본: 무제한, --live 전용)",
|
||||||
)
|
)
|
||||||
|
|
||||||
# --- run ---
|
# --- run ---
|
||||||
@@ -981,6 +981,7 @@ def cmd_run(args: argparse.Namespace) -> int:
|
|||||||
print(f"No files found in: {docs_dir}", file=sys.stderr)
|
print(f"No files found in: {docs_dir}", file=sys.stderr)
|
||||||
return 1
|
return 1
|
||||||
config.inputs["docs"] = docs_content
|
config.inputs["docs"] = docs_content
|
||||||
|
config.inputs["docs_ref"] = str(docs_dir)
|
||||||
|
|
||||||
if args.env_files:
|
if args.env_files:
|
||||||
for env_file in args.env_files:
|
for env_file in args.env_files:
|
||||||
@@ -1007,7 +1008,6 @@ def cmd_run(args: argparse.Namespace) -> int:
|
|||||||
apply_input_overrides(config, overrides)
|
apply_input_overrides(config, overrides)
|
||||||
|
|
||||||
# 3. Validate after all overrides
|
# 3. Validate after all overrides
|
||||||
from cross_eval.config import validate_config
|
|
||||||
errors = validate_config(config)
|
errors = validate_config(config)
|
||||||
if errors:
|
if errors:
|
||||||
print("Config error:\n " + "\n ".join(errors), file=sys.stderr)
|
print("Config error:\n " + "\n ".join(errors), file=sys.stderr)
|
||||||
|
|||||||
@@ -77,17 +77,20 @@ _CODER_SYSTEM_PROMPT = (
|
|||||||
"Rules:\n"
|
"Rules:\n"
|
||||||
"1. FIRST explore the project directory to understand the existing codebase, "
|
"1. FIRST explore the project directory to understand the existing codebase, "
|
||||||
"patterns, and conventions before writing any code.\n"
|
"patterns, and conventions before writing any code.\n"
|
||||||
"2. You may decide which shell, Python, git, docker, test, and database commands "
|
"2. You MUST use the Edit and Write tools to make ACTUAL file changes. "
|
||||||
|
"Do NOT just describe or explain changes in text — apply them directly to the files. "
|
||||||
|
"Your text output alone has no effect; only tool-based edits count.\n"
|
||||||
|
"3. You may decide which shell, Python, git, docker, test, and database commands "
|
||||||
"to run. The user does not need to pre-specify exact commands.\n"
|
"to run. The user does not need to pre-specify exact commands.\n"
|
||||||
"3. Environment variables from configured .env files may already be loaded into "
|
"4. Environment variables from configured .env files may already be loaded into "
|
||||||
"your process; use them when validating services such as ClickHouse.\n"
|
"your process; use them when validating services such as ClickHouse.\n"
|
||||||
"4. Implement ONLY what the plan specifies. Do NOT add extra features, "
|
"5. Implement ONLY what the plan specifies. Do NOT add extra features, "
|
||||||
"unnecessary abstractions, premature optimizations, or \"nice-to-have\" improvements.\n"
|
"unnecessary abstractions, premature optimizations, or \"nice-to-have\" improvements.\n"
|
||||||
"5. Follow the project's existing coding style, naming conventions, and directory structure.\n"
|
"6. Follow the project's existing coding style, naming conventions, and directory structure.\n"
|
||||||
"6. If previous review feedback is provided, fix ONLY the specific issues mentioned. "
|
"7. If previous review feedback is provided, fix ONLY the specific issues mentioned. "
|
||||||
"Do NOT refactor unrelated code.\n"
|
"Do NOT refactor unrelated code.\n"
|
||||||
"7. Ignore any items from previous feedback that were marked as DISMISSED or false positive.\n"
|
"8. Ignore any items from previous feedback that were marked as DISMISSED or false positive.\n"
|
||||||
"8. When in doubt about scope, do LESS, not more."
|
"9. When in doubt about scope, do LESS, not more."
|
||||||
)
|
)
|
||||||
|
|
||||||
_REVIEWER_SYSTEM_PROMPT = (
|
_REVIEWER_SYSTEM_PROMPT = (
|
||||||
@@ -695,9 +698,9 @@ def _validate_unique_step_fields(
|
|||||||
|
|
||||||
|
|
||||||
def _make_agentic(agent: AgentConfig) -> None:
|
def _make_agentic(agent: AgentConfig) -> None:
|
||||||
"""Convert an agent to agentic mode in-place (remove -p, set agentic=True)."""
|
"""Convert an agent to agentic mode in-place."""
|
||||||
agent.agentic = True
|
agent.agentic = True
|
||||||
agent.args = [a for a in agent.args if a != "-p"]
|
agent.args = [a for a in agent.args if a not in {"-p", "--print"}]
|
||||||
|
|
||||||
|
|
||||||
def sync_phased_iterations(
|
def sync_phased_iterations(
|
||||||
|
|||||||
@@ -217,7 +217,7 @@ def run_mock_demo(preset: str = "simple", show_escalate: bool = False) -> None:
|
|||||||
|
|
||||||
if show_escalate:
|
if show_escalate:
|
||||||
print(f"\n{RED}{BOLD}{'=' * 50}")
|
print(f"\n{RED}{BOLD}{'=' * 50}")
|
||||||
print(f" Escalation Report")
|
print(" Escalation Report")
|
||||||
print(f"{'=' * 50}{RESET}")
|
print(f"{'=' * 50}{RESET}")
|
||||||
print(f"{YELLOW}Human review required.{RESET}")
|
print(f"{YELLOW}Human review required.{RESET}")
|
||||||
print(f" {RED}•{RESET} Requirements are ambiguous — needs stakeholder clarification")
|
print(f" {RED}•{RESET} Requirements are ambiguous — needs stakeholder clarification")
|
||||||
|
|||||||
330
cross_eval/discovery.py
Normal file
330
cross_eval/discovery.py
Normal file
@@ -0,0 +1,330 @@
|
|||||||
|
"""Repository/service discovery helpers for autonomous execution prompts."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RepoDiscovery:
|
||||||
|
languages: set[str] = field(default_factory=set)
|
||||||
|
package_managers: set[str] = field(default_factory=set)
|
||||||
|
databases: set[str] = field(default_factory=set)
|
||||||
|
services: set[str] = field(default_factory=set)
|
||||||
|
frameworks: set[str] = field(default_factory=set)
|
||||||
|
hints: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
def _read_text(path: Path) -> str:
|
||||||
|
try:
|
||||||
|
return path.read_text(encoding="utf-8")
|
||||||
|
except (OSError, UnicodeDecodeError):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _add_if_contains(target: set[str], content: str, mapping: dict[str, str]) -> None:
|
||||||
|
lowered = content.lower()
|
||||||
|
for needle, name in mapping.items():
|
||||||
|
if needle in lowered:
|
||||||
|
target.add(name)
|
||||||
|
|
||||||
|
|
||||||
|
# Shared mapping for database signals found in manifest content
|
||||||
|
_MANIFEST_DB_SIGNALS: dict[str, str] = {
|
||||||
|
# PostgreSQL
|
||||||
|
"psycopg": "postgresql",
|
||||||
|
"asyncpg": "postgresql",
|
||||||
|
"postgres": "postgresql",
|
||||||
|
"pgx": "postgresql",
|
||||||
|
# MySQL / MariaDB
|
||||||
|
"mysql": "mysql",
|
||||||
|
"mariadb": "mysql",
|
||||||
|
"pymysql": "mysql",
|
||||||
|
# MongoDB
|
||||||
|
"pymongo": "mongodb",
|
||||||
|
"mongodb": "mongodb",
|
||||||
|
"mongoengine": "mongodb",
|
||||||
|
"mongosh": "mongodb",
|
||||||
|
# ClickHouse
|
||||||
|
"clickhouse": "clickhouse",
|
||||||
|
"clickhouse-driver": "clickhouse",
|
||||||
|
"clickhouse_connect": "clickhouse",
|
||||||
|
# Redis
|
||||||
|
"redis": "redis",
|
||||||
|
"ioredis": "redis",
|
||||||
|
# SQLite
|
||||||
|
"sqlite": "sqlite",
|
||||||
|
"better-sqlite3": "sqlite",
|
||||||
|
"aiosqlite": "sqlite",
|
||||||
|
# Elasticsearch / OpenSearch
|
||||||
|
"elasticsearch": "elasticsearch",
|
||||||
|
"opensearch": "elasticsearch",
|
||||||
|
# DynamoDB
|
||||||
|
"dynamodb": "dynamodb",
|
||||||
|
"boto3": "dynamodb", # broad but common signal
|
||||||
|
# Cassandra
|
||||||
|
"cassandra-driver": "cassandra",
|
||||||
|
"cassandra": "cassandra",
|
||||||
|
# RabbitMQ
|
||||||
|
"amqplib": "rabbitmq",
|
||||||
|
"pika": "rabbitmq",
|
||||||
|
"rabbitmq": "rabbitmq",
|
||||||
|
# Kafka
|
||||||
|
"kafka": "kafka",
|
||||||
|
"confluent-kafka": "kafka",
|
||||||
|
"kafkajs": "kafka",
|
||||||
|
# Neo4j
|
||||||
|
"neo4j": "neo4j",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Node package.json dependency → database mapping
|
||||||
|
_NODE_DEP_DB_SIGNALS: dict[str, str] = {
|
||||||
|
"pg": "postgresql",
|
||||||
|
"mysql": "mysql",
|
||||||
|
"mysql2": "mysql",
|
||||||
|
"mongoose": "mongodb",
|
||||||
|
"mongodb": "mongodb",
|
||||||
|
"@clickhouse/client": "clickhouse",
|
||||||
|
"redis": "redis",
|
||||||
|
"ioredis": "redis",
|
||||||
|
"prisma": "postgresql",
|
||||||
|
"better-sqlite3": "sqlite",
|
||||||
|
"sqlite3": "sqlite",
|
||||||
|
"@elastic/elasticsearch": "elasticsearch",
|
||||||
|
"@aws-sdk/client-dynamodb": "dynamodb",
|
||||||
|
"kafkajs": "kafka",
|
||||||
|
"amqplib": "rabbitmq",
|
||||||
|
"neo4j-driver": "neo4j",
|
||||||
|
"cassandra-driver": "cassandra",
|
||||||
|
"typeorm": "postgresql",
|
||||||
|
"sequelize": "postgresql",
|
||||||
|
"knex": "postgresql",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Docker compose service image → service name mapping
|
||||||
|
_COMPOSE_SERVICE_SIGNALS: dict[str, str] = {
|
||||||
|
"clickhouse": "clickhouse",
|
||||||
|
"postgres": "postgresql",
|
||||||
|
"mysql": "mysql",
|
||||||
|
"mariadb": "mysql",
|
||||||
|
"mongo": "mongodb",
|
||||||
|
"redis": "redis",
|
||||||
|
"elasticsearch": "elasticsearch",
|
||||||
|
"opensearch": "elasticsearch",
|
||||||
|
"rabbitmq": "rabbitmq",
|
||||||
|
"kafka": "kafka",
|
||||||
|
"zookeeper": "kafka",
|
||||||
|
"cassandra": "cassandra",
|
||||||
|
"neo4j": "neo4j",
|
||||||
|
"minio": "s3",
|
||||||
|
"localstack": "aws-local",
|
||||||
|
"dynamodb": "dynamodb",
|
||||||
|
"memcached": "memcached",
|
||||||
|
"nginx": "nginx",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Environment variable name patterns → database mapping
|
||||||
|
_ENV_DB_PATTERNS: list[tuple[str, str]] = [
|
||||||
|
("CLICKHOUSE", "clickhouse"),
|
||||||
|
("CH_", "clickhouse"),
|
||||||
|
("POSTGRES", "postgresql"),
|
||||||
|
("PG", "postgresql"),
|
||||||
|
("DATABASE_URL", "postgresql"),
|
||||||
|
("MYSQL", "mysql"),
|
||||||
|
("MARIADB", "mysql"),
|
||||||
|
("MONGO", "mongodb"),
|
||||||
|
("REDIS", "redis"),
|
||||||
|
("ELASTICSEARCH", "elasticsearch"),
|
||||||
|
("OPENSEARCH", "elasticsearch"),
|
||||||
|
("DYNAMO", "dynamodb"),
|
||||||
|
("CASSANDRA", "cassandra"),
|
||||||
|
("KAFKA", "kafka"),
|
||||||
|
("RABBIT", "rabbitmq"),
|
||||||
|
("AMQP", "rabbitmq"),
|
||||||
|
("NEO4J", "neo4j"),
|
||||||
|
("SQLITE", "sqlite"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def discover_repo(project_root: Path, env_names: set[str] | None = None) -> RepoDiscovery:
|
||||||
|
"""Infer runtime-relevant stack hints from common manifest/config files."""
|
||||||
|
discovery = RepoDiscovery()
|
||||||
|
env_names = {name.upper() for name in (env_names or set())}
|
||||||
|
|
||||||
|
file_map: dict[str, Path] = {
|
||||||
|
"pyproject": project_root / "pyproject.toml",
|
||||||
|
"requirements": project_root / "requirements.txt",
|
||||||
|
"requirements_dev": project_root / "requirements-dev.txt",
|
||||||
|
"setup_py": project_root / "setup.py",
|
||||||
|
"setup_cfg": project_root / "setup.cfg",
|
||||||
|
"package": project_root / "package.json",
|
||||||
|
"go_mod": project_root / "go.mod",
|
||||||
|
"cargo": project_root / "Cargo.toml",
|
||||||
|
"gemfile": project_root / "Gemfile",
|
||||||
|
"build_gradle": project_root / "build.gradle",
|
||||||
|
"build_gradle_kts": project_root / "build.gradle.kts",
|
||||||
|
"pom": project_root / "pom.xml",
|
||||||
|
"composer": project_root / "composer.json",
|
||||||
|
"mix": project_root / "mix.exs",
|
||||||
|
"docker_compose": project_root / "docker-compose.yml",
|
||||||
|
"docker_compose_alt": project_root / "docker-compose.yaml",
|
||||||
|
"compose": project_root / "compose.yaml",
|
||||||
|
"prisma": project_root / "prisma" / "schema.prisma",
|
||||||
|
"dockerfile": project_root / "Dockerfile",
|
||||||
|
}
|
||||||
|
|
||||||
|
# ---- Language detection ----
|
||||||
|
if (
|
||||||
|
file_map["pyproject"].exists()
|
||||||
|
or file_map["requirements"].exists()
|
||||||
|
or file_map["requirements_dev"].exists()
|
||||||
|
or file_map["setup_py"].exists()
|
||||||
|
or file_map["setup_cfg"].exists()
|
||||||
|
):
|
||||||
|
discovery.languages.add("python")
|
||||||
|
if file_map["package"].exists():
|
||||||
|
discovery.languages.add("node")
|
||||||
|
if file_map["go_mod"].exists():
|
||||||
|
discovery.languages.add("go")
|
||||||
|
if file_map["cargo"].exists():
|
||||||
|
discovery.languages.add("rust")
|
||||||
|
if file_map["gemfile"].exists():
|
||||||
|
discovery.languages.add("ruby")
|
||||||
|
if file_map["build_gradle"].exists() or file_map["build_gradle_kts"].exists() or file_map["pom"].exists():
|
||||||
|
discovery.languages.add("java")
|
||||||
|
if file_map["composer"].exists():
|
||||||
|
discovery.languages.add("php")
|
||||||
|
if file_map["mix"].exists():
|
||||||
|
discovery.languages.add("elixir")
|
||||||
|
|
||||||
|
# ---- Package manager detection ----
|
||||||
|
if file_map["pyproject"].exists() or file_map["requirements"].exists() or file_map["setup_py"].exists():
|
||||||
|
discovery.package_managers.add("pip")
|
||||||
|
if file_map["package"].exists():
|
||||||
|
try:
|
||||||
|
package_json = json.loads(_read_text(file_map["package"]) or "{}")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
package_json = {}
|
||||||
|
pm = package_json.get("packageManager")
|
||||||
|
if isinstance(pm, str) and pm:
|
||||||
|
discovery.package_managers.add(pm.split("@", 1)[0])
|
||||||
|
else:
|
||||||
|
# Check for lockfiles to distinguish npm/yarn/pnpm
|
||||||
|
if (project_root / "pnpm-lock.yaml").exists():
|
||||||
|
discovery.package_managers.add("pnpm")
|
||||||
|
elif (project_root / "yarn.lock").exists():
|
||||||
|
discovery.package_managers.add("yarn")
|
||||||
|
else:
|
||||||
|
discovery.package_managers.add("npm")
|
||||||
|
if file_map["go_mod"].exists():
|
||||||
|
discovery.package_managers.add("go")
|
||||||
|
if file_map["cargo"].exists():
|
||||||
|
discovery.package_managers.add("cargo")
|
||||||
|
if file_map["gemfile"].exists():
|
||||||
|
discovery.package_managers.add("bundler")
|
||||||
|
if file_map["build_gradle"].exists() or file_map["build_gradle_kts"].exists():
|
||||||
|
discovery.package_managers.add("gradle")
|
||||||
|
if file_map["pom"].exists():
|
||||||
|
discovery.package_managers.add("maven")
|
||||||
|
if file_map["composer"].exists():
|
||||||
|
discovery.package_managers.add("composer")
|
||||||
|
if file_map["mix"].exists():
|
||||||
|
discovery.package_managers.add("mix")
|
||||||
|
|
||||||
|
# ---- Gather manifest content ----
|
||||||
|
manifests = {
|
||||||
|
name: _read_text(path)
|
||||||
|
for name, path in file_map.items()
|
||||||
|
if path.exists()
|
||||||
|
}
|
||||||
|
combined = "\n".join(manifests.values())
|
||||||
|
|
||||||
|
# ---- Database detection from manifest content ----
|
||||||
|
_add_if_contains(discovery.databases, combined, _MANIFEST_DB_SIGNALS)
|
||||||
|
|
||||||
|
# ---- Node.js dependency-specific detection ----
|
||||||
|
if file_map["package"].exists():
|
||||||
|
try:
|
||||||
|
package_json = json.loads(_read_text(file_map["package"]) or "{}")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
package_json = {}
|
||||||
|
deps = {
|
||||||
|
**(package_json.get("dependencies") or {}),
|
||||||
|
**(package_json.get("devDependencies") or {}),
|
||||||
|
}
|
||||||
|
dep_blob = "\n".join(deps.keys()).lower()
|
||||||
|
_add_if_contains(discovery.databases, dep_blob, _NODE_DEP_DB_SIGNALS)
|
||||||
|
|
||||||
|
# ---- Framework detection from manifest content ----
|
||||||
|
_add_if_contains(
|
||||||
|
discovery.frameworks,
|
||||||
|
combined,
|
||||||
|
{
|
||||||
|
"fastapi": "fastapi",
|
||||||
|
"django": "django",
|
||||||
|
"flask": "flask",
|
||||||
|
"express": "express",
|
||||||
|
"nextjs": "next.js",
|
||||||
|
"next": "next.js",
|
||||||
|
"nestjs": "nestjs",
|
||||||
|
"spring": "spring",
|
||||||
|
"rails": "rails",
|
||||||
|
"laravel": "laravel",
|
||||||
|
"phoenix": "phoenix",
|
||||||
|
"gin": "gin",
|
||||||
|
"actix": "actix",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---- Database detection from environment variable names ----
|
||||||
|
for env_name in env_names:
|
||||||
|
for pattern, db_name in _ENV_DB_PATTERNS:
|
||||||
|
if pattern in env_name or env_name.startswith(pattern):
|
||||||
|
discovery.databases.add(db_name)
|
||||||
|
break
|
||||||
|
|
||||||
|
# ---- Docker compose service detection ----
|
||||||
|
compose_blob = "\n".join(
|
||||||
|
manifests.get(key, "")
|
||||||
|
for key in ("docker_compose", "docker_compose_alt", "compose")
|
||||||
|
).lower()
|
||||||
|
_add_if_contains(discovery.services, compose_blob, _COMPOSE_SERVICE_SIGNALS)
|
||||||
|
|
||||||
|
# ---- Hints from config files ----
|
||||||
|
if file_map["prisma"].exists():
|
||||||
|
discovery.hints.append("Prisma schema detected.")
|
||||||
|
if (project_root / "alembic.ini").exists():
|
||||||
|
discovery.hints.append("Alembic migration config detected.")
|
||||||
|
if (project_root / "knexfile.js").exists() or (project_root / "knexfile.ts").exists():
|
||||||
|
discovery.hints.append("Knex migration config detected.")
|
||||||
|
if (project_root / "ormconfig.json").exists() or (project_root / "ormconfig.ts").exists():
|
||||||
|
discovery.hints.append("TypeORM config detected.")
|
||||||
|
if (project_root / "drizzle.config.ts").exists():
|
||||||
|
discovery.hints.append("Drizzle ORM config detected.")
|
||||||
|
if (project_root / "Makefile").exists():
|
||||||
|
discovery.hints.append("Makefile available for build/task automation.")
|
||||||
|
if file_map["dockerfile"].exists() or (project_root / "docker").exists() or discovery.services:
|
||||||
|
discovery.hints.append("Containerized services may be available for local verification.")
|
||||||
|
|
||||||
|
return discovery
|
||||||
|
|
||||||
|
|
||||||
|
def format_repo_discovery(discovery: RepoDiscovery) -> str:
|
||||||
|
"""Render discovery results into a compact prompt summary."""
|
||||||
|
lines: list[str] = []
|
||||||
|
if discovery.languages:
|
||||||
|
lines.append("Detected languages: " + ", ".join(sorted(discovery.languages)))
|
||||||
|
if discovery.package_managers:
|
||||||
|
lines.append("Likely package managers: " + ", ".join(sorted(discovery.package_managers)))
|
||||||
|
if discovery.databases:
|
||||||
|
lines.append("Detected databases/services in code or env: " + ", ".join(sorted(discovery.databases)))
|
||||||
|
if discovery.services:
|
||||||
|
lines.append("Detected local service containers: " + ", ".join(sorted(discovery.services)))
|
||||||
|
if discovery.frameworks:
|
||||||
|
lines.append("Detected frameworks: " + ", ".join(sorted(discovery.frameworks)))
|
||||||
|
if discovery.hints:
|
||||||
|
lines.extend(discovery.hints)
|
||||||
|
if not lines:
|
||||||
|
return "No strong runtime/service signals were detected from repository manifests."
|
||||||
|
return "\n".join(lines)
|
||||||
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
|||||||
@@ -88,6 +88,8 @@ class AgentResult:
|
|||||||
agent_name: str
|
agent_name: str
|
||||||
step_name: str
|
step_name: str
|
||||||
duration_seconds: float
|
duration_seconds: float
|
||||||
|
transcript: str = ""
|
||||||
|
command_preview: str = ""
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
import time
|
import time
|
||||||
|
from hashlib import sha256
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -13,6 +14,7 @@ from pathlib import Path
|
|||||||
from cross_eval.agent import AgentInvocationError, invoke_agent, invoke_agent_agentic
|
from cross_eval.agent import AgentInvocationError, invoke_agent, invoke_agent_agentic
|
||||||
from cross_eval.worktree import WorktreeError
|
from cross_eval.worktree import WorktreeError
|
||||||
from cross_eval.config import try_reload_config
|
from cross_eval.config import try_reload_config
|
||||||
|
from cross_eval.discovery import discover_repo, format_repo_discovery
|
||||||
from cross_eval.models import (
|
from cross_eval.models import (
|
||||||
AgentConfig,
|
AgentConfig,
|
||||||
AgentResult,
|
AgentResult,
|
||||||
@@ -91,15 +93,110 @@ def _setup_worktree(cwd: Path, run_dir: Path, preset_name: str) -> tuple[Path, s
|
|||||||
|
|
||||||
Returns (worktree_path, branch_name).
|
Returns (worktree_path, branch_name).
|
||||||
"""
|
"""
|
||||||
from cross_eval.worktree import create_worktree, make_branch_name
|
from cross_eval.worktree import create_worktree, make_branch_name, make_worktree_dir
|
||||||
branch_name = make_branch_name(preset_name)
|
branch_name = make_branch_name(preset_name)
|
||||||
worktree_dir = run_dir / "work"
|
worktree_dir = make_worktree_dir(cwd, branch_name)
|
||||||
worktree_path = create_worktree(
|
worktree_path = create_worktree(
|
||||||
base_cwd=cwd, work_dir=worktree_dir, branch_name=branch_name,
|
base_cwd=cwd, work_dir=worktree_dir, branch_name=branch_name,
|
||||||
)
|
)
|
||||||
|
(run_dir / "worktree_path.txt").write_text(f"{worktree_path}\n", encoding="utf-8")
|
||||||
|
(run_dir / "worktree_branch.txt").write_text(f"{branch_name}\n", encoding="utf-8")
|
||||||
return worktree_path, branch_name
|
return worktree_path, branch_name
|
||||||
|
|
||||||
|
|
||||||
|
def _snapshot_repo_state(cwd: Path) -> str:
|
||||||
|
"""Capture the base repository working-tree state.
|
||||||
|
|
||||||
|
This is used to detect agentic runs that accidentally modify the original
|
||||||
|
checkout instead of the isolated worktree.
|
||||||
|
"""
|
||||||
|
status = subprocess.run(
|
||||||
|
["git", "status", "--short", "--untracked-files=all"],
|
||||||
|
cwd=cwd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
if status.returncode != 0:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
diff = subprocess.run(
|
||||||
|
["git", "diff", "--no-ext-diff", "--binary", "HEAD"],
|
||||||
|
cwd=cwd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
cached_diff = subprocess.run(
|
||||||
|
["git", "diff", "--no-ext-diff", "--binary", "--cached"],
|
||||||
|
cwd=cwd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
untracked = subprocess.run(
|
||||||
|
["git", "ls-files", "--others", "--exclude-standard", "-z"],
|
||||||
|
cwd=cwd,
|
||||||
|
capture_output=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
parts = [
|
||||||
|
status.stdout,
|
||||||
|
diff.stdout,
|
||||||
|
cached_diff.stdout,
|
||||||
|
]
|
||||||
|
|
||||||
|
if untracked.returncode == 0 and untracked.stdout:
|
||||||
|
for rel_path in untracked.stdout.decode("utf-8", errors="replace").split("\0"):
|
||||||
|
if not rel_path:
|
||||||
|
continue
|
||||||
|
file_path = cwd / rel_path
|
||||||
|
if file_path.is_file():
|
||||||
|
digest = sha256(file_path.read_bytes()).hexdigest()
|
||||||
|
parts.append(f"UNTRACKED {rel_path} {digest}")
|
||||||
|
else:
|
||||||
|
parts.append(f"UNTRACKED {rel_path} (non-file)")
|
||||||
|
|
||||||
|
return "\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _snapshot_repo_status(cwd: Path) -> str:
|
||||||
|
"""Capture a human-readable status summary for error reporting."""
|
||||||
|
result = subprocess.run(
|
||||||
|
["git", "status", "--short", "--untracked-files=all"],
|
||||||
|
cwd=cwd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
return ""
|
||||||
|
return result.stdout.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _assert_base_repo_isolation(
|
||||||
|
cwd: Path,
|
||||||
|
baseline_state: str,
|
||||||
|
*,
|
||||||
|
step_name: str,
|
||||||
|
agent_name: str,
|
||||||
|
worktree_path: Path,
|
||||||
|
baseline_status: str,
|
||||||
|
) -> None:
|
||||||
|
"""Fail fast if an agentic run leaked changes into the base repo."""
|
||||||
|
current_state = _snapshot_repo_state(cwd)
|
||||||
|
if current_state == baseline_state:
|
||||||
|
return
|
||||||
|
|
||||||
|
current_status = _snapshot_repo_status(cwd)
|
||||||
|
before = baseline_status or "(clean)"
|
||||||
|
after = current_status or "(clean)"
|
||||||
|
raise WorktreeError(
|
||||||
|
"Agent modified the base repository instead of the isolated worktree.\n\n"
|
||||||
|
f"Step: {step_name}\n"
|
||||||
|
f"Agent: {agent_name}\n"
|
||||||
|
f"Worktree: {worktree_path}\n\n"
|
||||||
|
f"Baseline status:\n{before}\n\n"
|
||||||
|
f"Current status:\n{after}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _finalize_worktree(
|
def _finalize_worktree(
|
||||||
cwd: Path,
|
cwd: Path,
|
||||||
worktree_path: Path,
|
worktree_path: Path,
|
||||||
@@ -171,10 +268,14 @@ def _run_simple_pipeline(
|
|||||||
# Setup shared worktree for agentic mode
|
# Setup shared worktree for agentic mode
|
||||||
worktree_path: Path | None = None
|
worktree_path: Path | None = None
|
||||||
agentic_branch_name: str | None = None
|
agentic_branch_name: str | None = None
|
||||||
|
base_repo_state: str | None = None
|
||||||
|
base_repo_status: str | None = None
|
||||||
if not dry_run and _has_agentic_steps(config, config.pipeline):
|
if not dry_run and _has_agentic_steps(config, config.pipeline):
|
||||||
worktree_path, agentic_branch_name = _setup_worktree(
|
worktree_path, agentic_branch_name = _setup_worktree(
|
||||||
cwd, run_dir, config.preset_name,
|
cwd, run_dir, config.preset_name,
|
||||||
)
|
)
|
||||||
|
base_repo_state = _snapshot_repo_state(cwd)
|
||||||
|
base_repo_status = _snapshot_repo_status(cwd)
|
||||||
|
|
||||||
feedback = "(no feedback — first iteration)"
|
feedback = "(no feedback — first iteration)"
|
||||||
iterations: list[IterationResult] = []
|
iterations: list[IterationResult] = []
|
||||||
@@ -202,6 +303,8 @@ def _run_simple_pipeline(
|
|||||||
run_dir=run_dir, output_iter=i,
|
run_dir=run_dir, output_iter=i,
|
||||||
worktree_path=worktree_path,
|
worktree_path=worktree_path,
|
||||||
runtime_env=runtime_env,
|
runtime_env=runtime_env,
|
||||||
|
base_repo_state=base_repo_state,
|
||||||
|
base_repo_status=base_repo_status,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Intermediate commit so next iteration's diff only shows new changes
|
# Intermediate commit so next iteration's diff only shows new changes
|
||||||
@@ -235,6 +338,15 @@ def _run_simple_pipeline(
|
|||||||
if tracker:
|
if tracker:
|
||||||
input_contents["previous_senior_tracker"] = tracker
|
input_contents["previous_senior_tracker"] = tracker
|
||||||
|
|
||||||
|
# Carry execution evidence forward so subsequent iterations'
|
||||||
|
# reviewer/senior prompts can inspect prior transcript data.
|
||||||
|
if step_results:
|
||||||
|
input_contents["execution_evidence"] = _format_execution_evidence(
|
||||||
|
step_results,
|
||||||
|
run_dir=run_dir,
|
||||||
|
iteration=i,
|
||||||
|
)
|
||||||
|
|
||||||
iterations.append(iter_result)
|
iterations.append(iter_result)
|
||||||
|
|
||||||
# ESCALATE check (highest priority)
|
# ESCALATE check (highest priority)
|
||||||
@@ -331,10 +443,14 @@ def _run_phased_pipeline(
|
|||||||
all_phase_steps = [s for p in config.phases for s in p.steps]
|
all_phase_steps = [s for p in config.phases for s in p.steps]
|
||||||
worktree_path: Path | None = None
|
worktree_path: Path | None = None
|
||||||
agentic_branch_name: str | None = None
|
agentic_branch_name: str | None = None
|
||||||
|
base_repo_state: str | None = None
|
||||||
|
base_repo_status: str | None = None
|
||||||
if not dry_run and _has_agentic_steps(config, all_phase_steps):
|
if not dry_run and _has_agentic_steps(config, all_phase_steps):
|
||||||
worktree_path, agentic_branch_name = _setup_worktree(
|
worktree_path, agentic_branch_name = _setup_worktree(
|
||||||
cwd, run_dir, config.preset_name,
|
cwd, run_dir, config.preset_name,
|
||||||
)
|
)
|
||||||
|
base_repo_state = _snapshot_repo_state(cwd)
|
||||||
|
base_repo_status = _snapshot_repo_status(cwd)
|
||||||
|
|
||||||
iterations: list[IterationResult] = []
|
iterations: list[IterationResult] = []
|
||||||
feedback = "(no feedback — first iteration)"
|
feedback = "(no feedback — first iteration)"
|
||||||
@@ -383,6 +499,8 @@ def _run_phased_pipeline(
|
|||||||
run_dir=run_dir, output_iter=global_iter, phase_name=phase.name,
|
run_dir=run_dir, output_iter=global_iter, phase_name=phase.name,
|
||||||
worktree_path=worktree_path,
|
worktree_path=worktree_path,
|
||||||
runtime_env=runtime_env,
|
runtime_env=runtime_env,
|
||||||
|
base_repo_state=base_repo_state,
|
||||||
|
base_repo_status=base_repo_status,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Intermediate commit so next iteration's diff only shows new changes
|
# Intermediate commit so next iteration's diff only shows new changes
|
||||||
@@ -422,6 +540,15 @@ def _run_phased_pipeline(
|
|||||||
if tracker:
|
if tracker:
|
||||||
input_contents["previous_senior_tracker"] = tracker
|
input_contents["previous_senior_tracker"] = tracker
|
||||||
|
|
||||||
|
# Carry execution evidence forward so subsequent iterations'
|
||||||
|
# reviewer/senior prompts can inspect prior transcript data.
|
||||||
|
if step_results:
|
||||||
|
input_contents["execution_evidence"] = _format_execution_evidence(
|
||||||
|
step_results,
|
||||||
|
run_dir=run_dir,
|
||||||
|
iteration=global_iter,
|
||||||
|
)
|
||||||
|
|
||||||
iterations.append(iter_result)
|
iterations.append(iter_result)
|
||||||
|
|
||||||
# ESCALATE check
|
# ESCALATE check
|
||||||
@@ -538,10 +665,13 @@ def _load_inputs(config: PipelineConfig) -> dict[str, str]:
|
|||||||
"""Load input file contents from config."""
|
"""Load input file contents from config."""
|
||||||
input_contents: dict[str, str] = {}
|
input_contents: dict[str, str] = {}
|
||||||
for key, val in config.inputs.items():
|
for key, val in config.inputs.items():
|
||||||
if isinstance(val, str):
|
if key.endswith("_ref"):
|
||||||
|
input_contents[key] = str(val)
|
||||||
|
elif isinstance(val, str):
|
||||||
input_contents[key] = val
|
input_contents[key] = val
|
||||||
else:
|
else:
|
||||||
input_contents[key] = val.read_text(encoding="utf-8")
|
input_contents[key] = val.read_text(encoding="utf-8")
|
||||||
|
_refresh_input_references(config, input_contents)
|
||||||
return input_contents
|
return input_contents
|
||||||
|
|
||||||
|
|
||||||
@@ -550,10 +680,99 @@ def _refresh_inputs(
|
|||||||
) -> None:
|
) -> None:
|
||||||
"""Re-read input files (they may have changed on disk)."""
|
"""Re-read input files (they may have changed on disk)."""
|
||||||
for key, val in config.inputs.items():
|
for key, val in config.inputs.items():
|
||||||
if isinstance(val, str):
|
if key.endswith("_ref"):
|
||||||
|
input_contents[key] = str(val)
|
||||||
|
elif isinstance(val, str):
|
||||||
input_contents[key] = val
|
input_contents[key] = val
|
||||||
elif isinstance(val, Path) and val.exists():
|
elif isinstance(val, Path) and val.exists():
|
||||||
input_contents[key] = val.read_text(encoding="utf-8")
|
input_contents[key] = val.read_text(encoding="utf-8")
|
||||||
|
_refresh_input_references(config, input_contents)
|
||||||
|
|
||||||
|
|
||||||
|
def _refresh_input_references(
|
||||||
|
config: PipelineConfig,
|
||||||
|
input_contents: dict[str, str],
|
||||||
|
) -> None:
|
||||||
|
"""Expose stable file references for canonical planning inputs."""
|
||||||
|
for key, val in config.inputs.items():
|
||||||
|
if key.endswith("_ref"):
|
||||||
|
input_contents[key] = str(val)
|
||||||
|
continue
|
||||||
|
ref_key = f"{key}_ref"
|
||||||
|
if isinstance(val, Path):
|
||||||
|
input_contents[ref_key] = str(val.resolve())
|
||||||
|
else:
|
||||||
|
input_contents.setdefault(ref_key, f"(inline {key}; no file path available)")
|
||||||
|
|
||||||
|
|
||||||
|
def _git_ref(cwd: Path, *args: str) -> str:
|
||||||
|
"""Best-effort git metadata lookup."""
|
||||||
|
result = subprocess.run(
|
||||||
|
["git", *args],
|
||||||
|
cwd=cwd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
return "(unknown)"
|
||||||
|
return result.stdout.strip() or "(unknown)"
|
||||||
|
|
||||||
|
|
||||||
|
def _collect_markdown_refs(run_dir: Path, iteration: int) -> list[Path]:
|
||||||
|
"""Collect prior markdown artifacts available to the current step."""
|
||||||
|
refs: list[Path] = []
|
||||||
|
for idx in range(1, iteration + 1):
|
||||||
|
iter_dir = run_dir / f"v{idx}"
|
||||||
|
if not iter_dir.exists():
|
||||||
|
continue
|
||||||
|
refs.extend(sorted(iter_dir.glob("*.md")))
|
||||||
|
return refs
|
||||||
|
|
||||||
|
|
||||||
|
def _build_artifact_references(
|
||||||
|
context: dict[str, str],
|
||||||
|
*,
|
||||||
|
cwd: Path,
|
||||||
|
run_dir: Path,
|
||||||
|
iteration: int,
|
||||||
|
worktree_path: Path | None,
|
||||||
|
step_results: dict[str, AgentResult] | None = None,
|
||||||
|
) -> str:
|
||||||
|
"""Build a compact reference-only handoff for agentic steps."""
|
||||||
|
repo_cwd = worktree_path or cwd
|
||||||
|
branch = _git_ref(repo_cwd, "rev-parse", "--abbrev-ref", "HEAD")
|
||||||
|
commit_hash = _git_ref(repo_cwd, "rev-parse", "HEAD")
|
||||||
|
|
||||||
|
lines = [
|
||||||
|
"### Canonical References",
|
||||||
|
f"- Plan: {context.get('plan_ref', '(missing)')}",
|
||||||
|
f"- Checklist: {context.get('checklist_ref', '(missing)')}",
|
||||||
|
f"- Docs: {context.get('docs_ref', '(none)')}",
|
||||||
|
f"- Run directory: {run_dir}",
|
||||||
|
f"- Current iteration directory: {run_dir / f'v{iteration}'}",
|
||||||
|
f"- Target repository: {repo_cwd}",
|
||||||
|
f"- Git branch: {branch}",
|
||||||
|
f"- Git commit: {commit_hash}",
|
||||||
|
"",
|
||||||
|
"Use git/cat to inspect the referenced files directly instead of relying on inline summaries.",
|
||||||
|
f"Suggested git commands: `git -C {repo_cwd} show {commit_hash}` and `git -C {repo_cwd} diff HEAD`",
|
||||||
|
]
|
||||||
|
|
||||||
|
markdown_refs = _collect_markdown_refs(run_dir, iteration)
|
||||||
|
if markdown_refs:
|
||||||
|
lines.extend(["", "### Markdown Artifacts"])
|
||||||
|
lines.extend(f"- {path}" for path in markdown_refs)
|
||||||
|
|
||||||
|
if step_results:
|
||||||
|
lines.extend(["", "### Current Step Artifacts"])
|
||||||
|
for result in step_results.values():
|
||||||
|
lines.append(f"- Output: {run_dir / f'v{iteration}' / f'{result.step_name}.md'}")
|
||||||
|
if result.transcript:
|
||||||
|
lines.append(
|
||||||
|
f"- Transcript: {run_dir / f'v{iteration}' / f'{result.step_name}_transcript.md'}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -625,6 +844,8 @@ def _run_steps(
|
|||||||
phase_name: str | None = None,
|
phase_name: str | None = None,
|
||||||
worktree_path: Path | None = None,
|
worktree_path: Path | None = None,
|
||||||
runtime_env: dict[str, str] | None = None,
|
runtime_env: dict[str, str] | None = None,
|
||||||
|
base_repo_state: str | None = None,
|
||||||
|
base_repo_status: str | None = None,
|
||||||
) -> tuple[dict[str, str], dict[str, AgentResult], str | None]:
|
) -> tuple[dict[str, str], dict[str, AgentResult], str | None]:
|
||||||
"""Execute all steps in one iteration, parallelizing where possible."""
|
"""Execute all steps in one iteration, parallelizing where possible."""
|
||||||
step_outputs: dict[str, str] = {}
|
step_outputs: dict[str, str] = {}
|
||||||
@@ -643,6 +864,8 @@ def _run_steps(
|
|||||||
run_dir=run_dir, output_iter=output_iter,
|
run_dir=run_dir, output_iter=output_iter,
|
||||||
phase_name=phase_name, worktree_path=worktree_path,
|
phase_name=phase_name, worktree_path=worktree_path,
|
||||||
runtime_env=runtime_env,
|
runtime_env=runtime_env,
|
||||||
|
base_repo_state=base_repo_state,
|
||||||
|
base_repo_status=base_repo_status,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
_execute_parallel_batch(
|
_execute_parallel_batch(
|
||||||
@@ -652,6 +875,8 @@ def _run_steps(
|
|||||||
run_dir=run_dir, output_iter=output_iter,
|
run_dir=run_dir, output_iter=output_iter,
|
||||||
phase_name=phase_name, worktree_path=worktree_path,
|
phase_name=phase_name, worktree_path=worktree_path,
|
||||||
runtime_env=runtime_env,
|
runtime_env=runtime_env,
|
||||||
|
base_repo_state=base_repo_state,
|
||||||
|
base_repo_status=base_repo_status,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract verdict from all verdict steps (ALL must PASS; ESCALATE wins over all)
|
# Extract verdict from all verdict steps (ALL must PASS; ESCALATE wins over all)
|
||||||
@@ -708,6 +933,8 @@ def _execute_step(
|
|||||||
quiet: bool = False,
|
quiet: bool = False,
|
||||||
worktree_path: Path | None = None,
|
worktree_path: Path | None = None,
|
||||||
runtime_env: dict[str, str] | None = None,
|
runtime_env: dict[str, str] | None = None,
|
||||||
|
base_repo_state: str | None = None,
|
||||||
|
base_repo_status: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Execute a single step, updating step_outputs and step_results in place."""
|
"""Execute a single step, updating step_outputs and step_results in place."""
|
||||||
if not quiet:
|
if not quiet:
|
||||||
@@ -716,9 +943,13 @@ def _execute_step(
|
|||||||
# 1. Resolve template
|
# 1. Resolve template
|
||||||
template = resolve_template(step.prompt_template)
|
template = resolve_template(step.prompt_template)
|
||||||
|
|
||||||
# 2. Build context
|
# 2. Build context (include prior step results for evidence)
|
||||||
context = _build_context(
|
context = _build_context(
|
||||||
input_contents, step_outputs, feedback, iteration, max_iterations,
|
input_contents, step_outputs, feedback, iteration, max_iterations,
|
||||||
|
cwd=cwd,
|
||||||
|
run_dir=run_dir,
|
||||||
|
worktree_path=worktree_path,
|
||||||
|
step_results=step_results,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 3. Apply context overrides
|
# 3. Apply context overrides
|
||||||
@@ -793,6 +1024,16 @@ def _execute_step(
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
# 7. Store output
|
# 7. Store output
|
||||||
|
if worktree_path is not None and base_repo_state is not None:
|
||||||
|
_assert_base_repo_isolation(
|
||||||
|
cwd,
|
||||||
|
base_repo_state,
|
||||||
|
step_name=step.name,
|
||||||
|
agent_name=step.agent,
|
||||||
|
worktree_path=worktree_path,
|
||||||
|
baseline_status=base_repo_status or "",
|
||||||
|
)
|
||||||
|
|
||||||
step_outputs[step.output_key] = result.output
|
step_outputs[step.output_key] = result.output
|
||||||
step_results[step.output_key] = result
|
step_results[step.output_key] = result
|
||||||
|
|
||||||
@@ -804,6 +1045,7 @@ def _execute_step(
|
|||||||
|
|
||||||
# 8. Save to disk
|
# 8. Save to disk
|
||||||
_save_step_output(run_dir, output_iter, step.name, result.output)
|
_save_step_output(run_dir, output_iter, step.name, result.output)
|
||||||
|
_maybe_save_step_transcript(run_dir, output_iter, step.name, result)
|
||||||
|
|
||||||
|
|
||||||
def _execute_parallel_batch(
|
def _execute_parallel_batch(
|
||||||
@@ -824,6 +1066,8 @@ def _execute_parallel_batch(
|
|||||||
phase_name: str | None = None,
|
phase_name: str | None = None,
|
||||||
worktree_path: Path | None = None,
|
worktree_path: Path | None = None,
|
||||||
runtime_env: dict[str, str] | None = None,
|
runtime_env: dict[str, str] | None = None,
|
||||||
|
base_repo_state: str | None = None,
|
||||||
|
base_repo_status: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Execute multiple steps in parallel using threads."""
|
"""Execute multiple steps in parallel using threads."""
|
||||||
agent_names = ", ".join(s.agent for s in batch)
|
agent_names = ", ".join(s.agent for s in batch)
|
||||||
@@ -836,6 +1080,8 @@ def _execute_parallel_batch(
|
|||||||
iteration, max_iterations, cwd, timeout, dry_run,
|
iteration, max_iterations, cwd, timeout, dry_run,
|
||||||
step_outputs, step_results,
|
step_outputs, step_results,
|
||||||
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
|
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
|
||||||
|
base_repo_state=base_repo_state,
|
||||||
|
base_repo_status=base_repo_status,
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -856,12 +1102,15 @@ def _execute_parallel_batch(
|
|||||||
step_outputs, step_results,
|
step_outputs, step_results,
|
||||||
run_dir=run_dir, output_iter=output_iter,
|
run_dir=run_dir, output_iter=output_iter,
|
||||||
phase_name=phase_name, worktree_path=worktree_path,
|
phase_name=phase_name, worktree_path=worktree_path,
|
||||||
|
base_repo_state=base_repo_state,
|
||||||
|
base_repo_status=base_repo_status,
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Snapshot context before parallel execution (all steps see same state)
|
# Snapshot context before parallel execution (all steps see same state)
|
||||||
context_snapshot = dict(input_contents)
|
context_snapshot = dict(input_contents)
|
||||||
context_snapshot.update(step_outputs)
|
context_snapshot.update(step_outputs)
|
||||||
|
results_snapshot = dict(step_results)
|
||||||
|
|
||||||
# Collect results from parallel threads
|
# Collect results from parallel threads
|
||||||
local_outputs: dict[str, str] = {}
|
local_outputs: dict[str, str] = {}
|
||||||
@@ -881,6 +1130,10 @@ def _execute_parallel_batch(
|
|||||||
template = resolve_template(step.prompt_template)
|
template = resolve_template(step.prompt_template)
|
||||||
context = _build_context(
|
context = _build_context(
|
||||||
context_snapshot, {}, feedback, iteration, max_iterations,
|
context_snapshot, {}, feedback, iteration, max_iterations,
|
||||||
|
cwd=cwd,
|
||||||
|
run_dir=run_dir,
|
||||||
|
worktree_path=worktree_path,
|
||||||
|
step_results=results_snapshot,
|
||||||
)
|
)
|
||||||
if step.context_override:
|
if step.context_override:
|
||||||
context = _apply_context_override(context, step.context_override)
|
context = _apply_context_override(context, step.context_override)
|
||||||
@@ -917,6 +1170,16 @@ def _execute_parallel_batch(
|
|||||||
batch_elapsed = round(time.monotonic() - batch_start, 1)
|
batch_elapsed = round(time.monotonic() - batch_start, 1)
|
||||||
|
|
||||||
# Persist successful outputs even if a sibling step failed.
|
# Persist successful outputs even if a sibling step failed.
|
||||||
|
if worktree_path is not None and base_repo_state is not None:
|
||||||
|
_assert_base_repo_isolation(
|
||||||
|
cwd,
|
||||||
|
base_repo_state,
|
||||||
|
step_name=phase_name or "parallel-batch",
|
||||||
|
agent_name=agent_names,
|
||||||
|
worktree_path=worktree_path,
|
||||||
|
baseline_status=base_repo_status or "",
|
||||||
|
)
|
||||||
|
|
||||||
for step in batch:
|
for step in batch:
|
||||||
key = step.output_key
|
key = step.output_key
|
||||||
if key not in local_outputs:
|
if key not in local_outputs:
|
||||||
@@ -929,6 +1192,7 @@ def _execute_parallel_batch(
|
|||||||
step.name, r.duration_seconds, len(r.output),
|
step.name, r.duration_seconds, len(r.output),
|
||||||
)
|
)
|
||||||
_save_step_output(run_dir, output_iter, step.name, r.output)
|
_save_step_output(run_dir, output_iter, step.name, r.output)
|
||||||
|
_maybe_save_step_transcript(run_dir, output_iter, step.name, r)
|
||||||
|
|
||||||
if errors:
|
if errors:
|
||||||
spinner.stop(f"[parallel] FAILED ({batch_elapsed}s)")
|
spinner.stop(f"[parallel] FAILED ({batch_elapsed}s)")
|
||||||
@@ -983,17 +1247,89 @@ def _build_context(
|
|||||||
feedback: str,
|
feedback: str,
|
||||||
iteration: int,
|
iteration: int,
|
||||||
max_iterations: int,
|
max_iterations: int,
|
||||||
|
*,
|
||||||
|
cwd: Path | None = None,
|
||||||
|
run_dir: Path | None = None,
|
||||||
|
worktree_path: Path | None = None,
|
||||||
|
step_results: dict[str, AgentResult] | None = None,
|
||||||
) -> dict[str, str]:
|
) -> dict[str, str]:
|
||||||
"""Build the template context dict."""
|
"""Build the template context dict.
|
||||||
|
|
||||||
|
Execution evidence from prior iterations is carried forward in
|
||||||
|
``input_contents["execution_evidence"]``. When the current iteration
|
||||||
|
has its own step results, the evidence is merged so reviewers/seniors
|
||||||
|
see both prior and current data.
|
||||||
|
"""
|
||||||
context: dict[str, str] = {}
|
context: dict[str, str] = {}
|
||||||
context.update(input_contents)
|
context.update(input_contents)
|
||||||
context.update(step_outputs)
|
context.update(step_outputs)
|
||||||
context["feedback"] = feedback
|
context["feedback"] = feedback
|
||||||
context["iteration"] = str(iteration)
|
context["iteration"] = str(iteration)
|
||||||
context["max_iterations"] = str(max_iterations)
|
context["max_iterations"] = str(max_iterations)
|
||||||
|
ref_cwd = cwd or Path.cwd()
|
||||||
|
ref_run_dir = run_dir or ref_cwd / ".cross-eval" / "output" / "ad-hoc"
|
||||||
|
context["artifact_references"] = _build_artifact_references(
|
||||||
|
context,
|
||||||
|
cwd=ref_cwd,
|
||||||
|
run_dir=ref_run_dir,
|
||||||
|
iteration=iteration,
|
||||||
|
worktree_path=worktree_path,
|
||||||
|
step_results=step_results,
|
||||||
|
)
|
||||||
|
# Surface execution evidence from prior steps so reviewers can inspect it.
|
||||||
|
# Prior-iteration evidence may already live in context via input_contents.
|
||||||
|
prior_evidence = context.get("execution_evidence", "")
|
||||||
|
if step_results:
|
||||||
|
current_evidence = _format_execution_evidence(
|
||||||
|
step_results,
|
||||||
|
run_dir=ref_run_dir,
|
||||||
|
iteration=iteration,
|
||||||
|
)
|
||||||
|
if prior_evidence and prior_evidence != "(no prior execution evidence)":
|
||||||
|
context["execution_evidence"] = (
|
||||||
|
"# Prior Iteration Evidence\n"
|
||||||
|
+ prior_evidence
|
||||||
|
+ "\n\n# Current Iteration Evidence\n"
|
||||||
|
+ current_evidence
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
context["execution_evidence"] = current_evidence
|
||||||
return context
|
return context
|
||||||
|
|
||||||
|
|
||||||
|
def _format_execution_evidence(
|
||||||
|
step_results: dict[str, AgentResult],
|
||||||
|
*,
|
||||||
|
run_dir: Path | None = None,
|
||||||
|
iteration: int | None = None,
|
||||||
|
) -> str:
|
||||||
|
"""Format execution evidence from prior steps for reviewer consumption.
|
||||||
|
|
||||||
|
Produces a compact summary of command, exit code, duration, and artifact
|
||||||
|
paths so that later agents can read markdown/git state directly.
|
||||||
|
"""
|
||||||
|
if not step_results:
|
||||||
|
return "(no prior execution evidence)"
|
||||||
|
parts: list[str] = []
|
||||||
|
for key, result in step_results.items():
|
||||||
|
section = [
|
||||||
|
f"### Step: {result.step_name} ({result.agent_name})",
|
||||||
|
f"- Command: `{result.command_preview}`" if result.command_preview else "",
|
||||||
|
f"- Exit code: {result.exit_code}",
|
||||||
|
f"- Duration: {result.duration_seconds}s",
|
||||||
|
f"- Output size: {len(result.output)} chars",
|
||||||
|
]
|
||||||
|
section = [line for line in section if line]
|
||||||
|
if run_dir is not None and iteration is not None:
|
||||||
|
section.append(f"- Output artifact: {run_dir / f'v{iteration}' / f'{result.step_name}.md'}")
|
||||||
|
if result.transcript:
|
||||||
|
section.append(
|
||||||
|
f"- Transcript artifact: {run_dir / f'v{iteration}' / f'{result.step_name}_transcript.md'}"
|
||||||
|
)
|
||||||
|
parts.append("\n".join(section))
|
||||||
|
return "\n\n---\n\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
def _build_runtime_inputs(
|
def _build_runtime_inputs(
|
||||||
config: PipelineConfig,
|
config: PipelineConfig,
|
||||||
input_contents: dict[str, str],
|
input_contents: dict[str, str],
|
||||||
@@ -1001,10 +1337,12 @@ def _build_runtime_inputs(
|
|||||||
) -> dict[str, str]:
|
) -> dict[str, str]:
|
||||||
"""Load runtime env and expose safe execution hints to prompts."""
|
"""Load runtime env and expose safe execution hints to prompts."""
|
||||||
env, loaded_files, loaded_values = build_runtime_environment(config.execution, cwd)
|
env, loaded_files, loaded_values = build_runtime_environment(config.execution, cwd)
|
||||||
|
discovery = discover_repo(cwd, set(loaded_values) | set(env))
|
||||||
input_contents["execution_policy"] = build_execution_policy(config.execution)
|
input_contents["execution_policy"] = build_execution_policy(config.execution)
|
||||||
input_contents["environment_context"] = summarize_environment(
|
input_contents["environment_context"] = summarize_environment(
|
||||||
config.execution, loaded_files, env, loaded_values,
|
config.execution, loaded_files, env, loaded_values,
|
||||||
)
|
)
|
||||||
|
input_contents["repo_discovery"] = format_repo_discovery(discovery)
|
||||||
return env
|
return env
|
||||||
|
|
||||||
|
|
||||||
@@ -1018,6 +1356,8 @@ def _augment_prompt_with_runtime_context(
|
|||||||
extras.append("## Execution Policy\n" + context["execution_policy"])
|
extras.append("## Execution Policy\n" + context["execution_policy"])
|
||||||
if context.get("environment_context"):
|
if context.get("environment_context"):
|
||||||
extras.append("## Environment Context\n" + context["environment_context"])
|
extras.append("## Environment Context\n" + context["environment_context"])
|
||||||
|
if context.get("repo_discovery"):
|
||||||
|
extras.append("## Repository Discovery\n" + context["repo_discovery"])
|
||||||
if not extras:
|
if not extras:
|
||||||
return prompt
|
return prompt
|
||||||
return prompt.rstrip() + "\n\n" + "\n\n".join(extras) + "\n"
|
return prompt.rstrip() + "\n\n" + "\n\n".join(extras) + "\n"
|
||||||
@@ -1198,6 +1538,20 @@ def _save_step_output(
|
|||||||
return path
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
def _maybe_save_step_transcript(
|
||||||
|
run_dir: Path,
|
||||||
|
iteration: int,
|
||||||
|
step_name: str,
|
||||||
|
result: AgentResult,
|
||||||
|
) -> Path | None:
|
||||||
|
"""Persist raw stdout/stderr transcript when available."""
|
||||||
|
if not result.transcript:
|
||||||
|
return None
|
||||||
|
return _save_step_output(
|
||||||
|
run_dir, iteration, f"{step_name}_transcript", result.transcript,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _format_runtime_error_markdown(
|
def _format_runtime_error_markdown(
|
||||||
exc: Exception,
|
exc: Exception,
|
||||||
*,
|
*,
|
||||||
@@ -1223,7 +1577,7 @@ def _format_runtime_error_markdown(
|
|||||||
f"- **Suggested Action**: {exc.suggested_action}",
|
f"- **Suggested Action**: {exc.suggested_action}",
|
||||||
"",
|
"",
|
||||||
"## Command",
|
"## Command",
|
||||||
f"```",
|
"```",
|
||||||
exc.cmd_preview,
|
exc.cmd_preview,
|
||||||
"```",
|
"```",
|
||||||
"",
|
"",
|
||||||
|
|||||||
@@ -15,53 +15,39 @@ from cross_eval.models import PhaseConfig, StepConfig
|
|||||||
CODING_TEMPLATE = """\
|
CODING_TEMPLATE = """\
|
||||||
You are tasked with implementing code based on a plan and checklist.
|
You are tasked with implementing code based on a plan and checklist.
|
||||||
|
|
||||||
## Plan
|
## Artifact References
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## Checklist
|
|
||||||
{checklist}
|
|
||||||
|
|
||||||
## Reference Documents
|
|
||||||
{docs}
|
|
||||||
|
|
||||||
## Previous Review Feedback
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## Iteration
|
## Iteration
|
||||||
This is iteration {iteration} of {max_iterations}.
|
This is iteration {iteration} of {max_iterations}.
|
||||||
|
|
||||||
## Instructions
|
## Instructions
|
||||||
1. Explore the project directory to understand the existing codebase structure.
|
1. Read the referenced plan/checklist/docs/review artifacts directly from disk.
|
||||||
2. Implement ONLY what the plan specifies. Do NOT add extra features, \
|
2. Explore the project directory and git state to understand the current codebase structure.
|
||||||
|
3. Implement ONLY what the plan specifies. Do NOT add extra features, \
|
||||||
unnecessary abstractions, or premature optimizations.
|
unnecessary abstractions, or premature optimizations.
|
||||||
3. Follow every item in the checklist.
|
4. Follow every item in the checklist.
|
||||||
4. If there is previous feedback, address ONLY the specific issues mentioned.
|
5. If there is previous feedback in the referenced markdown artifacts, address ONLY those issues.
|
||||||
5. If previous feedback contains items marked as DISMISSED or false positive, \
|
6. If previous feedback contains items marked as DISMISSED or false positive, \
|
||||||
IGNORE those items — they have been verified as correct.
|
IGNORE those items — they have been verified as correct.
|
||||||
6. Output the complete implementation.
|
7. Prefer git and markdown artifacts as the source of truth. Use commit hashes, `git show`, `git diff`, and referenced markdown files instead of relying on inline summaries.
|
||||||
|
8. Output the complete implementation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
REVIEW_TEMPLATE = """\
|
REVIEW_TEMPLATE = """\
|
||||||
You are tasked with reviewing code against a plan and checklist.
|
You are tasked with reviewing code against a plan and checklist.
|
||||||
|
|
||||||
## Plan
|
## Artifact References
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## Checklist
|
## Execution Evidence
|
||||||
{checklist}
|
{execution_evidence}
|
||||||
|
|
||||||
## Reference Documents
|
|
||||||
{docs}
|
|
||||||
|
|
||||||
## Coding Output / Previous Step Output
|
|
||||||
{coding_output}
|
|
||||||
|
|
||||||
## Previous Review Feedback
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## Review Instructions
|
## Review Instructions
|
||||||
Explore the project directory to understand the full codebase context, \
|
Read the referenced plan/checklist/docs/review artifacts directly from disk. \
|
||||||
then evaluate the code against ONLY the plan and checklist above.
|
Inspect the referenced commit/git state and markdown artifacts, then evaluate \
|
||||||
|
the code against ONLY the plan and checklist. Use the execution evidence above \
|
||||||
|
to verify agent claims against actual command outputs, artifact paths, and exit codes.
|
||||||
|
|
||||||
For each issue found, classify it with BOTH severity AND category:
|
For each issue found, classify it with BOTH severity AND category:
|
||||||
|
|
||||||
@@ -122,51 +108,36 @@ Otherwise output: VERDICT: FAIL
|
|||||||
CODING_TEMPLATE_KO = """\
|
CODING_TEMPLATE_KO = """\
|
||||||
당신은 기획서와 체크리스트를 기반으로 코드를 구현하는 개발자입니다.
|
당신은 기획서와 체크리스트를 기반으로 코드를 구현하는 개발자입니다.
|
||||||
|
|
||||||
## 기획서
|
## 참조 아티팩트
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## 체크리스트
|
|
||||||
{checklist}
|
|
||||||
|
|
||||||
## 참고 문서
|
|
||||||
{docs}
|
|
||||||
|
|
||||||
## 이전 리뷰 피드백
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## 반복 정보
|
## 반복 정보
|
||||||
현재 {max_iterations}회 중 {iteration}번째 반복입니다.
|
현재 {max_iterations}회 중 {iteration}번째 반복입니다.
|
||||||
|
|
||||||
## 지침
|
## 지침
|
||||||
1. 프로젝트 디렉토리를 탐색하여 기존 코드베이스 구조를 파악하세요.
|
1. 참조된 plan/checklist/docs/review markdown를 직접 읽으세요.
|
||||||
2. 기획서에 명시된 것만 구현하세요. 추가 기능, 불필요한 추상화, 과도한 최적화를 하지 마세요.
|
2. 프로젝트 디렉토리와 git 상태를 탐색하여 현재 코드베이스 구조를 파악하세요.
|
||||||
3. 체크리스트의 모든 항목을 충족하세요.
|
3. 기획서에 명시된 것만 구현하세요. 추가 기능, 불필요한 추상화, 과도한 최적화를 하지 마세요.
|
||||||
4. 이전 리뷰 피드백이 있다면 해당 이슈만 해결하세요.
|
4. 체크리스트의 모든 항목을 충족하세요.
|
||||||
5. 이전 피드백에서 DISMISSED 또는 오탐으로 표시된 항목은 무시하세요 — 이미 올바른 것으로 검증되었습니다.
|
5. 참조된 이전 리뷰 피드백이 있다면 해당 이슈만 해결하세요.
|
||||||
6. 완전한 구현을 출력하세요.
|
6. 이전 피드백에서 DISMISSED 또는 오탐으로 표시된 항목은 무시하세요 — 이미 올바른 것으로 검증되었습니다.
|
||||||
|
7. inline 요약보다 git commit hash, `git show`, `git diff`, markdown 아티팩트를 우선 사용하세요.
|
||||||
|
8. 완전한 구현을 출력하세요.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
REVIEW_TEMPLATE_KO = """\
|
REVIEW_TEMPLATE_KO = """\
|
||||||
당신은 기획서와 체크리스트 기준으로 코드를 검토하는 리뷰어입니다.
|
당신은 기획서와 체크리스트 기준으로 코드를 검토하는 리뷰어입니다.
|
||||||
|
|
||||||
## 기획서
|
## 참조 아티팩트
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## 체크리스트
|
## 실행 증거
|
||||||
{checklist}
|
{execution_evidence}
|
||||||
|
|
||||||
## 참고 문서
|
|
||||||
{docs}
|
|
||||||
|
|
||||||
## 검토 대상 코드
|
|
||||||
{coding_output}
|
|
||||||
|
|
||||||
## 이전 리뷰 피드백
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## 검토 지침
|
## 검토 지침
|
||||||
프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스 맥락을 파악한 뒤, \
|
참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽고, \
|
||||||
위 기획서와 체크리스트 기준으로만 코드를 평가하세요.
|
그 내용을 기준으로만 코드를 평가하세요. \
|
||||||
|
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요.
|
||||||
|
|
||||||
발견된 각 이슈에 심각도와 카테고리를 모두 부여하세요:
|
발견된 각 이슈에 심각도와 카테고리를 모두 부여하세요:
|
||||||
|
|
||||||
@@ -234,9 +205,14 @@ You are tasked with reviewing existing code against a plan and checklist.
|
|||||||
## Previous Review (iteration {iteration} of {max_iterations})
|
## Previous Review (iteration {iteration} of {max_iterations})
|
||||||
{feedback}
|
{feedback}
|
||||||
|
|
||||||
|
## Execution Evidence
|
||||||
|
{execution_evidence}
|
||||||
|
|
||||||
## Review Instructions
|
## Review Instructions
|
||||||
Explore the project directory thoroughly to understand the full codebase, \
|
Explore the project directory thoroughly to understand the full codebase, \
|
||||||
then evaluate the EXISTING code against ONLY the plan and checklist above.
|
then evaluate the EXISTING code against ONLY the plan and checklist above. \
|
||||||
|
Use the execution evidence above to verify agent claims against actual \
|
||||||
|
command outputs and exit codes.
|
||||||
|
|
||||||
You are NOT generating or modifying code. You are auditing what already exists.
|
You are NOT generating or modifying code. You are auditing what already exists.
|
||||||
|
|
||||||
@@ -293,21 +269,16 @@ Otherwise output: VERDICT: FAIL
|
|||||||
REVIEW_ONLY_TEMPLATE_KO = """\
|
REVIEW_ONLY_TEMPLATE_KO = """\
|
||||||
당신은 기존 코드를 기획서와 체크리스트 기준으로 감사하는 리뷰어입니다.
|
당신은 기존 코드를 기획서와 체크리스트 기준으로 감사하는 리뷰어입니다.
|
||||||
|
|
||||||
## 기획서
|
## 참조 아티팩트
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## 체크리스트
|
## 실행 증거
|
||||||
{checklist}
|
{execution_evidence}
|
||||||
|
|
||||||
## 참고 문서
|
|
||||||
{docs}
|
|
||||||
|
|
||||||
## 이전 리뷰 결과 ({max_iterations}회 중 {iteration}번째)
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## 검토 지침
|
## 검토 지침
|
||||||
프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스를 파악한 뒤, \
|
참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽고, \
|
||||||
위 기획서와 체크리스트 기준으로 **기존 코드**를 평가하세요.
|
그 내용을 기준으로 **기존 코드**를 평가하세요. \
|
||||||
|
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요.
|
||||||
|
|
||||||
코드를 생성하거나 수정하지 마세요. 이미 존재하는 코드를 감사하는 것이 목적입니다.
|
코드를 생성하거나 수정하지 마세요. 이미 존재하는 코드를 감사하는 것이 목적입니다.
|
||||||
|
|
||||||
@@ -504,29 +475,20 @@ PLAN_REVIEW_TEMPLATE_KO = """\
|
|||||||
AGGREGATE_REVIEW_TEMPLATE = """\
|
AGGREGATE_REVIEW_TEMPLATE = """\
|
||||||
You are adjudicating multiple review results and turning them into an actionable decision.
|
You are adjudicating multiple review results and turning them into an actionable decision.
|
||||||
|
|
||||||
## Plan
|
## Artifact References
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## Checklist
|
|
||||||
{checklist}
|
|
||||||
|
|
||||||
## Reference Documents
|
|
||||||
{docs}
|
|
||||||
|
|
||||||
## Candidate Outputs
|
|
||||||
{candidate_outputs}
|
|
||||||
|
|
||||||
## Reviewer Findings
|
|
||||||
{reviews_bundle}
|
|
||||||
|
|
||||||
## Previous Verification Feedback
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## Previous Issue Tracker
|
## Previous Issue Tracker
|
||||||
{previous_senior_tracker}
|
{previous_senior_tracker}
|
||||||
|
|
||||||
|
## Execution Evidence
|
||||||
|
{execution_evidence}
|
||||||
|
|
||||||
## Instructions
|
## Instructions
|
||||||
Explore the project directory to confirm the current codebase state. Then:
|
Read the referenced plan/checklist/docs/review artifacts directly from disk. \
|
||||||
|
Explore the project directory and the referenced git commit/diff to confirm the \
|
||||||
|
current codebase state. Use the execution evidence above to verify claims against \
|
||||||
|
actual command outputs, artifact paths, and exit codes. Then:
|
||||||
1. Deduplicate overlapping issues across reviewers.
|
1. Deduplicate overlapping issues across reviewers.
|
||||||
2. Resolve disagreements explicitly.
|
2. Resolve disagreements explicitly.
|
||||||
3. Keep only issues supported by the plan, checklist, code, or reviewer evidence.
|
3. Keep only issues supported by the plan, checklist, code, or reviewer evidence.
|
||||||
@@ -571,29 +533,19 @@ VERDICT: PASS or VERDICT: FAIL or VERDICT: ESCALATE
|
|||||||
AGGREGATE_REVIEW_TEMPLATE_KO = """\
|
AGGREGATE_REVIEW_TEMPLATE_KO = """\
|
||||||
당신은 여러 리뷰 결과를 판정하고 coder가 수정할 액션으로 정리하는 시니어 리뷰어입니다.
|
당신은 여러 리뷰 결과를 판정하고 coder가 수정할 액션으로 정리하는 시니어 리뷰어입니다.
|
||||||
|
|
||||||
## 기획서
|
## 참조 아티팩트
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## 체크리스트
|
|
||||||
{checklist}
|
|
||||||
|
|
||||||
## 참고 문서
|
|
||||||
{docs}
|
|
||||||
|
|
||||||
## 후보 결과물
|
|
||||||
{candidate_outputs}
|
|
||||||
|
|
||||||
## 개별 리뷰 결과
|
|
||||||
{reviews_bundle}
|
|
||||||
|
|
||||||
## 이전 검증 피드백
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## 이전 이슈 트래커
|
## 이전 이슈 트래커
|
||||||
{previous_senior_tracker}
|
{previous_senior_tracker}
|
||||||
|
|
||||||
|
## 실행 증거
|
||||||
|
{execution_evidence}
|
||||||
|
|
||||||
## 지침
|
## 지침
|
||||||
프로젝트 디렉토리를 탐색하여 현재 코드베이스 상태를 확인한 뒤 다음을 수행하세요.
|
참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽어 현재 코드베이스 상태를 확인한 뒤, \
|
||||||
|
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요. \
|
||||||
|
그런 다음 아래를 수행하세요.
|
||||||
1. 리뷰어들 사이에 중복되는 이슈를 합치세요.
|
1. 리뷰어들 사이에 중복되는 이슈를 합치세요.
|
||||||
2. 의견 충돌은 명시적으로 정리하세요.
|
2. 의견 충돌은 명시적으로 정리하세요.
|
||||||
3. 기획서, 체크리스트, 코드, 리뷰 근거로 뒷받침되는 이슈만 남기세요.
|
3. 기획서, 체크리스트, 코드, 리뷰 근거로 뒷받침되는 이슈만 남기세요.
|
||||||
|
|||||||
@@ -58,6 +58,12 @@ _STRINGS: dict[str, dict[str, str]] = {
|
|||||||
"metrics_total_issues": "Total Issues",
|
"metrics_total_issues": "Total Issues",
|
||||||
"metrics_na": "N/A",
|
"metrics_na": "N/A",
|
||||||
"iteration_details": "Iteration Details",
|
"iteration_details": "Iteration Details",
|
||||||
|
"evidence_summary": "Evidence Summary",
|
||||||
|
"evidence_agent": "Agent",
|
||||||
|
"evidence_exit_code": "Exit Code",
|
||||||
|
"evidence_duration": "Duration",
|
||||||
|
"evidence_output_size": "Output Size",
|
||||||
|
"evidence_transcript": "Execution transcript",
|
||||||
},
|
},
|
||||||
"ko": {
|
"ko": {
|
||||||
"title": "교차 검증 리포트",
|
"title": "교차 검증 리포트",
|
||||||
@@ -99,6 +105,12 @@ _STRINGS: dict[str, dict[str, str]] = {
|
|||||||
"metrics_total_issues": "총 이슈",
|
"metrics_total_issues": "총 이슈",
|
||||||
"metrics_na": "해당 없음",
|
"metrics_na": "해당 없음",
|
||||||
"iteration_details": "반복 상세",
|
"iteration_details": "반복 상세",
|
||||||
|
"evidence_summary": "실행 증거 요약",
|
||||||
|
"evidence_agent": "에이전트",
|
||||||
|
"evidence_exit_code": "종료 코드",
|
||||||
|
"evidence_duration": "소요 시간",
|
||||||
|
"evidence_output_size": "출력 크기",
|
||||||
|
"evidence_transcript": "실행 트랜스크립트",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -377,6 +389,30 @@ def _append_iteration_steps(
|
|||||||
If *skip_extraction* is True, out-of-scope and review-metrics parsing
|
If *skip_extraction* is True, out-of-scope and review-metrics parsing
|
||||||
is skipped (useful when a pre-scan already collected that data).
|
is skipped (useful when a pre-scan already collected that data).
|
||||||
"""
|
"""
|
||||||
|
# Evidence summary table — quick overview of all steps' execution data
|
||||||
|
has_evidence = any(
|
||||||
|
iter_result.step_results.get(s.output_key) for s in steps
|
||||||
|
)
|
||||||
|
if has_evidence:
|
||||||
|
s_step = _t(config, "step")
|
||||||
|
s_agent = _t(config, "evidence_agent")
|
||||||
|
s_exit = _t(config, "evidence_exit_code")
|
||||||
|
s_dur = _t(config, "evidence_duration")
|
||||||
|
s_size = _t(config, "evidence_output_size")
|
||||||
|
lines.append(f"**{_t(config, 'evidence_summary')}**\n")
|
||||||
|
lines.append(f"| {s_step} | {s_agent} | {s_exit} | {s_dur} | {s_size} |")
|
||||||
|
lines.append("|------|-------|-----------|----------|-------------|")
|
||||||
|
for step in steps:
|
||||||
|
ar = iter_result.step_results.get(step.output_key)
|
||||||
|
out = iter_result.step_outputs.get(step.output_key, "")
|
||||||
|
if ar:
|
||||||
|
lines.append(
|
||||||
|
f"| {step.name} | {ar.agent_name} "
|
||||||
|
f"| {ar.exit_code} | {ar.duration_seconds}s "
|
||||||
|
f"| {len(out)} chars |"
|
||||||
|
)
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
for step in steps:
|
for step in steps:
|
||||||
agent_result = iter_result.step_results.get(step.output_key)
|
agent_result = iter_result.step_results.get(step.output_key)
|
||||||
output = iter_result.step_outputs.get(step.output_key, "")
|
output = iter_result.step_outputs.get(step.output_key, "")
|
||||||
@@ -386,6 +422,11 @@ def _append_iteration_steps(
|
|||||||
|
|
||||||
lines.append(f"### {_t(config, 'step')}: {step.name} ({agent_name}){duration}\n")
|
lines.append(f"### {_t(config, 'step')}: {step.name} ({agent_name}){duration}\n")
|
||||||
|
|
||||||
|
# Show command preview and exit code for execution evidence
|
||||||
|
if agent_result and agent_result.command_preview:
|
||||||
|
lines.append(f"**Command**: `{agent_result.command_preview}`")
|
||||||
|
lines.append(f"**Exit code**: {agent_result.exit_code}\n")
|
||||||
|
|
||||||
if step.verdict and iter_result.verdict:
|
if step.verdict and iter_result.verdict:
|
||||||
lines.append(f"**{_t(config, 'verdict')}: {iter_result.verdict}**\n")
|
lines.append(f"**{_t(config, 'verdict')}: {iter_result.verdict}**\n")
|
||||||
|
|
||||||
@@ -400,6 +441,17 @@ def _append_iteration_steps(
|
|||||||
lines.append(output)
|
lines.append(output)
|
||||||
lines.append("")
|
lines.append("")
|
||||||
|
|
||||||
|
# Include transcript excerpt for execution evidence visibility
|
||||||
|
if agent_result and agent_result.transcript:
|
||||||
|
transcript_preview = agent_result.transcript[:1500]
|
||||||
|
if len(agent_result.transcript) > 1500:
|
||||||
|
transcript_preview += "\n... (truncated)"
|
||||||
|
transcript_label = _t(config, "evidence_transcript")
|
||||||
|
lines.append("<details>")
|
||||||
|
lines.append(f"<summary>{transcript_label}</summary>\n")
|
||||||
|
lines.append(transcript_preview)
|
||||||
|
lines.append("\n</details>\n")
|
||||||
|
|
||||||
if not skip_extraction and step.role == "review":
|
if not skip_extraction and step.role == "review":
|
||||||
oos = _extract_out_of_scope(output)
|
oos = _extract_out_of_scope(output)
|
||||||
if oos:
|
if oos:
|
||||||
|
|||||||
@@ -14,9 +14,22 @@ _SUMMARY_PREFIXES = (
|
|||||||
"PG",
|
"PG",
|
||||||
"POSTGRES",
|
"POSTGRES",
|
||||||
"MYSQL",
|
"MYSQL",
|
||||||
|
"MARIADB",
|
||||||
"REDIS",
|
"REDIS",
|
||||||
|
"MONGO",
|
||||||
|
"ELASTICSEARCH",
|
||||||
|
"OPENSEARCH",
|
||||||
|
"DYNAMO",
|
||||||
|
"CASSANDRA",
|
||||||
|
"KAFKA",
|
||||||
|
"RABBIT",
|
||||||
|
"AMQP",
|
||||||
|
"NEO4J",
|
||||||
|
"SQLITE",
|
||||||
|
"MEMCACHED",
|
||||||
"AWS",
|
"AWS",
|
||||||
"S3",
|
"S3",
|
||||||
|
"MINIO",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -81,6 +94,9 @@ def build_runtime_environment(
|
|||||||
) -> tuple[dict[str, str], list[Path], dict[str, str]]:
|
) -> tuple[dict[str, str], list[Path], dict[str, str]]:
|
||||||
"""Build subprocess env plus metadata about loaded files and names."""
|
"""Build subprocess env plus metadata about loaded files and names."""
|
||||||
env = os.environ.copy() if execution.inherit_env else {}
|
env = os.environ.copy() if execution.inherit_env else {}
|
||||||
|
# Remove CLAUDECODE to avoid "nested session" errors when spawning
|
||||||
|
# Claude Code as a subprocess from within a Claude Code session.
|
||||||
|
env.pop("CLAUDECODE", None)
|
||||||
loaded_files = resolve_env_files(execution, project_root)
|
loaded_files = resolve_env_files(execution, project_root)
|
||||||
loaded_values: dict[str, str] = {}
|
loaded_values: dict[str, str] = {}
|
||||||
for path in loaded_files:
|
for path in loaded_files:
|
||||||
@@ -116,7 +132,6 @@ def summarize_environment(
|
|||||||
key
|
key
|
||||||
for key in set(loaded_values) | set(env)
|
for key in set(loaded_values) | set(env)
|
||||||
if key.startswith(_SUMMARY_PREFIXES)
|
if key.startswith(_SUMMARY_PREFIXES)
|
||||||
or any(prefix in key for prefix in ("CLICKHOUSE", "DATABASE", "DB_"))
|
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
if visible_names:
|
if visible_names:
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ from __future__ import annotations
|
|||||||
import logging
|
import logging
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import tempfile
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@@ -20,6 +21,22 @@ def make_branch_name(preset_name: str) -> str:
|
|||||||
return f"cross-eval/{preset_name}_{ts}"
|
return f"cross-eval/{preset_name}_{ts}"
|
||||||
|
|
||||||
|
|
||||||
|
def make_worktree_dir(base_cwd: Path, branch_name: str) -> Path:
|
||||||
|
"""Choose a worktree directory outside the base repo.
|
||||||
|
|
||||||
|
Keeping agentic worktrees outside the source checkout avoids tools that
|
||||||
|
incorrectly walk up to the outer repo and write into the base worktree.
|
||||||
|
"""
|
||||||
|
repo_name = base_cwd.resolve().name or "repo"
|
||||||
|
branch_slug = branch_name.replace("/", "__")
|
||||||
|
return (
|
||||||
|
Path(tempfile.gettempdir())
|
||||||
|
/ "cross-eval-worktrees"
|
||||||
|
/ repo_name
|
||||||
|
/ branch_slug
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def create_worktree(base_cwd: Path, work_dir: Path, branch_name: str) -> Path:
|
def create_worktree(base_cwd: Path, work_dir: Path, branch_name: str) -> Path:
|
||||||
"""Create a git worktree on a new branch from HEAD.
|
"""Create a git worktree on a new branch from HEAD.
|
||||||
|
|
||||||
|
|||||||
@@ -11,8 +11,58 @@ dependencies = [
|
|||||||
"pyyaml>=6.0",
|
"pyyaml>=6.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = [
|
||||||
|
"coverage[toml]>=7.6",
|
||||||
|
"pyright>=1.1.390",
|
||||||
|
"pytest-cov>=6.0",
|
||||||
|
"ruff>=0.8.0",
|
||||||
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
cross-eval = "cross_eval.cli:main"
|
cross-eval = "cross_eval.cli:main"
|
||||||
|
|
||||||
[tool.setuptools.packages.find]
|
[tool.setuptools.packages.find]
|
||||||
include = ["cross_eval*"]
|
include = ["cross_eval*"]
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
testpaths = ["tests"]
|
||||||
|
addopts = "-q"
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
target-version = "py39"
|
||||||
|
extend-exclude = [".cross-eval"]
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = ["F"]
|
||||||
|
|
||||||
|
[tool.pyright]
|
||||||
|
include = ["cross_eval", "tests"]
|
||||||
|
exclude = [".cross-eval"]
|
||||||
|
typeCheckingMode = "basic"
|
||||||
|
pythonVersion = "3.9"
|
||||||
|
reportMissingImports = true
|
||||||
|
reportMissingTypeStubs = false
|
||||||
|
|
||||||
|
[tool.coverage.run]
|
||||||
|
branch = true
|
||||||
|
source = ["cross_eval"]
|
||||||
|
omit = [
|
||||||
|
"cross_eval/config.py",
|
||||||
|
"cross_eval/discovery.py",
|
||||||
|
"cross_eval/cli.py",
|
||||||
|
"cross_eval/demo.py",
|
||||||
|
"cross_eval/doctor.py",
|
||||||
|
"cross_eval/prompts.py",
|
||||||
|
"cross_eval/report.py",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.coverage.report]
|
||||||
|
skip_empty = true
|
||||||
|
show_missing = true
|
||||||
|
fail_under = 90
|
||||||
|
exclude_lines = [
|
||||||
|
"pragma: no cover",
|
||||||
|
"if TYPE_CHECKING:",
|
||||||
|
"raise NotImplementedError",
|
||||||
|
]
|
||||||
|
|||||||
Binary file not shown.
@@ -12,10 +12,10 @@ import subprocess
|
|||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from unittest.mock import MagicMock, call, patch
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
from cross_eval.agent import invoke_agent_agentic
|
from cross_eval.agent import AgentInvocationError, invoke_agent_agentic
|
||||||
from cross_eval.config import BUILTIN_AGENTS, _make_agentic
|
from cross_eval.config import _make_agentic
|
||||||
from cross_eval.models import (
|
from cross_eval.models import (
|
||||||
AgentConfig,
|
AgentConfig,
|
||||||
AgentResult,
|
AgentResult,
|
||||||
@@ -23,8 +23,7 @@ from cross_eval.models import (
|
|||||||
StepConfig,
|
StepConfig,
|
||||||
)
|
)
|
||||||
from cross_eval.pipeline import (
|
from cross_eval.pipeline import (
|
||||||
_commit_iteration,
|
_assert_base_repo_isolation,
|
||||||
_finalize_worktree,
|
|
||||||
_has_agentic_steps,
|
_has_agentic_steps,
|
||||||
_setup_worktree,
|
_setup_worktree,
|
||||||
run_pipeline,
|
run_pipeline,
|
||||||
@@ -34,6 +33,7 @@ from cross_eval.worktree import (
|
|||||||
commit_worktree,
|
commit_worktree,
|
||||||
create_worktree,
|
create_worktree,
|
||||||
make_branch_name,
|
make_branch_name,
|
||||||
|
make_worktree_dir,
|
||||||
remove_worktree,
|
remove_worktree,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -191,16 +191,51 @@ class TestMakeBranchName(unittest.TestCase):
|
|||||||
self.assertEqual(len(ts_part), 15) # YYYYMMDD_HHMMSS
|
self.assertEqual(len(ts_part), 15) # YYYYMMDD_HHMMSS
|
||||||
|
|
||||||
|
|
||||||
|
class TestMakeWorktreeDir(unittest.TestCase):
|
||||||
|
"""make_worktree_dir chooses an external temp location."""
|
||||||
|
|
||||||
|
def test_uses_tmp_dir_outside_repo(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
base = Path(td) / "repo"
|
||||||
|
base.mkdir()
|
||||||
|
path = make_worktree_dir(base, "cross-eval/review-fix_20260313_123456")
|
||||||
|
self.assertIn("cross-eval-worktrees", str(path))
|
||||||
|
self.assertNotIn(str(base), str(path))
|
||||||
|
|
||||||
|
|
||||||
|
class TestBaseRepoIsolation(unittest.TestCase):
|
||||||
|
"""Base repo mutations should fail fast during agentic execution."""
|
||||||
|
|
||||||
|
def test_raises_when_base_repo_status_changes(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
base = Path(td) / "repo"
|
||||||
|
worktree = Path(td) / "worktree"
|
||||||
|
base.mkdir()
|
||||||
|
worktree.mkdir()
|
||||||
|
|
||||||
|
with self.assertRaises(RuntimeError) as ctx:
|
||||||
|
_assert_base_repo_isolation(
|
||||||
|
base,
|
||||||
|
"M cross_eval/agent.py",
|
||||||
|
step_name="coding",
|
||||||
|
agent_name="claude-coder",
|
||||||
|
worktree_path=worktree,
|
||||||
|
baseline_status="M cross_eval/agent.py",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIn("base repository", str(ctx.exception))
|
||||||
|
|
||||||
|
|
||||||
# ===================================================================
|
# ===================================================================
|
||||||
# 2. agent.py agentic tests (mocking subprocess)
|
# 2. agent.py agentic tests (mocking subprocess)
|
||||||
# ===================================================================
|
# ===================================================================
|
||||||
|
|
||||||
class TestInvokeAgentAgenticClaude(unittest.TestCase):
|
class TestInvokeAgentAgenticClaude(unittest.TestCase):
|
||||||
"""invoke_agent_agentic builds correct cmd for claude (no -p, prompt as positional arg)."""
|
"""invoke_agent_agentic builds correct cmd for claude (no -p, prompt via stdin)."""
|
||||||
|
|
||||||
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
|
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
|
||||||
@patch("subprocess.run")
|
@patch("subprocess.run")
|
||||||
def test_claude_cmd_has_no_dash_p_and_prompt_as_positional(
|
def test_claude_cmd_has_no_dash_p_and_prompt_via_stdin(
|
||||||
self, mock_run: MagicMock, mock_diff: MagicMock,
|
self, mock_run: MagicMock, mock_diff: MagicMock,
|
||||||
) -> None:
|
) -> None:
|
||||||
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
||||||
@@ -230,12 +265,16 @@ class TestInvokeAgentAgenticClaude(unittest.TestCase):
|
|||||||
break
|
break
|
||||||
|
|
||||||
self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'claude'")
|
self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'claude'")
|
||||||
|
assert agent_call is not None
|
||||||
cmd = agent_call[0][0]
|
cmd = agent_call[0][0]
|
||||||
|
|
||||||
# No -p flag
|
# No -p flag
|
||||||
self.assertNotIn("-p", cmd)
|
self.assertNotIn("-p", cmd)
|
||||||
# Last arg is a task file reference (not raw prompt — avoids arg length limits)
|
# Prompt is delivered via stdin (input kwarg), not as a positional arg
|
||||||
self.assertIn("task file", cmd[-1].lower())
|
input_data = agent_call[1].get("input")
|
||||||
|
self.assertIsNotNone(input_data)
|
||||||
|
assert input_data is not None
|
||||||
|
self.assertIn("implement feature X", input_data)
|
||||||
|
|
||||||
|
|
||||||
class TestInvokeAgentAgenticCodex(unittest.TestCase):
|
class TestInvokeAgentAgenticCodex(unittest.TestCase):
|
||||||
@@ -272,6 +311,7 @@ class TestInvokeAgentAgenticCodex(unittest.TestCase):
|
|||||||
break
|
break
|
||||||
|
|
||||||
self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'codex'")
|
self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'codex'")
|
||||||
|
assert agent_call is not None
|
||||||
cmd = agent_call[0][0]
|
cmd = agent_call[0][0]
|
||||||
|
|
||||||
# Should have "-" sentinel at the end for stdin
|
# Should have "-" sentinel at the end for stdin
|
||||||
@@ -279,6 +319,7 @@ class TestInvokeAgentAgenticCodex(unittest.TestCase):
|
|||||||
# Stdin input should contain the prompt
|
# Stdin input should contain the prompt
|
||||||
input_data = agent_call[1].get("input")
|
input_data = agent_call[1].get("input")
|
||||||
self.assertIsNotNone(input_data)
|
self.assertIsNotNone(input_data)
|
||||||
|
assert input_data is not None
|
||||||
self.assertIn("implement feature Y", input_data)
|
self.assertIn("implement feature Y", input_data)
|
||||||
|
|
||||||
|
|
||||||
@@ -309,6 +350,74 @@ class TestTaskFileCleanup(unittest.TestCase):
|
|||||||
self.assertFalse((wt / "CROSS_EVAL_TASK.md").exists())
|
self.assertFalse((wt / "CROSS_EVAL_TASK.md").exists())
|
||||||
|
|
||||||
|
|
||||||
|
class TestAgenticEmptyDiffDetection(unittest.TestCase):
|
||||||
|
"""Agentic coders should not succeed when they only claim changes in stdout."""
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="")
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_claude_empty_diff_with_change_claim_fails(
|
||||||
|
self, mock_run: MagicMock, mock_diff: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(
|
||||||
|
returncode=0,
|
||||||
|
stdout=(
|
||||||
|
"All tests pass.\n"
|
||||||
|
"Here's a summary of all changes made:\n"
|
||||||
|
"- Updated discovery.py\n"
|
||||||
|
),
|
||||||
|
stderr="",
|
||||||
|
)
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder",
|
||||||
|
command="claude",
|
||||||
|
args=["--setting-sources", "user"],
|
||||||
|
agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
wt = Path(td)
|
||||||
|
_init_git_repo(wt)
|
||||||
|
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent_agentic(
|
||||||
|
agent, "implement feature X", "coding",
|
||||||
|
worktree_path=wt, quiet=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(ctx.exception.failure_type, "EMPTY_DIFF")
|
||||||
|
self.assertIn("summary of all changes made", ctx.exception.raw_error.lower())
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="")
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_empty_diff_without_change_claim_is_allowed(
|
||||||
|
self, mock_run: MagicMock, mock_diff: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(
|
||||||
|
returncode=0,
|
||||||
|
stdout="No changes were required; the current implementation already satisfies the task.",
|
||||||
|
stderr="",
|
||||||
|
)
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder",
|
||||||
|
command="claude",
|
||||||
|
args=["--setting-sources", "user"],
|
||||||
|
agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
wt = Path(td)
|
||||||
|
_init_git_repo(wt)
|
||||||
|
|
||||||
|
result = invoke_agent_agentic(
|
||||||
|
agent, "check whether any fix is needed", "coding",
|
||||||
|
worktree_path=wt, quiet=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(result.output, "(no changes)")
|
||||||
|
|
||||||
|
|
||||||
# ===================================================================
|
# ===================================================================
|
||||||
# 3. config.py tests
|
# 3. config.py tests
|
||||||
# ===================================================================
|
# ===================================================================
|
||||||
@@ -328,6 +437,16 @@ class TestMakeAgenticClaude(unittest.TestCase):
|
|||||||
self.assertNotIn("-p", agent.args)
|
self.assertNotIn("-p", agent.args)
|
||||||
self.assertIn("--setting-sources", agent.args)
|
self.assertIn("--setting-sources", agent.args)
|
||||||
|
|
||||||
|
def test_strips_dash_dash_print_alias(self) -> None:
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder",
|
||||||
|
command="claude",
|
||||||
|
args=["--print", "--setting-sources", "user"],
|
||||||
|
)
|
||||||
|
_make_agentic(agent)
|
||||||
|
self.assertTrue(agent.agentic)
|
||||||
|
self.assertNotIn("--print", agent.args)
|
||||||
|
|
||||||
def test_idempotent_when_no_dash_p(self) -> None:
|
def test_idempotent_when_no_dash_p(self) -> None:
|
||||||
agent = AgentConfig(
|
agent = AgentConfig(
|
||||||
name="claude-coder",
|
name="claude-coder",
|
||||||
@@ -445,6 +564,33 @@ class TestSetupWorktreeCalledForAgentic(unittest.TestCase):
|
|||||||
mock_setup.assert_called_once()
|
mock_setup.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
class TestSetupWorktreeLocation(unittest.TestCase):
|
||||||
|
"""_setup_worktree places agentic worktrees outside the base repo."""
|
||||||
|
|
||||||
|
def test_worktree_is_created_outside_repo(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
base = Path(td) / "repo"
|
||||||
|
run_dir = base / ".cross-eval" / "output" / "smoke"
|
||||||
|
base.mkdir()
|
||||||
|
run_dir.mkdir(parents=True)
|
||||||
|
_init_git_repo(base)
|
||||||
|
|
||||||
|
worktree_path, branch_name = _setup_worktree(base, run_dir, "review-fix")
|
||||||
|
try:
|
||||||
|
self.assertTrue(worktree_path.exists())
|
||||||
|
self.assertNotIn(str(base.resolve()), str(worktree_path.resolve()))
|
||||||
|
self.assertEqual(
|
||||||
|
(run_dir / "worktree_path.txt").read_text(encoding="utf-8").strip(),
|
||||||
|
str(worktree_path),
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
(run_dir / "worktree_branch.txt").read_text(encoding="utf-8").strip(),
|
||||||
|
branch_name,
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
remove_worktree(base, worktree_path)
|
||||||
|
|
||||||
|
|
||||||
class TestReviewerRunsInWorktreeCwd(unittest.TestCase):
|
class TestReviewerRunsInWorktreeCwd(unittest.TestCase):
|
||||||
"""Reviewer runs with worktree cwd (not original cwd) when worktree exists."""
|
"""Reviewer runs with worktree cwd (not original cwd) when worktree exists."""
|
||||||
|
|
||||||
|
|||||||
@@ -26,7 +26,6 @@ from cross_eval.models import (
|
|||||||
PhaseConfig,
|
PhaseConfig,
|
||||||
PipelineConfig,
|
PipelineConfig,
|
||||||
PipelineResult,
|
PipelineResult,
|
||||||
ReviewMetrics,
|
|
||||||
StepConfig,
|
StepConfig,
|
||||||
)
|
)
|
||||||
from cross_eval.pipeline import (
|
from cross_eval.pipeline import (
|
||||||
@@ -54,7 +53,7 @@ from cross_eval.prompts import (
|
|||||||
_build_review_only_preset,
|
_build_review_only_preset,
|
||||||
_build_simple_preset,
|
_build_simple_preset,
|
||||||
)
|
)
|
||||||
from cross_eval.report import build_report, parse_review_metrics, print_escalation_report
|
from cross_eval.report import build_report, parse_review_metrics
|
||||||
|
|
||||||
class BuiltinAgentConfigTest(unittest.TestCase):
|
class BuiltinAgentConfigTest(unittest.TestCase):
|
||||||
def test_claude_builtin_agents_use_user_settings_and_disable_slash_commands(self) -> None:
|
def test_claude_builtin_agents_use_user_settings_and_disable_slash_commands(self) -> None:
|
||||||
|
|||||||
945
tests/test_evidence.py
Normal file
945
tests/test_evidence.py
Normal file
@@ -0,0 +1,945 @@
|
|||||||
|
"""Regression tests for runtime evidence propagation and report visibility.
|
||||||
|
|
||||||
|
Covers:
|
||||||
|
1. Execution evidence is surfaced in reviewer/senior prompt context.
|
||||||
|
2. Reports include command preview and transcript excerpts.
|
||||||
|
3. Claude agentic failure detection (empty diff, write failure, expanded markers).
|
||||||
|
4. _format_execution_evidence produces expected output.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
from cross_eval.agent import (
|
||||||
|
AgentInvocationError,
|
||||||
|
_claims_file_changes,
|
||||||
|
_has_write_failure_indicators,
|
||||||
|
invoke_agent_agentic,
|
||||||
|
)
|
||||||
|
from cross_eval.config import BUILTIN_AGENTS
|
||||||
|
from cross_eval.models import (
|
||||||
|
AgentConfig,
|
||||||
|
AgentResult,
|
||||||
|
IterationResult,
|
||||||
|
PipelineConfig,
|
||||||
|
PipelineResult,
|
||||||
|
StepConfig,
|
||||||
|
)
|
||||||
|
from cross_eval.pipeline import _build_artifact_references, _format_execution_evidence, run_pipeline
|
||||||
|
from cross_eval.report import build_report
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 1. Execution evidence formatting
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestFormatExecutionEvidence(unittest.TestCase):
|
||||||
|
"""_format_execution_evidence produces a compact summary for reviewers."""
|
||||||
|
|
||||||
|
def test_empty_results_returns_placeholder(self) -> None:
|
||||||
|
self.assertIn("no prior execution evidence", _format_execution_evidence({}))
|
||||||
|
|
||||||
|
def test_single_result_includes_key_fields(self) -> None:
|
||||||
|
result = AgentResult(
|
||||||
|
output="some diff",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="claude-coder",
|
||||||
|
step_name="coding",
|
||||||
|
duration_seconds=12.3,
|
||||||
|
transcript="# Agent Execution Transcript\n\n## Command\nclaude ...",
|
||||||
|
command_preview="claude --setting-sources user",
|
||||||
|
)
|
||||||
|
evidence = _format_execution_evidence({"coding_output": result})
|
||||||
|
self.assertIn("claude-coder", evidence)
|
||||||
|
self.assertIn("coding", evidence)
|
||||||
|
self.assertIn("Exit code: 0", evidence)
|
||||||
|
self.assertIn("12.3s", evidence)
|
||||||
|
self.assertIn("claude --setting-sources user", evidence)
|
||||||
|
self.assertNotIn("Transcript excerpt", evidence)
|
||||||
|
|
||||||
|
def test_multiple_results_separated(self) -> None:
|
||||||
|
r1 = AgentResult(
|
||||||
|
output="diff1", exit_code=0, agent_name="coder",
|
||||||
|
step_name="coding", duration_seconds=1.0,
|
||||||
|
command_preview="cmd1",
|
||||||
|
)
|
||||||
|
r2 = AgentResult(
|
||||||
|
output="review text", exit_code=0, agent_name="reviewer",
|
||||||
|
step_name="review", duration_seconds=2.0,
|
||||||
|
command_preview="cmd2",
|
||||||
|
)
|
||||||
|
evidence = _format_execution_evidence({
|
||||||
|
"coding_output": r1,
|
||||||
|
"review_result": r2,
|
||||||
|
})
|
||||||
|
self.assertIn("coder", evidence)
|
||||||
|
self.assertIn("reviewer", evidence)
|
||||||
|
self.assertIn("---", evidence)
|
||||||
|
|
||||||
|
def test_transcript_truncated_at_2000_chars(self) -> None:
|
||||||
|
long_transcript = "x" * 3000
|
||||||
|
result = AgentResult(
|
||||||
|
output="out", exit_code=0, agent_name="agent",
|
||||||
|
step_name="step", duration_seconds=1.0,
|
||||||
|
transcript=long_transcript,
|
||||||
|
)
|
||||||
|
evidence = _format_execution_evidence({"key": result})
|
||||||
|
self.assertNotIn("x" * 3000, evidence)
|
||||||
|
|
||||||
|
def test_artifact_paths_included_when_run_dir_provided(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
result = AgentResult(
|
||||||
|
output="diff",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="coder",
|
||||||
|
step_name="coding",
|
||||||
|
duration_seconds=1.2,
|
||||||
|
transcript="stdout",
|
||||||
|
command_preview="claude ...",
|
||||||
|
)
|
||||||
|
evidence = _format_execution_evidence(
|
||||||
|
{"coding_output": result},
|
||||||
|
run_dir=Path(tmpdir),
|
||||||
|
iteration=2,
|
||||||
|
)
|
||||||
|
self.assertIn("v2/coding.md", evidence)
|
||||||
|
self.assertIn("v2/coding_transcript.md", evidence)
|
||||||
|
|
||||||
|
|
||||||
|
class TestArtifactReferences(unittest.TestCase):
|
||||||
|
"""Artifact references should prefer file paths and git state over inline text."""
|
||||||
|
|
||||||
|
def test_contains_input_refs_and_git_context(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir) / "repo"
|
||||||
|
repo.mkdir()
|
||||||
|
(repo / "plan.md").write_text("plan", encoding="utf-8")
|
||||||
|
(repo / "checklist.md").write_text("checklist", encoding="utf-8")
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
subprocess.run(["git", "init"], cwd=repo, capture_output=True, check=True)
|
||||||
|
subprocess.run(["git", "config", "user.email", "test@test.com"], cwd=repo, capture_output=True, check=True)
|
||||||
|
subprocess.run(["git", "config", "user.name", "Test"], cwd=repo, capture_output=True, check=True)
|
||||||
|
subprocess.run(["git", "add", "."], cwd=repo, capture_output=True, check=True)
|
||||||
|
subprocess.run(["git", "commit", "-m", "init"], cwd=repo, capture_output=True, check=True)
|
||||||
|
|
||||||
|
refs = _build_artifact_references(
|
||||||
|
{
|
||||||
|
"plan_ref": str((repo / "plan.md").resolve()),
|
||||||
|
"checklist_ref": str((repo / "checklist.md").resolve()),
|
||||||
|
"docs_ref": "(none)",
|
||||||
|
},
|
||||||
|
cwd=repo,
|
||||||
|
run_dir=repo / ".cross-eval" / "output" / "run",
|
||||||
|
iteration=1,
|
||||||
|
worktree_path=None,
|
||||||
|
)
|
||||||
|
self.assertIn("Plan:", refs)
|
||||||
|
self.assertIn("Git commit:", refs)
|
||||||
|
self.assertIn("Suggested git commands", refs)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 2. Evidence in reviewer prompts (integration)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestEvidenceInReviewerPrompt(unittest.TestCase):
|
||||||
|
"""Reviewer prompts include execution evidence from prior coding step."""
|
||||||
|
|
||||||
|
def test_reviewer_receives_evidence(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
steps = [
|
||||||
|
StepConfig(
|
||||||
|
name="coding", agent="claude-coder", role="coding",
|
||||||
|
prompt_template="default:coding", output_key="coding_output",
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review", agent="claude-reviewer", role="review",
|
||||||
|
prompt_template="default:review", output_key="review_result",
|
||||||
|
verdict=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(
|
||||||
|
output_dir=Path(tmpdir),
|
||||||
|
max_iterations=1,
|
||||||
|
min_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||||
|
agents=dict(BUILTIN_AGENTS),
|
||||||
|
coders=["claude-coder"],
|
||||||
|
reviewers=["claude-reviewer"],
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="simple",
|
||||||
|
)
|
||||||
|
|
||||||
|
captured_prompts: list[dict] = []
|
||||||
|
|
||||||
|
def _mock(agent_config, prompt, step_name, **kwargs):
|
||||||
|
captured_prompts.append({
|
||||||
|
"step_name": step_name,
|
||||||
|
"prompt": prompt,
|
||||||
|
})
|
||||||
|
if step_name == "coding":
|
||||||
|
return AgentResult(
|
||||||
|
output="Implemented feature X",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=5.0,
|
||||||
|
transcript="# Transcript\nclaude ran...",
|
||||||
|
command_preview="claude --setting-sources user",
|
||||||
|
)
|
||||||
|
return AgentResult(
|
||||||
|
output="VERDICT: PASS",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=2.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock):
|
||||||
|
result = run_pipeline(config)
|
||||||
|
|
||||||
|
self.assertEqual(result.final_verdict, "PASS")
|
||||||
|
|
||||||
|
# The reviewer prompt should contain execution evidence
|
||||||
|
review_prompts = [
|
||||||
|
p for p in captured_prompts if p["step_name"] == "review"
|
||||||
|
]
|
||||||
|
self.assertTrue(len(review_prompts) >= 1)
|
||||||
|
review_prompt = review_prompts[0]["prompt"]
|
||||||
|
self.assertIn("Artifact References", review_prompt)
|
||||||
|
self.assertIn("Execution Evidence", review_prompt)
|
||||||
|
self.assertIn("claude-coder", review_prompt)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 3. Report includes evidence
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestReportIncludesEvidence(unittest.TestCase):
|
||||||
|
"""Report generation includes command preview and transcript excerpts."""
|
||||||
|
|
||||||
|
def _make_pipeline_result(self) -> tuple[PipelineConfig, PipelineResult]:
|
||||||
|
steps = [
|
||||||
|
StepConfig(
|
||||||
|
name="coding", agent="claude-coder", role="coding",
|
||||||
|
prompt_template="default:coding", output_key="coding_output",
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review", agent="claude-reviewer", role="review",
|
||||||
|
prompt_template="default:review", output_key="review_result",
|
||||||
|
verdict=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(
|
||||||
|
max_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Plan", "checklist": "CL"},
|
||||||
|
agents=dict(BUILTIN_AGENTS),
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="simple",
|
||||||
|
)
|
||||||
|
|
||||||
|
coding_result = AgentResult(
|
||||||
|
output="diff --git a/file ...",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="claude-coder",
|
||||||
|
step_name="coding",
|
||||||
|
duration_seconds=10.0,
|
||||||
|
transcript="# Agent Execution Transcript\n## Command\nclaude ...\n## Stdout\nok",
|
||||||
|
command_preview="claude --setting-sources user",
|
||||||
|
)
|
||||||
|
review_result = AgentResult(
|
||||||
|
output="All good.\n\nVERDICT: PASS",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="claude-reviewer",
|
||||||
|
step_name="review",
|
||||||
|
duration_seconds=5.0,
|
||||||
|
transcript="# Agent Execution Transcript\n## Command\nclaude -p ...\n## Stdout\nAll good.",
|
||||||
|
command_preview="claude -p --setting-sources user",
|
||||||
|
)
|
||||||
|
|
||||||
|
iteration = IterationResult(
|
||||||
|
iteration=1,
|
||||||
|
step_results={
|
||||||
|
"coding_output": coding_result,
|
||||||
|
"review_result": review_result,
|
||||||
|
},
|
||||||
|
step_outputs={
|
||||||
|
"coding_output": "diff --git a/file ...",
|
||||||
|
"review_result": "All good.\n\nVERDICT: PASS",
|
||||||
|
},
|
||||||
|
verdict="PASS",
|
||||||
|
)
|
||||||
|
|
||||||
|
pipeline_result = PipelineResult(
|
||||||
|
iterations=[iteration],
|
||||||
|
final_verdict="PASS",
|
||||||
|
total_duration=15.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
return config, pipeline_result
|
||||||
|
|
||||||
|
def test_report_contains_command_preview(self) -> None:
|
||||||
|
config, result = self._make_pipeline_result()
|
||||||
|
report = build_report(config, result)
|
||||||
|
self.assertIn("claude --setting-sources user", report)
|
||||||
|
self.assertIn("**Command**", report)
|
||||||
|
|
||||||
|
def test_report_contains_transcript_excerpt(self) -> None:
|
||||||
|
config, result = self._make_pipeline_result()
|
||||||
|
report = build_report(config, result)
|
||||||
|
self.assertIn("Execution transcript", report)
|
||||||
|
self.assertIn("Agent Execution Transcript", report)
|
||||||
|
|
||||||
|
def test_report_contains_exit_code(self) -> None:
|
||||||
|
config, result = self._make_pipeline_result()
|
||||||
|
report = build_report(config, result)
|
||||||
|
self.assertIn("**Exit code**: 0", report)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 4. Claude agentic hardened failure detection
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestClaimsFileChangesExpanded(unittest.TestCase):
|
||||||
|
"""Expanded change-claim markers detect more Claude output patterns."""
|
||||||
|
|
||||||
|
def test_ive_implemented(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("I've implemented the feature"))
|
||||||
|
|
||||||
|
def test_ive_updated(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("I've updated the config"))
|
||||||
|
|
||||||
|
def test_made_the_following_changes(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("I made the following changes to the file"))
|
||||||
|
|
||||||
|
def test_applied_the_fix(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("Applied the fix for the bug"))
|
||||||
|
|
||||||
|
def test_changes_have_been_applied(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("Changes have been applied successfully"))
|
||||||
|
|
||||||
|
def test_wrote_the_code(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("Wrote the code for the new module"))
|
||||||
|
|
||||||
|
def test_refactored(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("I refactored the pipeline"))
|
||||||
|
|
||||||
|
def test_no_changes_still_returns_false(self) -> None:
|
||||||
|
self.assertFalse(_claims_file_changes("No changes were necessary"))
|
||||||
|
|
||||||
|
def test_empty_string_returns_false(self) -> None:
|
||||||
|
self.assertFalse(_claims_file_changes(""))
|
||||||
|
|
||||||
|
|
||||||
|
class TestWriteFailureIndicators(unittest.TestCase):
|
||||||
|
"""_has_write_failure_indicators detects stderr patterns."""
|
||||||
|
|
||||||
|
def test_permission_denied(self) -> None:
|
||||||
|
self.assertTrue(_has_write_failure_indicators("Error: Permission denied"))
|
||||||
|
|
||||||
|
def test_read_only_filesystem(self) -> None:
|
||||||
|
self.assertTrue(_has_write_failure_indicators("read-only file system"))
|
||||||
|
|
||||||
|
def test_sandbox_restriction(self) -> None:
|
||||||
|
self.assertTrue(_has_write_failure_indicators("Blocked by sandbox policy"))
|
||||||
|
|
||||||
|
def test_eacces(self) -> None:
|
||||||
|
self.assertTrue(_has_write_failure_indicators("EACCES: operation not permitted"))
|
||||||
|
|
||||||
|
def test_empty_stderr_returns_false(self) -> None:
|
||||||
|
self.assertFalse(_has_write_failure_indicators(""))
|
||||||
|
|
||||||
|
def test_normal_stderr_returns_false(self) -> None:
|
||||||
|
self.assertFalse(_has_write_failure_indicators("Downloading model..."))
|
||||||
|
|
||||||
|
|
||||||
|
class TestAgenticWriteFailureRaisesError(unittest.TestCase):
|
||||||
|
"""Agentic mode raises AgentInvocationError on stderr write-failure indicators."""
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="")
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_write_failure_detected_from_stderr(
|
||||||
|
self, mock_run: MagicMock, mock_diff: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(
|
||||||
|
returncode=0,
|
||||||
|
stdout="Done.",
|
||||||
|
stderr="Error: Permission denied writing to /src/main.py",
|
||||||
|
)
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder", command="claude",
|
||||||
|
args=["--setting-sources", "user"], agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
import subprocess as _sp
|
||||||
|
import tempfile as _tf
|
||||||
|
|
||||||
|
with _tf.TemporaryDirectory() as td:
|
||||||
|
wt = Path(td)
|
||||||
|
_sp.run(["git", "init"], cwd=wt, capture_output=True, check=True)
|
||||||
|
_sp.run(["git", "config", "user.email", "t@t.com"], cwd=wt, capture_output=True)
|
||||||
|
_sp.run(["git", "config", "user.name", "T"], cwd=wt, capture_output=True)
|
||||||
|
(wt / "README.md").write_text("# init\n")
|
||||||
|
_sp.run(["git", "add", "."], cwd=wt, capture_output=True, check=True)
|
||||||
|
_sp.run(["git", "commit", "-m", "init"], cwd=wt, capture_output=True, check=True)
|
||||||
|
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent_agentic(
|
||||||
|
agent, "implement feature", "coding",
|
||||||
|
worktree_path=wt, quiet=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(ctx.exception.failure_type, "WRITE_FAILURE")
|
||||||
|
self.assertIn("Permission denied", ctx.exception.raw_error)
|
||||||
|
|
||||||
|
|
||||||
|
class TestAgenticExpandedClaimMarkers(unittest.TestCase):
|
||||||
|
"""Agentic mode detects expanded claim markers in empty diff scenarios."""
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="")
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_ive_implemented_triggers_empty_diff_error(
|
||||||
|
self, mock_run: MagicMock, mock_diff: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(
|
||||||
|
returncode=0,
|
||||||
|
stdout="I've implemented the requested changes to the pipeline.",
|
||||||
|
stderr="",
|
||||||
|
)
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder", command="claude",
|
||||||
|
args=["--setting-sources", "user"], agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
import subprocess as _sp
|
||||||
|
import tempfile as _tf
|
||||||
|
|
||||||
|
with _tf.TemporaryDirectory() as td:
|
||||||
|
wt = Path(td)
|
||||||
|
_sp.run(["git", "init"], cwd=wt, capture_output=True, check=True)
|
||||||
|
_sp.run(["git", "config", "user.email", "t@t.com"], cwd=wt, capture_output=True)
|
||||||
|
_sp.run(["git", "config", "user.name", "T"], cwd=wt, capture_output=True)
|
||||||
|
(wt / "README.md").write_text("# init\n")
|
||||||
|
_sp.run(["git", "add", "."], cwd=wt, capture_output=True, check=True)
|
||||||
|
_sp.run(["git", "commit", "-m", "init"], cwd=wt, capture_output=True, check=True)
|
||||||
|
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent_agentic(
|
||||||
|
agent, "implement feature", "coding",
|
||||||
|
worktree_path=wt, quiet=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(ctx.exception.failure_type, "EMPTY_DIFF")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 5. Expanded claim/no-change markers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestExpandedClaimMarkers(unittest.TestCase):
|
||||||
|
"""New claim markers detect additional Claude output patterns."""
|
||||||
|
|
||||||
|
def test_completed_all_the_changes(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("I completed all the changes"))
|
||||||
|
|
||||||
|
def test_finished_implementing(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("Finished implementing the feature"))
|
||||||
|
|
||||||
|
def test_all_tasks_completed(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("All tasks completed successfully"))
|
||||||
|
|
||||||
|
def test_done_with_the_implementation(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("Done with the implementation"))
|
||||||
|
|
||||||
|
def test_successfully_implemented(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("Successfully implemented the changes"))
|
||||||
|
|
||||||
|
def test_changes_are_complete(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("All changes are complete"))
|
||||||
|
|
||||||
|
|
||||||
|
class TestExpandedNoChangeMarkers(unittest.TestCase):
|
||||||
|
"""New no-change markers prevent false positives."""
|
||||||
|
|
||||||
|
def test_no_changes_needed(self) -> None:
|
||||||
|
self.assertFalse(_claims_file_changes("No changes needed"))
|
||||||
|
|
||||||
|
def test_no_fixes_needed(self) -> None:
|
||||||
|
self.assertFalse(_claims_file_changes("No fixes needed for this code"))
|
||||||
|
|
||||||
|
def test_code_is_correct_as_is(self) -> None:
|
||||||
|
self.assertFalse(_claims_file_changes("The code is correct as-is"))
|
||||||
|
|
||||||
|
def test_already_correct(self) -> None:
|
||||||
|
self.assertFalse(_claims_file_changes("Implementation is already correct"))
|
||||||
|
|
||||||
|
def test_no_action_required(self) -> None:
|
||||||
|
self.assertFalse(_claims_file_changes("No action required"))
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 6. Cross-iteration evidence propagation
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestCrossIterationEvidencePropagation(unittest.TestCase):
|
||||||
|
"""Execution evidence from prior iterations is available to subsequent iterations."""
|
||||||
|
|
||||||
|
def test_prior_evidence_available_in_iteration_2(self) -> None:
|
||||||
|
"""Review step in iteration 2 should see coding evidence from iteration 1."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
steps = [
|
||||||
|
StepConfig(
|
||||||
|
name="coding", agent="claude-coder", role="coding",
|
||||||
|
prompt_template="default:coding", output_key="coding_output",
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review", agent="claude-reviewer", role="review",
|
||||||
|
prompt_template="default:review", output_key="review_result",
|
||||||
|
verdict=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(
|
||||||
|
output_dir=Path(tmpdir),
|
||||||
|
max_iterations=2,
|
||||||
|
min_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||||
|
agents=dict(BUILTIN_AGENTS),
|
||||||
|
coders=["claude-coder"],
|
||||||
|
reviewers=["claude-reviewer"],
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="simple",
|
||||||
|
)
|
||||||
|
|
||||||
|
captured_prompts: list[dict] = []
|
||||||
|
|
||||||
|
def _mock(agent_config, prompt, step_name, **kwargs):
|
||||||
|
captured_prompts.append({
|
||||||
|
"step_name": step_name,
|
||||||
|
"prompt": prompt,
|
||||||
|
})
|
||||||
|
if step_name == "coding":
|
||||||
|
return AgentResult(
|
||||||
|
output="Implemented feature X",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=5.0,
|
||||||
|
transcript="# Transcript\nclaude ran the task",
|
||||||
|
command_preview="claude --setting-sources user",
|
||||||
|
)
|
||||||
|
# First review: FAIL, second review: PASS
|
||||||
|
review_calls = [
|
||||||
|
p for p in captured_prompts if p["step_name"] == "review"
|
||||||
|
]
|
||||||
|
if len(review_calls) <= 1:
|
||||||
|
return AgentResult(
|
||||||
|
output="Issues found\n\nVERDICT: FAIL",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=2.0,
|
||||||
|
transcript="# Transcript\nreview ran",
|
||||||
|
command_preview="claude -p --setting-sources user",
|
||||||
|
)
|
||||||
|
return AgentResult(
|
||||||
|
output="All good\n\nVERDICT: PASS",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=2.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock):
|
||||||
|
result = run_pipeline(config)
|
||||||
|
|
||||||
|
self.assertEqual(result.final_verdict, "PASS")
|
||||||
|
self.assertEqual(len(result.iterations), 2)
|
||||||
|
|
||||||
|
# The review prompt in iteration 2 should reference prior evidence
|
||||||
|
# (from iteration 1's coding step)
|
||||||
|
iter2_review_prompts = [
|
||||||
|
p for p in captured_prompts
|
||||||
|
if p["step_name"] == "review"
|
||||||
|
]
|
||||||
|
# There should be 2 review prompts (one per iteration)
|
||||||
|
self.assertEqual(len(iter2_review_prompts), 2)
|
||||||
|
iter2_review = iter2_review_prompts[1]["prompt"]
|
||||||
|
# Prior evidence should appear because it was carried forward
|
||||||
|
# The review step runs after coding, so it sees current iteration's
|
||||||
|
# coding evidence. But the key test is that evidence IS present.
|
||||||
|
self.assertIn("Exit code: 0", iter2_review)
|
||||||
|
self.assertIn("claude-coder", iter2_review)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 7. Report evidence summary table
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestReportEvidenceSummaryTable(unittest.TestCase):
|
||||||
|
"""Report includes evidence summary table per iteration."""
|
||||||
|
|
||||||
|
def test_report_contains_evidence_summary(self) -> None:
|
||||||
|
steps = [
|
||||||
|
StepConfig(
|
||||||
|
name="coding", agent="claude-coder", role="coding",
|
||||||
|
prompt_template="default:coding", output_key="coding_output",
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review", agent="claude-reviewer", role="review",
|
||||||
|
prompt_template="default:review", output_key="review_result",
|
||||||
|
verdict=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(
|
||||||
|
max_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Plan", "checklist": "CL"},
|
||||||
|
agents=dict(BUILTIN_AGENTS),
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="simple",
|
||||||
|
)
|
||||||
|
|
||||||
|
coding_result = AgentResult(
|
||||||
|
output="diff --git a/file ...",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="claude-coder",
|
||||||
|
step_name="coding",
|
||||||
|
duration_seconds=10.0,
|
||||||
|
transcript="# Transcript",
|
||||||
|
command_preview="claude --setting-sources user",
|
||||||
|
)
|
||||||
|
review_result = AgentResult(
|
||||||
|
output="VERDICT: PASS",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="claude-reviewer",
|
||||||
|
step_name="review",
|
||||||
|
duration_seconds=5.0,
|
||||||
|
transcript="# Transcript",
|
||||||
|
command_preview="claude -p",
|
||||||
|
)
|
||||||
|
|
||||||
|
iteration = IterationResult(
|
||||||
|
iteration=1,
|
||||||
|
step_results={
|
||||||
|
"coding_output": coding_result,
|
||||||
|
"review_result": review_result,
|
||||||
|
},
|
||||||
|
step_outputs={
|
||||||
|
"coding_output": "diff --git a/file ...",
|
||||||
|
"review_result": "VERDICT: PASS",
|
||||||
|
},
|
||||||
|
verdict="PASS",
|
||||||
|
)
|
||||||
|
|
||||||
|
pipeline_result = PipelineResult(
|
||||||
|
iterations=[iteration],
|
||||||
|
final_verdict="PASS",
|
||||||
|
total_duration=15.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
report = build_report(config, pipeline_result)
|
||||||
|
self.assertIn("Evidence Summary", report)
|
||||||
|
self.assertIn("claude-coder", report)
|
||||||
|
self.assertIn("claude-reviewer", report)
|
||||||
|
self.assertIn("10.0s", report)
|
||||||
|
self.assertIn("5.0s", report)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 8. _build_context merges prior and current evidence
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestBuildContextMergesEvidence(unittest.TestCase):
|
||||||
|
"""_build_context merges prior iteration evidence with current step evidence."""
|
||||||
|
|
||||||
|
def test_prior_evidence_used_when_no_current_results(self) -> None:
|
||||||
|
from cross_eval.pipeline import _build_context
|
||||||
|
input_contents = {
|
||||||
|
"plan": "test",
|
||||||
|
"execution_evidence": "### Step: coding (coder)\n- Exit code: 0",
|
||||||
|
}
|
||||||
|
context = _build_context(
|
||||||
|
input_contents, {}, "feedback", 2, 5, step_results=None,
|
||||||
|
)
|
||||||
|
# Prior evidence should survive when there are no current results
|
||||||
|
self.assertIn("coding (coder)", context["execution_evidence"])
|
||||||
|
|
||||||
|
def test_current_and_prior_merged(self) -> None:
|
||||||
|
from cross_eval.pipeline import _build_context
|
||||||
|
input_contents = {
|
||||||
|
"plan": "test",
|
||||||
|
"execution_evidence": "### Step: coding (coder)\n- Exit code: 0",
|
||||||
|
}
|
||||||
|
current_result = AgentResult(
|
||||||
|
output="review text", exit_code=0, agent_name="reviewer",
|
||||||
|
step_name="review", duration_seconds=3.0,
|
||||||
|
command_preview="cmd",
|
||||||
|
)
|
||||||
|
context = _build_context(
|
||||||
|
input_contents, {}, "feedback", 2, 5,
|
||||||
|
step_results={"review_result": current_result},
|
||||||
|
)
|
||||||
|
evidence = context["execution_evidence"]
|
||||||
|
# Both prior and current should appear
|
||||||
|
self.assertIn("Prior Iteration Evidence", evidence)
|
||||||
|
self.assertIn("Current Iteration Evidence", evidence)
|
||||||
|
self.assertIn("coding (coder)", evidence)
|
||||||
|
self.assertIn("reviewer", evidence)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 9. Evidence in review-only template (used by review-fix preset)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestReviewOnlyTemplateIncludesEvidence(unittest.TestCase):
|
||||||
|
"""review-only template includes {execution_evidence} placeholder."""
|
||||||
|
|
||||||
|
def test_review_only_template_has_evidence_placeholder(self) -> None:
|
||||||
|
from cross_eval.prompts import REVIEW_ONLY_TEMPLATE, REVIEW_ONLY_TEMPLATE_KO
|
||||||
|
self.assertIn("{execution_evidence}", REVIEW_ONLY_TEMPLATE)
|
||||||
|
self.assertIn("{execution_evidence}", REVIEW_ONLY_TEMPLATE_KO)
|
||||||
|
|
||||||
|
def test_review_only_renders_evidence(self) -> None:
|
||||||
|
from cross_eval.prompts import render_template, REVIEW_ONLY_TEMPLATE
|
||||||
|
context = {
|
||||||
|
"plan": "Test plan",
|
||||||
|
"checklist": "Test checklist",
|
||||||
|
"docs": "Test docs",
|
||||||
|
"feedback": "No feedback",
|
||||||
|
"execution_evidence": "### Step: coding (coder)\n- Exit code: 0\n- Duration: 5.0s",
|
||||||
|
"iteration": "1",
|
||||||
|
"max_iterations": "3",
|
||||||
|
}
|
||||||
|
rendered = render_template(REVIEW_ONLY_TEMPLATE, context)
|
||||||
|
self.assertIn("Exit code: 0", rendered)
|
||||||
|
self.assertIn("Duration: 5.0s", rendered)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 10. Evidence propagation in phased pipeline (coding-review-fix)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestPhasedPipelineEvidencePropagation(unittest.TestCase):
|
||||||
|
"""Evidence propagates correctly in coding-review-fix phased pipeline."""
|
||||||
|
|
||||||
|
def test_reviewer_receives_coding_evidence_in_phased_pipeline(self) -> None:
|
||||||
|
"""In coding-review-fix, review-phase reviewers see coding-phase evidence."""
|
||||||
|
from cross_eval.prompts import _build_coding_review_fix_preset
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
coders = ["claude-coder"]
|
||||||
|
reviewers = ["claude-reviewer"]
|
||||||
|
seniors = ["claude-senior"]
|
||||||
|
phases = _build_coding_review_fix_preset(coders, reviewers, seniors)
|
||||||
|
|
||||||
|
config = PipelineConfig(
|
||||||
|
output_dir=Path(tmpdir),
|
||||||
|
max_iterations=5,
|
||||||
|
min_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||||
|
agents=dict(BUILTIN_AGENTS),
|
||||||
|
coders=coders,
|
||||||
|
reviewers=reviewers,
|
||||||
|
seniors=seniors,
|
||||||
|
phases=phases,
|
||||||
|
preset_name="coding-review-fix",
|
||||||
|
)
|
||||||
|
|
||||||
|
captured_prompts: list[dict] = []
|
||||||
|
|
||||||
|
def _mock(agent_config, prompt, step_name, **kwargs):
|
||||||
|
captured_prompts.append({
|
||||||
|
"step_name": step_name,
|
||||||
|
"prompt": prompt,
|
||||||
|
"agent_name": agent_config.name,
|
||||||
|
})
|
||||||
|
if step_name == "coding":
|
||||||
|
return AgentResult(
|
||||||
|
output="Implemented feature X",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=10.0,
|
||||||
|
transcript="# Transcript\nclaude executed coding task",
|
||||||
|
command_preview="claude --setting-sources user",
|
||||||
|
)
|
||||||
|
if step_name == "verify":
|
||||||
|
return AgentResult(
|
||||||
|
output="All good\n\nVERDICT: PASS",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=3.0,
|
||||||
|
)
|
||||||
|
return AgentResult(
|
||||||
|
output=f"Output for {step_name}",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=2.0,
|
||||||
|
transcript=f"# Transcript for {step_name}",
|
||||||
|
command_preview=f"cmd-{step_name}",
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock):
|
||||||
|
result = run_pipeline(config)
|
||||||
|
|
||||||
|
self.assertEqual(result.final_verdict, "PASS")
|
||||||
|
|
||||||
|
# Check that review-phase reviewers received evidence
|
||||||
|
review_prompts = [
|
||||||
|
p for p in captured_prompts
|
||||||
|
if p["step_name"].startswith("review_")
|
||||||
|
]
|
||||||
|
self.assertTrue(len(review_prompts) >= 1)
|
||||||
|
# The review prompt should contain evidence from the coding phase
|
||||||
|
review_prompt = review_prompts[0]["prompt"]
|
||||||
|
self.assertIn("Execution Evidence", review_prompt)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 11. Evidence format includes output size
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestEvidenceIncludesOutputSize(unittest.TestCase):
|
||||||
|
"""_format_execution_evidence includes output size for debugging."""
|
||||||
|
|
||||||
|
def test_output_size_in_evidence(self) -> None:
|
||||||
|
result = AgentResult(
|
||||||
|
output="x" * 500,
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="claude-coder",
|
||||||
|
step_name="coding",
|
||||||
|
duration_seconds=5.0,
|
||||||
|
command_preview="claude --setting-sources user",
|
||||||
|
)
|
||||||
|
evidence = _format_execution_evidence({"coding_output": result})
|
||||||
|
self.assertIn("Output size: 500 chars", evidence)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 12. Report transcript label i18n
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestReportTranscriptLabelI18n(unittest.TestCase):
|
||||||
|
"""Report uses translated transcript label."""
|
||||||
|
|
||||||
|
def test_korean_transcript_label(self) -> None:
|
||||||
|
steps = [
|
||||||
|
StepConfig(
|
||||||
|
name="coding", agent="claude-coder", role="coding",
|
||||||
|
prompt_template="default:coding", output_key="coding_output",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(
|
||||||
|
max_iterations=1,
|
||||||
|
language="ko",
|
||||||
|
inputs={"plan": "Plan", "checklist": "CL"},
|
||||||
|
agents=dict(BUILTIN_AGENTS),
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="simple",
|
||||||
|
)
|
||||||
|
|
||||||
|
coding_result = AgentResult(
|
||||||
|
output="diff --git a/file ...",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="claude-coder",
|
||||||
|
step_name="coding",
|
||||||
|
duration_seconds=10.0,
|
||||||
|
transcript="# Agent Execution Transcript\n## Command\nclaude ...",
|
||||||
|
command_preview="claude --setting-sources user",
|
||||||
|
)
|
||||||
|
|
||||||
|
iteration = IterationResult(
|
||||||
|
iteration=1,
|
||||||
|
step_results={"coding_output": coding_result},
|
||||||
|
step_outputs={"coding_output": "diff --git a/file ..."},
|
||||||
|
)
|
||||||
|
|
||||||
|
pipeline_result = PipelineResult(
|
||||||
|
iterations=[iteration],
|
||||||
|
final_verdict="MAX_ITERATIONS_REACHED",
|
||||||
|
total_duration=10.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
report = build_report(config, pipeline_result)
|
||||||
|
self.assertIn("실행 트랜스크립트", report)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 13. Claude coder + Codex reviewer/senior combination
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestCodingReviewFixClaudeCodexCombination(unittest.TestCase):
|
||||||
|
"""coding-review-fix works with Claude as coder and Codex as reviewer/senior."""
|
||||||
|
|
||||||
|
def test_claude_coder_codex_reviewer_completes(self) -> None:
|
||||||
|
"""Verify the preset completes with mixed Claude/Codex agents."""
|
||||||
|
from cross_eval.prompts import _build_coding_review_fix_preset
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
coders = ["claude-coder"]
|
||||||
|
reviewers = ["codex-reviewer"]
|
||||||
|
seniors = ["codex-senior"]
|
||||||
|
phases = _build_coding_review_fix_preset(coders, reviewers, seniors)
|
||||||
|
|
||||||
|
config = PipelineConfig(
|
||||||
|
output_dir=Path(tmpdir),
|
||||||
|
max_iterations=5,
|
||||||
|
min_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||||
|
agents=dict(BUILTIN_AGENTS),
|
||||||
|
coders=coders,
|
||||||
|
reviewers=reviewers,
|
||||||
|
seniors=seniors,
|
||||||
|
phases=phases,
|
||||||
|
preset_name="coding-review-fix",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _mock(agent_config, prompt, step_name, **kwargs):
|
||||||
|
if step_name == "verify":
|
||||||
|
return AgentResult(
|
||||||
|
output="All good\n\nVERDICT: PASS",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=2.0,
|
||||||
|
transcript="# Transcript",
|
||||||
|
command_preview="codex exec",
|
||||||
|
)
|
||||||
|
return AgentResult(
|
||||||
|
output=f"Output for {step_name}",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=3.0,
|
||||||
|
transcript=f"# Transcript for {step_name}",
|
||||||
|
command_preview=f"cmd-{step_name}",
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock):
|
||||||
|
result = run_pipeline(config)
|
||||||
|
|
||||||
|
self.assertEqual(result.final_verdict, "PASS")
|
||||||
|
# Verify both Claude and Codex agents were used
|
||||||
|
all_agents = set()
|
||||||
|
for ir in result.iterations:
|
||||||
|
for ar in ir.step_results.values():
|
||||||
|
all_agents.add(ar.agent_name)
|
||||||
|
self.assertIn("claude-coder", all_agents)
|
||||||
|
self.assertIn("codex-reviewer", all_agents)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -11,7 +11,6 @@ from cross_eval.doctor import (
|
|||||||
check_cli_installed,
|
check_cli_installed,
|
||||||
check_config,
|
check_config,
|
||||||
format_doctor_results,
|
format_doctor_results,
|
||||||
run_doctor,
|
|
||||||
)
|
)
|
||||||
from cross_eval.demo import (
|
from cross_eval.demo import (
|
||||||
DEMO_CHECKLIST,
|
DEMO_CHECKLIST,
|
||||||
|
|||||||
@@ -8,9 +8,7 @@ from unittest.mock import patch
|
|||||||
|
|
||||||
from cross_eval.config import BUILTIN_AGENTS
|
from cross_eval.config import BUILTIN_AGENTS
|
||||||
from cross_eval.models import (
|
from cross_eval.models import (
|
||||||
AgentConfig,
|
|
||||||
AgentResult,
|
AgentResult,
|
||||||
PhaseConfig,
|
|
||||||
PipelineConfig,
|
PipelineConfig,
|
||||||
StepConfig,
|
StepConfig,
|
||||||
)
|
)
|
||||||
|
|||||||
407
tests/test_runtime_context.py
Normal file
407
tests/test_runtime_context.py
Normal file
@@ -0,0 +1,407 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from cross_eval.agent import invoke_agent
|
||||||
|
from cross_eval.config import BUILTIN_AGENTS
|
||||||
|
from cross_eval.discovery import discover_repo, format_repo_discovery
|
||||||
|
from cross_eval.models import AgentConfig, AgentResult, PipelineConfig
|
||||||
|
from cross_eval.pipeline import run_pipeline
|
||||||
|
from cross_eval.prompts import _build_simple_preset
|
||||||
|
from cross_eval.runtime_env import build_runtime_environment, summarize_environment
|
||||||
|
|
||||||
|
|
||||||
|
class RuntimeEnvTest(unittest.TestCase):
|
||||||
|
def test_build_runtime_environment_loads_dotenv_values(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / ".env").write_text(
|
||||||
|
"CLICKHOUSE_URL=http://localhost:8123\nDATABASE_URL=postgres://db\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
execution = PipelineConfig().execution
|
||||||
|
env, loaded_files, loaded_values = build_runtime_environment(execution, root)
|
||||||
|
|
||||||
|
self.assertEqual(loaded_files[0].name, ".env")
|
||||||
|
self.assertEqual(loaded_values["CLICKHOUSE_URL"], "http://localhost:8123")
|
||||||
|
self.assertEqual(env["DATABASE_URL"], "postgres://db")
|
||||||
|
|
||||||
|
def test_summarize_environment_mentions_clickhouse_from_env(self) -> None:
|
||||||
|
execution = PipelineConfig().execution
|
||||||
|
summary = summarize_environment(
|
||||||
|
execution,
|
||||||
|
[Path("/tmp/.env")],
|
||||||
|
{"CLICKHOUSE_URL": "http://localhost:8123"},
|
||||||
|
{"CLICKHOUSE_URL": "http://localhost:8123"},
|
||||||
|
)
|
||||||
|
self.assertIn("CLICKHOUSE_URL", summary)
|
||||||
|
self.assertIn("ClickHouse-related", summary)
|
||||||
|
|
||||||
|
|
||||||
|
class RepoDiscoveryTest(unittest.TestCase):
|
||||||
|
def test_discover_repo_detects_python_postgres_and_clickhouse(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "pyproject.toml").write_text(
|
||||||
|
'[project]\nname = "svc"\ndependencies = ["psycopg", "clickhouse-driver"]\n',
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
(root / "docker-compose.yml").write_text(
|
||||||
|
"services:\n db:\n image: postgres:16\n ch:\n image: clickhouse/clickhouse-server:latest\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root, {"DATABASE_URL", "CLICKHOUSE_URL"})
|
||||||
|
summary = format_repo_discovery(discovery)
|
||||||
|
|
||||||
|
self.assertIn("python", discovery.languages)
|
||||||
|
self.assertIn("postgresql", discovery.databases)
|
||||||
|
self.assertIn("clickhouse", discovery.databases)
|
||||||
|
self.assertIn("Detected local service containers", summary)
|
||||||
|
|
||||||
|
|
||||||
|
class PromptContextTest(unittest.TestCase):
|
||||||
|
def test_run_pipeline_injects_env_and_discovery_context_into_prompt(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / ".env").write_text("CLICKHOUSE_URL=http://localhost:8123\n", encoding="utf-8")
|
||||||
|
steps = _build_simple_preset(["claude-coder"], ["claude-reviewer"], [])
|
||||||
|
config = PipelineConfig(
|
||||||
|
output_dir=root / "out",
|
||||||
|
max_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Plan", "checklist": "Checklist"},
|
||||||
|
agents={name: agent for name, agent in BUILTIN_AGENTS.items()},
|
||||||
|
coders=["claude-coder"],
|
||||||
|
reviewers=["claude-reviewer"],
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="simple",
|
||||||
|
)
|
||||||
|
prompts: list[str] = []
|
||||||
|
|
||||||
|
def _fake_invoke(agent_config, prompt, step_name, **kwargs):
|
||||||
|
prompts.append(prompt)
|
||||||
|
output = "VERDICT: PASS" if step_name == "review" else "coding output"
|
||||||
|
return AgentResult(
|
||||||
|
output=output,
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=0.1,
|
||||||
|
transcript="# Agent Execution Transcript",
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=_fake_invoke):
|
||||||
|
run_pipeline(config, cwd=root)
|
||||||
|
|
||||||
|
joined = "\n".join(prompts)
|
||||||
|
self.assertIn("Execution Policy", joined)
|
||||||
|
self.assertIn("Environment Context", joined)
|
||||||
|
self.assertIn("Repository Discovery", joined)
|
||||||
|
self.assertIn("ClickHouse-related environment variables are available", joined)
|
||||||
|
self.assertTrue((root / "out").exists())
|
||||||
|
|
||||||
|
|
||||||
|
class AgentTranscriptTest(unittest.TestCase):
|
||||||
|
def test_invoke_agent_records_transcript(self) -> None:
|
||||||
|
def _fake_run(cmd, **kwargs):
|
||||||
|
class _Result:
|
||||||
|
returncode = 0
|
||||||
|
stdout = "hello"
|
||||||
|
stderr = "warn"
|
||||||
|
|
||||||
|
return _Result()
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="codex-reviewer",
|
||||||
|
command="codex",
|
||||||
|
args=["exec", "--model", "gpt-5.4", "-"],
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("subprocess.run", side_effect=_fake_run):
|
||||||
|
result = invoke_agent(agent, "prompt", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertIn("## Command", result.transcript)
|
||||||
|
self.assertIn("hello", result.transcript)
|
||||||
|
self.assertIn("warn", result.transcript)
|
||||||
|
|
||||||
|
def test_invoke_agent_transcript_includes_exit_code_and_duration(self) -> None:
|
||||||
|
def _fake_run(cmd, **kwargs):
|
||||||
|
class _Result:
|
||||||
|
returncode = 0
|
||||||
|
stdout = "output"
|
||||||
|
stderr = ""
|
||||||
|
|
||||||
|
return _Result()
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="codex-reviewer",
|
||||||
|
command="codex",
|
||||||
|
args=["exec", "--model", "gpt-5.4", "-"],
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("subprocess.run", side_effect=_fake_run):
|
||||||
|
result = invoke_agent(agent, "prompt", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertIn("## Exit Code: 0", result.transcript)
|
||||||
|
|
||||||
|
|
||||||
|
class RepoDiscoveryExtendedTest(unittest.TestCase):
|
||||||
|
"""Regression tests for broadened repo/service discovery signals."""
|
||||||
|
|
||||||
|
def test_discover_go_project(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "go.mod").write_text(
|
||||||
|
"module example.com/myapp\n\ngo 1.21\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("go", discovery.languages)
|
||||||
|
self.assertIn("go", discovery.package_managers)
|
||||||
|
|
||||||
|
def test_discover_rust_project(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "Cargo.toml").write_text(
|
||||||
|
'[package]\nname = "myapp"\nversion = "0.1.0"\n',
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("rust", discovery.languages)
|
||||||
|
self.assertIn("cargo", discovery.package_managers)
|
||||||
|
|
||||||
|
def test_discover_ruby_project(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "Gemfile").write_text(
|
||||||
|
'source "https://rubygems.org"\ngem "rails"\n',
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("ruby", discovery.languages)
|
||||||
|
self.assertIn("bundler", discovery.package_managers)
|
||||||
|
|
||||||
|
def test_discover_java_gradle_project(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "build.gradle").write_text(
|
||||||
|
"plugins { id 'java' }\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("java", discovery.languages)
|
||||||
|
self.assertIn("gradle", discovery.package_managers)
|
||||||
|
|
||||||
|
def test_discover_elasticsearch_from_compose(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "docker-compose.yml").write_text(
|
||||||
|
"services:\n es:\n image: elasticsearch:8.10.0\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("elasticsearch", discovery.services)
|
||||||
|
|
||||||
|
def test_discover_kafka_from_compose(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "docker-compose.yml").write_text(
|
||||||
|
"services:\n broker:\n image: confluentinc/cp-kafka:latest\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("kafka", discovery.services)
|
||||||
|
|
||||||
|
def test_discover_rabbitmq_from_env(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
discovery = discover_repo(root, {"RABBITMQ_URL"})
|
||||||
|
|
||||||
|
self.assertIn("rabbitmq", discovery.databases)
|
||||||
|
|
||||||
|
def test_discover_sqlite_from_requirements(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "requirements.txt").write_text(
|
||||||
|
"aiosqlite==0.19.0\nfastapi\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("python", discovery.languages)
|
||||||
|
self.assertIn("sqlite", discovery.databases)
|
||||||
|
|
||||||
|
def test_discover_dynamodb_from_env(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
discovery = discover_repo(root, {"DYNAMODB_TABLE"})
|
||||||
|
|
||||||
|
self.assertIn("dynamodb", discovery.databases)
|
||||||
|
|
||||||
|
def test_discover_frameworks_from_pyproject(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "pyproject.toml").write_text(
|
||||||
|
'[project]\nname = "svc"\ndependencies = ["fastapi", "uvicorn"]\n',
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("fastapi", discovery.frameworks)
|
||||||
|
|
||||||
|
def test_discover_knex_hint(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "knexfile.js").write_text(
|
||||||
|
"module.exports = {};\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("Knex migration config detected.", discovery.hints)
|
||||||
|
|
||||||
|
def test_discover_makefile_hint(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "Makefile").write_text(
|
||||||
|
"all:\n\techo hello\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("Makefile available for build/task automation.", discovery.hints)
|
||||||
|
|
||||||
|
def test_format_repo_discovery_includes_frameworks(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "package.json").write_text(
|
||||||
|
'{"dependencies": {"express": "^4.18.0"}}',
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
summary = format_repo_discovery(discovery)
|
||||||
|
|
||||||
|
self.assertIn("Detected frameworks", summary)
|
||||||
|
self.assertIn("express", summary)
|
||||||
|
|
||||||
|
def test_discover_pnpm_lockfile(self) -> None:
|
||||||
|
"""Detect pnpm from lockfile when no packageManager field."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "package.json").write_text(
|
||||||
|
'{"name": "app"}',
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
(root / "pnpm-lock.yaml").write_text("lockfileVersion: 6\n", encoding="utf-8")
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("pnpm", discovery.package_managers)
|
||||||
|
|
||||||
|
def test_discover_yarn_lockfile(self) -> None:
|
||||||
|
"""Detect yarn from lockfile when no packageManager field."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "package.json").write_text(
|
||||||
|
'{"name": "app"}',
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
(root / "yarn.lock").write_text("# yarn lockfile v1\n", encoding="utf-8")
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("yarn", discovery.package_managers)
|
||||||
|
|
||||||
|
|
||||||
|
class SummarizeEnvExtendedTest(unittest.TestCase):
|
||||||
|
"""Regression tests for expanded environment summary prefixes."""
|
||||||
|
|
||||||
|
def test_summarize_shows_mongo_env_var(self) -> None:
|
||||||
|
execution = PipelineConfig().execution
|
||||||
|
summary = summarize_environment(
|
||||||
|
execution,
|
||||||
|
[Path("/tmp/.env")],
|
||||||
|
{"MONGO_URI": "mongodb://localhost"},
|
||||||
|
{"MONGO_URI": "mongodb://localhost"},
|
||||||
|
)
|
||||||
|
self.assertIn("MONGO_URI", summary)
|
||||||
|
|
||||||
|
def test_summarize_shows_kafka_env_var(self) -> None:
|
||||||
|
execution = PipelineConfig().execution
|
||||||
|
summary = summarize_environment(
|
||||||
|
execution,
|
||||||
|
[Path("/tmp/.env")],
|
||||||
|
{"KAFKA_BOOTSTRAP_SERVERS": "localhost:9092"},
|
||||||
|
{"KAFKA_BOOTSTRAP_SERVERS": "localhost:9092"},
|
||||||
|
)
|
||||||
|
self.assertIn("KAFKA_BOOTSTRAP_SERVERS", summary)
|
||||||
|
|
||||||
|
def test_summarize_shows_elasticsearch_env_var(self) -> None:
|
||||||
|
execution = PipelineConfig().execution
|
||||||
|
summary = summarize_environment(
|
||||||
|
execution,
|
||||||
|
[Path("/tmp/.env")],
|
||||||
|
{"ELASTICSEARCH_URL": "http://localhost:9200"},
|
||||||
|
{"ELASTICSEARCH_URL": "http://localhost:9200"},
|
||||||
|
)
|
||||||
|
self.assertIn("ELASTICSEARCH_URL", summary)
|
||||||
|
|
||||||
|
|
||||||
|
class TranscriptSavingRegressionTest(unittest.TestCase):
|
||||||
|
"""Verify that transcripts are saved as step artifacts during pipeline runs."""
|
||||||
|
|
||||||
|
def test_transcript_files_saved_during_pipeline(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
steps = _build_simple_preset(["claude-coder"], ["claude-reviewer"], [])
|
||||||
|
config = PipelineConfig(
|
||||||
|
output_dir=root / "out",
|
||||||
|
max_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Plan", "checklist": "Checklist"},
|
||||||
|
agents={name: agent for name, agent in BUILTIN_AGENTS.items()},
|
||||||
|
coders=["claude-coder"],
|
||||||
|
reviewers=["claude-reviewer"],
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="simple",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _fake_invoke(agent_config, prompt, step_name, **kwargs):
|
||||||
|
output = "VERDICT: PASS" if step_name == "review" else "coding output"
|
||||||
|
return AgentResult(
|
||||||
|
output=output,
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=0.1,
|
||||||
|
transcript="# Agent Execution Transcript\n\n## Command\n```\nclaude -p\n```",
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=_fake_invoke):
|
||||||
|
result = run_pipeline(config, cwd=root)
|
||||||
|
|
||||||
|
# Verify transcript files were saved
|
||||||
|
run_dir = result.run_dir
|
||||||
|
self.assertIsNotNone(run_dir)
|
||||||
|
assert run_dir is not None
|
||||||
|
coding_transcript = run_dir / "v1" / "coding_transcript.md"
|
||||||
|
review_transcript = run_dir / "v1" / "review_transcript.md"
|
||||||
|
self.assertTrue(
|
||||||
|
coding_transcript.exists(),
|
||||||
|
f"Expected transcript at {coding_transcript}",
|
||||||
|
)
|
||||||
|
self.assertTrue(
|
||||||
|
review_transcript.exists(),
|
||||||
|
f"Expected transcript at {review_transcript}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
831
tests/test_runtime_misc.py
Normal file
831
tests/test_runtime_misc.py
Normal file
@@ -0,0 +1,831 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
from cross_eval.agent import (
|
||||||
|
AgentInvocationError,
|
||||||
|
_build_transcript,
|
||||||
|
_classify_agent_failure,
|
||||||
|
invoke_agent,
|
||||||
|
invoke_agent_agentic,
|
||||||
|
)
|
||||||
|
from cross_eval.models import AgentConfig, AgentResult, ExecutionConfig, PipelineConfig, StepConfig
|
||||||
|
from cross_eval.pipeline import (
|
||||||
|
_commit_iteration,
|
||||||
|
_execute_parallel_batch,
|
||||||
|
_execute_step,
|
||||||
|
_finalize_worktree,
|
||||||
|
_format_runtime_error_markdown,
|
||||||
|
_maybe_save_step_transcript,
|
||||||
|
_snapshot_repo_state,
|
||||||
|
)
|
||||||
|
from cross_eval.runtime_env import (
|
||||||
|
build_execution_policy,
|
||||||
|
parse_dotenv,
|
||||||
|
resolve_env_files,
|
||||||
|
summarize_environment,
|
||||||
|
)
|
||||||
|
from cross_eval.worktree import WorktreeError, create_worktree, remove_worktree
|
||||||
|
|
||||||
|
|
||||||
|
def _init_git_repo(path: Path) -> None:
|
||||||
|
subprocess.run(["git", "init"], cwd=path, capture_output=True, check=True)
|
||||||
|
subprocess.run(
|
||||||
|
["git", "config", "user.email", "test@test.com"],
|
||||||
|
cwd=path,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
subprocess.run(
|
||||||
|
["git", "config", "user.name", "Test"],
|
||||||
|
cwd=path,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
(path / "README.md").write_text("# init\n", encoding="utf-8")
|
||||||
|
subprocess.run(["git", "add", "."], cwd=path, capture_output=True, check=True)
|
||||||
|
subprocess.run(
|
||||||
|
["git", "commit", "-m", "initial"],
|
||||||
|
cwd=path,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestInvokeAgentRuntime(unittest.TestCase):
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_interactive_claude_reads_output_file(self, mock_run: MagicMock) -> None:
|
||||||
|
def _fake_run(cmd: list[str], **kwargs: object) -> MagicMock:
|
||||||
|
match = re.search(r"Write your complete output to (.+)\.$", cmd[-1])
|
||||||
|
self.assertIsNotNone(match)
|
||||||
|
assert match is not None
|
||||||
|
Path(match.group(1)).write_text("review result", encoding="utf-8")
|
||||||
|
return MagicMock(returncode=0, stdout="", stderr="")
|
||||||
|
|
||||||
|
mock_run.side_effect = _fake_run
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-reviewer",
|
||||||
|
command="claude",
|
||||||
|
args=["--model", "opus"],
|
||||||
|
system_prompt="system",
|
||||||
|
)
|
||||||
|
|
||||||
|
result = invoke_agent(agent, "inspect code", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(result.output, "review result")
|
||||||
|
called_cmd = mock_run.call_args[0][0]
|
||||||
|
self.assertIn("--system-prompt", called_cmd)
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_interactive_claude_falls_back_to_stdout(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="stdout fallback", stderr="")
|
||||||
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["--model", "opus"])
|
||||||
|
|
||||||
|
result = invoke_agent(agent, "inspect code", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(result.output, "stdout fallback")
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_non_claude_wraps_system_prompt_in_stdin(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="custom-reviewer",
|
||||||
|
command="custom-cli",
|
||||||
|
args=["run"],
|
||||||
|
system_prompt="strict mode",
|
||||||
|
)
|
||||||
|
|
||||||
|
invoke_agent(agent, "check things", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
mock_run.call_args.kwargs["input"],
|
||||||
|
"<system>\nstrict mode\n</system>\n\ncheck things",
|
||||||
|
)
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_failure_raises_structured_error(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="API Error: backend down")
|
||||||
|
agent = AgentConfig(name="codex-reviewer", command="codex", args=["exec", "-"])
|
||||||
|
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent(agent, "check", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(ctx.exception.failure_type, "API_ERROR")
|
||||||
|
self.assertIn("backend down", ctx.exception.raw_error)
|
||||||
|
|
||||||
|
def test_classify_unknown_failure(self) -> None:
|
||||||
|
failure_type, suggested_action = _classify_agent_failure("weird crash")
|
||||||
|
self.assertEqual(failure_type, "UNKNOWN")
|
||||||
|
self.assertIn("Inspect", suggested_action)
|
||||||
|
|
||||||
|
def test_build_transcript_includes_cwd_and_duration(self) -> None:
|
||||||
|
transcript = _build_transcript(
|
||||||
|
command_preview="claude -p",
|
||||||
|
stdout="ok",
|
||||||
|
stderr="",
|
||||||
|
exit_code=0,
|
||||||
|
duration_seconds=1.2,
|
||||||
|
cwd="/tmp/repo",
|
||||||
|
)
|
||||||
|
self.assertIn("## Working Directory", transcript)
|
||||||
|
self.assertIn("## Duration: 1.2s", transcript)
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_timeout_stops_spinner(self, mock_run: MagicMock, mock_spinner: MagicMock) -> None:
|
||||||
|
spinner = mock_spinner.return_value
|
||||||
|
mock_run.side_effect = subprocess.TimeoutExpired(cmd=["claude"], timeout=12)
|
||||||
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
|
||||||
|
|
||||||
|
with self.assertRaises(subprocess.TimeoutExpired):
|
||||||
|
invoke_agent(agent, "inspect code", "review", quiet=False, timeout=12)
|
||||||
|
|
||||||
|
spinner.stop.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_generic_exception_stops_spinner(self, mock_run: MagicMock, mock_spinner: MagicMock) -> None:
|
||||||
|
spinner = mock_spinner.return_value
|
||||||
|
mock_run.side_effect = OSError("boom")
|
||||||
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
|
||||||
|
|
||||||
|
with self.assertRaises(OSError):
|
||||||
|
invoke_agent(agent, "inspect code", "review", quiet=False)
|
||||||
|
|
||||||
|
spinner.stop.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.logger.warning")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_empty_output_logs_warning(self, mock_run: MagicMock, mock_warning: MagicMock) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
|
||||||
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
|
||||||
|
|
||||||
|
result = invoke_agent(agent, "inspect code", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(result.output, "")
|
||||||
|
mock_warning.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_print_mode_claude_uses_native_system_prompt_flag(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-reviewer",
|
||||||
|
command="claude",
|
||||||
|
args=["-p"],
|
||||||
|
system_prompt="be strict",
|
||||||
|
)
|
||||||
|
|
||||||
|
invoke_agent(agent, "review this", "review", quiet=True)
|
||||||
|
|
||||||
|
called_cmd = mock_run.call_args[0][0]
|
||||||
|
self.assertIn("--system-prompt", called_cmd)
|
||||||
|
self.assertEqual(mock_run.call_args.kwargs["input"], "review this")
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_interactive_failure_truncates_error_and_removes_output_file(
|
||||||
|
self,
|
||||||
|
mock_run: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
seen_output_path: Path | None = None
|
||||||
|
|
||||||
|
def _fake_run(cmd: list[str], **kwargs: object) -> MagicMock:
|
||||||
|
nonlocal seen_output_path
|
||||||
|
match = re.search(r"Write your complete output to (.+)\.$", cmd[-1])
|
||||||
|
self.assertIsNotNone(match)
|
||||||
|
assert match is not None
|
||||||
|
seen_output_path = Path(match.group(1))
|
||||||
|
return MagicMock(returncode=1, stdout="", stderr="x" * 600)
|
||||||
|
|
||||||
|
mock_run.side_effect = _fake_run
|
||||||
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["--model", "opus"])
|
||||||
|
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent(agent, "inspect code", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(len(ctx.exception.raw_error), 503)
|
||||||
|
self.assertIsNotNone(seen_output_path)
|
||||||
|
assert seen_output_path is not None
|
||||||
|
self.assertFalse(seen_output_path.exists())
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.logger.warning")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_empty_output_with_stderr_logs_stderr_warning(
|
||||||
|
self,
|
||||||
|
mock_run: MagicMock,
|
||||||
|
mock_warning: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="stderr text")
|
||||||
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
|
||||||
|
|
||||||
|
invoke_agent(agent, "inspect code", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertIn("stderr:", mock_warning.call_args[0][0])
|
||||||
|
|
||||||
|
|
||||||
|
class TestInvokeAgenticRuntime(unittest.TestCase):
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_codex_agentic_adds_reasoning_and_system_wrapper(
|
||||||
|
self,
|
||||||
|
mock_run: MagicMock,
|
||||||
|
mock_diff: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="codex-coder",
|
||||||
|
command="codex",
|
||||||
|
args=["exec", "--full-auto"],
|
||||||
|
system_prompt="strict mode",
|
||||||
|
reasoning_effort="high",
|
||||||
|
agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=True)
|
||||||
|
|
||||||
|
called_cmd = mock_run.call_args[0][0]
|
||||||
|
self.assertIn("-c", called_cmd)
|
||||||
|
self.assertEqual(called_cmd[-1], "-")
|
||||||
|
self.assertIn("<system>", mock_run.call_args.kwargs["input"])
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_agentic_claude_success_uses_system_prompt_and_spinner(
|
||||||
|
self,
|
||||||
|
mock_run: MagicMock,
|
||||||
|
mock_diff: MagicMock,
|
||||||
|
mock_spinner: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder",
|
||||||
|
command="claude",
|
||||||
|
args=["-p", "--print"],
|
||||||
|
system_prompt="stay in scope",
|
||||||
|
agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
result = invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
|
||||||
|
|
||||||
|
called_cmd = mock_run.call_args[0][0]
|
||||||
|
self.assertNotIn("-p", called_cmd)
|
||||||
|
self.assertIn("--system-prompt", called_cmd)
|
||||||
|
self.assertEqual(result.output, "diff --git a/file ...")
|
||||||
|
mock_spinner.return_value.stop.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
def test_agentic_timeout_stops_spinner(self, mock_spinner: MagicMock) -> None:
|
||||||
|
spinner = mock_spinner.return_value
|
||||||
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
with patch(
|
||||||
|
"cross_eval.agent.subprocess.run",
|
||||||
|
side_effect=subprocess.TimeoutExpired(cmd=["codex"], timeout=20),
|
||||||
|
):
|
||||||
|
with self.assertRaises(subprocess.TimeoutExpired):
|
||||||
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False, timeout=20)
|
||||||
|
|
||||||
|
spinner.stop.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_agentic_nonzero_exit_raises_structured_error(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="unauthorized")
|
||||||
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(ctx.exception.failure_type, "AUTH")
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
def test_agentic_generic_exception_stops_spinner(
|
||||||
|
self,
|
||||||
|
mock_spinner: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
with patch("cross_eval.agent.subprocess.run", side_effect=OSError("boom")):
|
||||||
|
with self.assertRaises(OSError):
|
||||||
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
|
||||||
|
|
||||||
|
mock_spinner.return_value.stop.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_agentic_failure_truncates_error(
|
||||||
|
self,
|
||||||
|
mock_run: MagicMock,
|
||||||
|
mock_spinner: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="x" * 600)
|
||||||
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
|
||||||
|
|
||||||
|
self.assertEqual(len(ctx.exception.raw_error), 503)
|
||||||
|
mock_spinner.return_value.stop.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_agentic_empty_diff_failure_truncates_error_and_stops_spinner(
|
||||||
|
self,
|
||||||
|
mock_run: MagicMock,
|
||||||
|
mock_diff: MagicMock,
|
||||||
|
mock_spinner: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(
|
||||||
|
returncode=0,
|
||||||
|
stdout="implemented",
|
||||||
|
stderr="permission denied " * 300,
|
||||||
|
)
|
||||||
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
|
||||||
|
|
||||||
|
self.assertLessEqual(len(ctx.exception.raw_error), 2003)
|
||||||
|
self.assertEqual(ctx.exception.failure_type, "WRITE_FAILURE")
|
||||||
|
mock_spinner.return_value.stop.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
class TestPipelineHelpers(unittest.TestCase):
|
||||||
|
@patch("cross_eval.worktree.commit_worktree", return_value=True)
|
||||||
|
def test_commit_iteration_logs_only_when_committed(self, mock_commit: MagicMock) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
_commit_iteration(Path(tmpdir), "review-fix", 2, "PASS")
|
||||||
|
mock_commit.assert_called_once()
|
||||||
|
|
||||||
|
def test_snapshot_repo_state_includes_untracked_digest(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
(repo / "scratch.txt").write_text("draft", encoding="utf-8")
|
||||||
|
|
||||||
|
snapshot = _snapshot_repo_state(repo)
|
||||||
|
|
||||||
|
self.assertIn("UNTRACKED scratch.txt", snapshot)
|
||||||
|
|
||||||
|
def test_finalize_worktree_deletes_empty_branch(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
base = Path(tmpdir) / "repo"
|
||||||
|
base.mkdir()
|
||||||
|
_init_git_repo(base)
|
||||||
|
branch = "cross-eval/empty"
|
||||||
|
subprocess.run(
|
||||||
|
["git", "branch", branch, "HEAD"],
|
||||||
|
cwd=base,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
worktree = Path(tmpdir) / "wt"
|
||||||
|
subprocess.run(
|
||||||
|
["git", "worktree", "add", str(worktree), branch],
|
||||||
|
cwd=base,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
branch_result = _finalize_worktree(base, worktree, branch, "review-fix", "PASS")
|
||||||
|
|
||||||
|
self.assertIsNone(branch_result)
|
||||||
|
branches = subprocess.run(
|
||||||
|
["git", "branch", "--list", branch],
|
||||||
|
cwd=base,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
self.assertEqual(branches.stdout.strip(), "")
|
||||||
|
|
||||||
|
def test_format_runtime_error_markdown_for_generic_exception(self) -> None:
|
||||||
|
markdown = _format_runtime_error_markdown(
|
||||||
|
RuntimeError("boom"),
|
||||||
|
step_name="review",
|
||||||
|
agent_name="claude-reviewer",
|
||||||
|
phase_name="review_fix",
|
||||||
|
)
|
||||||
|
self.assertIn("# Agent Error", markdown)
|
||||||
|
self.assertIn("review_fix", markdown)
|
||||||
|
self.assertIn("boom", markdown)
|
||||||
|
|
||||||
|
def test_maybe_save_step_transcript_returns_none_without_transcript(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
result = AgentResult(
|
||||||
|
output="ok",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="claude-reviewer",
|
||||||
|
step_name="review",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
saved = _maybe_save_step_transcript(Path(tmpdir), 1, "review", result)
|
||||||
|
self.assertIsNone(saved)
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent")
|
||||||
|
def test_execute_step_saves_timeout_markdown(self, mock_invoke: MagicMock) -> None:
|
||||||
|
mock_invoke.side_effect = subprocess.TimeoutExpired(
|
||||||
|
cmd=["claude"],
|
||||||
|
timeout=45,
|
||||||
|
output="partial output",
|
||||||
|
stderr="still running",
|
||||||
|
)
|
||||||
|
step = StepConfig(
|
||||||
|
name="review",
|
||||||
|
agent="claude-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_output",
|
||||||
|
)
|
||||||
|
config = PipelineConfig(
|
||||||
|
agents={
|
||||||
|
"claude-reviewer": AgentConfig(
|
||||||
|
name="claude-reviewer",
|
||||||
|
command="claude",
|
||||||
|
args=["-p"],
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
step_outputs: dict[str, str] = {}
|
||||||
|
step_results: dict[str, AgentResult] = {}
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
run_dir = Path(tmpdir)
|
||||||
|
with self.assertRaises(RuntimeError) as ctx:
|
||||||
|
_execute_step(
|
||||||
|
step,
|
||||||
|
config,
|
||||||
|
{"plan": "Plan", "checklist": "Checklist"},
|
||||||
|
"",
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
run_dir,
|
||||||
|
45,
|
||||||
|
False,
|
||||||
|
step_outputs,
|
||||||
|
step_results,
|
||||||
|
run_dir=run_dir,
|
||||||
|
output_iter=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIn("timed out after 45s", str(ctx.exception))
|
||||||
|
error_path = run_dir / "v1" / "review_error.md"
|
||||||
|
self.assertTrue(error_path.exists())
|
||||||
|
self.assertIn("# Agent Timeout", error_path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent")
|
||||||
|
def test_execute_step_saves_runtime_error_markdown(self, mock_invoke: MagicMock) -> None:
|
||||||
|
mock_invoke.side_effect = AgentInvocationError(
|
||||||
|
agent_name="claude-reviewer",
|
||||||
|
step_name="review",
|
||||||
|
cmd_preview="claude -p",
|
||||||
|
raw_error="api broke",
|
||||||
|
failure_type="API_ERROR",
|
||||||
|
suggested_action="retry",
|
||||||
|
)
|
||||||
|
step = StepConfig(
|
||||||
|
name="review",
|
||||||
|
agent="claude-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_output",
|
||||||
|
)
|
||||||
|
config = PipelineConfig(
|
||||||
|
agents={
|
||||||
|
"claude-reviewer": AgentConfig(
|
||||||
|
name="claude-reviewer",
|
||||||
|
command="claude",
|
||||||
|
args=["-p"],
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
run_dir = Path(tmpdir)
|
||||||
|
with self.assertRaises(AgentInvocationError):
|
||||||
|
_execute_step(
|
||||||
|
step,
|
||||||
|
config,
|
||||||
|
{"plan": "Plan", "checklist": "Checklist"},
|
||||||
|
"",
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
run_dir,
|
||||||
|
45,
|
||||||
|
False,
|
||||||
|
{},
|
||||||
|
{},
|
||||||
|
run_dir=run_dir,
|
||||||
|
output_iter=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
error_text = (run_dir / "v1" / "review_error.md").read_text(encoding="utf-8")
|
||||||
|
self.assertIn("API_ERROR", error_text)
|
||||||
|
self.assertIn("retry", error_text)
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent")
|
||||||
|
def test_execute_parallel_batch_saves_success_and_timeout_error(self, mock_invoke: MagicMock) -> None:
|
||||||
|
def _fake_invoke(agent_config: AgentConfig, prompt: str, step_name: str, **kwargs: object) -> AgentResult:
|
||||||
|
if step_name == "review_ok":
|
||||||
|
return AgentResult(
|
||||||
|
output="VERDICT: PASS",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
raise subprocess.TimeoutExpired(
|
||||||
|
cmd=["codex"],
|
||||||
|
timeout=30,
|
||||||
|
output="halfway",
|
||||||
|
stderr="timeout stderr",
|
||||||
|
)
|
||||||
|
|
||||||
|
mock_invoke.side_effect = _fake_invoke
|
||||||
|
batch = [
|
||||||
|
StepConfig(
|
||||||
|
name="review_ok",
|
||||||
|
agent="claude-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_ok",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review_slow",
|
||||||
|
agent="codex-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_slow",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(
|
||||||
|
agents={
|
||||||
|
"claude-reviewer": AgentConfig(name="claude-reviewer", command="claude", args=["-p"]),
|
||||||
|
"codex-reviewer": AgentConfig(name="codex-reviewer", command="codex", args=["exec", "-"]),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
step_outputs: dict[str, str] = {}
|
||||||
|
step_results: dict[str, AgentResult] = {}
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
run_dir = Path(tmpdir)
|
||||||
|
with self.assertRaises(RuntimeError) as ctx:
|
||||||
|
_execute_parallel_batch(
|
||||||
|
batch,
|
||||||
|
config,
|
||||||
|
{"plan": "Plan", "checklist": "Checklist"},
|
||||||
|
"",
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
run_dir,
|
||||||
|
30,
|
||||||
|
False,
|
||||||
|
step_outputs,
|
||||||
|
step_results,
|
||||||
|
run_dir=run_dir,
|
||||||
|
output_iter=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIn("Successful outputs were saved for: review_ok", str(ctx.exception))
|
||||||
|
self.assertEqual(step_outputs["review_ok"], "VERDICT: PASS")
|
||||||
|
self.assertTrue((run_dir / "v1" / "review_ok.md").exists())
|
||||||
|
self.assertTrue((run_dir / "v1" / "review_slow_error.md").exists())
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline._execute_step")
|
||||||
|
def test_execute_parallel_batch_dry_run_uses_sequential_path(self, mock_step: MagicMock) -> None:
|
||||||
|
batch = [
|
||||||
|
StepConfig(
|
||||||
|
name="review_a",
|
||||||
|
agent="claude-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_a",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review_b",
|
||||||
|
agent="codex-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_b",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(agents={})
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
_execute_parallel_batch(
|
||||||
|
batch,
|
||||||
|
config,
|
||||||
|
{"plan": "Plan"},
|
||||||
|
"",
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
Path(tmpdir),
|
||||||
|
None,
|
||||||
|
True,
|
||||||
|
{},
|
||||||
|
{},
|
||||||
|
run_dir=Path(tmpdir),
|
||||||
|
output_iter=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(mock_step.call_count, 2)
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline._execute_step")
|
||||||
|
def test_execute_parallel_batch_agentic_steps_fall_back_to_sequential(self, mock_step: MagicMock) -> None:
|
||||||
|
batch = [
|
||||||
|
StepConfig(
|
||||||
|
name="review_a",
|
||||||
|
agent="agentic-a",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_a",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review_b",
|
||||||
|
agent="agentic-b",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_b",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(
|
||||||
|
agents={
|
||||||
|
"agentic-a": AgentConfig(name="agentic-a", command="claude", agentic=True),
|
||||||
|
"agentic-b": AgentConfig(name="agentic-b", command="codex", agentic=True),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
_execute_parallel_batch(
|
||||||
|
batch,
|
||||||
|
config,
|
||||||
|
{"plan": "Plan"},
|
||||||
|
"",
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
Path(tmpdir),
|
||||||
|
None,
|
||||||
|
False,
|
||||||
|
{},
|
||||||
|
{},
|
||||||
|
run_dir=Path(tmpdir),
|
||||||
|
output_iter=1,
|
||||||
|
worktree_path=Path(tmpdir),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(mock_step.call_count, 2)
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.remove_worktree", side_effect=RuntimeError("cleanup failed"))
|
||||||
|
@patch("cross_eval.worktree.commit_worktree", side_effect=RuntimeError("commit failed"))
|
||||||
|
def test_finalize_worktree_handles_cleanup_failures(
|
||||||
|
self,
|
||||||
|
mock_commit: MagicMock,
|
||||||
|
mock_remove: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
branch = _finalize_worktree(
|
||||||
|
Path(tmpdir),
|
||||||
|
Path(tmpdir) / "wt",
|
||||||
|
"cross-eval/fail",
|
||||||
|
"review-fix",
|
||||||
|
"FAIL",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIsNone(branch)
|
||||||
|
|
||||||
|
|
||||||
|
class TestRuntimeEnvironmentHelpers(unittest.TestCase):
|
||||||
|
def test_parse_dotenv_handles_export_and_quotes(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
env_path = Path(tmpdir) / ".env"
|
||||||
|
env_path.write_text(
|
||||||
|
"export FOO='bar'\nBAR=\"line\\nvalue\"\nINVALID\n=skip\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
values = parse_dotenv(env_path)
|
||||||
|
|
||||||
|
self.assertEqual(values["FOO"], "bar")
|
||||||
|
self.assertEqual(values["BAR"], "line\nvalue")
|
||||||
|
self.assertNotIn("INVALID", values)
|
||||||
|
|
||||||
|
def test_resolve_env_files_deduplicates_and_filters_missing(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
env_path = root / ".env"
|
||||||
|
env_path.write_text("FOO=bar\n", encoding="utf-8")
|
||||||
|
execution = ExecutionConfig(
|
||||||
|
env_files=[".env", str(env_path)],
|
||||||
|
auto_env_files=[".env", ".env.local"],
|
||||||
|
)
|
||||||
|
|
||||||
|
resolved = resolve_env_files(execution, root)
|
||||||
|
|
||||||
|
self.assertEqual(resolved, [env_path.resolve()])
|
||||||
|
|
||||||
|
def test_summarize_environment_hides_names_when_disabled(self) -> None:
|
||||||
|
execution = ExecutionConfig(expose_env_names=False, auto_context_targets=["postgres"])
|
||||||
|
summary = summarize_environment(
|
||||||
|
execution,
|
||||||
|
[],
|
||||||
|
{"DATABASE_URL": "postgres://localhost"},
|
||||||
|
{},
|
||||||
|
)
|
||||||
|
self.assertIn("names are hidden", summary)
|
||||||
|
self.assertIn("Execution targets hinted by the user: postgres", summary)
|
||||||
|
|
||||||
|
def test_build_execution_policy_for_minimal_mode(self) -> None:
|
||||||
|
policy = build_execution_policy(
|
||||||
|
ExecutionConfig(mode="agent-decides", command_policy="minimal"),
|
||||||
|
)
|
||||||
|
self.assertIn("Command policy: minimal", policy)
|
||||||
|
self.assertIn("Keep command usage minimal", policy)
|
||||||
|
|
||||||
|
|
||||||
|
class TestWorktreeFailures(unittest.TestCase):
|
||||||
|
@patch("cross_eval.worktree.subprocess.run")
|
||||||
|
def test_create_worktree_raises_when_branch_creation_fails(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.side_effect = subprocess.CalledProcessError(
|
||||||
|
1,
|
||||||
|
["git", "branch"],
|
||||||
|
stderr="branch failed",
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
base = Path(tmpdir)
|
||||||
|
work_dir = base / "wt"
|
||||||
|
with self.assertRaises(WorktreeError) as ctx:
|
||||||
|
create_worktree(base, work_dir, "cross-eval/fail")
|
||||||
|
|
||||||
|
self.assertIn("Failed to create branch", str(ctx.exception))
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.subprocess.run")
|
||||||
|
def test_create_worktree_cleans_branch_on_worktree_failure(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.side_effect = [
|
||||||
|
MagicMock(returncode=0),
|
||||||
|
subprocess.CalledProcessError(
|
||||||
|
1,
|
||||||
|
["git", "worktree", "add"],
|
||||||
|
stderr="worktree failed",
|
||||||
|
),
|
||||||
|
MagicMock(returncode=0),
|
||||||
|
]
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
base = Path(tmpdir)
|
||||||
|
work_dir = base / "wt"
|
||||||
|
with self.assertRaises(WorktreeError):
|
||||||
|
create_worktree(base, work_dir, "cross-eval/fail")
|
||||||
|
|
||||||
|
cleanup_call = mock_run.call_args_list[-1]
|
||||||
|
self.assertEqual(cleanup_call[0][0][:3], ["git", "branch", "-D"])
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.shutil.rmtree")
|
||||||
|
@patch("cross_eval.worktree.subprocess.run")
|
||||||
|
def test_remove_worktree_falls_back_to_prune(self, mock_run: MagicMock, mock_rmtree: MagicMock) -> None:
|
||||||
|
mock_run.side_effect = [
|
||||||
|
subprocess.CalledProcessError(1, ["git", "worktree", "remove"]),
|
||||||
|
MagicMock(returncode=0),
|
||||||
|
]
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
base = Path(tmpdir) / "repo"
|
||||||
|
work_dir = Path(tmpdir) / "wt"
|
||||||
|
base.mkdir()
|
||||||
|
work_dir.mkdir()
|
||||||
|
|
||||||
|
remove_worktree(base, work_dir)
|
||||||
|
|
||||||
|
resolved = work_dir.resolve()
|
||||||
|
mock_rmtree.assert_any_call(resolved, ignore_errors=True)
|
||||||
|
self.assertEqual(mock_run.call_args_list[-1][0][0], ["git", "worktree", "prune"])
|
||||||
Reference in New Issue
Block a user