Compare commits
14 Commits
main
...
0858675076
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0858675076 | ||
|
|
cc8d583914 | ||
|
|
7b95233edf | ||
|
|
87bc0ffbfb | ||
|
|
c467222a2a | ||
|
|
99cbf171aa | ||
|
|
d5fcc258b7 | ||
|
|
290eace01b | ||
|
|
ecf44b4c07 | ||
|
|
b19d174c98 | ||
|
|
3fb19e90c0 | ||
|
|
28dd794f54 | ||
|
|
941304398d | ||
|
|
204e071b74 |
7
.gitignore
vendored
Normal file
7
.gitignore
vendored
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
.pytest_cache/
|
||||||
|
.idea/
|
||||||
|
output/
|
||||||
|
.cross-eval/output/
|
||||||
|
cross_eval.egg-info/
|
||||||
10
.idea/.gitignore
generated
vendored
10
.idea/.gitignore
generated
vendored
@@ -1,10 +0,0 @@
|
|||||||
# Default ignored files
|
|
||||||
/shelf/
|
|
||||||
/workspace.xml
|
|
||||||
# Ignored default folder with query files
|
|
||||||
/queries/
|
|
||||||
# Datasource local storage ignored files
|
|
||||||
/dataSources/
|
|
||||||
/dataSources.local.xml
|
|
||||||
# Editor-based HTTP Client requests
|
|
||||||
/httpRequests/
|
|
||||||
14
.idea/cross-eval.iml
generated
14
.idea/cross-eval.iml
generated
@@ -1,14 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<module type="PYTHON_MODULE" version="4">
|
|
||||||
<component name="NewModuleRootManager">
|
|
||||||
<content url="file://$MODULE_DIR$">
|
|
||||||
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
|
||||||
</content>
|
|
||||||
<orderEntry type="jdk" jdkName="Python 3.12 (cross-eval)" jdkType="Python SDK" />
|
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
|
||||||
</component>
|
|
||||||
<component name="PyDocumentationSettings">
|
|
||||||
<option name="format" value="PLAIN" />
|
|
||||||
<option name="myDocStringFormat" value="Plain" />
|
|
||||||
</component>
|
|
||||||
</module>
|
|
||||||
6
.idea/inspectionProfiles/Project_Default.xml
generated
6
.idea/inspectionProfiles/Project_Default.xml
generated
@@ -1,6 +0,0 @@
|
|||||||
<component name="InspectionProjectProfileManager">
|
|
||||||
<profile version="1.0">
|
|
||||||
<option name="myName" value="Project Default" />
|
|
||||||
<inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
|
|
||||||
</profile>
|
|
||||||
</component>
|
|
||||||
6
.idea/inspectionProfiles/profiles_settings.xml
generated
6
.idea/inspectionProfiles/profiles_settings.xml
generated
@@ -1,6 +0,0 @@
|
|||||||
<component name="InspectionProjectProfileManager">
|
|
||||||
<settings>
|
|
||||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
|
||||||
<version value="1.0" />
|
|
||||||
</settings>
|
|
||||||
</component>
|
|
||||||
7
.idea/misc.xml
generated
7
.idea/misc.xml
generated
@@ -1,7 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="Black">
|
|
||||||
<option name="sdkName" value="Python 3.12 (cross-eval)" />
|
|
||||||
</component>
|
|
||||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (cross-eval)" project-jdk-type="Python SDK" />
|
|
||||||
</project>
|
|
||||||
8
.idea/modules.xml
generated
8
.idea/modules.xml
generated
@@ -1,8 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="ProjectModuleManager">
|
|
||||||
<modules>
|
|
||||||
<module fileurl="file://$PROJECT_DIR$/.idea/cross-eval.iml" filepath="$PROJECT_DIR$/.idea/cross-eval.iml" />
|
|
||||||
</modules>
|
|
||||||
</component>
|
|
||||||
</project>
|
|
||||||
@@ -41,7 +41,7 @@ inputs:
|
|||||||
checklist: checklist.md
|
checklist: checklist.md
|
||||||
|
|
||||||
agents:
|
agents:
|
||||||
generator:
|
coder:
|
||||||
command: claude
|
command: claude
|
||||||
args: ["-p", "--model", "sonnet", "--permission-mode", "auto"]
|
args: ["-p", "--model", "sonnet", "--permission-mode", "auto"]
|
||||||
system_prompt: "You are a senior software engineer. Follow the plan precisely."
|
system_prompt: "You are a senior software engineer. Follow the plan precisely."
|
||||||
@@ -53,14 +53,16 @@ agents:
|
|||||||
# 방법 1: 프리셋 사용 (사용자가 pipeline YAML 직접 작성할 필요 없음)
|
# 방법 1: 프리셋 사용 (사용자가 pipeline YAML 직접 작성할 필요 없음)
|
||||||
pipeline: preset:simple # "A 생성 → B 리뷰" (기본값)
|
pipeline: preset:simple # "A 생성 → B 리뷰" (기본값)
|
||||||
# pipeline: preset:cross-review # "둘 다 생성 → 서로 리뷰"
|
# pipeline: preset:cross-review # "둘 다 생성 → 서로 리뷰"
|
||||||
|
# pipeline: preset:plan-review # "구현 전 문서/기획 검토"
|
||||||
|
# pipeline: preset:coding-review-fix # "초기 코딩 1회 → 리뷰/수정 반복"
|
||||||
|
|
||||||
# 방법 2: 직접 커스텀 (고급 사용자용)
|
# 방법 2: 직접 커스텀 (고급 사용자용)
|
||||||
# pipeline:
|
# pipeline:
|
||||||
# - name: generate
|
# - name: coding
|
||||||
# agent: generator
|
# agent: coder
|
||||||
# role: generate
|
# role: coding
|
||||||
# prompt_template: "default:generate"
|
# prompt_template: "default:coding"
|
||||||
# output_key: generated_code
|
# output_key: coding_output
|
||||||
# - name: review
|
# - name: review
|
||||||
# agent: reviewer
|
# agent: reviewer
|
||||||
# role: review
|
# role: review
|
||||||
@@ -73,8 +75,10 @@ pipeline: preset:simple # "A 생성 → B 리뷰" (기본값)
|
|||||||
|
|
||||||
| 프리셋 | 설명 | 자동 생성되는 steps |
|
| 프리셋 | 설명 | 자동 생성되는 steps |
|
||||||
|--------|------|-------------------|
|
|--------|------|-------------------|
|
||||||
| `simple` | A 생성 → B 리뷰 | generate(agent1) → review(agent2) |
|
| `simple` | A 코딩 → B 리뷰 | coding(agent1) → review(agent2) |
|
||||||
| `cross-review` | 둘 다 생성, 서로 리뷰 | gen_a → gen_b → review_of_b(agent_a) → review_of_a(agent_b) |
|
| `cross-review` | 둘 다 코딩, 서로 리뷰 | coding_a → coding_b → review_of_b(agent_a) → review_of_a(agent_b) |
|
||||||
|
| `plan-review` | 구현 전 문서 검토 | parallel plan_review_* → senior_review(optional) |
|
||||||
|
| `coding-review-fix` | 초기 코딩 후 리뷰/수정 반복 | initial_coding(coding) → review_fix(review* → aggregate → coding → verify) |
|
||||||
|
|
||||||
프리셋은 내부적으로 적절한 pipeline steps + context_override를 자동 구성한다. agents에 정의된 순서대로 agent1, agent2가 배정된다. 프리셋이 불충분하면 직접 steps를 작성할 수 있다.
|
프리셋은 내부적으로 적절한 pipeline steps + context_override를 자동 구성한다. agents에 정의된 순서대로 agent1, agent2가 배정된다. 프리셋이 불충분하면 직접 steps를 작성할 수 있다.
|
||||||
|
|
||||||
@@ -109,11 +113,11 @@ cross_eval/
|
|||||||
- verdict_pattern 유효한 정규식인지
|
- verdict_pattern 유효한 정규식인지
|
||||||
|
|
||||||
**prompts.py** — 기본 프롬프트 2종 + 파이프라인 프리셋 정의:
|
**prompts.py** — 기본 프롬프트 2종 + 파이프라인 프리셋 정의:
|
||||||
- `default:generate` — "기획서에 명시된 것만 구현하라, 과최적화 금지" + plan/checklist/feedback + **"프로젝트 디렉토리의 기존 코드를 탐색하여 컨텍스트를 파악하라"** 지시
|
- `default:coding` — "기획서에 명시된 것만 구현하라, 과최적화 금지" + plan/checklist/feedback + **"프로젝트 디렉토리의 기존 코드를 탐색하여 컨텍스트를 파악하라"** 지시
|
||||||
- `default:review` — 과최적화/오탐/누락 3기준 검토 + `VERDICT: PASS|FAIL` 출력 + **"프로젝트 디렉토리를 직접 탐색하여 코드를 검증하라"** 지시
|
- `default:review` — 과최적화/오탐/누락 3기준 검토 + `VERDICT: PASS|FAIL` 출력 + **"프로젝트 디렉토리를 직접 탐색하여 코드를 검증하라"** 지시
|
||||||
- `{variable}` 플레이스홀더, 누락 시 `(no {key} provided)` 출력
|
- `{variable}` 플레이스홀더, 누락 시 `(no {key} provided)` 출력
|
||||||
- 사용자가 커스텀 .md 파일로 오버라이드 가능
|
- 사용자가 커스텀 .md 파일로 오버라이드 가능
|
||||||
- `PIPELINE_PRESETS` dict: `simple`, `cross-review` 등 프리셋별 StepConfig 리스트 정의
|
- `PIPELINE_PRESETS` dict: `simple`, `cross-review`, `plan-review` 등 프리셋별 StepConfig 리스트 정의
|
||||||
|
|
||||||
**agent.py** — `invoke_agent(agent_config, prompt, cwd)`:
|
**agent.py** — `invoke_agent(agent_config, prompt, cwd)`:
|
||||||
- `cwd` 파라미터로 프로젝트 디렉토리 지정 → 에이전트가 해당 디렉토리에서 파일 탐색 가능
|
- `cwd` 파라미터로 프로젝트 디렉토리 지정 → 에이전트가 해당 디렉토리에서 파일 탐색 가능
|
||||||
@@ -141,7 +145,7 @@ final-report.md 생성
|
|||||||
- 최종 판정
|
- 최종 판정
|
||||||
|
|
||||||
**cli.py** — 서브커맨드:
|
**cli.py** — 서브커맨드:
|
||||||
- `cross-eval init [--dir .] [--preset simple|cross-review]` — 스캐폴딩 (기존 파일 안 덮어씀)
|
- `cross-eval init [--dir .] [--preset simple|cross-review|plan-review]` — 스캐폴딩 (기존 파일 안 덮어씀)
|
||||||
- `cross-eval run [-c config] [--max-iter N] [--dry-run] [--output-dir path] [--input key=path ...]`
|
- `cross-eval run [-c config] [--max-iter N] [--dry-run] [--output-dir path] [--input key=path ...]`
|
||||||
- `--input key=path`: config의 inputs 오버라이드/추가
|
- `--input key=path`: config의 inputs 오버라이드/추가
|
||||||
- `--dry-run`: 에이전트 호출 없이 렌더링된 프롬프트만 출력
|
- `--dry-run`: 에이전트 호출 없이 렌더링된 프롬프트만 출력
|
||||||
@@ -167,3 +171,17 @@ final-report.md 생성
|
|||||||
3. `cross-eval run --dry-run` 로 프롬프트 렌더링 확인 (에이전트 호출 없이)
|
3. `cross-eval run --dry-run` 로 프롬프트 렌더링 확인 (에이전트 호출 없이)
|
||||||
4. plan.md/checklist.md에 간단한 내용 넣고 `cross-eval run --max-iter 2` 로 실제 실행
|
4. plan.md/checklist.md에 간단한 내용 넣고 `cross-eval run --max-iter 2` 로 실제 실행
|
||||||
5. `output/` 디렉토리에 v1/, final-report.md 생성 확인
|
5. `output/` 디렉토리에 v1/, final-report.md 생성 확인
|
||||||
|
|
||||||
|
|
||||||
|
cross-eval run \
|
||||||
|
--docs /Users/chungyeong/Desktop/Dev/new-alpha-foundry/plans/TO_CLICKHOUSE \
|
||||||
|
--preset coding-review-fix \
|
||||||
|
--coder claude \
|
||||||
|
--reviewer codex \
|
||||||
|
--reviewer codex \
|
||||||
|
--reviewer codex \
|
||||||
|
--senior codex \
|
||||||
|
--coder-effort high \
|
||||||
|
--reviewer-effort high \
|
||||||
|
--senior-effort xhigh \
|
||||||
|
--max-iter 10
|
||||||
|
|||||||
19
README.md
19
README.md
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
AI 에이전트 간 교차 검증을 자동화하는 CLI 도구.
|
AI 에이전트 간 교차 검증을 자동화하는 CLI 도구.
|
||||||
|
|
||||||
기획서와 체크리스트를 기반으로 "생성 → 리뷰 → 피드백 → 재생성" 루프를 자동으로 돌려서,
|
기획서와 체크리스트를 기반으로 "코딩 → 리뷰 → 피드백 → 재코딩" 루프를 자동으로 돌려서,
|
||||||
**과최적화 / 오탐 / 누락** 문제를 잡아냅니다.
|
**과최적화 / 오탐 / 누락** 문제를 잡아냅니다.
|
||||||
|
|
||||||
## 설치
|
## 설치
|
||||||
@@ -51,7 +51,7 @@ cp .cross-eval/checklist-sample.md .cross-eval/checklist.md
|
|||||||
### 3. 실행
|
### 3. 실행
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 기본 실행 (생성 → 리뷰, 최대 3회 반복)
|
# 기본 실행 (코딩 → 리뷰, 최대 3회 반복)
|
||||||
cross-eval run
|
cross-eval run
|
||||||
|
|
||||||
# 프롬프트만 확인 (에이전트 호출 없이, 비용 절약)
|
# 프롬프트만 확인 (에이전트 호출 없이, 비용 절약)
|
||||||
@@ -72,10 +72,10 @@ cross-eval run --config .cross-eval/config.yaml
|
|||||||
```
|
```
|
||||||
output/
|
output/
|
||||||
├── v1/
|
├── v1/
|
||||||
│ ├── generate.md # 에이전트 생성 결과
|
│ ├── coding.md # 에이전트 코딩 결과
|
||||||
│ └── review.md # 에이전트 리뷰 결과
|
│ └── review.md # 에이전트 리뷰 결과
|
||||||
├── v2/
|
├── v2/
|
||||||
│ ├── generate.md
|
│ ├── coding.md
|
||||||
│ └── review.md
|
│ └── review.md
|
||||||
└── final-report.md # 전체 요약 리포트
|
└── final-report.md # 전체 요약 리포트
|
||||||
```
|
```
|
||||||
@@ -92,7 +92,7 @@ inputs:
|
|||||||
checklist: checklist.md
|
checklist: checklist.md
|
||||||
|
|
||||||
agents:
|
agents:
|
||||||
generator:
|
coder:
|
||||||
command: claude
|
command: claude
|
||||||
args: ["-p", "--model", "sonnet", "--permission-mode", "auto"]
|
args: ["-p", "--model", "sonnet", "--permission-mode", "auto"]
|
||||||
system_prompt: "You are a senior software engineer."
|
system_prompt: "You are a senior software engineer."
|
||||||
@@ -110,11 +110,16 @@ pipeline: preset:simple
|
|||||||
|
|
||||||
| 프리셋 | 설명 |
|
| 프리셋 | 설명 |
|
||||||
|--------|------|
|
|--------|------|
|
||||||
| `simple` | Agent A가 생성, Agent B가 리뷰 (기본값) |
|
| `simple` | Agent A가 코딩, Agent B가 리뷰 (기본값) |
|
||||||
| `cross-review` | 둘 다 생성, 서로 교차 리뷰 |
|
| `cross-review` | 둘 다 코딩, 서로 교차 리뷰 |
|
||||||
|
| `plan-review` | 구현 전 기획서/체크리스트/참고문서를 검토하고 필요시 현재 코드베이스와의 정합성도 확인 |
|
||||||
|
| `review-only` | 기존 코드만 감사 용도로 검토 |
|
||||||
|
| `review-fix` | 리뷰 결과를 취합한 뒤 자동 수정과 재검증까지 반복 |
|
||||||
|
| `coding-review-fix` | 초기 코딩 1회 후 리뷰 결과를 취합해 자동 수정과 재검증을 반복 |
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 초기화 옵션
|
# 초기화 옵션
|
||||||
cross-eval init --preset cross-review # 교차 리뷰 프리셋
|
cross-eval init --preset cross-review # 교차 리뷰 프리셋
|
||||||
|
cross-eval init --preset plan-review # 구현 전 문서 검토 프리셋
|
||||||
cross-eval init --lang en # 영어 템플릿
|
cross-eval init --lang en # 영어 템플릿
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -1,6 +0,0 @@
|
|||||||
Metadata-Version: 2.4
|
|
||||||
Name: cross-eval
|
|
||||||
Version: 0.1.0
|
|
||||||
Summary: AI agent cross-evaluation CLI tool
|
|
||||||
Requires-Python: >=3.9
|
|
||||||
Requires-Dist: pyyaml>=6.0
|
|
||||||
@@ -1,17 +0,0 @@
|
|||||||
README.md
|
|
||||||
pyproject.toml
|
|
||||||
cross_eval/__init__.py
|
|
||||||
cross_eval/agent.py
|
|
||||||
cross_eval/cli.py
|
|
||||||
cross_eval/config.py
|
|
||||||
cross_eval/models.py
|
|
||||||
cross_eval/pipeline.py
|
|
||||||
cross_eval/prompts.py
|
|
||||||
cross_eval/report.py
|
|
||||||
cross_eval.egg-info/PKG-INFO
|
|
||||||
cross_eval.egg-info/SOURCES.txt
|
|
||||||
cross_eval.egg-info/dependency_links.txt
|
|
||||||
cross_eval.egg-info/entry_points.txt
|
|
||||||
cross_eval.egg-info/requires.txt
|
|
||||||
cross_eval.egg-info/top_level.txt
|
|
||||||
tests/test_config.py
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
|
|
||||||
@@ -1,2 +0,0 @@
|
|||||||
[console_scripts]
|
|
||||||
cross-eval = cross_eval.cli:main
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
pyyaml>=6.0
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
cross_eval
|
|
||||||
@@ -1 +1 @@
|
|||||||
__version__ = "0.1.0"
|
__version__ = "0.2.0"
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -3,8 +3,10 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
import tempfile
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -17,6 +19,89 @@ logger = logging.getLogger(__name__)
|
|||||||
# CLI tools that support --system-prompt flag natively
|
# CLI tools that support --system-prompt flag natively
|
||||||
_SYSTEM_PROMPT_AGENTS = ("claude",)
|
_SYSTEM_PROMPT_AGENTS = ("claude",)
|
||||||
_REASONING_EFFORT_AGENTS = ("codex",)
|
_REASONING_EFFORT_AGENTS = ("codex",)
|
||||||
|
_NO_CHANGE_ACK_MARKERS = (
|
||||||
|
"no changes",
|
||||||
|
"no code changes",
|
||||||
|
"no file changes",
|
||||||
|
"did not make any changes",
|
||||||
|
"nothing to change",
|
||||||
|
"no modifications were necessary",
|
||||||
|
"no update was necessary",
|
||||||
|
"already satisfied",
|
||||||
|
"no changes needed",
|
||||||
|
"no fixes needed",
|
||||||
|
"everything is correct",
|
||||||
|
"code is correct as-is",
|
||||||
|
"already correct",
|
||||||
|
"no action required",
|
||||||
|
)
|
||||||
|
_CHANGE_CLAIM_MARKERS = (
|
||||||
|
"summary of all changes made",
|
||||||
|
"here's a summary of all changes made",
|
||||||
|
"here is a summary of all changes",
|
||||||
|
"implemented",
|
||||||
|
"i implemented",
|
||||||
|
"i've implemented",
|
||||||
|
"added",
|
||||||
|
"i added",
|
||||||
|
"i've added",
|
||||||
|
"updated",
|
||||||
|
"i updated",
|
||||||
|
"i've updated",
|
||||||
|
"modified",
|
||||||
|
"i modified",
|
||||||
|
"i've modified",
|
||||||
|
"created",
|
||||||
|
"i created",
|
||||||
|
"i've created",
|
||||||
|
"fixed",
|
||||||
|
"i fixed",
|
||||||
|
"i've fixed",
|
||||||
|
"completed the changes",
|
||||||
|
"finished the changes",
|
||||||
|
"made the following changes",
|
||||||
|
"applied the fix",
|
||||||
|
"changes have been applied",
|
||||||
|
"wrote the code",
|
||||||
|
"refactored",
|
||||||
|
"i refactored",
|
||||||
|
"completed all the changes",
|
||||||
|
"finished implementing",
|
||||||
|
"all tasks completed",
|
||||||
|
"done with the implementation",
|
||||||
|
"successfully implemented",
|
||||||
|
"completed the implementation",
|
||||||
|
"all changes have been made",
|
||||||
|
"changes are complete",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class AgentInvocationError(RuntimeError):
|
||||||
|
"""Structured error for agent CLI failures."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
agent_name: str,
|
||||||
|
step_name: str,
|
||||||
|
cmd_preview: str,
|
||||||
|
raw_error: str,
|
||||||
|
failure_type: str,
|
||||||
|
suggested_action: str,
|
||||||
|
) -> None:
|
||||||
|
self.agent_name = agent_name
|
||||||
|
self.step_name = step_name
|
||||||
|
self.cmd_preview = cmd_preview
|
||||||
|
self.raw_error = raw_error
|
||||||
|
self.failure_type = failure_type
|
||||||
|
self.suggested_action = suggested_action
|
||||||
|
super().__init__(
|
||||||
|
f"Agent '{agent_name}' failed (exit code != 0) at step '{step_name}':\n"
|
||||||
|
f" type: {failure_type}\n"
|
||||||
|
f" cmd: {cmd_preview}\n"
|
||||||
|
f" error: {raw_error or '(no output)'}\n"
|
||||||
|
f" action: {suggested_action}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _supports_system_prompt_flag(command: str) -> bool:
|
def _supports_system_prompt_flag(command: str) -> bool:
|
||||||
@@ -29,6 +114,86 @@ def _supports_reasoning_effort(command: str) -> bool:
|
|||||||
return any(name in command for name in _REASONING_EFFORT_AGENTS)
|
return any(name in command for name in _REASONING_EFFORT_AGENTS)
|
||||||
|
|
||||||
|
|
||||||
|
def _classify_agent_failure(detail: str) -> tuple[str, str]:
|
||||||
|
"""Classify a failed agent invocation into a user-actionable bucket."""
|
||||||
|
normalized = detail.lower()
|
||||||
|
|
||||||
|
auth_markers = (
|
||||||
|
"not logged in",
|
||||||
|
"please run /login",
|
||||||
|
"auth",
|
||||||
|
"authentication",
|
||||||
|
"invalid api key",
|
||||||
|
"api key",
|
||||||
|
"unauthorized",
|
||||||
|
"forbidden",
|
||||||
|
)
|
||||||
|
usage_limit_markers = (
|
||||||
|
"quota",
|
||||||
|
"rate limit",
|
||||||
|
"credits",
|
||||||
|
"credit balance",
|
||||||
|
"budget",
|
||||||
|
"insufficient funds",
|
||||||
|
"usage limit",
|
||||||
|
"token limit",
|
||||||
|
"billing",
|
||||||
|
)
|
||||||
|
|
||||||
|
if any(marker in normalized for marker in auth_markers):
|
||||||
|
return (
|
||||||
|
"AUTH",
|
||||||
|
"Agent CLI authentication is missing or expired. Re-authenticate the CLI, then rerun.",
|
||||||
|
)
|
||||||
|
if any(marker in normalized for marker in usage_limit_markers):
|
||||||
|
return (
|
||||||
|
"USAGE_LIMIT",
|
||||||
|
"Agent CLI hit a quota, billing, or token budget limit. Refill or raise the limit, then rerun.",
|
||||||
|
)
|
||||||
|
if "api error" in normalized:
|
||||||
|
return (
|
||||||
|
"API_ERROR",
|
||||||
|
"Agent CLI returned an API error. Inspect the saved error file for the raw response.",
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
"UNKNOWN",
|
||||||
|
"Agent CLI failed for an unknown reason. Inspect the saved error file for details.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_WRITE_FAILURE_MARKERS = (
|
||||||
|
"permission denied",
|
||||||
|
"read-only file system",
|
||||||
|
"read only file system",
|
||||||
|
"operation not permitted",
|
||||||
|
"cannot write",
|
||||||
|
"failed to write",
|
||||||
|
"could not write",
|
||||||
|
"unable to write",
|
||||||
|
"sandbox",
|
||||||
|
"eacces",
|
||||||
|
"erofs",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _has_write_failure_indicators(stderr: str) -> bool:
|
||||||
|
"""Detect stderr patterns indicating the agent could not write files."""
|
||||||
|
if not stderr.strip():
|
||||||
|
return False
|
||||||
|
normalized = stderr.lower()
|
||||||
|
return any(marker in normalized for marker in _WRITE_FAILURE_MARKERS)
|
||||||
|
|
||||||
|
|
||||||
|
def _claims_file_changes(output: str) -> bool:
|
||||||
|
"""Heuristic for agent text that claims code changes were made."""
|
||||||
|
normalized = output.lower()
|
||||||
|
if not normalized.strip():
|
||||||
|
return False
|
||||||
|
if any(marker in normalized for marker in _NO_CHANGE_ACK_MARKERS):
|
||||||
|
return False
|
||||||
|
return any(marker in normalized for marker in _CHANGE_CLAIM_MARKERS)
|
||||||
|
|
||||||
|
|
||||||
class _Spinner:
|
class _Spinner:
|
||||||
"""Animated spinner for long-running agent calls."""
|
"""Animated spinner for long-running agent calls."""
|
||||||
|
|
||||||
@@ -67,11 +232,17 @@ class _Spinner:
|
|||||||
sys.stderr.flush()
|
sys.stderr.flush()
|
||||||
|
|
||||||
|
|
||||||
|
def _is_print_mode(args: list[str]) -> bool:
|
||||||
|
"""Check if the agent args include -p / --print flag."""
|
||||||
|
return "-p" in args or "--print" in args
|
||||||
|
|
||||||
|
|
||||||
def invoke_agent(
|
def invoke_agent(
|
||||||
agent: AgentConfig,
|
agent: AgentConfig,
|
||||||
prompt: str,
|
prompt: str,
|
||||||
step_name: str,
|
step_name: str,
|
||||||
cwd: Optional[Path] = None,
|
cwd: Optional[Path] = None,
|
||||||
|
env: Optional[dict[str, str]] = None,
|
||||||
timeout: int | None = None,
|
timeout: int | None = None,
|
||||||
quiet: bool = False,
|
quiet: bool = False,
|
||||||
) -> AgentResult:
|
) -> AgentResult:
|
||||||
@@ -80,30 +251,68 @@ def invoke_agent(
|
|||||||
Args:
|
Args:
|
||||||
quiet: If True, suppress spinner (for parallel execution).
|
quiet: If True, suppress spinner (for parallel execution).
|
||||||
"""
|
"""
|
||||||
|
is_claude = "claude" in agent.command
|
||||||
|
is_interactive = is_claude and not _is_print_mode(agent.args)
|
||||||
|
|
||||||
cmd = [agent.command]
|
cmd = [agent.command]
|
||||||
if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
|
if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
|
||||||
cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
|
cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
|
||||||
cmd.extend(agent.args)
|
cmd.extend(agent.args)
|
||||||
|
|
||||||
# Build the full prompt (system prompt + user prompt)
|
# --- Temp files for interactive (non -p) claude ---
|
||||||
if agent.system_prompt and _supports_system_prompt_flag(agent.command):
|
task_file: Optional[Path] = None
|
||||||
# claude: --system-prompt flag supported natively
|
output_file: Optional[Path] = None
|
||||||
cmd.extend(["--system-prompt", agent.system_prompt])
|
|
||||||
input_data = prompt
|
|
||||||
elif agent.system_prompt:
|
|
||||||
# codex, others: no --system-prompt flag, prepend to prompt
|
|
||||||
input_data = (
|
|
||||||
f"<system>\n{agent.system_prompt}\n</system>\n\n"
|
|
||||||
f"{prompt}"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
input_data = prompt
|
|
||||||
|
|
||||||
|
if is_interactive:
|
||||||
|
# Write prompt + output instruction to temp task file
|
||||||
|
task_fd, task_path = tempfile.mkstemp(suffix=".md", prefix="cross_eval_task_")
|
||||||
|
task_file = Path(task_path)
|
||||||
|
os.close(task_fd)
|
||||||
|
|
||||||
|
out_fd, out_path = tempfile.mkstemp(suffix=".md", prefix="cross_eval_out_")
|
||||||
|
output_file = Path(out_path)
|
||||||
|
os.close(out_fd)
|
||||||
|
# Clear the output file so we can detect if agent wrote to it
|
||||||
|
output_file.write_text("", encoding="utf-8")
|
||||||
|
|
||||||
|
wrapped_prompt = (
|
||||||
|
f"{prompt}\n\n"
|
||||||
|
f"---\n"
|
||||||
|
f"IMPORTANT: Write your COMPLETE response to this file: {output_file}\n"
|
||||||
|
f"Do NOT modify any other files in the project."
|
||||||
|
)
|
||||||
|
task_file.write_text(wrapped_prompt, encoding="utf-8")
|
||||||
|
|
||||||
|
# System prompt via flag
|
||||||
|
if agent.system_prompt and _supports_system_prompt_flag(agent.command):
|
||||||
|
cmd.extend(["--system-prompt", agent.system_prompt])
|
||||||
|
|
||||||
|
# Positional arg: point claude to the task file
|
||||||
|
cmd.append(
|
||||||
|
f"Read the task file at {task_file} and follow all instructions in it. "
|
||||||
|
f"Write your complete output to {output_file}."
|
||||||
|
)
|
||||||
|
input_data: str | None = None
|
||||||
|
else:
|
||||||
|
# Print mode (-p) or non-claude: deliver prompt via stdin
|
||||||
|
if agent.system_prompt and _supports_system_prompt_flag(agent.command):
|
||||||
|
cmd.extend(["--system-prompt", agent.system_prompt])
|
||||||
|
input_data = prompt
|
||||||
|
elif agent.system_prompt:
|
||||||
|
input_data = (
|
||||||
|
f"<system>\n{agent.system_prompt}\n</system>\n\n"
|
||||||
|
f"{prompt}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
input_data = prompt
|
||||||
|
|
||||||
|
cmd_preview = " ".join(cmd[:6])
|
||||||
logger.debug("Invoking agent '%s': %s", agent.name, " ".join(cmd[:5]) + " ...")
|
logger.debug("Invoking agent '%s': %s", agent.name, " ".join(cmd[:5]) + " ...")
|
||||||
|
|
||||||
spinner: Optional[_Spinner] = None
|
spinner: Optional[_Spinner] = None
|
||||||
if not quiet:
|
if not quiet:
|
||||||
logger.info(" cmd: %s", " ".join(cmd[:6]))
|
mode_label = "interactive" if is_interactive else ""
|
||||||
|
logger.info(" cmd: %s %s", " ".join(cmd[:6]), f"({mode_label})" if mode_label else "")
|
||||||
spinner = _Spinner(f"[{step_name}] {agent.name} running...")
|
spinner = _Spinner(f"[{step_name}] {agent.name} running...")
|
||||||
spinner.start()
|
spinner.start()
|
||||||
|
|
||||||
@@ -116,6 +325,7 @@ def invoke_agent(
|
|||||||
text=True,
|
text=True,
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
cwd=cwd,
|
cwd=cwd,
|
||||||
|
env=env,
|
||||||
)
|
)
|
||||||
duration = time.monotonic() - start
|
duration = time.monotonic() - start
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
@@ -126,32 +336,64 @@ def invoke_agent(
|
|||||||
if spinner:
|
if spinner:
|
||||||
spinner.stop(f"[{step_name}] ERROR")
|
spinner.stop(f"[{step_name}] ERROR")
|
||||||
raise
|
raise
|
||||||
|
finally:
|
||||||
output = result.stdout.strip()
|
if task_file:
|
||||||
chars = len(output)
|
task_file.unlink(missing_ok=True)
|
||||||
|
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
if spinner:
|
if spinner:
|
||||||
spinner.stop(f"[{step_name}] FAILED (exit {result.returncode})")
|
spinner.stop(f"[{step_name}] FAILED (exit {result.returncode})")
|
||||||
|
if output_file:
|
||||||
|
output_file.unlink(missing_ok=True)
|
||||||
err_detail = result.stderr.strip() or result.stdout.strip()
|
err_detail = result.stderr.strip() or result.stdout.strip()
|
||||||
if err_detail and len(err_detail) > 500:
|
if err_detail and len(err_detail) > 500:
|
||||||
err_detail = err_detail[:500] + "..."
|
err_detail = err_detail[:500] + "..."
|
||||||
cmd_preview = " ".join(cmd[:6])
|
failure_type, suggested_action = _classify_agent_failure(err_detail or "")
|
||||||
raise RuntimeError(
|
raise AgentInvocationError(
|
||||||
f"Agent '{agent.name}' failed (exit code {result.returncode}) "
|
agent_name=agent.name,
|
||||||
f"at step '{step_name}':\n"
|
step_name=step_name,
|
||||||
f" cmd: {cmd_preview}\n"
|
cmd_preview=cmd_preview,
|
||||||
f" error: {err_detail or '(no output)'}"
|
raw_error=err_detail or "(no output)",
|
||||||
|
failure_type=failure_type,
|
||||||
|
suggested_action=suggested_action,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# --- Capture output ---
|
||||||
|
if output_file:
|
||||||
|
output = output_file.read_text(encoding="utf-8").strip()
|
||||||
|
output_file.unlink(missing_ok=True)
|
||||||
|
if not output:
|
||||||
|
# Fallback to stdout if agent didn't write to the file
|
||||||
|
output = result.stdout.strip()
|
||||||
|
else:
|
||||||
|
output = result.stdout.strip()
|
||||||
|
|
||||||
|
chars = len(output)
|
||||||
|
|
||||||
if spinner:
|
if spinner:
|
||||||
spinner.stop(f"[{step_name}] done — {chars} chars")
|
spinner.stop(f"[{step_name}] done — {chars} chars")
|
||||||
|
|
||||||
if not output:
|
if not output:
|
||||||
logger.warning(
|
stderr_info = result.stderr.strip()
|
||||||
"Agent '%s' produced empty output at step '%s'",
|
if stderr_info:
|
||||||
agent.name, step_name,
|
logger.warning(
|
||||||
)
|
"Agent '%s' produced empty output at step '%s'. stderr: %s",
|
||||||
|
agent.name, step_name, stderr_info[:500],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
"Agent '%s' produced empty output at step '%s' (no stderr either)",
|
||||||
|
agent.name, step_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
transcript = _build_transcript(
|
||||||
|
command_preview=cmd_preview,
|
||||||
|
stdout=result.stdout,
|
||||||
|
stderr=result.stderr,
|
||||||
|
exit_code=result.returncode,
|
||||||
|
duration_seconds=round(duration, 1),
|
||||||
|
cwd=str(cwd) if cwd else "",
|
||||||
|
)
|
||||||
|
|
||||||
return AgentResult(
|
return AgentResult(
|
||||||
output=output,
|
output=output,
|
||||||
@@ -159,4 +401,222 @@ def invoke_agent(
|
|||||||
agent_name=agent.name,
|
agent_name=agent.name,
|
||||||
step_name=step_name,
|
step_name=step_name,
|
||||||
duration_seconds=round(duration, 1),
|
duration_seconds=round(duration, 1),
|
||||||
|
transcript=transcript,
|
||||||
|
command_preview=cmd_preview,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def invoke_agent_agentic(
|
||||||
|
agent: AgentConfig,
|
||||||
|
prompt: str,
|
||||||
|
step_name: str,
|
||||||
|
worktree_path: Path,
|
||||||
|
env: Optional[dict[str, str]] = None,
|
||||||
|
timeout: int | None = None,
|
||||||
|
quiet: bool = False,
|
||||||
|
) -> AgentResult:
|
||||||
|
"""Invoke an agent in agentic mode using the worktree as the source of truth."""
|
||||||
|
from cross_eval.worktree import capture_diff
|
||||||
|
|
||||||
|
# Write prompt to a temp file (outside worktree, won't appear in diffs)
|
||||||
|
import tempfile
|
||||||
|
task_fd, task_path = tempfile.mkstemp(suffix=".md", prefix="cross_eval_task_")
|
||||||
|
task_file = Path(task_path)
|
||||||
|
task_file.write_text(prompt, encoding="utf-8")
|
||||||
|
os.close(task_fd)
|
||||||
|
|
||||||
|
cmd = [agent.command]
|
||||||
|
if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
|
||||||
|
cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
|
||||||
|
|
||||||
|
# Strip print-mode flags and stdin sentinels for agentic mode.
|
||||||
|
# Agentic runs should operate on the worktree and return a real git diff,
|
||||||
|
# not behave as a one-shot text completer.
|
||||||
|
args = [a for a in agent.args if a not in {"-", "-p", "--print"}]
|
||||||
|
cmd.extend(args)
|
||||||
|
|
||||||
|
# System prompt via flag if supported
|
||||||
|
if agent.system_prompt and _supports_system_prompt_flag(agent.command):
|
||||||
|
cmd.extend(["--system-prompt", agent.system_prompt])
|
||||||
|
|
||||||
|
# Deliver the prompt differently per agent type
|
||||||
|
is_codex = "codex" in agent.command
|
||||||
|
input_data: str | None = None
|
||||||
|
if is_codex:
|
||||||
|
# codex: stdin mode
|
||||||
|
cmd.append("-")
|
||||||
|
if agent.system_prompt and not _supports_system_prompt_flag(agent.command):
|
||||||
|
input_data = f"<system>\n{agent.system_prompt}\n</system>\n\n{prompt}"
|
||||||
|
else:
|
||||||
|
input_data = prompt
|
||||||
|
else:
|
||||||
|
# claude: deliver the task through stdin and let the worktree be the
|
||||||
|
# canonical place where files are read/written.
|
||||||
|
input_data = prompt
|
||||||
|
|
||||||
|
cmd_preview = " ".join(cmd[:6])
|
||||||
|
logger.debug(
|
||||||
|
"Invoking agent '%s' (agentic) in worktree: %s",
|
||||||
|
agent.name, worktree_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
spinner: Optional[_Spinner] = None
|
||||||
|
if not quiet:
|
||||||
|
logger.info(" cmd: %s (agentic)", " ".join(cmd[:6]))
|
||||||
|
spinner = _Spinner(f"[{step_name}] {agent.name} (agentic) running...")
|
||||||
|
spinner.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
start = time.monotonic()
|
||||||
|
result = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
input=input_data,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=timeout,
|
||||||
|
cwd=worktree_path,
|
||||||
|
env=env,
|
||||||
|
)
|
||||||
|
duration = time.monotonic() - start
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
if spinner:
|
||||||
|
spinner.stop(f"[{step_name}] TIMEOUT after {timeout}s")
|
||||||
|
raise
|
||||||
|
except Exception:
|
||||||
|
if spinner:
|
||||||
|
spinner.stop(f"[{step_name}] ERROR")
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
# Clean up temp task file (it's in /tmp, not in worktree)
|
||||||
|
task_file.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
if spinner:
|
||||||
|
spinner.stop(f"[{step_name}] FAILED (exit {result.returncode})")
|
||||||
|
err_detail = result.stderr.strip() or result.stdout.strip()
|
||||||
|
if err_detail and len(err_detail) > 500:
|
||||||
|
err_detail = err_detail[:500] + "..."
|
||||||
|
failure_type, suggested_action = _classify_agent_failure(err_detail or "")
|
||||||
|
raise AgentInvocationError(
|
||||||
|
agent_name=agent.name,
|
||||||
|
step_name=step_name,
|
||||||
|
cmd_preview=cmd_preview,
|
||||||
|
raw_error=err_detail or "(no output)",
|
||||||
|
failure_type=failure_type,
|
||||||
|
suggested_action=suggested_action,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Capture git diff as the output (changes since last commit on the branch)
|
||||||
|
diff_output = capture_diff(worktree_path)
|
||||||
|
|
||||||
|
if not diff_output:
|
||||||
|
stdout_excerpt = (result.stdout or "").strip()
|
||||||
|
stderr_excerpt = (result.stderr or "").strip()
|
||||||
|
|
||||||
|
# Detect two failure modes:
|
||||||
|
# 1. Agent claims changes in stdout but produced no diff
|
||||||
|
# 2. Agent stderr contains permission or write-failure indicators
|
||||||
|
claims_changes = _claims_file_changes(stdout_excerpt)
|
||||||
|
has_write_failure = _has_write_failure_indicators(stderr_excerpt)
|
||||||
|
|
||||||
|
if claims_changes or has_write_failure:
|
||||||
|
if spinner:
|
||||||
|
spinner.stop(f"[{step_name}] FAILED (empty diff)")
|
||||||
|
raw_error = stdout_excerpt or "(stdout empty)"
|
||||||
|
if stderr_excerpt:
|
||||||
|
raw_error = f"{raw_error}\n\n[stderr]\n{stderr_excerpt}"
|
||||||
|
if len(raw_error) > 2000:
|
||||||
|
raw_error = raw_error[:2000] + "..."
|
||||||
|
|
||||||
|
if has_write_failure:
|
||||||
|
failure_type = "WRITE_FAILURE"
|
||||||
|
suggested_action = (
|
||||||
|
"Agent encountered file write errors (permission denied, read-only, "
|
||||||
|
"or sandbox restriction). Check agent permissions and worktree state."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
failure_type = "EMPTY_DIFF"
|
||||||
|
suggested_action = (
|
||||||
|
"Agent reported code changes but produced no git diff. "
|
||||||
|
"Treat this run as failed and require a real worktree diff before continuing."
|
||||||
|
)
|
||||||
|
|
||||||
|
raise AgentInvocationError(
|
||||||
|
agent_name=agent.name,
|
||||||
|
step_name=step_name,
|
||||||
|
cmd_preview=cmd_preview,
|
||||||
|
raw_error=raw_error,
|
||||||
|
failure_type=failure_type,
|
||||||
|
suggested_action=suggested_action,
|
||||||
|
)
|
||||||
|
|
||||||
|
diff_output = "(no changes)"
|
||||||
|
logger.warning(
|
||||||
|
"Agent '%s' made no file changes at step '%s'",
|
||||||
|
agent.name, step_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
chars = len(diff_output)
|
||||||
|
if spinner:
|
||||||
|
spinner.stop(f"[{step_name}] done — {chars} chars (agentic)")
|
||||||
|
|
||||||
|
transcript = _build_transcript(
|
||||||
|
command_preview=cmd_preview,
|
||||||
|
stdout=result.stdout,
|
||||||
|
stderr=result.stderr,
|
||||||
|
exit_code=result.returncode,
|
||||||
|
duration_seconds=round(duration, 1),
|
||||||
|
cwd=str(worktree_path),
|
||||||
|
)
|
||||||
|
|
||||||
|
return AgentResult(
|
||||||
|
output=diff_output,
|
||||||
|
exit_code=result.returncode,
|
||||||
|
agent_name=agent.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=round(duration, 1),
|
||||||
|
transcript=transcript,
|
||||||
|
command_preview=cmd_preview,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_transcript(
|
||||||
|
*,
|
||||||
|
command_preview: str,
|
||||||
|
stdout: str,
|
||||||
|
stderr: str,
|
||||||
|
exit_code: int = 0,
|
||||||
|
duration_seconds: float = 0.0,
|
||||||
|
cwd: str = "",
|
||||||
|
) -> str:
|
||||||
|
"""Build a compact execution transcript for debugging/audit output."""
|
||||||
|
sections = [
|
||||||
|
"# Agent Execution Transcript",
|
||||||
|
"",
|
||||||
|
"## Command",
|
||||||
|
"```",
|
||||||
|
command_preview or "(unknown command)",
|
||||||
|
"```",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
if cwd:
|
||||||
|
sections.extend(["## Working Directory", f"`{cwd}`", ""])
|
||||||
|
sections.extend([
|
||||||
|
f"## Exit Code: {exit_code}",
|
||||||
|
"",
|
||||||
|
])
|
||||||
|
if duration_seconds > 0:
|
||||||
|
sections.extend([f"## Duration: {duration_seconds}s", ""])
|
||||||
|
sections.extend([
|
||||||
|
"## Stdout",
|
||||||
|
"```",
|
||||||
|
(stdout or "(empty)").strip(),
|
||||||
|
"```",
|
||||||
|
"",
|
||||||
|
"## Stderr",
|
||||||
|
"```",
|
||||||
|
(stderr or "(empty)").strip(),
|
||||||
|
"```",
|
||||||
|
"",
|
||||||
|
])
|
||||||
|
return "\n".join(sections)
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ import sys
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from cross_eval import __version__
|
from cross_eval import __version__
|
||||||
from cross_eval.config import REASONING_EFFORT_CHOICES
|
from cross_eval.config import REASONING_EFFORT_CHOICES, resolve_agent_shorthand
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -38,7 +38,7 @@ coders: [claude-coder]
|
|||||||
reviewers: [claude-reviewer]
|
reviewers: [claude-reviewer]
|
||||||
# seniors: [codex-senior]
|
# seniors: [codex-senior]
|
||||||
|
|
||||||
# 파이프라인 종류: simple | cross-review | review-only | review-fix
|
# 파이프라인 종류: simple | cross-review | plan-review | review-only | review-fix | coding-review-fix
|
||||||
pipeline: preset:{preset}
|
pipeline: preset:{preset}
|
||||||
|
|
||||||
# 반복 설정
|
# 반복 설정
|
||||||
@@ -49,7 +49,7 @@ max_iterations: 3
|
|||||||
language: {language}
|
language: {language}
|
||||||
|
|
||||||
# 결과 저장 경로
|
# 결과 저장 경로
|
||||||
output_dir: output
|
output_dir: .cross-eval/output
|
||||||
|
|
||||||
# ─── 커스텀 에이전트 (선택) ────────────────────────────────────
|
# ─── 커스텀 에이전트 (선택) ────────────────────────────────────
|
||||||
# 기본 제공 에이전트를 덮어쓰거나 새 에이전트를 정의할 수 있습니다.
|
# 기본 제공 에이전트를 덮어쓰거나 새 에이전트를 정의할 수 있습니다.
|
||||||
@@ -145,7 +145,7 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
"AI 코딩 에이전트의 결과물을 자동으로 검증하는 CLI 도구.\n"
|
"AI 코딩 에이전트의 결과물을 자동으로 검증하는 CLI 도구.\n"
|
||||||
"\n"
|
"\n"
|
||||||
"동작 방식:\n"
|
"동작 방식:\n"
|
||||||
" 1. 기획서(plan)를 바탕으로 Coder 에이전트가 코드를 생성\n"
|
" 1. 기획서(plan)를 바탕으로 Coder 에이전트가 코드를 작성\n"
|
||||||
" 2. Reviewer 에이전트가 기획서 대비 코드를 검토하고 PASS/FAIL 판정\n"
|
" 2. Reviewer 에이전트가 기획서 대비 코드를 검토하고 PASS/FAIL 판정\n"
|
||||||
" 3. FAIL이면 피드백을 반영해서 1~2를 반복 (최대 N회)\n"
|
" 3. FAIL이면 피드백을 반영해서 1~2를 반복 (최대 N회)\n"
|
||||||
"\n"
|
"\n"
|
||||||
@@ -195,11 +195,19 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
init_parser.add_argument(
|
init_parser.add_argument(
|
||||||
"--preset",
|
"--preset",
|
||||||
default="simple",
|
default="simple",
|
||||||
choices=["simple", "cross-review", "review-only", "review-fix"],
|
choices=[
|
||||||
|
"simple",
|
||||||
|
"cross-review",
|
||||||
|
"plan-review",
|
||||||
|
"review-only",
|
||||||
|
"review-fix",
|
||||||
|
"coding-review-fix",
|
||||||
|
],
|
||||||
help=(
|
help=(
|
||||||
"파이프라인 종류 (기본: simple). "
|
"파이프라인 종류 (기본: simple). "
|
||||||
"simple=코딩+리뷰, cross-review=교차리뷰, "
|
"simple=코딩+리뷰, cross-review=교차리뷰, plan-review=문서기획검토, "
|
||||||
"review-only=리뷰만, review-fix=리뷰수렴+자동수정"
|
"review-only=리뷰만, review-fix=리뷰수렴+자동수정, "
|
||||||
|
"coding-review-fix=초기코딩후리뷰수렴"
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
init_parser.add_argument(
|
init_parser.add_argument(
|
||||||
@@ -208,13 +216,65 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
choices=["en", "ko"],
|
choices=["en", "ko"],
|
||||||
help="프롬프트 언어 (기본: ko)",
|
help="프롬프트 언어 (기본: ko)",
|
||||||
)
|
)
|
||||||
|
init_parser.add_argument(
|
||||||
|
"--guided",
|
||||||
|
action="store_true",
|
||||||
|
help="대화형 설정 마법사 실행",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- doctor ---
|
||||||
|
doctor_parser = subparsers.add_parser(
|
||||||
|
"doctor",
|
||||||
|
help="실행 환경 점검 (CLI 설치, 인증, 설정 파일 검증)",
|
||||||
|
description="cross-eval 실행에 필요한 환경을 점검합니다.",
|
||||||
|
)
|
||||||
|
doctor_parser.add_argument(
|
||||||
|
"--dir",
|
||||||
|
type=Path,
|
||||||
|
default=Path("."),
|
||||||
|
help="점검할 디렉토리 (기본: 현재 디렉토리)",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- demo ---
|
||||||
|
demo_parser = subparsers.add_parser(
|
||||||
|
"demo",
|
||||||
|
help="내장 데모 실행 (파이프라인 동작 체험)",
|
||||||
|
description=(
|
||||||
|
"내장된 간단한 기획서로 cross-eval 파이프라인의 전체 동작을 체험합니다.\n"
|
||||||
|
"기본값은 mock 모드(시뮬레이션)이며, --live로 실제 에이전트를 호출할 수 있습니다."
|
||||||
|
),
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
)
|
||||||
|
demo_parser.add_argument(
|
||||||
|
"--live",
|
||||||
|
action="store_true",
|
||||||
|
help="실제 에이전트를 호출하여 데모 실행 (API 비용 발생)",
|
||||||
|
)
|
||||||
|
demo_parser.add_argument(
|
||||||
|
"--preset",
|
||||||
|
default="simple",
|
||||||
|
choices=["simple", "review-fix", "coding-review-fix"],
|
||||||
|
help="데모할 파이프라인 종류 (기본: simple)",
|
||||||
|
)
|
||||||
|
demo_parser.add_argument(
|
||||||
|
"--escalate",
|
||||||
|
action="store_true",
|
||||||
|
help="ESCALATE 시나리오 데모 (mock 모드 전용)",
|
||||||
|
)
|
||||||
|
demo_parser.add_argument(
|
||||||
|
"--timeout",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
metavar="SEC",
|
||||||
|
help="에이전트 1회 호출 제한 시간(초). 0=무제한 (기본: 무제한, --live 전용)",
|
||||||
|
)
|
||||||
|
|
||||||
# --- run ---
|
# --- run ---
|
||||||
run_parser = subparsers.add_parser(
|
run_parser = subparsers.add_parser(
|
||||||
"run",
|
"run",
|
||||||
help="검증 파이프라인 실행",
|
help="검증 파이프라인 실행",
|
||||||
description=(
|
description=(
|
||||||
"기획서(plan)를 기반으로 AI 에이전트가 코드 생성과 리뷰를 반복합니다.\n"
|
"기획서(plan)를 기반으로 AI 에이전트가 코딩과 리뷰를 반복합니다.\n"
|
||||||
"\n"
|
"\n"
|
||||||
"설정 파일 없이 바로 실행할 수 있고, config.yaml로도 실행할 수 있습니다.\n"
|
"설정 파일 없이 바로 실행할 수 있고, config.yaml로도 실행할 수 있습니다.\n"
|
||||||
"CLI 옵션이 config.yaml보다 우선합니다."
|
"CLI 옵션이 config.yaml보다 우선합니다."
|
||||||
@@ -222,13 +282,19 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
epilog=(
|
epilog=(
|
||||||
"파이프라인 종류 (--preset):\n"
|
"파이프라인 종류 (--preset):\n"
|
||||||
" ┌──────────────┬─────────────────────────────────────────────────────┐\n"
|
" ┌──────────────┬─────────────────────────────────────────────────────┐\n"
|
||||||
" │ simple │ Coder가 코드 생성 → Reviewer가 리뷰 │\n"
|
" │ simple │ Coder가 코드 작성 → Reviewer가 리뷰 │\n"
|
||||||
" │ (기본값) │ FAIL이면 피드백 반영해서 재생성, PASS까지 반복 │\n"
|
" │ (기본값) │ FAIL이면 피드백 반영해서 재코딩, PASS까지 반복 │\n"
|
||||||
" ├──────────────┼─────────────────────────────────────────────────────┤\n"
|
" ├──────────────┼─────────────────────────────────────────────────────┤\n"
|
||||||
" │ review-fix │ 2단계 파이프라인: │\n"
|
" │ review-fix │ 2단계 파이프라인: │\n"
|
||||||
" │ │ Reviewer N명 병렬 리뷰 → 취합 → 수정 → 재검증 │\n"
|
" │ │ Reviewer N명 병렬 리뷰 → 취합 → 수정 → 재검증 │\n"
|
||||||
" ├──────────────┼─────────────────────────────────────────────────────┤\n"
|
" ├──────────────┼─────────────────────────────────────────────────────┤\n"
|
||||||
" │ review-only │ 코드 생성 없이 Reviewer N명이 기존 코드만 검토 │\n"
|
" │ coding- │ 3단계 파이프라인: │\n"
|
||||||
|
" │ review-fix │ 초기 코딩 1회 → 리뷰 취합 → 수정 → 재검증 반복 │\n"
|
||||||
|
" ├──────────────┼─────────────────────────────────────────────────────┤\n"
|
||||||
|
" │ plan-review │ 구현 전 기획서/체크리스트/문서를 검토 │\n"
|
||||||
|
" │ │ 필요하면 현재 코드베이스와의 정합성도 점검 │\n"
|
||||||
|
" ├──────────────┼─────────────────────────────────────────────────────┤\n"
|
||||||
|
" │ review-only │ 코드 작성 없이 Reviewer N명이 기존 코드만 검토 │\n"
|
||||||
" │ │ (이미 작성된 코드의 품질 감사용) │\n"
|
" │ │ (이미 작성된 코드의 품질 감사용) │\n"
|
||||||
" ├──────────────┼─────────────────────────────────────────────────────┤\n"
|
" ├──────────────┼─────────────────────────────────────────────────────┤\n"
|
||||||
" │ cross-review │ Coder 2명이 각각 구현 → 상대방 코드를 교차 리뷰 │\n"
|
" │ cross-review │ Coder 2명이 각각 구현 → 상대방 코드를 교차 리뷰 │\n"
|
||||||
@@ -239,10 +305,10 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
" ┌──────────────────┬─────────┬───────────┬──────────────────────────┐\n"
|
" ┌──────────────────┬─────────┬───────────┬──────────────────────────┐\n"
|
||||||
" │ 이름 │ CLI │ 기본 모델 │ 역할 │\n"
|
" │ 이름 │ CLI │ 기본 모델 │ 역할 │\n"
|
||||||
" ├──────────────────┼─────────┼───────────┼──────────────────────────┤\n"
|
" ├──────────────────┼─────────┼───────────┼──────────────────────────┤\n"
|
||||||
" │ claude-coder │ claude │ opus │ 코드 생성 │\n"
|
" │ claude-coder │ claude │ opus │ 코드 작성 │\n"
|
||||||
" │ claude-reviewer │ claude │ opus │ 코드 리뷰 │\n"
|
" │ claude-reviewer │ claude │ opus │ 코드 리뷰 │\n"
|
||||||
" │ claude-senior │ claude │ opus │ 리뷰 취합/판정 │\n"
|
" │ claude-senior │ claude │ opus │ 리뷰 취합/판정 │\n"
|
||||||
" │ codex-coder │ codex │ gpt-5.4 │ 코드 생성 │\n"
|
" │ codex-coder │ codex │ gpt-5.4 │ 코드 작성 │\n"
|
||||||
" │ codex-reviewer │ codex │ gpt-5.4 │ 코드 리뷰 │\n"
|
" │ codex-reviewer │ codex │ gpt-5.4 │ 코드 리뷰 │\n"
|
||||||
" │ codex-senior │ codex │ gpt-5.4 │ 리뷰 취합/판정 │\n"
|
" │ codex-senior │ codex │ gpt-5.4 │ 리뷰 취합/판정 │\n"
|
||||||
" └──────────────────┴─────────┴───────────┴──────────────────────────┘\n"
|
" └──────────────────┴─────────┴───────────┴──────────────────────────┘\n"
|
||||||
@@ -267,10 +333,18 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
" cross-eval run --plan plan.md --preset review-fix \\\n"
|
" cross-eval run --plan plan.md --preset review-fix \\\n"
|
||||||
" --reviewer claude --reviewer codex\n"
|
" --reviewer claude --reviewer codex\n"
|
||||||
"\n"
|
"\n"
|
||||||
|
" 초기 코딩 후 리뷰 수렴 + 자동 수정 (coding-review-fix):\n"
|
||||||
|
" cross-eval run --plan plan.md --preset coding-review-fix \\\n"
|
||||||
|
" --reviewer claude --reviewer codex\n"
|
||||||
|
"\n"
|
||||||
" 기존 코드 리뷰만 (review-only):\n"
|
" 기존 코드 리뷰만 (review-only):\n"
|
||||||
" cross-eval run --plan plan.md --preset review-only \\\n"
|
" cross-eval run --plan plan.md --preset review-only \\\n"
|
||||||
" --reviewer claude --reviewer codex\n"
|
" --reviewer claude --reviewer codex\n"
|
||||||
"\n"
|
"\n"
|
||||||
|
" 구현 전 문서/기획 검토 (plan-review):\n"
|
||||||
|
" cross-eval run --plan plan.md --preset plan-review \\\n"
|
||||||
|
" --reviewer claude --reviewer codex\n"
|
||||||
|
"\n"
|
||||||
" 모델 변경:\n"
|
" 모델 변경:\n"
|
||||||
" cross-eval run --plan plan.md --model sonnet\n"
|
" cross-eval run --plan plan.md --model sonnet\n"
|
||||||
"\n"
|
"\n"
|
||||||
@@ -298,6 +372,14 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
"--input", action="append", dest="inputs", metavar="KEY=PATH",
|
"--input", action="append", dest="inputs", metavar="KEY=PATH",
|
||||||
help="추가 입력 파일 (예: --input spec=./api-spec.md)",
|
help="추가 입력 파일 (예: --input spec=./api-spec.md)",
|
||||||
)
|
)
|
||||||
|
input_group.add_argument(
|
||||||
|
"--env-file", action="append", dest="env_files", type=Path, default=None,
|
||||||
|
help="에이전트 subprocess에 주입할 추가 .env 파일 (여러 개 가능)",
|
||||||
|
)
|
||||||
|
input_group.add_argument(
|
||||||
|
"--target", action="append", dest="execution_targets", default=None,
|
||||||
|
help="에이전트에게 강조할 실행 대상 힌트 (예: clickhouse, postgres)",
|
||||||
|
)
|
||||||
|
|
||||||
# -- 에이전트 설정 --
|
# -- 에이전트 설정 --
|
||||||
agent_group = run_parser.add_argument_group(
|
agent_group = run_parser.add_argument_group(
|
||||||
@@ -336,24 +418,39 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
choices=REASONING_EFFORT_CHOICES + ("extra-high", "extra_high", "x-high"),
|
choices=REASONING_EFFORT_CHOICES + ("extra-high", "extra_high", "x-high"),
|
||||||
help="Senior용 reasoning effort",
|
help="Senior용 reasoning effort",
|
||||||
)
|
)
|
||||||
|
agent_group.add_argument(
|
||||||
|
"--agentic", action="store_true", default=False,
|
||||||
|
help="Coder를 agentic 모드로 실행 (worktree에서 파일 직접 수정, git diff로 결과 캡처)",
|
||||||
|
)
|
||||||
agent_group.add_argument(
|
agent_group.add_argument(
|
||||||
"--model", default=None, metavar="MODEL",
|
"--model", default=None, metavar="MODEL",
|
||||||
help="모든 에이전트의 모델을 한번에 변경 (예: sonnet, opus)",
|
help="모든 에이전트의 모델을 한번에 변경 (예: sonnet, opus)",
|
||||||
)
|
)
|
||||||
agent_group.add_argument(
|
agent_group.add_argument(
|
||||||
"--generator-model", default=None, metavar="MODEL",
|
"--coder-model", default=None, metavar="MODEL",
|
||||||
help="Coder 에이전트 모델만 변경",
|
help="Coder 에이전트 모델만 변경",
|
||||||
)
|
)
|
||||||
agent_group.add_argument(
|
agent_group.add_argument(
|
||||||
"--reviewer-model", default=None, metavar="MODEL",
|
"--reviewer-model", default=None, metavar="MODEL",
|
||||||
help="Reviewer 에이전트 모델만 변경",
|
help="Reviewer 에이전트 모델만 변경",
|
||||||
)
|
)
|
||||||
|
agent_group.add_argument(
|
||||||
|
"--senior-model", default=None, metavar="MODEL",
|
||||||
|
help="Senior 에이전트 모델만 변경",
|
||||||
|
)
|
||||||
|
|
||||||
# -- 파이프라인 --
|
# -- 파이프라인 --
|
||||||
pipe_group = run_parser.add_argument_group("파이프라인")
|
pipe_group = run_parser.add_argument_group("파이프라인")
|
||||||
pipe_group.add_argument(
|
pipe_group.add_argument(
|
||||||
"--preset", default=None,
|
"--preset", default=None,
|
||||||
choices=["simple", "cross-review", "review-only", "review-fix"],
|
choices=[
|
||||||
|
"simple",
|
||||||
|
"cross-review",
|
||||||
|
"plan-review",
|
||||||
|
"review-only",
|
||||||
|
"review-fix",
|
||||||
|
"coding-review-fix",
|
||||||
|
],
|
||||||
help="파이프라인 종류 (기본: simple). 각 종류 설명은 아래 참조",
|
help="파이프라인 종류 (기본: simple). 각 종류 설명은 아래 참조",
|
||||||
)
|
)
|
||||||
pipe_group.add_argument(
|
pipe_group.add_argument(
|
||||||
@@ -381,7 +478,7 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
)
|
)
|
||||||
etc_group.add_argument(
|
etc_group.add_argument(
|
||||||
"--output-dir", type=Path, default=None,
|
"--output-dir", type=Path, default=None,
|
||||||
help="결과 저장 디렉토리 (기본: output/)",
|
help="결과 저장 디렉토리 (기본: .cross-eval/output/)",
|
||||||
)
|
)
|
||||||
etc_group.add_argument(
|
etc_group.add_argument(
|
||||||
"--dry-run", action="store_true",
|
"--dry-run", action="store_true",
|
||||||
@@ -400,6 +497,10 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
|
|
||||||
if args.command == "init":
|
if args.command == "init":
|
||||||
return cmd_init(args)
|
return cmd_init(args)
|
||||||
|
elif args.command == "doctor":
|
||||||
|
return cmd_doctor(args)
|
||||||
|
elif args.command == "demo":
|
||||||
|
return cmd_demo(args)
|
||||||
elif args.command == "run":
|
elif args.command == "run":
|
||||||
return cmd_run(args)
|
return cmd_run(args)
|
||||||
else:
|
else:
|
||||||
@@ -407,9 +508,186 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_doctor(args: argparse.Namespace) -> int:
|
||||||
|
"""Run environment health checks."""
|
||||||
|
from cross_eval.doctor import format_doctor_results, run_doctor
|
||||||
|
|
||||||
|
checks = run_doctor(args.dir.resolve())
|
||||||
|
print(format_doctor_results(checks))
|
||||||
|
|
||||||
|
has_critical = any(not c.passed and c.critical for c in checks)
|
||||||
|
return 1 if has_critical else 0
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_demo(args: argparse.Namespace) -> int:
|
||||||
|
"""Run a built-in demo to show the pipeline lifecycle."""
|
||||||
|
from cross_eval.demo import run_live_demo, run_mock_demo
|
||||||
|
|
||||||
|
if args.live:
|
||||||
|
print("\n⚠ --live 모드: 실제 AI 에이전트를 호출합니다 (API 비용 발생).")
|
||||||
|
print(" 내장 피보나치 함수 기획서를 사용합니다.\n")
|
||||||
|
try:
|
||||||
|
answer = input("계속하시겠습니까? [y/N] ").strip().lower()
|
||||||
|
except (EOFError, KeyboardInterrupt):
|
||||||
|
print("\n취소됨.")
|
||||||
|
return 0
|
||||||
|
if answer not in ("y", "yes"):
|
||||||
|
print("취소됨.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
raw_timeout = args.timeout if args.timeout is not None else 0
|
||||||
|
agent_timeout = None if raw_timeout == 0 else raw_timeout
|
||||||
|
result = run_live_demo(preset=args.preset, timeout=agent_timeout)
|
||||||
|
print(f"\nResult: {result.final_verdict}")
|
||||||
|
print(f"Iterations: {len(result.iterations)}")
|
||||||
|
if result.run_dir:
|
||||||
|
print(f"Output: {result.run_dir}/")
|
||||||
|
return 0
|
||||||
|
except (RuntimeError, KeyboardInterrupt) as e:
|
||||||
|
if isinstance(e, KeyboardInterrupt):
|
||||||
|
print("\nInterrupted.")
|
||||||
|
return 130
|
||||||
|
print(f"Demo error: {e}", file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
run_mock_demo(preset=args.preset, show_escalate=args.escalate)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Guided init wizard
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_PRESET_DESCRIPTIONS = {
|
||||||
|
"simple": "코딩 + 리뷰 (가장 기본)",
|
||||||
|
"review-fix": "리뷰 → 취합 → 수정 → 재검증 반복",
|
||||||
|
"coding-review-fix": "초기 코딩 + 리뷰 수렴 반복",
|
||||||
|
"plan-review": "구현 전 기획서/문서 검토",
|
||||||
|
"review-only": "기존 코드만 리뷰 (코딩 없음)",
|
||||||
|
"cross-review": "2명이 각각 구현 후 교차 리뷰",
|
||||||
|
}
|
||||||
|
|
||||||
|
_PRESET_ORDER = [
|
||||||
|
"simple", "review-fix", "coding-review-fix",
|
||||||
|
"plan-review", "review-only", "cross-review",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _prompt_choice(
|
||||||
|
message: str,
|
||||||
|
choices: list[str],
|
||||||
|
descriptions: dict[str, str] | None = None,
|
||||||
|
default: int = 1,
|
||||||
|
) -> str:
|
||||||
|
"""Prompt user to pick from a numbered list."""
|
||||||
|
print(f"\n{message}")
|
||||||
|
for i, choice in enumerate(choices, 1):
|
||||||
|
desc = f" — {descriptions[choice]}" if descriptions and choice in descriptions else ""
|
||||||
|
marker = " (기본)" if i == default else ""
|
||||||
|
print(f" {i}. {choice}{desc}{marker}")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
raw = input(f"선택 [{default}]: ").strip()
|
||||||
|
except (EOFError, KeyboardInterrupt):
|
||||||
|
print()
|
||||||
|
return choices[default - 1]
|
||||||
|
if not raw:
|
||||||
|
return choices[default - 1]
|
||||||
|
try:
|
||||||
|
idx = int(raw)
|
||||||
|
if 1 <= idx <= len(choices):
|
||||||
|
return choices[idx - 1]
|
||||||
|
except ValueError:
|
||||||
|
if raw in choices:
|
||||||
|
return raw
|
||||||
|
print(f" 1-{len(choices)} 사이 숫자를 입력하세요.")
|
||||||
|
|
||||||
|
|
||||||
|
def _prompt_text(message: str, default: str = "") -> str:
|
||||||
|
"""Prompt for text input with default."""
|
||||||
|
suffix = f" [{default}]" if default else ""
|
||||||
|
try:
|
||||||
|
raw = input(f"{message}{suffix}: ").strip()
|
||||||
|
except (EOFError, KeyboardInterrupt):
|
||||||
|
print()
|
||||||
|
return default
|
||||||
|
return raw or default
|
||||||
|
|
||||||
|
|
||||||
|
def _run_guided_init(target: Path) -> dict:
|
||||||
|
"""Interactive setup wizard. Returns settings dict."""
|
||||||
|
print("\n━━━ cross-eval 설정 마법사 ━━━\n")
|
||||||
|
|
||||||
|
lang = _prompt_choice(
|
||||||
|
"언어 / Language:",
|
||||||
|
["ko", "en"],
|
||||||
|
{"ko": "한국어", "en": "English"},
|
||||||
|
default=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
preset = _prompt_choice(
|
||||||
|
"파이프라인 종류:",
|
||||||
|
_PRESET_ORDER,
|
||||||
|
_PRESET_DESCRIPTIONS,
|
||||||
|
default=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n--- 에이전트 설정 ---")
|
||||||
|
print(" 사용 가능: claude, codex (또는 claude-coder, codex-reviewer 등)")
|
||||||
|
|
||||||
|
coder = _prompt_text(" Coder 에이전트", default="claude")
|
||||||
|
reviewer = _prompt_text(" Reviewer 에이전트", default="claude")
|
||||||
|
|
||||||
|
needs_senior = preset in ("review-fix", "coding-review-fix")
|
||||||
|
senior = ""
|
||||||
|
if needs_senior:
|
||||||
|
senior = _prompt_text(" Senior 에이전트", default=reviewer)
|
||||||
|
else:
|
||||||
|
senior = _prompt_text(" Senior 에이전트 (선택, Enter로 건너뛰기)", default="")
|
||||||
|
|
||||||
|
max_iter = _prompt_text("최대 반복 횟수", default="3")
|
||||||
|
try:
|
||||||
|
max_iter_int = int(max_iter)
|
||||||
|
except ValueError:
|
||||||
|
max_iter_int = 3
|
||||||
|
|
||||||
|
create_templates = _prompt_text(
|
||||||
|
"\n템플릿 파일(plan.md, checklist.md) 생성?", default="Y",
|
||||||
|
).lower() in ("y", "yes", "")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"lang": lang,
|
||||||
|
"preset": preset,
|
||||||
|
"coder": coder,
|
||||||
|
"reviewer": reviewer,
|
||||||
|
"senior": senior,
|
||||||
|
"max_iter": max_iter_int,
|
||||||
|
"create_templates": create_templates,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def cmd_init(args: argparse.Namespace) -> int:
|
def cmd_init(args: argparse.Namespace) -> int:
|
||||||
"""Scaffold a new cross-eval project."""
|
"""Scaffold a new cross-eval project."""
|
||||||
target = args.dir.resolve()
|
target = args.dir.resolve()
|
||||||
|
|
||||||
|
if args.guided:
|
||||||
|
settings = _run_guided_init(target)
|
||||||
|
args.lang = settings["lang"]
|
||||||
|
args.preset = settings["preset"]
|
||||||
|
# We'll use guided settings for enhanced config generation
|
||||||
|
return _write_init_files(target, args, guided_settings=settings)
|
||||||
|
|
||||||
|
return _write_init_files(target, args)
|
||||||
|
|
||||||
|
|
||||||
|
def _write_init_files(
|
||||||
|
target: Path,
|
||||||
|
args: argparse.Namespace,
|
||||||
|
guided_settings: dict | None = None,
|
||||||
|
) -> int:
|
||||||
|
"""Write config and template files to target directory."""
|
||||||
ce_dir = target / ".cross-eval"
|
ce_dir = target / ".cross-eval"
|
||||||
ce_dir.mkdir(parents=True, exist_ok=True)
|
ce_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
@@ -417,14 +695,23 @@ def cmd_init(args: argparse.Namespace) -> int:
|
|||||||
plan_sample = PLAN_SAMPLE_KO if lang == "ko" else PLAN_SAMPLE_EN
|
plan_sample = PLAN_SAMPLE_KO if lang == "ko" else PLAN_SAMPLE_EN
|
||||||
checklist_sample = CHECKLIST_SAMPLE_KO if lang == "ko" else CHECKLIST_SAMPLE_EN
|
checklist_sample = CHECKLIST_SAMPLE_KO if lang == "ko" else CHECKLIST_SAMPLE_EN
|
||||||
|
|
||||||
files = {
|
# Generate config content
|
||||||
".cross-eval/config.yaml": DEFAULT_CONFIG_YAML.format(
|
if guided_settings:
|
||||||
|
config_content = _generate_guided_config(args.preset, lang, guided_settings)
|
||||||
|
else:
|
||||||
|
config_content = DEFAULT_CONFIG_YAML.format(
|
||||||
preset=args.preset, language=lang,
|
preset=args.preset, language=lang,
|
||||||
),
|
)
|
||||||
".cross-eval/plan.md": plan_sample,
|
|
||||||
".cross-eval/checklist.md": checklist_sample,
|
files: dict[str, str] = {
|
||||||
|
".cross-eval/config.yaml": config_content,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Add templates unless guided mode opted out
|
||||||
|
if not guided_settings or guided_settings.get("create_templates", True):
|
||||||
|
files[".cross-eval/plan.md"] = plan_sample
|
||||||
|
files[".cross-eval/checklist.md"] = checklist_sample
|
||||||
|
|
||||||
created = []
|
created = []
|
||||||
skipped = []
|
skipped = []
|
||||||
for name, content in files.items():
|
for name, content in files.items():
|
||||||
@@ -436,23 +723,67 @@ def cmd_init(args: argparse.Namespace) -> int:
|
|||||||
created.append(name)
|
created.append(name)
|
||||||
|
|
||||||
if created:
|
if created:
|
||||||
print(f" 생성: {', '.join(created)}")
|
print(f"\n 생성: {', '.join(created)}")
|
||||||
if skipped:
|
if skipped:
|
||||||
print(f" 이미 존재 (건너뜀): {', '.join(skipped)}")
|
print(f" 이미 존재 (건너뜀): {', '.join(skipped)}")
|
||||||
|
|
||||||
print(f"\n 파이프라인: {args.preset}")
|
print(f"\n 파이프라인: {args.preset}")
|
||||||
print(f" 언어: {lang}")
|
print(f" 언어: {lang}")
|
||||||
|
if guided_settings:
|
||||||
|
print(f" Coder: {guided_settings['coder']}")
|
||||||
|
print(f" Reviewer: {guided_settings['reviewer']}")
|
||||||
|
if guided_settings.get("senior"):
|
||||||
|
print(f" Senior: {guided_settings['senior']}")
|
||||||
|
print(f" 최대 반복: {guided_settings['max_iter']}")
|
||||||
print("")
|
print("")
|
||||||
print("다음 단계:")
|
print("다음 단계:")
|
||||||
print(" 1. .cross-eval/plan.md 에 기획서 작성")
|
print(" 1. .cross-eval/plan.md 에 기획서 작성")
|
||||||
print(" 2. .cross-eval/checklist.md 에 체크리스트 작성 (선택)")
|
print(" 2. .cross-eval/checklist.md 에 체크리스트 작성 (선택)")
|
||||||
print(" 3. cross-eval run 으로 실행")
|
print(" 3. cross-eval run 으로 실행")
|
||||||
print("")
|
print("")
|
||||||
print("주의: 에이전트는 기본적으로 파일 읽기/쓰기/실행 권한을 가집니다.")
|
print("팁: cross-eval doctor 로 환경 점검을 먼저 하세요.")
|
||||||
print(" 실행 전에 .cross-eval/config.yaml 을 확인하세요.")
|
print(" cross-eval demo 로 동작 방식을 미리 볼 수 있습니다.")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def _generate_guided_config(
|
||||||
|
preset: str,
|
||||||
|
lang: str,
|
||||||
|
settings: dict,
|
||||||
|
) -> str:
|
||||||
|
"""Generate config.yaml content from guided init settings."""
|
||||||
|
coder_name = resolve_agent_shorthand(settings["coder"], "coder")
|
||||||
|
reviewer_name = resolve_agent_shorthand(settings["reviewer"], "reviewer")
|
||||||
|
|
||||||
|
lines = [
|
||||||
|
"# cross-eval 설정 (guided init으로 생성됨)",
|
||||||
|
"",
|
||||||
|
"inputs:",
|
||||||
|
" plan: plan.md",
|
||||||
|
" checklist: checklist.md",
|
||||||
|
"",
|
||||||
|
f"coders: [{coder_name}]",
|
||||||
|
f"reviewers: [{reviewer_name}]",
|
||||||
|
]
|
||||||
|
|
||||||
|
senior = settings.get("senior", "")
|
||||||
|
if senior:
|
||||||
|
senior_name = resolve_agent_shorthand(senior, "senior")
|
||||||
|
lines.append(f"seniors: [{senior_name}]")
|
||||||
|
|
||||||
|
lines.extend([
|
||||||
|
"",
|
||||||
|
f"pipeline: preset:{preset}",
|
||||||
|
"",
|
||||||
|
f"max_iterations: {settings['max_iter']}",
|
||||||
|
f"language: {lang}",
|
||||||
|
"output_dir: .cross-eval/output",
|
||||||
|
"",
|
||||||
|
])
|
||||||
|
|
||||||
|
return "\n".join(lines) + "\n"
|
||||||
|
|
||||||
|
|
||||||
def _read_docs_dir(docs_dir: Path) -> str:
|
def _read_docs_dir(docs_dir: Path) -> str:
|
||||||
"""Read all files in a directory and concatenate with filename headers."""
|
"""Read all files in a directory and concatenate with filename headers."""
|
||||||
parts: list[str] = []
|
parts: list[str] = []
|
||||||
@@ -482,12 +813,21 @@ def _apply_model_override(config, agent_name: str, model: str) -> None:
|
|||||||
agent.args = new_args
|
agent.args = new_args
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_phased_iteration_override(config, max_iter: int | None) -> None:
|
||||||
|
"""Apply CLI max-iter to converging phases while preserving setup phases."""
|
||||||
|
from cross_eval.config import sync_phased_iterations
|
||||||
|
|
||||||
|
sync_phased_iterations(config, max_iter)
|
||||||
|
|
||||||
|
|
||||||
def cmd_run(args: argparse.Namespace) -> int:
|
def cmd_run(args: argparse.Namespace) -> int:
|
||||||
"""Load config, validate, and execute the pipeline."""
|
"""Load config, validate, and execute the pipeline."""
|
||||||
from cross_eval.config import (
|
from cross_eval.config import (
|
||||||
|
ensure_fix_preset_agentic,
|
||||||
apply_input_overrides,
|
apply_input_overrides,
|
||||||
default_config,
|
default_config,
|
||||||
load_config,
|
load_config,
|
||||||
|
sync_phased_iterations,
|
||||||
validate_config,
|
validate_config,
|
||||||
)
|
)
|
||||||
from cross_eval.prompts import PIPELINE_PRESETS
|
from cross_eval.prompts import PIPELINE_PRESETS
|
||||||
@@ -562,7 +902,7 @@ def cmd_run(args: argparse.Namespace) -> int:
|
|||||||
preset = args.preset or "simple"
|
preset = args.preset or "simple"
|
||||||
# Determine which preset was configured (from YAML or defaults)
|
# Determine which preset was configured (from YAML or defaults)
|
||||||
if args.preset is None and config.phases:
|
if args.preset is None and config.phases:
|
||||||
preset = "review-fix" # only phased preset currently
|
preset = config.preset_name if config.preset_name != "custom" else "review-fix"
|
||||||
elif args.preset is None and not args.coders and not args.reviewers and not args.seniors:
|
elif args.preset is None and not args.coders and not args.reviewers and not args.seniors:
|
||||||
pass # no changes needed
|
pass # no changes needed
|
||||||
inferred_coders, inferred_reviewers, inferred_seniors = _infer_roles(
|
inferred_coders, inferred_reviewers, inferred_seniors = _infer_roles(
|
||||||
@@ -584,13 +924,18 @@ def cmd_run(args: argparse.Namespace) -> int:
|
|||||||
config.preset_name = preset
|
config.preset_name = preset
|
||||||
if preset in PHASED_PRESETS:
|
if preset in PHASED_PRESETS:
|
||||||
config.phases = PHASED_PRESETS[preset](coders, reviewers, seniors)
|
config.phases = PHASED_PRESETS[preset](coders, reviewers, seniors)
|
||||||
|
_apply_phased_iteration_override(config, args.max_iter)
|
||||||
config.pipeline = []
|
config.pipeline = []
|
||||||
elif preset in PIPELINE_PRESETS:
|
elif preset in PIPELINE_PRESETS:
|
||||||
config.pipeline = PIPELINE_PRESETS[preset](coders, reviewers, seniors)
|
config.pipeline = PIPELINE_PRESETS[preset](coders, reviewers, seniors)
|
||||||
config.phases = []
|
config.phases = []
|
||||||
if preset == "review-only" and args.max_iter is None and args.min_iter is None:
|
if preset in {"plan-review", "review-only"} and args.max_iter is None and args.min_iter is None:
|
||||||
config.max_iterations = 1
|
config.max_iterations = 1
|
||||||
|
|
||||||
|
sync_phased_iterations(config)
|
||||||
|
if args.max_iter is not None:
|
||||||
|
sync_phased_iterations(config, args.max_iter)
|
||||||
|
|
||||||
apply_reasoning_effort_settings(
|
apply_reasoning_effort_settings(
|
||||||
config,
|
config,
|
||||||
reasoning_effort=args.reasoning_effort,
|
reasoning_effort=args.reasoning_effort,
|
||||||
@@ -599,17 +944,29 @@ def cmd_run(args: argparse.Namespace) -> int:
|
|||||||
senior_effort=args.senior_effort,
|
senior_effort=args.senior_effort,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# --agentic: convert coder agents to agentic mode
|
||||||
|
if args.agentic:
|
||||||
|
from cross_eval.config import _make_agentic
|
||||||
|
for coder_name in config.coders:
|
||||||
|
if coder_name in config.agents:
|
||||||
|
_make_agentic(config.agents[coder_name])
|
||||||
|
|
||||||
|
ensure_fix_preset_agentic(config)
|
||||||
|
|
||||||
# --model: apply to ALL agents
|
# --model: apply to ALL agents
|
||||||
if args.model is not None:
|
if args.model is not None:
|
||||||
for agent_name in config.agents:
|
for agent_name in config.agents:
|
||||||
_apply_model_override(config, agent_name, args.model)
|
_apply_model_override(config, agent_name, args.model)
|
||||||
# --generator-model / --reviewer-model: apply by role
|
# --coder-model / --reviewer-model / --senior-model: apply by role
|
||||||
if args.generator_model is not None:
|
if args.coder_model is not None:
|
||||||
for coder_name in config.coders:
|
for coder_name in config.coders:
|
||||||
_apply_model_override(config, coder_name, args.generator_model)
|
_apply_model_override(config, coder_name, args.coder_model)
|
||||||
if args.reviewer_model is not None:
|
if args.reviewer_model is not None:
|
||||||
for reviewer_name in config.reviewers:
|
for reviewer_name in config.reviewers:
|
||||||
_apply_model_override(config, reviewer_name, args.reviewer_model)
|
_apply_model_override(config, reviewer_name, args.reviewer_model)
|
||||||
|
if args.senior_model is not None:
|
||||||
|
for senior_name in config.seniors:
|
||||||
|
_apply_model_override(config, senior_name, args.senior_model)
|
||||||
|
|
||||||
# --plan / --checklist shortcuts
|
# --plan / --checklist shortcuts
|
||||||
for key, val in [("plan", args.plan), ("checklist", args.checklist)]:
|
for key, val in [("plan", args.plan), ("checklist", args.checklist)]:
|
||||||
@@ -631,6 +988,18 @@ def cmd_run(args: argparse.Namespace) -> int:
|
|||||||
print(f"No files found in: {docs_dir}", file=sys.stderr)
|
print(f"No files found in: {docs_dir}", file=sys.stderr)
|
||||||
return 1
|
return 1
|
||||||
config.inputs["docs"] = docs_content
|
config.inputs["docs"] = docs_content
|
||||||
|
config.inputs["docs_ref"] = str(docs_dir)
|
||||||
|
|
||||||
|
if args.env_files:
|
||||||
|
for env_file in args.env_files:
|
||||||
|
resolved = env_file.resolve()
|
||||||
|
if not resolved.exists():
|
||||||
|
print(f"Env file not found: {resolved}", file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
config.execution.env_files.append(str(resolved))
|
||||||
|
|
||||||
|
if args.execution_targets:
|
||||||
|
config.execution.auto_context_targets = list(args.execution_targets)
|
||||||
|
|
||||||
if args.inputs:
|
if args.inputs:
|
||||||
overrides = {}
|
overrides = {}
|
||||||
@@ -646,7 +1015,6 @@ def cmd_run(args: argparse.Namespace) -> int:
|
|||||||
apply_input_overrides(config, overrides)
|
apply_input_overrides(config, overrides)
|
||||||
|
|
||||||
# 3. Validate after all overrides
|
# 3. Validate after all overrides
|
||||||
from cross_eval.config import validate_config
|
|
||||||
errors = validate_config(config)
|
errors = validate_config(config)
|
||||||
if errors:
|
if errors:
|
||||||
print("Config error:\n " + "\n ".join(errors), file=sys.stderr)
|
print("Config error:\n " + "\n ".join(errors), file=sys.stderr)
|
||||||
@@ -694,6 +1062,11 @@ def cmd_run(args: argparse.Namespace) -> int:
|
|||||||
if not args.dry_run and result.run_dir:
|
if not args.dry_run and result.run_dir:
|
||||||
print(f"Output: {result.run_dir}/")
|
print(f"Output: {result.run_dir}/")
|
||||||
|
|
||||||
|
if result.final_verdict == "ESCALATE":
|
||||||
|
from cross_eval.report import print_escalation_report
|
||||||
|
print_escalation_report(config, result)
|
||||||
|
return 2
|
||||||
|
|
||||||
return 0 if result.final_verdict == "PASS" else 1
|
return 0 if result.final_verdict == "PASS" else 1
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
"""Configuration loading, validation, and preset resolution."""
|
"""Configuration loading, validation, and preset resolution."""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import copy
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -8,7 +9,13 @@ from typing import Any
|
|||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from cross_eval.models import AgentConfig, PhaseConfig, PipelineConfig, StepConfig
|
from cross_eval.models import (
|
||||||
|
AgentConfig,
|
||||||
|
ExecutionConfig,
|
||||||
|
PhaseConfig,
|
||||||
|
PipelineConfig,
|
||||||
|
StepConfig,
|
||||||
|
)
|
||||||
from cross_eval.prompts import PHASED_PRESETS, PIPELINE_PRESETS
|
from cross_eval.prompts import PHASED_PRESETS, PIPELINE_PRESETS
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -24,6 +31,7 @@ DEFAULT_ROLE_REASONING_EFFORTS = {
|
|||||||
"reviewer": "medium",
|
"reviewer": "medium",
|
||||||
"senior": "high",
|
"senior": "high",
|
||||||
}
|
}
|
||||||
|
FIX_STYLE_PRESETS = {"review-fix", "coding-review-fix"}
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -39,34 +47,62 @@ _CODEX_ARGS = [
|
|||||||
"-",
|
"-",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
_CLAUDE_BASE_ARGS = [
|
||||||
|
"-p",
|
||||||
|
"--setting-sources",
|
||||||
|
"user",
|
||||||
|
"--disable-slash-commands",
|
||||||
|
"--model",
|
||||||
|
"opus",
|
||||||
|
]
|
||||||
|
|
||||||
|
_CLAUDE_CODER_ARGS = list(_CLAUDE_BASE_ARGS) + [
|
||||||
|
"--dangerously-skip-permissions",
|
||||||
|
"--permission-mode",
|
||||||
|
"bypassPermissions",
|
||||||
|
]
|
||||||
|
|
||||||
|
_CLAUDE_REVIEW_ARGS = list(_CLAUDE_BASE_ARGS)
|
||||||
|
|
||||||
_CODER_SYSTEM_PROMPT = (
|
_CODER_SYSTEM_PROMPT = (
|
||||||
"You are a senior software engineer implementing code changes.\n"
|
"You are a senior software engineer implementing code changes.\n"
|
||||||
"Rules:\n"
|
"Rules:\n"
|
||||||
"1. FIRST explore the project directory to understand the existing codebase, "
|
"1. FIRST explore the project directory to understand the existing codebase, "
|
||||||
"patterns, and conventions before writing any code.\n"
|
"patterns, and conventions before writing any code.\n"
|
||||||
"2. Implement ONLY what the plan specifies. Do NOT add extra features, "
|
"2. You MUST use the Edit and Write tools to make ACTUAL file changes. "
|
||||||
|
"Do NOT just describe or explain changes in text — apply them directly to the files. "
|
||||||
|
"Your text output alone has no effect; only tool-based edits count.\n"
|
||||||
|
"3. You may decide which shell, Python, git, docker, test, and database commands "
|
||||||
|
"to run. The user does not need to pre-specify exact commands.\n"
|
||||||
|
"4. Environment variables from configured .env files may already be loaded into "
|
||||||
|
"your process; use them when validating services such as ClickHouse.\n"
|
||||||
|
"5. Implement ONLY what the plan specifies. Do NOT add extra features, "
|
||||||
"unnecessary abstractions, premature optimizations, or \"nice-to-have\" improvements.\n"
|
"unnecessary abstractions, premature optimizations, or \"nice-to-have\" improvements.\n"
|
||||||
"3. Follow the project's existing coding style, naming conventions, and directory structure.\n"
|
"6. Follow the project's existing coding style, naming conventions, and directory structure.\n"
|
||||||
"4. If previous review feedback is provided, fix ONLY the specific issues mentioned. "
|
"7. If previous review feedback is provided, fix ONLY the specific issues mentioned. "
|
||||||
"Do NOT refactor unrelated code.\n"
|
"Do NOT refactor unrelated code.\n"
|
||||||
"5. Ignore any items from previous feedback that were marked as DISMISSED or false positive.\n"
|
"8. Ignore any items from previous feedback that were marked as DISMISSED or false positive.\n"
|
||||||
"6. When in doubt about scope, do LESS, not more."
|
"9. When in doubt about scope, do LESS, not more."
|
||||||
)
|
)
|
||||||
|
|
||||||
_REVIEWER_SYSTEM_PROMPT = (
|
_REVIEWER_SYSTEM_PROMPT = (
|
||||||
"You are a code reviewer. You MUST NOT create, modify, or delete any files.\n"
|
"You are a code reviewer. You MUST NOT create, modify, or delete any files.\n"
|
||||||
"Rules:\n"
|
"Rules:\n"
|
||||||
"1. Explore the project directory to understand the full codebase context.\n"
|
"1. Explore the project directory to understand the full codebase context.\n"
|
||||||
"2. Compare the implementation against the plan and checklist ONLY.\n"
|
"2. You may decide which shell, Python, test, git, docker, and database read commands "
|
||||||
"3. Classify every issue with BOTH severity AND category:\n"
|
"to run in order to verify behavior. The user does not need to pre-specify exact commands.\n"
|
||||||
|
"3. Environment variables from configured .env files may already be loaded into "
|
||||||
|
"your process; use them for verification when relevant.\n"
|
||||||
|
"4. Compare the implementation against the plan and checklist ONLY.\n"
|
||||||
|
"5. Classify every issue with BOTH severity AND category:\n"
|
||||||
" - Severity: Critical (breaks functionality/security) > Major (requirement mismatch) > Minor (convention/style)\n"
|
" - Severity: Critical (breaks functionality/security) > Major (requirement mismatch) > Minor (convention/style)\n"
|
||||||
" - Category: Over-engineering / Omission\n"
|
" - Category: Over-engineering / Omission\n"
|
||||||
"4. When reviewing with previous feedback, mark items as CONFIRMED (still an issue) "
|
"6. When reviewing with previous feedback, mark items as CONFIRMED (still an issue) "
|
||||||
"or DISMISSED (false positive) with rationale.\n"
|
"or DISMISSED (false positive) with rationale.\n"
|
||||||
"5. Report out-of-scope issues separately — problems found outside plan/checklist scope.\n"
|
"7. Report out-of-scope issues separately — problems found outside plan/checklist scope.\n"
|
||||||
"6. Order issues by severity (Critical first).\n"
|
"8. Order issues by severity (Critical first).\n"
|
||||||
"7. Do NOT suggest improvements beyond the plan scope.\n"
|
"9. Do NOT suggest improvements beyond the plan scope.\n"
|
||||||
"8. End with VERDICT: PASS (all requirements met, no over-engineering) "
|
"10. End with VERDICT: PASS (all requirements met, no over-engineering) "
|
||||||
"or VERDICT: FAIL (issues found)."
|
"or VERDICT: FAIL (issues found)."
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -74,36 +110,48 @@ _SENIOR_SYSTEM_PROMPT = (
|
|||||||
"You are a senior technical reviewer coordinating a review-fix-verification loop.\n"
|
"You are a senior technical reviewer coordinating a review-fix-verification loop.\n"
|
||||||
"Rules:\n"
|
"Rules:\n"
|
||||||
"1. Explore the project directory to understand the full codebase context.\n"
|
"1. Explore the project directory to understand the full codebase context.\n"
|
||||||
"2. In aggregation mode, deduplicate overlaps, resolve disagreements, and keep only "
|
"2. You may decide which shell, Python, test, git, docker, and database read commands "
|
||||||
|
"to run to verify disputed issues. The user does not need to pre-specify exact commands.\n"
|
||||||
|
"3. Environment variables from configured .env files may already be loaded into "
|
||||||
|
"your process; use them when validating service integrations.\n"
|
||||||
|
"4. In aggregation mode, deduplicate overlaps, resolve disagreements, and keep only "
|
||||||
"evidence-backed issues. Categorize dismissed findings as [False positive] or [Already fixed].\n"
|
"evidence-backed issues. Categorize dismissed findings as [False positive] or [Already fixed].\n"
|
||||||
"3. In verification mode, judge the current implementation directly against ONLY the "
|
"5. In verification mode, judge the current implementation directly against ONLY the "
|
||||||
"plan and checklist.\n"
|
"plan and checklist.\n"
|
||||||
"4. Be skeptical of false positives, but do not lower the bar on real requirement "
|
"6. Be skeptical of false positives, but do not lower the bar on real requirement "
|
||||||
"gaps.\n"
|
"gaps.\n"
|
||||||
"5. When issues remain, produce a concise prioritized action list the coder can act on.\n"
|
"7. When issues remain, produce a concise prioritized action list the coder can act on.\n"
|
||||||
"6. Do NOT invent new requirements beyond the plan and checklist.\n"
|
"8. Maintain an Issue Tracker table across iterations to track issue status.\n"
|
||||||
"7. End with VERDICT: PASS or VERDICT: FAIL."
|
"9. Do NOT invent new requirements beyond the plan and checklist.\n"
|
||||||
|
"10. End with one of three verdicts:\n"
|
||||||
|
" - VERDICT: PASS — all requirements met, no issues remain.\n"
|
||||||
|
" - VERDICT: FAIL — issues found that the coder can fix.\n"
|
||||||
|
" - VERDICT: ESCALATE — issues that require human intervention. Use ESCALATE when:\n"
|
||||||
|
" * Requirements are ambiguous and need clarification from stakeholders\n"
|
||||||
|
" * Architecture decisions are needed that go beyond the plan scope\n"
|
||||||
|
" * External dependency issues block progress\n"
|
||||||
|
" * The coder has failed to resolve the same issue 2+ times"
|
||||||
)
|
)
|
||||||
|
|
||||||
BUILTIN_AGENTS: dict[str, AgentConfig] = {
|
BUILTIN_AGENTS: dict[str, AgentConfig] = {
|
||||||
"claude-coder": AgentConfig(
|
"claude-coder": AgentConfig(
|
||||||
name="claude-coder",
|
name="claude-coder",
|
||||||
command="claude",
|
command="claude",
|
||||||
args=["-p", "--model", "opus", "--permission-mode", "auto"],
|
args=list(_CLAUDE_CODER_ARGS),
|
||||||
system_prompt=_CODER_SYSTEM_PROMPT,
|
system_prompt=_CODER_SYSTEM_PROMPT,
|
||||||
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["coder"],
|
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["coder"],
|
||||||
),
|
),
|
||||||
"claude-reviewer": AgentConfig(
|
"claude-reviewer": AgentConfig(
|
||||||
name="claude-reviewer",
|
name="claude-reviewer",
|
||||||
command="claude",
|
command="claude",
|
||||||
args=["-p", "--model", "opus", "--permission-mode", "auto"],
|
args=list(_CLAUDE_REVIEW_ARGS),
|
||||||
system_prompt=_REVIEWER_SYSTEM_PROMPT,
|
system_prompt=_REVIEWER_SYSTEM_PROMPT,
|
||||||
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["reviewer"],
|
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["reviewer"],
|
||||||
),
|
),
|
||||||
"claude-senior": AgentConfig(
|
"claude-senior": AgentConfig(
|
||||||
name="claude-senior",
|
name="claude-senior",
|
||||||
command="claude",
|
command="claude",
|
||||||
args=["-p", "--model", "opus", "--permission-mode", "auto"],
|
args=list(_CLAUDE_REVIEW_ARGS),
|
||||||
system_prompt=_SENIOR_SYSTEM_PROMPT,
|
system_prompt=_SENIOR_SYSTEM_PROMPT,
|
||||||
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["senior"],
|
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["senior"],
|
||||||
),
|
),
|
||||||
@@ -136,6 +184,11 @@ _AGENT_ALIASES: dict[str, str] = {
|
|||||||
"codex": "codex",
|
"codex": "codex",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_ROLE_ALIASES: dict[str, str] = {
|
||||||
|
"coding": "coding",
|
||||||
|
"review": "review",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def resolve_agent_shorthand(name: str, role: str) -> str:
|
def resolve_agent_shorthand(name: str, role: str) -> str:
|
||||||
"""Resolve shorthand agent name to full builtin name.
|
"""Resolve shorthand agent name to full builtin name.
|
||||||
@@ -150,6 +203,16 @@ def resolve_agent_shorthand(name: str, role: str) -> str:
|
|||||||
return name
|
return name
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_step_role(role: str) -> str:
|
||||||
|
"""Normalize step role aliases to the canonical role name."""
|
||||||
|
return _ROLE_ALIASES.get(role, role)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_prompt_template(template_ref: str) -> str:
|
||||||
|
"""Normalize prompt template aliases to canonical template refs."""
|
||||||
|
return template_ref
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Role inference (backward compatibility)
|
# Role inference (backward compatibility)
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -220,7 +283,7 @@ def _resolve_agents(
|
|||||||
|
|
||||||
for name in all_referenced:
|
for name in all_referenced:
|
||||||
if name not in result and name in BUILTIN_AGENTS:
|
if name not in result and name in BUILTIN_AGENTS:
|
||||||
result[name] = BUILTIN_AGENTS[name]
|
result[name] = copy.deepcopy(BUILTIN_AGENTS[name])
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@@ -233,7 +296,7 @@ def _default_seniors_for_preset(
|
|||||||
"""Infer a default senior agent for presets that benefit from adjudication."""
|
"""Infer a default senior agent for presets that benefit from adjudication."""
|
||||||
if not (
|
if not (
|
||||||
isinstance(pipeline_raw, str)
|
isinstance(pipeline_raw, str)
|
||||||
and pipeline_raw == "preset:review-fix"
|
and pipeline_raw in {"preset:review-fix", "preset:coding-review-fix"}
|
||||||
and reviewers
|
and reviewers
|
||||||
):
|
):
|
||||||
return []
|
return []
|
||||||
@@ -311,15 +374,16 @@ def _apply_role_effort(
|
|||||||
|
|
||||||
def default_config() -> PipelineConfig:
|
def default_config() -> PipelineConfig:
|
||||||
"""Return a PipelineConfig with sensible defaults (no YAML needed)."""
|
"""Return a PipelineConfig with sensible defaults (no YAML needed)."""
|
||||||
agents = dict(BUILTIN_AGENTS)
|
agents = copy.deepcopy(BUILTIN_AGENTS)
|
||||||
coders = ["claude-coder"]
|
coders = ["claude-coder"]
|
||||||
reviewers = ["claude-reviewer"]
|
reviewers = ["claude-reviewer"]
|
||||||
seniors: list[str] = []
|
seniors: list[str] = []
|
||||||
pipeline = PIPELINE_PRESETS["simple"](coders, reviewers, seniors)
|
pipeline = PIPELINE_PRESETS["simple"](coders, reviewers, seniors)
|
||||||
return PipelineConfig(
|
return PipelineConfig(
|
||||||
output_dir=Path("output"),
|
output_dir=Path(".cross-eval/output"),
|
||||||
max_iterations=3,
|
max_iterations=3,
|
||||||
language="ko",
|
language="ko",
|
||||||
|
execution=ExecutionConfig(),
|
||||||
inputs={},
|
inputs={},
|
||||||
agents=agents,
|
agents=agents,
|
||||||
coders=coders,
|
coders=coders,
|
||||||
@@ -353,6 +417,8 @@ def load_config(path: Path) -> PipelineConfig:
|
|||||||
|
|
||||||
def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
|
def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
|
||||||
"""Parse raw YAML dict into PipelineConfig."""
|
"""Parse raw YAML dict into PipelineConfig."""
|
||||||
|
project_root = config_path.parent.parent if config_path.parent.name == ".cross-eval" else config_path.parent
|
||||||
|
|
||||||
# --- agents ---
|
# --- agents ---
|
||||||
agents: dict[str, AgentConfig] = {}
|
agents: dict[str, AgentConfig] = {}
|
||||||
for name, agent_data in raw.get("agents", {}).items():
|
for name, agent_data in raw.get("agents", {}).items():
|
||||||
@@ -363,6 +429,7 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
|
|||||||
system_prompt=agent_data.get("system_prompt"),
|
system_prompt=agent_data.get("system_prompt"),
|
||||||
reasoning_effort=agent_data.get("reasoning_effort"),
|
reasoning_effort=agent_data.get("reasoning_effort"),
|
||||||
stdin_mode=agent_data.get("stdin_mode", False),
|
stdin_mode=agent_data.get("stdin_mode", False),
|
||||||
|
agentic=agent_data.get("agentic", False),
|
||||||
)
|
)
|
||||||
|
|
||||||
# --- roles: explicit or inferred ---
|
# --- roles: explicit or inferred ---
|
||||||
@@ -402,6 +469,17 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
|
|||||||
p = config_dir / p
|
p = config_dir / p
|
||||||
inputs[key] = p
|
inputs[key] = p
|
||||||
|
|
||||||
|
execution_raw = raw.get("execution", {}) or {}
|
||||||
|
execution = ExecutionConfig(
|
||||||
|
mode=execution_raw.get("mode", "agent-decides"),
|
||||||
|
command_policy=execution_raw.get("command_policy", "broad"),
|
||||||
|
inherit_env=bool(execution_raw.get("inherit_env", True)),
|
||||||
|
auto_env_files=list(execution_raw.get("auto_env_files", [".env", ".env.local"])),
|
||||||
|
env_files=list(execution_raw.get("env_files", [])),
|
||||||
|
expose_env_names=bool(execution_raw.get("expose_env_names", True)),
|
||||||
|
auto_context_targets=list(execution_raw.get("auto_context_targets", [])),
|
||||||
|
)
|
||||||
|
|
||||||
# --- pipeline (preset or custom) ---
|
# --- pipeline (preset or custom) ---
|
||||||
steps, phases = _resolve_pipeline(pipeline_raw, coders, reviewers, seniors)
|
steps, phases = _resolve_pipeline(pipeline_raw, coders, reviewers, seniors)
|
||||||
|
|
||||||
@@ -410,12 +488,17 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
|
|||||||
if isinstance(pipeline_raw, str) and pipeline_raw.startswith("preset:"):
|
if isinstance(pipeline_raw, str) and pipeline_raw.startswith("preset:"):
|
||||||
preset_name = pipeline_raw.split(":", 1)[1]
|
preset_name = pipeline_raw.split(":", 1)[1]
|
||||||
|
|
||||||
return PipelineConfig(
|
output_dir = Path(raw.get("output_dir", ".cross-eval/output"))
|
||||||
output_dir=Path(raw.get("output_dir", "output")),
|
if not output_dir.is_absolute():
|
||||||
|
output_dir = project_root / output_dir
|
||||||
|
|
||||||
|
config = PipelineConfig(
|
||||||
|
output_dir=output_dir,
|
||||||
max_iterations=int(raw.get("max_iterations", 3)),
|
max_iterations=int(raw.get("max_iterations", 3)),
|
||||||
min_iterations=int(raw.get("min_iterations", 1)),
|
min_iterations=int(raw.get("min_iterations", 1)),
|
||||||
verbose=bool(raw.get("verbose", False)),
|
verbose=bool(raw.get("verbose", False)),
|
||||||
language=raw.get("language", "en"),
|
language=raw.get("language", "en"),
|
||||||
|
execution=execution,
|
||||||
inputs=inputs,
|
inputs=inputs,
|
||||||
agents=agents,
|
agents=agents,
|
||||||
coders=coders,
|
coders=coders,
|
||||||
@@ -427,6 +510,9 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
|
|||||||
_config_path=config_path,
|
_config_path=config_path,
|
||||||
_config_mtime=config_path.stat().st_mtime,
|
_config_mtime=config_path.stat().st_mtime,
|
||||||
)
|
)
|
||||||
|
sync_phased_iterations(config)
|
||||||
|
ensure_fix_preset_agentic(config)
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
def try_reload_config(config: PipelineConfig) -> PipelineConfig:
|
def try_reload_config(config: PipelineConfig) -> PipelineConfig:
|
||||||
@@ -465,7 +551,7 @@ def _resolve_pipeline(
|
|||||||
"""Resolve pipeline from preset string or explicit step list.
|
"""Resolve pipeline from preset string or explicit step list.
|
||||||
|
|
||||||
Returns (steps, phases) tuple. Only one will be non-empty.
|
Returns (steps, phases) tuple. Only one will be non-empty.
|
||||||
- Simple/cross-review/review-only → steps populated, phases empty.
|
- Simple/cross-review/plan-review/review-only → steps populated, phases empty.
|
||||||
- Phased presets (review-fix) → steps empty, phases populated.
|
- Phased presets (review-fix) → steps empty, phases populated.
|
||||||
"""
|
"""
|
||||||
# Preset: "preset:simple" or "preset:review-fix"
|
# Preset: "preset:simple" or "preset:review-fix"
|
||||||
@@ -485,11 +571,15 @@ def _resolve_pipeline(
|
|||||||
if isinstance(pipeline_raw, list):
|
if isinstance(pipeline_raw, list):
|
||||||
steps = []
|
steps = []
|
||||||
for step_data in pipeline_raw:
|
for step_data in pipeline_raw:
|
||||||
|
raw_role = step_data.get("role", "coding")
|
||||||
|
normalized_role = normalize_step_role(raw_role)
|
||||||
steps.append(StepConfig(
|
steps.append(StepConfig(
|
||||||
name=step_data["name"],
|
name=step_data["name"],
|
||||||
agent=step_data["agent"],
|
agent=step_data["agent"],
|
||||||
role=step_data.get("role", "generate"),
|
role=normalized_role,
|
||||||
prompt_template=step_data.get("prompt_template", f"default:{step_data.get('role', 'generate')}"),
|
prompt_template=normalize_prompt_template(
|
||||||
|
step_data.get("prompt_template", f"default:{normalized_role}")
|
||||||
|
),
|
||||||
output_key=step_data["output_key"],
|
output_key=step_data["output_key"],
|
||||||
verdict=step_data.get("verdict", False),
|
verdict=step_data.get("verdict", False),
|
||||||
verdict_pattern=step_data.get("verdict_pattern", r"VERDICT:\s*PASS"),
|
verdict_pattern=step_data.get("verdict_pattern", r"VERDICT:\s*PASS"),
|
||||||
@@ -524,10 +614,6 @@ def validate_config(config: PipelineConfig) -> list[str]:
|
|||||||
errors,
|
errors,
|
||||||
scope=f"Phase '{phase.name}'",
|
scope=f"Phase '{phase.name}'",
|
||||||
)
|
)
|
||||||
if not any(s.verdict for s in phase.steps):
|
|
||||||
errors.append(
|
|
||||||
f"Phase '{phase.name}' must have at least one step with verdict: true"
|
|
||||||
)
|
|
||||||
# Validate verdict patterns
|
# Validate verdict patterns
|
||||||
for step in phase.steps:
|
for step in phase.steps:
|
||||||
if step.verdict:
|
if step.verdict:
|
||||||
@@ -576,6 +662,16 @@ def validate_config(config: PipelineConfig) -> list[str]:
|
|||||||
if config.language not in ("en", "ko"):
|
if config.language not in ("en", "ko"):
|
||||||
errors.append(f"Unsupported language '{config.language}'. Use 'en' or 'ko'.")
|
errors.append(f"Unsupported language '{config.language}'. Use 'en' or 'ko'.")
|
||||||
|
|
||||||
|
if config.execution.mode not in {"agent-decides"}:
|
||||||
|
errors.append(
|
||||||
|
f"Unsupported execution.mode '{config.execution.mode}'. Use 'agent-decides'."
|
||||||
|
)
|
||||||
|
if config.execution.command_policy not in {"broad", "restricted"}:
|
||||||
|
errors.append(
|
||||||
|
"Unsupported execution.command_policy "
|
||||||
|
f"'{config.execution.command_policy}'. Use 'broad' or 'restricted'."
|
||||||
|
)
|
||||||
|
|
||||||
return errors
|
return errors
|
||||||
|
|
||||||
|
|
||||||
@@ -599,6 +695,37 @@ def _validate_unique_step_fields(
|
|||||||
seen_output_keys.add(step.output_key)
|
seen_output_keys.add(step.output_key)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_agentic(agent: AgentConfig) -> None:
|
||||||
|
"""Convert an agent to agentic mode in-place."""
|
||||||
|
agent.agentic = True
|
||||||
|
agent.args = [a for a in agent.args if a not in {"-p", "--print"}]
|
||||||
|
|
||||||
|
|
||||||
|
def sync_phased_iterations(
|
||||||
|
config: PipelineConfig,
|
||||||
|
max_iter: int | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""Apply effective max iterations to converging phases while preserving setup phases."""
|
||||||
|
if not config.phases:
|
||||||
|
return
|
||||||
|
|
||||||
|
effective_max_iter = config.max_iterations if max_iter is None else max_iter
|
||||||
|
for phase in config.phases:
|
||||||
|
if any(step.verdict for step in phase.steps):
|
||||||
|
phase.max_iterations = effective_max_iter
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_fix_preset_agentic(config: PipelineConfig) -> None:
|
||||||
|
"""Fix-style presets should modify code, so coders run agentically by default."""
|
||||||
|
if config.preset_name not in FIX_STYLE_PRESETS:
|
||||||
|
return
|
||||||
|
|
||||||
|
for coder_name in config.coders:
|
||||||
|
agent = config.agents.get(coder_name)
|
||||||
|
if agent is not None and not agent.agentic:
|
||||||
|
_make_agentic(agent)
|
||||||
|
|
||||||
|
|
||||||
def apply_input_overrides(
|
def apply_input_overrides(
|
||||||
config: PipelineConfig, overrides: dict[str, str]
|
config: PipelineConfig, overrides: dict[str, str]
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|||||||
282
cross_eval/demo.py
Normal file
282
cross_eval/demo.py
Normal file
@@ -0,0 +1,282 @@
|
|||||||
|
"""Built-in demo for cross-eval — lets new users see the full lifecycle."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from cross_eval.models import PipelineConfig, PipelineResult
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Built-in demo plan & checklist
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
DEMO_PLAN = """\
|
||||||
|
# Demo: Fibonacci Function
|
||||||
|
|
||||||
|
## Objective
|
||||||
|
Implement a `fibonacci(n)` function in Python.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
1. `fibonacci(0)` returns `0`, `fibonacci(1)` returns `1`.
|
||||||
|
2. For `n >= 2`, return the sum of the two preceding values.
|
||||||
|
3. Raise `ValueError` for negative `n`.
|
||||||
|
4. Use an iterative approach (not recursive).
|
||||||
|
|
||||||
|
## Constraints
|
||||||
|
- Single file: `fib.py`
|
||||||
|
- No external dependencies.
|
||||||
|
"""
|
||||||
|
|
||||||
|
DEMO_CHECKLIST = """\
|
||||||
|
# Demo Checklist
|
||||||
|
- [ ] fibonacci(0) → 0
|
||||||
|
- [ ] fibonacci(1) → 1
|
||||||
|
- [ ] fibonacci(10) → 55
|
||||||
|
- [ ] fibonacci(-1) raises ValueError
|
||||||
|
- [ ] Iterative implementation (no recursion)
|
||||||
|
- [ ] No unnecessary abstractions
|
||||||
|
"""
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Mock outputs (realistic-looking)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_MOCK_CODING_V1 = """\
|
||||||
|
I'll implement the fibonacci function in `fib.py`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# fib.py
|
||||||
|
|
||||||
|
def fibonacci(n: int) -> int:
|
||||||
|
\"\"\"Return the nth Fibonacci number using iteration.\"\"\"
|
||||||
|
if n < 0:
|
||||||
|
return -1 # invalid input
|
||||||
|
if n <= 1:
|
||||||
|
return n
|
||||||
|
a, b = 0, 1
|
||||||
|
for _ in range(2, n + 1):
|
||||||
|
a, b = b, a + b
|
||||||
|
return b
|
||||||
|
```
|
||||||
|
|
||||||
|
Created `fib.py` with the iterative fibonacci function.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_MOCK_REVIEW_V1 = """\
|
||||||
|
### Previous Feedback Assessment
|
||||||
|
N/A — first iteration.
|
||||||
|
|
||||||
|
### Issues Found
|
||||||
|
- ISS-001 [Major][Omission] Negative input should raise `ValueError`, \
|
||||||
|
but implementation returns `-1` instead. (Requirement 3: "Raise ValueError for negative n")
|
||||||
|
|
||||||
|
### Out of Scope Issues
|
||||||
|
None
|
||||||
|
|
||||||
|
### Summary
|
||||||
|
- Critical: 0, Major: 1, Minor: 0
|
||||||
|
- Over-engineering count: 0
|
||||||
|
- Omission count: 1
|
||||||
|
- CONFIRMED: 0, DISMISSED: 0
|
||||||
|
- Overall quality: Good structure, one requirement gap.
|
||||||
|
|
||||||
|
### Verdict
|
||||||
|
VERDICT: FAIL
|
||||||
|
"""
|
||||||
|
|
||||||
|
_MOCK_CODING_V2 = """\
|
||||||
|
Fixing the negative input handling per review feedback (ISS-001).
|
||||||
|
|
||||||
|
```python
|
||||||
|
# fib.py
|
||||||
|
|
||||||
|
def fibonacci(n: int) -> int:
|
||||||
|
\"\"\"Return the nth Fibonacci number using iteration.\"\"\"
|
||||||
|
if n < 0:
|
||||||
|
raise ValueError(f"n must be non-negative, got {n}")
|
||||||
|
if n <= 1:
|
||||||
|
return n
|
||||||
|
a, b = 0, 1
|
||||||
|
for _ in range(2, n + 1):
|
||||||
|
a, b = b, a + b
|
||||||
|
return b
|
||||||
|
```
|
||||||
|
|
||||||
|
Updated `fib.py`: negative input now raises `ValueError`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_MOCK_REVIEW_V2 = """\
|
||||||
|
### Previous Feedback Assessment
|
||||||
|
- DISMISSED (false positive): None
|
||||||
|
- CONFIRMED: None — ISS-001 has been fixed.
|
||||||
|
|
||||||
|
### Issues Found
|
||||||
|
None — all checklist items satisfied.
|
||||||
|
|
||||||
|
### Out of Scope Issues
|
||||||
|
None
|
||||||
|
|
||||||
|
### Summary
|
||||||
|
- Critical: 0, Major: 0, Minor: 0
|
||||||
|
- Over-engineering count: 0
|
||||||
|
- Omission count: 0
|
||||||
|
- CONFIRMED: 0, DISMISSED: 0
|
||||||
|
- Overall quality: All requirements met, clean implementation.
|
||||||
|
|
||||||
|
### Verdict
|
||||||
|
VERDICT: PASS
|
||||||
|
"""
|
||||||
|
|
||||||
|
_MOCK_STEPS = [
|
||||||
|
# (iteration, step_name, agent, duration, output_chars, verdict, output)
|
||||||
|
(1, "coding", "claude-coder", 2.1, 347, None, _MOCK_CODING_V1),
|
||||||
|
(1, "review", "claude-reviewer", 1.8, 423, "FAIL", _MOCK_REVIEW_V1),
|
||||||
|
(2, "coding", "claude-coder", 2.3, 382, None, _MOCK_CODING_V2),
|
||||||
|
(2, "review", "claude-reviewer", 1.5, 312, "PASS", _MOCK_REVIEW_V2),
|
||||||
|
]
|
||||||
|
|
||||||
|
_MOCK_ESCALATE_REVIEW = """\
|
||||||
|
### Issues Found
|
||||||
|
- ISS-001 [Critical][Omission] Requirements are ambiguous: "iterative approach" is unclear — \
|
||||||
|
does this exclude memoization? The plan needs clarification from stakeholders.
|
||||||
|
|
||||||
|
### Verdict
|
||||||
|
VERDICT: ESCALATE
|
||||||
|
"""
|
||||||
|
|
||||||
|
_MOCK_ESCALATE_STEPS = [
|
||||||
|
(1, "coding", "claude-coder", 2.1, 347, None, _MOCK_CODING_V1),
|
||||||
|
(1, "review", "claude-reviewer", 1.8, 520, "ESCALATE", _MOCK_ESCALATE_REVIEW),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Mock demo runner
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
DIM = "\033[2m"
|
||||||
|
BOLD = "\033[1m"
|
||||||
|
GREEN = "\033[32m"
|
||||||
|
RED = "\033[31m"
|
||||||
|
YELLOW = "\033[33m"
|
||||||
|
CYAN = "\033[36m"
|
||||||
|
RESET = "\033[0m"
|
||||||
|
|
||||||
|
|
||||||
|
def run_mock_demo(preset: str = "simple", show_escalate: bool = False) -> None:
|
||||||
|
"""Run a simulated demo showing the full pipeline lifecycle."""
|
||||||
|
steps = _MOCK_ESCALATE_STEPS if show_escalate else _MOCK_STEPS
|
||||||
|
|
||||||
|
print(f"\n{BOLD}=== cross-eval demo (mock) ==={RESET}")
|
||||||
|
print(f"{DIM}Preset: {preset} | Coder: claude-coder | Reviewer: claude-reviewer{RESET}")
|
||||||
|
print(f"{DIM}Plan: fibonacci function | Max iterations: 3{RESET}\n")
|
||||||
|
|
||||||
|
current_iter = 0
|
||||||
|
for iteration, step_name, agent, duration, chars, verdict, output in steps:
|
||||||
|
if iteration != current_iter:
|
||||||
|
current_iter = iteration
|
||||||
|
print(f"{BOLD}{'━' * 50}")
|
||||||
|
print(f" Iteration {iteration}/3")
|
||||||
|
print(f"{'━' * 50}{RESET}")
|
||||||
|
|
||||||
|
# Simulate running
|
||||||
|
sys.stdout.write(f" ⠋ [{step_name}] {agent} running...")
|
||||||
|
sys.stdout.flush()
|
||||||
|
time.sleep(0.5)
|
||||||
|
sys.stdout.write(f"\r {GREEN}✓{RESET} [{step_name}] {agent} — {chars} chars ({duration}s)\n")
|
||||||
|
|
||||||
|
if verdict:
|
||||||
|
if verdict == "PASS":
|
||||||
|
color = GREEN
|
||||||
|
elif verdict == "ESCALATE":
|
||||||
|
color = YELLOW
|
||||||
|
else:
|
||||||
|
color = RED
|
||||||
|
print(f" {color}{BOLD}Verdict: {verdict}{RESET}")
|
||||||
|
|
||||||
|
if verdict == "FAIL":
|
||||||
|
# Show key feedback
|
||||||
|
print(f" {DIM}Feedback: ISS-001 [Major] Negative input returns -1 instead of ValueError{RESET}")
|
||||||
|
elif verdict == "ESCALATE":
|
||||||
|
print(f" {YELLOW}Reason: Requirements need clarification from stakeholders{RESET}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Final result
|
||||||
|
if show_escalate:
|
||||||
|
final = "ESCALATE"
|
||||||
|
color = YELLOW
|
||||||
|
else:
|
||||||
|
final = "PASS"
|
||||||
|
color = GREEN
|
||||||
|
|
||||||
|
print(f"{BOLD}Result: {color}{final}{RESET}")
|
||||||
|
print(f"Iterations: {current_iter}")
|
||||||
|
|
||||||
|
if show_escalate:
|
||||||
|
print(f"\n{RED}{BOLD}{'=' * 50}")
|
||||||
|
print(" Escalation Report")
|
||||||
|
print(f"{'=' * 50}{RESET}")
|
||||||
|
print(f"{YELLOW}Human review required.{RESET}")
|
||||||
|
print(f" {RED}•{RESET} Requirements are ambiguous — needs stakeholder clarification")
|
||||||
|
print(f"{RED}{BOLD}{'=' * 50}{RESET}")
|
||||||
|
|
||||||
|
print(f"\n{DIM}This was a mock demo. To run with real agents:{RESET}")
|
||||||
|
print(f"{DIM} cross-eval demo --live{RESET}")
|
||||||
|
print(f"{DIM} cross-eval run --plan plan.md{RESET}\n")
|
||||||
|
|
||||||
|
|
||||||
|
def run_live_demo(
|
||||||
|
preset: str = "simple",
|
||||||
|
timeout: int | None = None,
|
||||||
|
) -> PipelineResult:
|
||||||
|
"""Run a live demo with real agents using the built-in plan."""
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
from cross_eval.config import (
|
||||||
|
BUILTIN_AGENTS,
|
||||||
|
_resolve_agents,
|
||||||
|
apply_reasoning_effort_settings,
|
||||||
|
)
|
||||||
|
from cross_eval.pipeline import run_pipeline
|
||||||
|
from cross_eval.prompts import PHASED_PRESETS, PIPELINE_PRESETS
|
||||||
|
|
||||||
|
coders = ["claude-coder"]
|
||||||
|
reviewers = ["claude-reviewer"]
|
||||||
|
seniors: list[str] = []
|
||||||
|
agents = _resolve_agents(dict(BUILTIN_AGENTS), coders, reviewers, seniors)
|
||||||
|
|
||||||
|
if preset in PIPELINE_PRESETS:
|
||||||
|
pipeline = PIPELINE_PRESETS[preset](coders, reviewers, seniors)
|
||||||
|
phases = []
|
||||||
|
elif preset in PHASED_PRESETS:
|
||||||
|
pipeline = []
|
||||||
|
phases = PHASED_PRESETS[preset](coders, reviewers, seniors)
|
||||||
|
else:
|
||||||
|
pipeline = PIPELINE_PRESETS["simple"](coders, reviewers, seniors)
|
||||||
|
phases = []
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
plan_path = Path(tmpdir) / "plan.md"
|
||||||
|
checklist_path = Path(tmpdir) / "checklist.md"
|
||||||
|
plan_path.write_text(DEMO_PLAN, encoding="utf-8")
|
||||||
|
checklist_path.write_text(DEMO_CHECKLIST, encoding="utf-8")
|
||||||
|
|
||||||
|
config = PipelineConfig(
|
||||||
|
output_dir=Path(".cross-eval/output"),
|
||||||
|
max_iterations=3,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": plan_path, "checklist": checklist_path},
|
||||||
|
agents=agents,
|
||||||
|
coders=coders,
|
||||||
|
reviewers=reviewers,
|
||||||
|
seniors=seniors,
|
||||||
|
pipeline=pipeline,
|
||||||
|
phases=phases,
|
||||||
|
preset_name=f"demo-{preset}",
|
||||||
|
)
|
||||||
|
apply_reasoning_effort_settings(config)
|
||||||
|
|
||||||
|
return run_pipeline(config, timeout=timeout)
|
||||||
330
cross_eval/discovery.py
Normal file
330
cross_eval/discovery.py
Normal file
@@ -0,0 +1,330 @@
|
|||||||
|
"""Repository/service discovery helpers for autonomous execution prompts."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RepoDiscovery:
|
||||||
|
languages: set[str] = field(default_factory=set)
|
||||||
|
package_managers: set[str] = field(default_factory=set)
|
||||||
|
databases: set[str] = field(default_factory=set)
|
||||||
|
services: set[str] = field(default_factory=set)
|
||||||
|
frameworks: set[str] = field(default_factory=set)
|
||||||
|
hints: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
def _read_text(path: Path) -> str:
|
||||||
|
try:
|
||||||
|
return path.read_text(encoding="utf-8")
|
||||||
|
except (OSError, UnicodeDecodeError):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _add_if_contains(target: set[str], content: str, mapping: dict[str, str]) -> None:
|
||||||
|
lowered = content.lower()
|
||||||
|
for needle, name in mapping.items():
|
||||||
|
if needle in lowered:
|
||||||
|
target.add(name)
|
||||||
|
|
||||||
|
|
||||||
|
# Shared mapping for database signals found in manifest content
|
||||||
|
_MANIFEST_DB_SIGNALS: dict[str, str] = {
|
||||||
|
# PostgreSQL
|
||||||
|
"psycopg": "postgresql",
|
||||||
|
"asyncpg": "postgresql",
|
||||||
|
"postgres": "postgresql",
|
||||||
|
"pgx": "postgresql",
|
||||||
|
# MySQL / MariaDB
|
||||||
|
"mysql": "mysql",
|
||||||
|
"mariadb": "mysql",
|
||||||
|
"pymysql": "mysql",
|
||||||
|
# MongoDB
|
||||||
|
"pymongo": "mongodb",
|
||||||
|
"mongodb": "mongodb",
|
||||||
|
"mongoengine": "mongodb",
|
||||||
|
"mongosh": "mongodb",
|
||||||
|
# ClickHouse
|
||||||
|
"clickhouse": "clickhouse",
|
||||||
|
"clickhouse-driver": "clickhouse",
|
||||||
|
"clickhouse_connect": "clickhouse",
|
||||||
|
# Redis
|
||||||
|
"redis": "redis",
|
||||||
|
"ioredis": "redis",
|
||||||
|
# SQLite
|
||||||
|
"sqlite": "sqlite",
|
||||||
|
"better-sqlite3": "sqlite",
|
||||||
|
"aiosqlite": "sqlite",
|
||||||
|
# Elasticsearch / OpenSearch
|
||||||
|
"elasticsearch": "elasticsearch",
|
||||||
|
"opensearch": "elasticsearch",
|
||||||
|
# DynamoDB
|
||||||
|
"dynamodb": "dynamodb",
|
||||||
|
"boto3": "dynamodb", # broad but common signal
|
||||||
|
# Cassandra
|
||||||
|
"cassandra-driver": "cassandra",
|
||||||
|
"cassandra": "cassandra",
|
||||||
|
# RabbitMQ
|
||||||
|
"amqplib": "rabbitmq",
|
||||||
|
"pika": "rabbitmq",
|
||||||
|
"rabbitmq": "rabbitmq",
|
||||||
|
# Kafka
|
||||||
|
"kafka": "kafka",
|
||||||
|
"confluent-kafka": "kafka",
|
||||||
|
"kafkajs": "kafka",
|
||||||
|
# Neo4j
|
||||||
|
"neo4j": "neo4j",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Node package.json dependency → database mapping
|
||||||
|
_NODE_DEP_DB_SIGNALS: dict[str, str] = {
|
||||||
|
"pg": "postgresql",
|
||||||
|
"mysql": "mysql",
|
||||||
|
"mysql2": "mysql",
|
||||||
|
"mongoose": "mongodb",
|
||||||
|
"mongodb": "mongodb",
|
||||||
|
"@clickhouse/client": "clickhouse",
|
||||||
|
"redis": "redis",
|
||||||
|
"ioredis": "redis",
|
||||||
|
"prisma": "postgresql",
|
||||||
|
"better-sqlite3": "sqlite",
|
||||||
|
"sqlite3": "sqlite",
|
||||||
|
"@elastic/elasticsearch": "elasticsearch",
|
||||||
|
"@aws-sdk/client-dynamodb": "dynamodb",
|
||||||
|
"kafkajs": "kafka",
|
||||||
|
"amqplib": "rabbitmq",
|
||||||
|
"neo4j-driver": "neo4j",
|
||||||
|
"cassandra-driver": "cassandra",
|
||||||
|
"typeorm": "postgresql",
|
||||||
|
"sequelize": "postgresql",
|
||||||
|
"knex": "postgresql",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Docker compose service image → service name mapping
|
||||||
|
_COMPOSE_SERVICE_SIGNALS: dict[str, str] = {
|
||||||
|
"clickhouse": "clickhouse",
|
||||||
|
"postgres": "postgresql",
|
||||||
|
"mysql": "mysql",
|
||||||
|
"mariadb": "mysql",
|
||||||
|
"mongo": "mongodb",
|
||||||
|
"redis": "redis",
|
||||||
|
"elasticsearch": "elasticsearch",
|
||||||
|
"opensearch": "elasticsearch",
|
||||||
|
"rabbitmq": "rabbitmq",
|
||||||
|
"kafka": "kafka",
|
||||||
|
"zookeeper": "kafka",
|
||||||
|
"cassandra": "cassandra",
|
||||||
|
"neo4j": "neo4j",
|
||||||
|
"minio": "s3",
|
||||||
|
"localstack": "aws-local",
|
||||||
|
"dynamodb": "dynamodb",
|
||||||
|
"memcached": "memcached",
|
||||||
|
"nginx": "nginx",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Environment variable name patterns → database mapping
|
||||||
|
_ENV_DB_PATTERNS: list[tuple[str, str]] = [
|
||||||
|
("CLICKHOUSE", "clickhouse"),
|
||||||
|
("CH_", "clickhouse"),
|
||||||
|
("POSTGRES", "postgresql"),
|
||||||
|
("PG", "postgresql"),
|
||||||
|
("DATABASE_URL", "postgresql"),
|
||||||
|
("MYSQL", "mysql"),
|
||||||
|
("MARIADB", "mysql"),
|
||||||
|
("MONGO", "mongodb"),
|
||||||
|
("REDIS", "redis"),
|
||||||
|
("ELASTICSEARCH", "elasticsearch"),
|
||||||
|
("OPENSEARCH", "elasticsearch"),
|
||||||
|
("DYNAMO", "dynamodb"),
|
||||||
|
("CASSANDRA", "cassandra"),
|
||||||
|
("KAFKA", "kafka"),
|
||||||
|
("RABBIT", "rabbitmq"),
|
||||||
|
("AMQP", "rabbitmq"),
|
||||||
|
("NEO4J", "neo4j"),
|
||||||
|
("SQLITE", "sqlite"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def discover_repo(project_root: Path, env_names: set[str] | None = None) -> RepoDiscovery:
|
||||||
|
"""Infer runtime-relevant stack hints from common manifest/config files."""
|
||||||
|
discovery = RepoDiscovery()
|
||||||
|
env_names = {name.upper() for name in (env_names or set())}
|
||||||
|
|
||||||
|
file_map: dict[str, Path] = {
|
||||||
|
"pyproject": project_root / "pyproject.toml",
|
||||||
|
"requirements": project_root / "requirements.txt",
|
||||||
|
"requirements_dev": project_root / "requirements-dev.txt",
|
||||||
|
"setup_py": project_root / "setup.py",
|
||||||
|
"setup_cfg": project_root / "setup.cfg",
|
||||||
|
"package": project_root / "package.json",
|
||||||
|
"go_mod": project_root / "go.mod",
|
||||||
|
"cargo": project_root / "Cargo.toml",
|
||||||
|
"gemfile": project_root / "Gemfile",
|
||||||
|
"build_gradle": project_root / "build.gradle",
|
||||||
|
"build_gradle_kts": project_root / "build.gradle.kts",
|
||||||
|
"pom": project_root / "pom.xml",
|
||||||
|
"composer": project_root / "composer.json",
|
||||||
|
"mix": project_root / "mix.exs",
|
||||||
|
"docker_compose": project_root / "docker-compose.yml",
|
||||||
|
"docker_compose_alt": project_root / "docker-compose.yaml",
|
||||||
|
"compose": project_root / "compose.yaml",
|
||||||
|
"prisma": project_root / "prisma" / "schema.prisma",
|
||||||
|
"dockerfile": project_root / "Dockerfile",
|
||||||
|
}
|
||||||
|
|
||||||
|
# ---- Language detection ----
|
||||||
|
if (
|
||||||
|
file_map["pyproject"].exists()
|
||||||
|
or file_map["requirements"].exists()
|
||||||
|
or file_map["requirements_dev"].exists()
|
||||||
|
or file_map["setup_py"].exists()
|
||||||
|
or file_map["setup_cfg"].exists()
|
||||||
|
):
|
||||||
|
discovery.languages.add("python")
|
||||||
|
if file_map["package"].exists():
|
||||||
|
discovery.languages.add("node")
|
||||||
|
if file_map["go_mod"].exists():
|
||||||
|
discovery.languages.add("go")
|
||||||
|
if file_map["cargo"].exists():
|
||||||
|
discovery.languages.add("rust")
|
||||||
|
if file_map["gemfile"].exists():
|
||||||
|
discovery.languages.add("ruby")
|
||||||
|
if file_map["build_gradle"].exists() or file_map["build_gradle_kts"].exists() or file_map["pom"].exists():
|
||||||
|
discovery.languages.add("java")
|
||||||
|
if file_map["composer"].exists():
|
||||||
|
discovery.languages.add("php")
|
||||||
|
if file_map["mix"].exists():
|
||||||
|
discovery.languages.add("elixir")
|
||||||
|
|
||||||
|
# ---- Package manager detection ----
|
||||||
|
if file_map["pyproject"].exists() or file_map["requirements"].exists() or file_map["setup_py"].exists():
|
||||||
|
discovery.package_managers.add("pip")
|
||||||
|
if file_map["package"].exists():
|
||||||
|
try:
|
||||||
|
package_json = json.loads(_read_text(file_map["package"]) or "{}")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
package_json = {}
|
||||||
|
pm = package_json.get("packageManager")
|
||||||
|
if isinstance(pm, str) and pm:
|
||||||
|
discovery.package_managers.add(pm.split("@", 1)[0])
|
||||||
|
else:
|
||||||
|
# Check for lockfiles to distinguish npm/yarn/pnpm
|
||||||
|
if (project_root / "pnpm-lock.yaml").exists():
|
||||||
|
discovery.package_managers.add("pnpm")
|
||||||
|
elif (project_root / "yarn.lock").exists():
|
||||||
|
discovery.package_managers.add("yarn")
|
||||||
|
else:
|
||||||
|
discovery.package_managers.add("npm")
|
||||||
|
if file_map["go_mod"].exists():
|
||||||
|
discovery.package_managers.add("go")
|
||||||
|
if file_map["cargo"].exists():
|
||||||
|
discovery.package_managers.add("cargo")
|
||||||
|
if file_map["gemfile"].exists():
|
||||||
|
discovery.package_managers.add("bundler")
|
||||||
|
if file_map["build_gradle"].exists() or file_map["build_gradle_kts"].exists():
|
||||||
|
discovery.package_managers.add("gradle")
|
||||||
|
if file_map["pom"].exists():
|
||||||
|
discovery.package_managers.add("maven")
|
||||||
|
if file_map["composer"].exists():
|
||||||
|
discovery.package_managers.add("composer")
|
||||||
|
if file_map["mix"].exists():
|
||||||
|
discovery.package_managers.add("mix")
|
||||||
|
|
||||||
|
# ---- Gather manifest content ----
|
||||||
|
manifests = {
|
||||||
|
name: _read_text(path)
|
||||||
|
for name, path in file_map.items()
|
||||||
|
if path.exists()
|
||||||
|
}
|
||||||
|
combined = "\n".join(manifests.values())
|
||||||
|
|
||||||
|
# ---- Database detection from manifest content ----
|
||||||
|
_add_if_contains(discovery.databases, combined, _MANIFEST_DB_SIGNALS)
|
||||||
|
|
||||||
|
# ---- Node.js dependency-specific detection ----
|
||||||
|
if file_map["package"].exists():
|
||||||
|
try:
|
||||||
|
package_json = json.loads(_read_text(file_map["package"]) or "{}")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
package_json = {}
|
||||||
|
deps = {
|
||||||
|
**(package_json.get("dependencies") or {}),
|
||||||
|
**(package_json.get("devDependencies") or {}),
|
||||||
|
}
|
||||||
|
dep_blob = "\n".join(deps.keys()).lower()
|
||||||
|
_add_if_contains(discovery.databases, dep_blob, _NODE_DEP_DB_SIGNALS)
|
||||||
|
|
||||||
|
# ---- Framework detection from manifest content ----
|
||||||
|
_add_if_contains(
|
||||||
|
discovery.frameworks,
|
||||||
|
combined,
|
||||||
|
{
|
||||||
|
"fastapi": "fastapi",
|
||||||
|
"django": "django",
|
||||||
|
"flask": "flask",
|
||||||
|
"express": "express",
|
||||||
|
"nextjs": "next.js",
|
||||||
|
"next": "next.js",
|
||||||
|
"nestjs": "nestjs",
|
||||||
|
"spring": "spring",
|
||||||
|
"rails": "rails",
|
||||||
|
"laravel": "laravel",
|
||||||
|
"phoenix": "phoenix",
|
||||||
|
"gin": "gin",
|
||||||
|
"actix": "actix",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---- Database detection from environment variable names ----
|
||||||
|
for env_name in env_names:
|
||||||
|
for pattern, db_name in _ENV_DB_PATTERNS:
|
||||||
|
if pattern in env_name or env_name.startswith(pattern):
|
||||||
|
discovery.databases.add(db_name)
|
||||||
|
break
|
||||||
|
|
||||||
|
# ---- Docker compose service detection ----
|
||||||
|
compose_blob = "\n".join(
|
||||||
|
manifests.get(key, "")
|
||||||
|
for key in ("docker_compose", "docker_compose_alt", "compose")
|
||||||
|
).lower()
|
||||||
|
_add_if_contains(discovery.services, compose_blob, _COMPOSE_SERVICE_SIGNALS)
|
||||||
|
|
||||||
|
# ---- Hints from config files ----
|
||||||
|
if file_map["prisma"].exists():
|
||||||
|
discovery.hints.append("Prisma schema detected.")
|
||||||
|
if (project_root / "alembic.ini").exists():
|
||||||
|
discovery.hints.append("Alembic migration config detected.")
|
||||||
|
if (project_root / "knexfile.js").exists() or (project_root / "knexfile.ts").exists():
|
||||||
|
discovery.hints.append("Knex migration config detected.")
|
||||||
|
if (project_root / "ormconfig.json").exists() or (project_root / "ormconfig.ts").exists():
|
||||||
|
discovery.hints.append("TypeORM config detected.")
|
||||||
|
if (project_root / "drizzle.config.ts").exists():
|
||||||
|
discovery.hints.append("Drizzle ORM config detected.")
|
||||||
|
if (project_root / "Makefile").exists():
|
||||||
|
discovery.hints.append("Makefile available for build/task automation.")
|
||||||
|
if file_map["dockerfile"].exists() or (project_root / "docker").exists() or discovery.services:
|
||||||
|
discovery.hints.append("Containerized services may be available for local verification.")
|
||||||
|
|
||||||
|
return discovery
|
||||||
|
|
||||||
|
|
||||||
|
def format_repo_discovery(discovery: RepoDiscovery) -> str:
|
||||||
|
"""Render discovery results into a compact prompt summary."""
|
||||||
|
lines: list[str] = []
|
||||||
|
if discovery.languages:
|
||||||
|
lines.append("Detected languages: " + ", ".join(sorted(discovery.languages)))
|
||||||
|
if discovery.package_managers:
|
||||||
|
lines.append("Likely package managers: " + ", ".join(sorted(discovery.package_managers)))
|
||||||
|
if discovery.databases:
|
||||||
|
lines.append("Detected databases/services in code or env: " + ", ".join(sorted(discovery.databases)))
|
||||||
|
if discovery.services:
|
||||||
|
lines.append("Detected local service containers: " + ", ".join(sorted(discovery.services)))
|
||||||
|
if discovery.frameworks:
|
||||||
|
lines.append("Detected frameworks: " + ", ".join(sorted(discovery.frameworks)))
|
||||||
|
if discovery.hints:
|
||||||
|
lines.extend(discovery.hints)
|
||||||
|
if not lines:
|
||||||
|
return "No strong runtime/service signals were detected from repository manifests."
|
||||||
|
return "\n".join(lines)
|
||||||
200
cross_eval/doctor.py
Normal file
200
cross_eval/doctor.py
Normal file
@@ -0,0 +1,200 @@
|
|||||||
|
"""Environment health checks for cross-eval."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DoctorCheck:
|
||||||
|
"""Result of a single health check."""
|
||||||
|
|
||||||
|
name: str
|
||||||
|
passed: bool
|
||||||
|
critical: bool
|
||||||
|
message: str
|
||||||
|
detail: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
def check_cli_installed(command: str) -> tuple[bool, str]:
|
||||||
|
"""Check if a CLI tool is on PATH and get its version."""
|
||||||
|
path = shutil.which(command)
|
||||||
|
if not path:
|
||||||
|
return False, f"'{command}' not found on PATH"
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
[command, "--version"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
version = (result.stdout.strip() or result.stderr.strip()).split("\n")[0]
|
||||||
|
return True, version or "(version unknown)"
|
||||||
|
except (subprocess.TimeoutExpired, OSError):
|
||||||
|
return True, "(installed but version check failed)"
|
||||||
|
|
||||||
|
|
||||||
|
def check_cli_authenticated(command: str) -> tuple[bool, str]:
|
||||||
|
"""Check if a CLI tool is authenticated by running a minimal probe."""
|
||||||
|
path = shutil.which(command)
|
||||||
|
if not path:
|
||||||
|
return False, "not installed"
|
||||||
|
|
||||||
|
if command == "claude":
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
[command, "-p", "--model", "haiku", "--max-turns", "1"],
|
||||||
|
input="respond with just 'ok'",
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
combined = result.stdout + result.stderr
|
||||||
|
if any(kw in combined.lower() for kw in (
|
||||||
|
"not logged in", "login", "unauthorized", "unauthenticated",
|
||||||
|
"api key", "invalid key",
|
||||||
|
)):
|
||||||
|
return False, "not authenticated — run: claude login"
|
||||||
|
if result.returncode == 0:
|
||||||
|
return True, "authenticated"
|
||||||
|
return False, f"exit code {result.returncode}: {combined[:100]}"
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
return False, "timed out (30s) — possible network issue"
|
||||||
|
except OSError as e:
|
||||||
|
return False, str(e)
|
||||||
|
|
||||||
|
elif command == "codex":
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
[command, "--version"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
combined = result.stdout + result.stderr
|
||||||
|
if any(kw in combined.lower() for kw in (
|
||||||
|
"not logged in", "login", "unauthorized", "api key",
|
||||||
|
)):
|
||||||
|
return False, "not authenticated — run: codex login"
|
||||||
|
return True, "installed (auth check: codex login if needed)"
|
||||||
|
except (subprocess.TimeoutExpired, OSError) as e:
|
||||||
|
return False, str(e)
|
||||||
|
|
||||||
|
return False, f"unknown command: {command}"
|
||||||
|
|
||||||
|
|
||||||
|
def check_config(directory: Path) -> tuple[bool, Optional[Path], list[str]]:
|
||||||
|
"""Check if config.yaml exists and is valid."""
|
||||||
|
config_path = directory / ".cross-eval" / "config.yaml"
|
||||||
|
if not config_path.exists():
|
||||||
|
return False, None, []
|
||||||
|
|
||||||
|
try:
|
||||||
|
from cross_eval.config import load_config
|
||||||
|
load_config(config_path)
|
||||||
|
return True, config_path, []
|
||||||
|
except (ValueError, FileNotFoundError) as e:
|
||||||
|
return False, config_path, [str(e)]
|
||||||
|
|
||||||
|
|
||||||
|
def run_doctor(directory: Path) -> list[DoctorCheck]:
|
||||||
|
"""Run all health checks and return results."""
|
||||||
|
checks: list[DoctorCheck] = []
|
||||||
|
|
||||||
|
# 1. claude CLI
|
||||||
|
installed, version = check_cli_installed("claude")
|
||||||
|
checks.append(DoctorCheck(
|
||||||
|
name="claude CLI",
|
||||||
|
passed=installed,
|
||||||
|
critical=True,
|
||||||
|
message=version if installed else "not found",
|
||||||
|
detail="Install: https://docs.anthropic.com/en/docs/claude-code" if not installed else None,
|
||||||
|
))
|
||||||
|
|
||||||
|
if installed:
|
||||||
|
auth_ok, auth_msg = check_cli_authenticated("claude")
|
||||||
|
checks.append(DoctorCheck(
|
||||||
|
name="claude auth",
|
||||||
|
passed=auth_ok,
|
||||||
|
critical=True,
|
||||||
|
message=auth_msg,
|
||||||
|
))
|
||||||
|
|
||||||
|
# 2. codex CLI
|
||||||
|
installed, version = check_cli_installed("codex")
|
||||||
|
checks.append(DoctorCheck(
|
||||||
|
name="codex CLI",
|
||||||
|
passed=installed,
|
||||||
|
critical=False,
|
||||||
|
message=version if installed else "not found (optional)",
|
||||||
|
detail="Install: https://github.com/openai/codex" if not installed else None,
|
||||||
|
))
|
||||||
|
|
||||||
|
if installed:
|
||||||
|
auth_ok, auth_msg = check_cli_authenticated("codex")
|
||||||
|
checks.append(DoctorCheck(
|
||||||
|
name="codex auth",
|
||||||
|
passed=auth_ok,
|
||||||
|
critical=False,
|
||||||
|
message=auth_msg,
|
||||||
|
))
|
||||||
|
|
||||||
|
# 3. Config
|
||||||
|
config_ok, config_path, config_errors = check_config(directory)
|
||||||
|
if config_path is None:
|
||||||
|
checks.append(DoctorCheck(
|
||||||
|
name="config",
|
||||||
|
passed=True, # not having config is fine
|
||||||
|
critical=False,
|
||||||
|
message="no .cross-eval/config.yaml (will use defaults)",
|
||||||
|
detail="Run: cross-eval init",
|
||||||
|
))
|
||||||
|
elif config_ok:
|
||||||
|
checks.append(DoctorCheck(
|
||||||
|
name="config",
|
||||||
|
passed=True,
|
||||||
|
critical=False,
|
||||||
|
message=f"valid ({config_path.name})",
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
checks.append(DoctorCheck(
|
||||||
|
name="config",
|
||||||
|
passed=False,
|
||||||
|
critical=True,
|
||||||
|
message="invalid config",
|
||||||
|
detail="\n".join(config_errors),
|
||||||
|
))
|
||||||
|
|
||||||
|
return checks
|
||||||
|
|
||||||
|
|
||||||
|
def format_doctor_results(checks: list[DoctorCheck]) -> str:
|
||||||
|
"""Format doctor check results for terminal output."""
|
||||||
|
lines: list[str] = []
|
||||||
|
lines.append("\n cross-eval doctor\n")
|
||||||
|
|
||||||
|
for check in checks:
|
||||||
|
icon = " ✓" if check.passed else " ✗"
|
||||||
|
lines.append(f"{icon} {check.name}: {check.message}")
|
||||||
|
if check.detail and not check.passed:
|
||||||
|
for detail_line in check.detail.split("\n"):
|
||||||
|
lines.append(f" {detail_line}")
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
failed_critical = [c for c in checks if not c.passed and c.critical]
|
||||||
|
failed_warn = [c for c in checks if not c.passed and not c.critical]
|
||||||
|
|
||||||
|
lines.append("")
|
||||||
|
if not failed_critical and not failed_warn:
|
||||||
|
lines.append(" All checks passed!")
|
||||||
|
elif failed_critical:
|
||||||
|
lines.append(f" {len(failed_critical)} critical issue(s) found.")
|
||||||
|
else:
|
||||||
|
lines.append(f" {len(failed_warn)} warning(s), no critical issues.")
|
||||||
|
|
||||||
|
lines.append("")
|
||||||
|
return "\n".join(lines)
|
||||||
@@ -16,6 +16,7 @@ class AgentConfig:
|
|||||||
system_prompt: Optional[str] = None
|
system_prompt: Optional[str] = None
|
||||||
reasoning_effort: Optional[str] = None
|
reasoning_effort: Optional[str] = None
|
||||||
stdin_mode: bool = False
|
stdin_mode: bool = False
|
||||||
|
agentic: bool = False # run in worktree, capture git diff instead of stdout
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -24,7 +25,7 @@ class StepConfig:
|
|||||||
|
|
||||||
name: str
|
name: str
|
||||||
agent: str # reference to agents key
|
agent: str # reference to agents key
|
||||||
role: str # "generate" or "review"
|
role: str # "coding" or "review"
|
||||||
prompt_template: str # "default:<role>" or file path
|
prompt_template: str # "default:<role>" or file path
|
||||||
output_key: str
|
output_key: str
|
||||||
verdict: bool = False
|
verdict: bool = False
|
||||||
@@ -43,15 +44,29 @@ class PhaseConfig:
|
|||||||
consecutive_pass: int = 1 # stop after N consecutive PASSes
|
consecutive_pass: int = 1 # stop after N consecutive PASSes
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExecutionConfig:
|
||||||
|
"""Runtime execution policy for agent subprocesses."""
|
||||||
|
|
||||||
|
mode: str = "agent-decides"
|
||||||
|
command_policy: str = "broad"
|
||||||
|
inherit_env: bool = True
|
||||||
|
auto_env_files: list[str] = field(default_factory=lambda: [".env", ".env.local"])
|
||||||
|
env_files: list[str] = field(default_factory=list)
|
||||||
|
expose_env_names: bool = True
|
||||||
|
auto_context_targets: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class PipelineConfig:
|
class PipelineConfig:
|
||||||
"""Full cross-eval configuration."""
|
"""Full cross-eval configuration."""
|
||||||
|
|
||||||
output_dir: Path = field(default_factory=lambda: Path("output"))
|
output_dir: Path = field(default_factory=lambda: Path(".cross-eval/output"))
|
||||||
max_iterations: int = 3
|
max_iterations: int = 3
|
||||||
min_iterations: int = 1
|
min_iterations: int = 1
|
||||||
verbose: bool = False
|
verbose: bool = False
|
||||||
language: str = "en" # "en" or "ko"
|
language: str = "en" # "en" or "ko"
|
||||||
|
execution: ExecutionConfig = field(default_factory=ExecutionConfig)
|
||||||
inputs: dict[str, Path | str] = field(default_factory=dict)
|
inputs: dict[str, Path | str] = field(default_factory=dict)
|
||||||
agents: dict[str, AgentConfig] = field(default_factory=dict)
|
agents: dict[str, AgentConfig] = field(default_factory=dict)
|
||||||
coders: list[str] = field(default_factory=list)
|
coders: list[str] = field(default_factory=list)
|
||||||
@@ -73,6 +88,8 @@ class AgentResult:
|
|||||||
agent_name: str
|
agent_name: str
|
||||||
step_name: str
|
step_name: str
|
||||||
duration_seconds: float
|
duration_seconds: float
|
||||||
|
transcript: str = ""
|
||||||
|
command_preview: str = ""
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -105,6 +122,7 @@ class IterationResult:
|
|||||||
phase_name: Optional[str] = None
|
phase_name: Optional[str] = None
|
||||||
repeated_aggregate_warning: Optional[str] = None
|
repeated_aggregate_warning: Optional[str] = None
|
||||||
review_metrics: Optional[ReviewMetrics] = None
|
review_metrics: Optional[ReviewMetrics] = None
|
||||||
|
escalated_issues: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -116,3 +134,5 @@ class PipelineResult:
|
|||||||
total_duration: float = 0.0
|
total_duration: float = 0.0
|
||||||
run_dir: Optional[Path] = None
|
run_dir: Optional[Path] = None
|
||||||
repeated_aggregate_warnings: list[str] = field(default_factory=list)
|
repeated_aggregate_warnings: list[str] = field(default_factory=list)
|
||||||
|
escalated_issues: list[str] = field(default_factory=list)
|
||||||
|
agentic_branch: Optional[str] = None
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -12,56 +12,42 @@ from cross_eval.models import PhaseConfig, StepConfig
|
|||||||
# Default prompt templates
|
# Default prompt templates
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
GENERATE_TEMPLATE = """\
|
CODING_TEMPLATE = """\
|
||||||
You are tasked with implementing code based on a plan and checklist.
|
You are tasked with implementing code based on a plan and checklist.
|
||||||
|
|
||||||
## Plan
|
## Artifact References
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## Checklist
|
|
||||||
{checklist}
|
|
||||||
|
|
||||||
## Reference Documents
|
|
||||||
{docs}
|
|
||||||
|
|
||||||
## Previous Review Feedback
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## Iteration
|
## Iteration
|
||||||
This is iteration {iteration} of {max_iterations}.
|
This is iteration {iteration} of {max_iterations}.
|
||||||
|
|
||||||
## Instructions
|
## Instructions
|
||||||
1. Explore the project directory to understand the existing codebase structure.
|
1. Read the referenced plan/checklist/docs/review artifacts directly from disk.
|
||||||
2. Implement ONLY what the plan specifies. Do NOT add extra features, \
|
2. Explore the project directory and git state to understand the current codebase structure.
|
||||||
|
3. Implement ONLY what the plan specifies. Do NOT add extra features, \
|
||||||
unnecessary abstractions, or premature optimizations.
|
unnecessary abstractions, or premature optimizations.
|
||||||
3. Follow every item in the checklist.
|
4. Follow every item in the checklist.
|
||||||
4. If there is previous feedback, address ONLY the specific issues mentioned.
|
5. If there is previous feedback in the referenced markdown artifacts, address ONLY those issues.
|
||||||
5. If previous feedback contains items marked as DISMISSED or false positive, \
|
6. If previous feedback contains items marked as DISMISSED or false positive, \
|
||||||
IGNORE those items — they have been verified as correct.
|
IGNORE those items — they have been verified as correct.
|
||||||
6. Output the complete implementation.
|
7. Prefer git and markdown artifacts as the source of truth. Use commit hashes, `git show`, `git diff`, and referenced markdown files instead of relying on inline summaries.
|
||||||
|
8. Output the complete implementation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
REVIEW_TEMPLATE = """\
|
REVIEW_TEMPLATE = """\
|
||||||
You are tasked with reviewing code against a plan and checklist.
|
You are tasked with reviewing code against a plan and checklist.
|
||||||
|
|
||||||
## Plan
|
## Artifact References
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## Checklist
|
## Execution Evidence
|
||||||
{checklist}
|
{execution_evidence}
|
||||||
|
|
||||||
## Reference Documents
|
|
||||||
{docs}
|
|
||||||
|
|
||||||
## Generated Code / Previous Step Output
|
|
||||||
{generated_code}
|
|
||||||
|
|
||||||
## Previous Review Feedback
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## Review Instructions
|
## Review Instructions
|
||||||
Explore the project directory to understand the full codebase context, \
|
Read the referenced plan/checklist/docs/review artifacts directly from disk. \
|
||||||
then evaluate the code against ONLY the plan and checklist above.
|
Inspect the referenced commit/git state and markdown artifacts, then evaluate \
|
||||||
|
the code against ONLY the plan and checklist. Use the execution evidence above \
|
||||||
|
to verify agent claims against actual command outputs, artifact paths, and exit codes.
|
||||||
|
|
||||||
For each issue found, classify it with BOTH severity AND category:
|
For each issue found, classify it with BOTH severity AND category:
|
||||||
|
|
||||||
@@ -94,10 +80,10 @@ security concerns, performance problems), report them separately under \
|
|||||||
(Write "N/A" if no previous feedback was provided.)
|
(Write "N/A" if no previous feedback was provided.)
|
||||||
|
|
||||||
### Issues Found
|
### Issues Found
|
||||||
List issues ordered by severity (Critical first):
|
List issues ordered by severity (Critical first). Assign each issue a unique ID (ISS-NNN):
|
||||||
- [Critical][Over-engineering] Description (reference specific plan/checklist item)
|
- ISS-001 [Critical][Over-engineering] Description (reference specific plan/checklist item)
|
||||||
- [Major][Omission] Description (reference specific plan/checklist item)
|
- ISS-002 [Major][Omission] Description (reference specific plan/checklist item)
|
||||||
- [Minor][Omission] Description (reference specific plan/checklist item)
|
- ISS-003 [Minor][Omission] Description (reference specific plan/checklist item)
|
||||||
|
|
||||||
### Out of Scope Issues
|
### Out of Scope Issues
|
||||||
Issues found outside plan/checklist scope but worth noting:
|
Issues found outside plan/checklist scope but worth noting:
|
||||||
@@ -119,54 +105,39 @@ Otherwise output: VERDICT: FAIL
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
GENERATE_TEMPLATE_KO = """\
|
CODING_TEMPLATE_KO = """\
|
||||||
당신은 기획서와 체크리스트를 기반으로 코드를 구현하는 개발자입니다.
|
당신은 기획서와 체크리스트를 기반으로 코드를 구현하는 개발자입니다.
|
||||||
|
|
||||||
## 기획서
|
## 참조 아티팩트
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## 체크리스트
|
|
||||||
{checklist}
|
|
||||||
|
|
||||||
## 참고 문서
|
|
||||||
{docs}
|
|
||||||
|
|
||||||
## 이전 리뷰 피드백
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## 반복 정보
|
## 반복 정보
|
||||||
현재 {max_iterations}회 중 {iteration}번째 반복입니다.
|
현재 {max_iterations}회 중 {iteration}번째 반복입니다.
|
||||||
|
|
||||||
## 지침
|
## 지침
|
||||||
1. 프로젝트 디렉토리를 탐색하여 기존 코드베이스 구조를 파악하세요.
|
1. 참조된 plan/checklist/docs/review markdown를 직접 읽으세요.
|
||||||
2. 기획서에 명시된 것만 구현하세요. 추가 기능, 불필요한 추상화, 과도한 최적화를 하지 마세요.
|
2. 프로젝트 디렉토리와 git 상태를 탐색하여 현재 코드베이스 구조를 파악하세요.
|
||||||
3. 체크리스트의 모든 항목을 충족하세요.
|
3. 기획서에 명시된 것만 구현하세요. 추가 기능, 불필요한 추상화, 과도한 최적화를 하지 마세요.
|
||||||
4. 이전 리뷰 피드백이 있다면 해당 이슈만 해결하세요.
|
4. 체크리스트의 모든 항목을 충족하세요.
|
||||||
5. 이전 피드백에서 DISMISSED 또는 오탐으로 표시된 항목은 무시하세요 — 이미 올바른 것으로 검증되었습니다.
|
5. 참조된 이전 리뷰 피드백이 있다면 해당 이슈만 해결하세요.
|
||||||
6. 완전한 구현을 출력하세요.
|
6. 이전 피드백에서 DISMISSED 또는 오탐으로 표시된 항목은 무시하세요 — 이미 올바른 것으로 검증되었습니다.
|
||||||
|
7. inline 요약보다 git commit hash, `git show`, `git diff`, markdown 아티팩트를 우선 사용하세요.
|
||||||
|
8. 완전한 구현을 출력하세요.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
REVIEW_TEMPLATE_KO = """\
|
REVIEW_TEMPLATE_KO = """\
|
||||||
당신은 기획서와 체크리스트 기준으로 코드를 검토하는 리뷰어입니다.
|
당신은 기획서와 체크리스트 기준으로 코드를 검토하는 리뷰어입니다.
|
||||||
|
|
||||||
## 기획서
|
## 참조 아티팩트
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## 체크리스트
|
## 실행 증거
|
||||||
{checklist}
|
{execution_evidence}
|
||||||
|
|
||||||
## 참고 문서
|
|
||||||
{docs}
|
|
||||||
|
|
||||||
## 검토 대상 코드
|
|
||||||
{generated_code}
|
|
||||||
|
|
||||||
## 이전 리뷰 피드백
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## 검토 지침
|
## 검토 지침
|
||||||
프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스 맥락을 파악한 뒤, \
|
참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽고, \
|
||||||
위 기획서와 체크리스트 기준으로만 코드를 평가하세요.
|
그 내용을 기준으로만 코드를 평가하세요. \
|
||||||
|
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요.
|
||||||
|
|
||||||
발견된 각 이슈에 심각도와 카테고리를 모두 부여하세요:
|
발견된 각 이슈에 심각도와 카테고리를 모두 부여하세요:
|
||||||
|
|
||||||
@@ -195,10 +166,10 @@ REVIEW_TEMPLATE_KO = """\
|
|||||||
(이전 피드백이 없으면 "해당 없음"이라고 작성하세요.)
|
(이전 피드백이 없으면 "해당 없음"이라고 작성하세요.)
|
||||||
|
|
||||||
### 발견된 이슈
|
### 발견된 이슈
|
||||||
심각도 순서(Critical 먼저)로 나열:
|
심각도 순서(Critical 먼저)로 나열. 각 이슈에 고유 ID(ISS-NNN)를 부여하세요:
|
||||||
- [Critical][과최적화] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
|
- ISS-001 [Critical][과최적화] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
|
||||||
- [Major][누락] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
|
- ISS-002 [Major][누락] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
|
||||||
- [Minor][누락] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
|
- ISS-003 [Minor][누락] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
|
||||||
|
|
||||||
### 범위 밖 이슈
|
### 범위 밖 이슈
|
||||||
기획서/체크리스트 범위 밖이지만 주목할 만한 이슈:
|
기획서/체크리스트 범위 밖이지만 주목할 만한 이슈:
|
||||||
@@ -234,9 +205,14 @@ You are tasked with reviewing existing code against a plan and checklist.
|
|||||||
## Previous Review (iteration {iteration} of {max_iterations})
|
## Previous Review (iteration {iteration} of {max_iterations})
|
||||||
{feedback}
|
{feedback}
|
||||||
|
|
||||||
|
## Execution Evidence
|
||||||
|
{execution_evidence}
|
||||||
|
|
||||||
## Review Instructions
|
## Review Instructions
|
||||||
Explore the project directory thoroughly to understand the full codebase, \
|
Explore the project directory thoroughly to understand the full codebase, \
|
||||||
then evaluate the EXISTING code against ONLY the plan and checklist above.
|
then evaluate the EXISTING code against ONLY the plan and checklist above. \
|
||||||
|
Use the execution evidence above to verify agent claims against actual \
|
||||||
|
command outputs and exit codes.
|
||||||
|
|
||||||
You are NOT generating or modifying code. You are auditing what already exists.
|
You are NOT generating or modifying code. You are auditing what already exists.
|
||||||
|
|
||||||
@@ -293,21 +269,16 @@ Otherwise output: VERDICT: FAIL
|
|||||||
REVIEW_ONLY_TEMPLATE_KO = """\
|
REVIEW_ONLY_TEMPLATE_KO = """\
|
||||||
당신은 기존 코드를 기획서와 체크리스트 기준으로 감사하는 리뷰어입니다.
|
당신은 기존 코드를 기획서와 체크리스트 기준으로 감사하는 리뷰어입니다.
|
||||||
|
|
||||||
## 기획서
|
## 참조 아티팩트
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## 체크리스트
|
## 실행 증거
|
||||||
{checklist}
|
{execution_evidence}
|
||||||
|
|
||||||
## 참고 문서
|
|
||||||
{docs}
|
|
||||||
|
|
||||||
## 이전 리뷰 결과 ({max_iterations}회 중 {iteration}번째)
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## 검토 지침
|
## 검토 지침
|
||||||
프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스를 파악한 뒤, \
|
참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽고, \
|
||||||
위 기획서와 체크리스트 기준으로 **기존 코드**를 평가하세요.
|
그 내용을 기준으로 **기존 코드**를 평가하세요. \
|
||||||
|
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요.
|
||||||
|
|
||||||
코드를 생성하거나 수정하지 마세요. 이미 존재하는 코드를 감사하는 것이 목적입니다.
|
코드를 생성하거나 수정하지 마세요. 이미 존재하는 코드를 감사하는 것이 목적입니다.
|
||||||
|
|
||||||
@@ -357,8 +328,8 @@ REVIEW_ONLY_TEMPLATE_KO = """\
|
|||||||
그렇지 않으면: VERDICT: FAIL
|
그렇지 않으면: VERDICT: FAIL
|
||||||
"""
|
"""
|
||||||
|
|
||||||
AGGREGATE_REVIEW_TEMPLATE = """\
|
PLAN_REVIEW_TEMPLATE = """\
|
||||||
You are adjudicating multiple review results and turning them into an actionable decision.
|
You are tasked with reviewing planning documents before implementation begins.
|
||||||
|
|
||||||
## Plan
|
## Plan
|
||||||
{plan}
|
{plan}
|
||||||
@@ -369,23 +340,166 @@ You are adjudicating multiple review results and turning them into an actionable
|
|||||||
## Reference Documents
|
## Reference Documents
|
||||||
{docs}
|
{docs}
|
||||||
|
|
||||||
## Candidate Outputs
|
## Previous Review (iteration {iteration} of {max_iterations})
|
||||||
{candidate_outputs}
|
|
||||||
|
|
||||||
## Reviewer Findings
|
|
||||||
{reviews_bundle}
|
|
||||||
|
|
||||||
## Previous Verification Feedback
|
|
||||||
{feedback}
|
{feedback}
|
||||||
|
|
||||||
|
## Review Instructions
|
||||||
|
Review the planning package itself: the plan, checklist, and reference documents.
|
||||||
|
You MAY inspect the current repository to validate feasibility, constraints, and integration assumptions.
|
||||||
|
Do NOT write or modify code. Assume implementation has NOT started yet.
|
||||||
|
|
||||||
|
Your job is to find planning issues that would likely cause bad implementation outcomes:
|
||||||
|
- Ambiguous or contradictory requirements
|
||||||
|
- Missing acceptance criteria, constraints, edge cases, or dependencies
|
||||||
|
- Scope that is broader or more complex than the stated objective
|
||||||
|
- Checklist items that do not verify the actual requirements
|
||||||
|
- Plan details that conflict with the current codebase or architecture
|
||||||
|
|
||||||
|
If previous review results are provided above, you MUST:
|
||||||
|
1. Verify each previously reported issue — is it a real issue or a false positive?
|
||||||
|
2. Look for issues the previous review MISSED.
|
||||||
|
3. Do NOT simply repeat the previous review. Provide your own independent assessment.
|
||||||
|
4. Explicitly mark items as CONFIRMED (still an issue) or DISMISSED (false positive).
|
||||||
|
|
||||||
|
For each issue found, classify it with BOTH severity AND category:
|
||||||
|
|
||||||
|
Severity levels:
|
||||||
|
- **Critical**: The plan is likely to cause fundamentally wrong implementation or unsafe behavior.
|
||||||
|
- **Major**: Important requirements, constraints, or acceptance criteria are unclear, conflicting, missing, or incompatible with the existing system.
|
||||||
|
- **Minor**: Wording, structure, or checklist quality problems that reduce implementation clarity.
|
||||||
|
|
||||||
|
Categories:
|
||||||
|
- **Over-engineering**: The plan introduces scope, abstractions, or complexity not justified by the stated objective.
|
||||||
|
- **Omission**: A necessary requirement, constraint, acceptance criterion, edge case, dependency, or compatibility consideration is missing or incomplete.
|
||||||
|
|
||||||
|
If you find issues outside the planning scope (e.g. repository health, pre-existing code problems), report them separately under "Out of Scope Issues".
|
||||||
|
|
||||||
|
## Output Format
|
||||||
|
|
||||||
|
### Issues Found
|
||||||
|
List issues ordered by severity (Critical first):
|
||||||
|
- [Critical][Over-engineering] Description (reference specific plan/checklist item)
|
||||||
|
- [Major][Omission] Description (reference specific plan/checklist item)
|
||||||
|
- [Minor][Omission] Description (reference specific plan/checklist item)
|
||||||
|
|
||||||
|
### Out of Scope Issues
|
||||||
|
Issues found outside planning scope but worth noting:
|
||||||
|
- [Critical] Description of issue
|
||||||
|
- [Minor] Description of issue
|
||||||
|
(Write "None" if no out-of-scope issues found.)
|
||||||
|
|
||||||
|
### Summary
|
||||||
|
- Critical: N, Major: N, Minor: N
|
||||||
|
- Over-engineering count: N
|
||||||
|
- Omission count: N
|
||||||
|
- CONFIRMED: N, DISMISSED: N
|
||||||
|
- Overall quality: [BRIEF ASSESSMENT]
|
||||||
|
|
||||||
|
### Verdict
|
||||||
|
If the planning documents are clear, complete enough to implement, compatible with the current repository, and free of unjustified scope, output: VERDICT: PASS
|
||||||
|
Otherwise output: VERDICT: FAIL
|
||||||
|
"""
|
||||||
|
|
||||||
|
PLAN_REVIEW_TEMPLATE_KO = """\
|
||||||
|
당신은 구현 시작 전에 기획 문서를 검토하는 리뷰어입니다.
|
||||||
|
|
||||||
|
## 기획서
|
||||||
|
{plan}
|
||||||
|
|
||||||
|
## 체크리스트
|
||||||
|
{checklist}
|
||||||
|
|
||||||
|
## 참고 문서
|
||||||
|
{docs}
|
||||||
|
|
||||||
|
## 이전 리뷰 결과 ({max_iterations}회 중 {iteration}번째)
|
||||||
|
{feedback}
|
||||||
|
|
||||||
|
## 검토 지침
|
||||||
|
검토 대상은 코드가 아니라 기획 패키지 자체입니다: 기획서, 체크리스트, 참고 문서를 함께 검토하세요.
|
||||||
|
현재 저장소를 살펴보며 구현 가능성, 제약조건, 통합 가정이 맞는지도 확인할 수 있습니다.
|
||||||
|
코드를 생성하거나 수정하지 마세요. 아직 구현이 시작되지 않았다고 가정하세요.
|
||||||
|
|
||||||
|
목표는 구현 단계에서 문제를 일으킬 기획 결함을 찾는 것입니다:
|
||||||
|
- 요구사항이 모호하거나 서로 충돌하는 경우
|
||||||
|
- 수용 기준, 제약조건, 엣지 케이스, 의존성이 빠진 경우
|
||||||
|
- 목표 대비 범위가 지나치게 넓거나 복잡한 경우
|
||||||
|
- 체크리스트가 실제 요구사항 검증에 충분하지 않은 경우
|
||||||
|
- 기획 내용이 현재 코드베이스나 아키텍처와 충돌하는 경우
|
||||||
|
|
||||||
|
이전 리뷰 결과가 제공된 경우 반드시:
|
||||||
|
1. 이전에 보고된 각 이슈를 검증하세요 — 진짜 이슈인지 오탐인지?
|
||||||
|
2. 이전 리뷰가 놓친 새로운 이슈를 찾으세요.
|
||||||
|
3. 이전 리뷰를 그대로 반복하지 마세요. 독립적인 평가를 제공하세요.
|
||||||
|
4. 각 항목에 CONFIRMED (여전히 이슈) 또는 DISMISSED (오탐) 태그를 명시하세요.
|
||||||
|
|
||||||
|
발견된 각 이슈에 심각도와 카테고리를 모두 부여하세요:
|
||||||
|
|
||||||
|
심각도:
|
||||||
|
- **Critical**: 잘못된 구현이나 위험한 동작으로 직결될 가능성이 큰 기획 결함.
|
||||||
|
- **Major**: 중요한 요구사항, 제약조건, 수용 기준이 모호하거나 충돌하거나 누락되었거나 기존 시스템과 맞지 않는 경우.
|
||||||
|
- **Minor**: 문서 표현, 구조, 체크리스트 품질 문제로 구현 명확성이 떨어지는 경우.
|
||||||
|
|
||||||
|
카테고리:
|
||||||
|
- **과최적화**: 목표 대비 불필요한 범위, 추상화, 복잡성을 기획에 추가한 경우.
|
||||||
|
- **누락**: 필요한 요구사항, 제약조건, 수용 기준, 엣지 케이스, 의존성, 호환성 고려가 빠졌거나 불완전한 경우.
|
||||||
|
|
||||||
|
기획 범위 밖에서 발견된 문제(저장소 상태, 기존 코드 문제 등)는 "범위 밖 이슈" 섹션에 별도로 보고하세요.
|
||||||
|
|
||||||
|
## 출력 형식
|
||||||
|
|
||||||
|
### 발견된 이슈
|
||||||
|
심각도 순서(Critical 먼저)로 나열:
|
||||||
|
- [Critical][과최적화] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
|
||||||
|
- [Major][누락] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
|
||||||
|
- [Minor][누락] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
|
||||||
|
|
||||||
|
### 범위 밖 이슈
|
||||||
|
기획 범위 밖이지만 주목할 만한 이슈:
|
||||||
|
- [Critical] 이슈 설명
|
||||||
|
- [Minor] 이슈 설명
|
||||||
|
(범위 밖 이슈가 없으면 "없음"이라고 작성하세요.)
|
||||||
|
|
||||||
|
### 요약
|
||||||
|
- Critical: N, Major: N, Minor: N
|
||||||
|
- 과최적화 수: N
|
||||||
|
- 누락 수: N
|
||||||
|
- CONFIRMED: N, DISMISSED: N
|
||||||
|
- 전체 품질: [간략한 평가]
|
||||||
|
|
||||||
|
### 판정
|
||||||
|
기획 문서가 구현 가능한 수준으로 명확하고 충분하며 현재 저장소와도 정합적이고, 불필요한 범위 확장이 없으면: VERDICT: PASS
|
||||||
|
그렇지 않으면: VERDICT: FAIL
|
||||||
|
"""
|
||||||
|
|
||||||
|
AGGREGATE_REVIEW_TEMPLATE = """\
|
||||||
|
You are adjudicating multiple review results and turning them into an actionable decision.
|
||||||
|
|
||||||
|
## Artifact References
|
||||||
|
{artifact_references}
|
||||||
|
|
||||||
|
## Previous Issue Tracker
|
||||||
|
{previous_senior_tracker}
|
||||||
|
|
||||||
|
## Execution Evidence
|
||||||
|
{execution_evidence}
|
||||||
|
|
||||||
## Instructions
|
## Instructions
|
||||||
Explore the project directory to confirm the current codebase state. Then:
|
Read the referenced plan/checklist/docs/review artifacts directly from disk. \
|
||||||
|
Explore the project directory and the referenced git commit/diff to confirm the \
|
||||||
|
current codebase state. Use the execution evidence above to verify claims against \
|
||||||
|
actual command outputs, artifact paths, and exit codes. Then:
|
||||||
1. Deduplicate overlapping issues across reviewers.
|
1. Deduplicate overlapping issues across reviewers.
|
||||||
2. Resolve disagreements explicitly.
|
2. Resolve disagreements explicitly.
|
||||||
3. Keep only issues supported by the plan, checklist, code, or reviewer evidence.
|
3. Keep only issues supported by the plan, checklist, code, or reviewer evidence.
|
||||||
4. When evidence is mixed, explain what was confirmed, what was dismissed, and what still needs follow-up.
|
4. When evidence is mixed, explain what was confirmed, what was dismissed, and what still needs follow-up.
|
||||||
5. Produce a prioritized action list for the coder.
|
5. Produce a prioritized action list for the coder.
|
||||||
6. If no confirmed issue remains, output VERDICT: PASS. Otherwise VERDICT: FAIL.
|
6. Maintain the Issue Tracker table across iterations (carry forward unresolved issues).
|
||||||
|
7. If no confirmed issue remains, output VERDICT: PASS.
|
||||||
|
8. If issues exist that the coder can fix, output VERDICT: FAIL.
|
||||||
|
9. If issues require human intervention (ambiguous requirements, architecture decisions, \
|
||||||
|
external dependency problems, or the same issue persists after 2+ fix attempts), \
|
||||||
|
output VERDICT: ESCALATE.
|
||||||
|
|
||||||
## Output Format
|
## Output Format
|
||||||
|
|
||||||
@@ -401,44 +515,47 @@ Explore the project directory to confirm the current codebase state. Then:
|
|||||||
1. Concrete fix the coder should make
|
1. Concrete fix the coder should make
|
||||||
2. Concrete fix the coder should make
|
2. Concrete fix the coder should make
|
||||||
|
|
||||||
|
## Issue Tracker
|
||||||
|
|
||||||
|
| ISS-ID | Severity | Description | Status | Since |
|
||||||
|
|--------|----------|-------------|--------|-------|
|
||||||
|
| ISS-001 | Critical | ... | Open/Fixed/Dismissed | v1 |
|
||||||
|
|
||||||
### Summary
|
### Summary
|
||||||
- Confirmed issues: N
|
- Confirmed issues: N
|
||||||
- Dismissed findings: N (false positive: N, already fixed: N)
|
- Dismissed findings: N (false positive: N, already fixed: N)
|
||||||
- Overall quality: [BRIEF ASSESSMENT]
|
- Overall quality: [BRIEF ASSESSMENT]
|
||||||
|
|
||||||
### Verdict
|
### Verdict
|
||||||
VERDICT: PASS or VERDICT: FAIL
|
VERDICT: PASS or VERDICT: FAIL or VERDICT: ESCALATE
|
||||||
"""
|
"""
|
||||||
|
|
||||||
AGGREGATE_REVIEW_TEMPLATE_KO = """\
|
AGGREGATE_REVIEW_TEMPLATE_KO = """\
|
||||||
당신은 여러 리뷰 결과를 판정하고 coder가 수정할 액션으로 정리하는 시니어 리뷰어입니다.
|
당신은 여러 리뷰 결과를 판정하고 coder가 수정할 액션으로 정리하는 시니어 리뷰어입니다.
|
||||||
|
|
||||||
## 기획서
|
## 참조 아티팩트
|
||||||
{plan}
|
{artifact_references}
|
||||||
|
|
||||||
## 체크리스트
|
## 이전 이슈 트래커
|
||||||
{checklist}
|
{previous_senior_tracker}
|
||||||
|
|
||||||
## 참고 문서
|
## 실행 증거
|
||||||
{docs}
|
{execution_evidence}
|
||||||
|
|
||||||
## 후보 결과물
|
|
||||||
{candidate_outputs}
|
|
||||||
|
|
||||||
## 개별 리뷰 결과
|
|
||||||
{reviews_bundle}
|
|
||||||
|
|
||||||
## 이전 검증 피드백
|
|
||||||
{feedback}
|
|
||||||
|
|
||||||
## 지침
|
## 지침
|
||||||
프로젝트 디렉토리를 탐색하여 현재 코드베이스 상태를 확인한 뒤 다음을 수행하세요.
|
참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽어 현재 코드베이스 상태를 확인한 뒤, \
|
||||||
|
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요. \
|
||||||
|
그런 다음 아래를 수행하세요.
|
||||||
1. 리뷰어들 사이에 중복되는 이슈를 합치세요.
|
1. 리뷰어들 사이에 중복되는 이슈를 합치세요.
|
||||||
2. 의견 충돌은 명시적으로 정리하세요.
|
2. 의견 충돌은 명시적으로 정리하세요.
|
||||||
3. 기획서, 체크리스트, 코드, 리뷰 근거로 뒷받침되는 이슈만 남기세요.
|
3. 기획서, 체크리스트, 코드, 리뷰 근거로 뒷받침되는 이슈만 남기세요.
|
||||||
4. 근거가 엇갈리면 무엇이 확정이고 무엇이 기각 또는 추가확인 대상인지 분명히 적으세요.
|
4. 근거가 엇갈리면 무엇이 확정이고 무엇이 기각 또는 추가확인 대상인지 분명히 적으세요.
|
||||||
5. coder가 바로 수정할 수 있는 우선순위 액션 아이템을 만드세요.
|
5. coder가 바로 수정할 수 있는 우선순위 액션 아이템을 만드세요.
|
||||||
6. 확정된 이슈가 없으면 VERDICT: PASS, 있으면 VERDICT: FAIL 을 출력하세요.
|
6. 이슈 트래커 테이블을 반복 간에 유지하세요 (미해결 이슈를 이월).
|
||||||
|
7. 확정된 이슈가 없으면 VERDICT: PASS 를 출력하세요.
|
||||||
|
8. coder가 수정 가능한 이슈가 있으면 VERDICT: FAIL 을 출력하세요.
|
||||||
|
9. 사람의 개입이 필요한 이슈(모호한 요구사항, 아키텍처 결정, 외부 의존성 문제, \
|
||||||
|
동일 이슈가 2회 이상 해결 실패)가 있으면 VERDICT: ESCALATE 를 출력하세요.
|
||||||
|
|
||||||
## 출력 형식
|
## 출력 형식
|
||||||
|
|
||||||
@@ -454,26 +571,34 @@ AGGREGATE_REVIEW_TEMPLATE_KO = """\
|
|||||||
1. coder가 수정해야 할 구체적인 작업
|
1. coder가 수정해야 할 구체적인 작업
|
||||||
2. coder가 수정해야 할 구체적인 작업
|
2. coder가 수정해야 할 구체적인 작업
|
||||||
|
|
||||||
|
## 이슈 트래커
|
||||||
|
|
||||||
|
| ISS-ID | 심각도 | 설명 | 상태 | 최초 발견 |
|
||||||
|
|--------|--------|------|------|-----------|
|
||||||
|
| ISS-001 | Critical | ... | Open/Fixed/Dismissed | v1 |
|
||||||
|
|
||||||
### 요약
|
### 요약
|
||||||
- 확정 이슈 수: N
|
- 확정 이슈 수: N
|
||||||
- 기각된 주장 수: N (오탐: N, 수정 완료: N)
|
- 기각된 주장 수: N (오탐: N, 수정 완료: N)
|
||||||
- 전체 품질: [간략한 평가]
|
- 전체 품질: [간략한 평가]
|
||||||
|
|
||||||
### 판정
|
### 판정
|
||||||
VERDICT: PASS 또는 VERDICT: FAIL
|
VERDICT: PASS 또는 VERDICT: FAIL 또는 VERDICT: ESCALATE
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_TEMPLATES: dict[str, dict[str, str]] = {
|
DEFAULT_TEMPLATES: dict[str, dict[str, str]] = {
|
||||||
"en": {
|
"en": {
|
||||||
"generate": GENERATE_TEMPLATE,
|
"coding": CODING_TEMPLATE,
|
||||||
"review": REVIEW_TEMPLATE,
|
"review": REVIEW_TEMPLATE,
|
||||||
|
"plan-review": PLAN_REVIEW_TEMPLATE,
|
||||||
"review-only": REVIEW_ONLY_TEMPLATE,
|
"review-only": REVIEW_ONLY_TEMPLATE,
|
||||||
"aggregate-review": AGGREGATE_REVIEW_TEMPLATE,
|
"aggregate-review": AGGREGATE_REVIEW_TEMPLATE,
|
||||||
},
|
},
|
||||||
"ko": {
|
"ko": {
|
||||||
"generate": GENERATE_TEMPLATE_KO,
|
"coding": CODING_TEMPLATE_KO,
|
||||||
"review": REVIEW_TEMPLATE_KO,
|
"review": REVIEW_TEMPLATE_KO,
|
||||||
|
"plan-review": PLAN_REVIEW_TEMPLATE_KO,
|
||||||
"review-only": REVIEW_ONLY_TEMPLATE_KO,
|
"review-only": REVIEW_ONLY_TEMPLATE_KO,
|
||||||
"aggregate-review": AGGREGATE_REVIEW_TEMPLATE_KO,
|
"aggregate-review": AGGREGATE_REVIEW_TEMPLATE_KO,
|
||||||
},
|
},
|
||||||
@@ -544,18 +669,18 @@ def _build_named_bundle(
|
|||||||
def _build_simple_preset(
|
def _build_simple_preset(
|
||||||
coders: list[str], reviewers: list[str], seniors: list[str],
|
coders: list[str], reviewers: list[str], seniors: list[str],
|
||||||
) -> list[StepConfig]:
|
) -> list[StepConfig]:
|
||||||
"""First coder generates, first reviewer reviews."""
|
"""First coder writes code, first reviewer reviews."""
|
||||||
if not coders:
|
if not coders:
|
||||||
raise ValueError("'simple' preset requires at least 1 coder")
|
raise ValueError("'simple' preset requires at least 1 coder")
|
||||||
if not reviewers:
|
if not reviewers:
|
||||||
raise ValueError("'simple' preset requires at least 1 reviewer")
|
raise ValueError("'simple' preset requires at least 1 reviewer")
|
||||||
steps = [
|
steps = [
|
||||||
StepConfig(
|
StepConfig(
|
||||||
name="generate",
|
name="coding",
|
||||||
agent=coders[0],
|
agent=coders[0],
|
||||||
role="generate",
|
role="coding",
|
||||||
prompt_template="default:generate",
|
prompt_template="default:coding",
|
||||||
output_key="generated_code",
|
output_key="coding_output",
|
||||||
),
|
),
|
||||||
StepConfig(
|
StepConfig(
|
||||||
name="review",
|
name="review",
|
||||||
@@ -576,7 +701,7 @@ def _build_simple_preset(
|
|||||||
output_key="senior_review_result",
|
output_key="senior_review_result",
|
||||||
verdict=True,
|
verdict=True,
|
||||||
context_override={
|
context_override={
|
||||||
"candidate_outputs": "## Generated code\n{generated_code}",
|
"candidate_outputs": "## Coding output\n{coding_output}",
|
||||||
"reviews_bundle": f"## Review: {reviewers[0]} (review)\n{{review_result}}",
|
"reviews_bundle": f"## Review: {reviewers[0]} (review)\n{{review_result}}",
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
@@ -587,25 +712,25 @@ def _build_simple_preset(
|
|||||||
def _build_cross_review_preset(
|
def _build_cross_review_preset(
|
||||||
coders: list[str], reviewers: list[str], seniors: list[str],
|
coders: list[str], reviewers: list[str], seniors: list[str],
|
||||||
) -> list[StepConfig]:
|
) -> list[StepConfig]:
|
||||||
"""Both coders generate, then cross-review each other's output."""
|
"""Both coders write code, then cross-review each other's output."""
|
||||||
if len(coders) < 2:
|
if len(coders) < 2:
|
||||||
raise ValueError("'cross-review' preset requires at least 2 coders")
|
raise ValueError("'cross-review' preset requires at least 2 coders")
|
||||||
a, b = coders[0], coders[1]
|
a, b = coders[0], coders[1]
|
||||||
ak, bk = _unique_safe_keys([a, b])
|
ak, bk = _unique_safe_keys([a, b])
|
||||||
steps = [
|
steps = [
|
||||||
StepConfig(
|
StepConfig(
|
||||||
name=f"generate_{ak}",
|
name=f"coding_{ak}",
|
||||||
agent=a,
|
agent=a,
|
||||||
role="generate",
|
role="coding",
|
||||||
prompt_template="default:generate",
|
prompt_template="default:coding",
|
||||||
output_key=f"code_{ak}",
|
output_key=f"code_{ak}",
|
||||||
parallel=True,
|
parallel=True,
|
||||||
),
|
),
|
||||||
StepConfig(
|
StepConfig(
|
||||||
name=f"generate_{bk}",
|
name=f"coding_{bk}",
|
||||||
agent=b,
|
agent=b,
|
||||||
role="generate",
|
role="coding",
|
||||||
prompt_template="default:generate",
|
prompt_template="default:coding",
|
||||||
output_key=f"code_{bk}",
|
output_key=f"code_{bk}",
|
||||||
parallel=True,
|
parallel=True,
|
||||||
),
|
),
|
||||||
@@ -615,7 +740,7 @@ def _build_cross_review_preset(
|
|||||||
role="review",
|
role="review",
|
||||||
prompt_template="default:review",
|
prompt_template="default:review",
|
||||||
output_key=f"review_by_{ak}",
|
output_key=f"review_by_{ak}",
|
||||||
context_override={"generated_code": f"{{code_{bk}}}"},
|
context_override={"coding_output": f"{{code_{bk}}}"},
|
||||||
parallel=True,
|
parallel=True,
|
||||||
verdict=not seniors,
|
verdict=not seniors,
|
||||||
),
|
),
|
||||||
@@ -626,7 +751,7 @@ def _build_cross_review_preset(
|
|||||||
prompt_template="default:review",
|
prompt_template="default:review",
|
||||||
output_key=f"review_by_{bk}",
|
output_key=f"review_by_{bk}",
|
||||||
verdict=not seniors,
|
verdict=not seniors,
|
||||||
context_override={"generated_code": f"{{code_{ak}}}"},
|
context_override={"coding_output": f"{{code_{ak}}}"},
|
||||||
parallel=True,
|
parallel=True,
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
@@ -642,9 +767,9 @@ def _build_cross_review_preset(
|
|||||||
context_override={
|
context_override={
|
||||||
"candidate_outputs": _build_named_bundle(
|
"candidate_outputs": _build_named_bundle(
|
||||||
[a, b],
|
[a, b],
|
||||||
[f"generate_{ak}", f"generate_{bk}"],
|
[f"coding_{ak}", f"coding_{bk}"],
|
||||||
[f"code_{ak}", f"code_{bk}"],
|
[f"code_{ak}", f"code_{bk}"],
|
||||||
"Candidate",
|
"Coding Output",
|
||||||
),
|
),
|
||||||
"reviews_bundle": _build_named_bundle(
|
"reviews_bundle": _build_named_bundle(
|
||||||
[a, b],
|
[a, b],
|
||||||
@@ -715,6 +840,61 @@ def _build_review_only_preset(
|
|||||||
return steps
|
return steps
|
||||||
|
|
||||||
|
|
||||||
|
def _build_plan_review_preset(
|
||||||
|
coders: list[str], reviewers: list[str], seniors: list[str],
|
||||||
|
) -> list[StepConfig]:
|
||||||
|
"""Plan-review: reviewers audit planning docs before implementation."""
|
||||||
|
if not reviewers:
|
||||||
|
raise ValueError("'plan-review' preset requires at least 1 reviewer")
|
||||||
|
|
||||||
|
if len(reviewers) == 1 and not seniors:
|
||||||
|
return [
|
||||||
|
StepConfig(
|
||||||
|
name="plan_review",
|
||||||
|
agent=reviewers[0],
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:plan-review",
|
||||||
|
output_key="plan_review_result",
|
||||||
|
verdict=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
steps: list[StepConfig] = []
|
||||||
|
reviewer_keys = _unique_safe_keys(reviewers)
|
||||||
|
for reviewer, rk in zip(reviewers, reviewer_keys):
|
||||||
|
steps.append(
|
||||||
|
StepConfig(
|
||||||
|
name=f"plan_review_{rk}",
|
||||||
|
agent=reviewer,
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:plan-review",
|
||||||
|
output_key=f"plan_review_{rk}",
|
||||||
|
verdict=not seniors,
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
if seniors:
|
||||||
|
step_names = [f"plan_review_{rk}" for rk in reviewer_keys]
|
||||||
|
output_keys = [f"plan_review_{rk}" for rk in reviewer_keys]
|
||||||
|
steps.append(
|
||||||
|
StepConfig(
|
||||||
|
name="senior_review",
|
||||||
|
agent=seniors[0],
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:aggregate-review",
|
||||||
|
output_key="senior_review_result",
|
||||||
|
verdict=True,
|
||||||
|
context_override={
|
||||||
|
"candidate_outputs": "Planning documents under review (plan/checklist/reference docs).",
|
||||||
|
"reviews_bundle": _build_named_bundle(
|
||||||
|
reviewers, step_names, output_keys, "Review",
|
||||||
|
),
|
||||||
|
},
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return steps
|
||||||
|
|
||||||
|
|
||||||
def _build_review_fix_preset(
|
def _build_review_fix_preset(
|
||||||
coders: list[str], reviewers: list[str], seniors: list[str],
|
coders: list[str], reviewers: list[str], seniors: list[str],
|
||||||
) -> list[PhaseConfig]:
|
) -> list[PhaseConfig]:
|
||||||
@@ -762,11 +942,11 @@ def _build_review_fix_preset(
|
|||||||
},
|
},
|
||||||
),
|
),
|
||||||
StepConfig(
|
StepConfig(
|
||||||
name="generate",
|
name="coding",
|
||||||
agent=fix_coder,
|
agent=fix_coder,
|
||||||
role="generate",
|
role="coding",
|
||||||
prompt_template="default:generate",
|
prompt_template="default:coding",
|
||||||
output_key="generated_code",
|
output_key="coding_output",
|
||||||
context_override={"feedback": "{aggregate_review}"},
|
context_override={"feedback": "{aggregate_review}"},
|
||||||
),
|
),
|
||||||
StepConfig(
|
StepConfig(
|
||||||
@@ -784,14 +964,44 @@ def _build_review_fix_preset(
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _build_coding_review_fix_preset(
|
||||||
|
coders: list[str], reviewers: list[str], seniors: list[str],
|
||||||
|
) -> list[PhaseConfig]:
|
||||||
|
"""Write code once, then run the review-fix convergence loop."""
|
||||||
|
if not coders:
|
||||||
|
raise ValueError("'coding-review-fix' preset requires at least 1 coder")
|
||||||
|
if not reviewers:
|
||||||
|
raise ValueError("'coding-review-fix' preset requires at least 1 reviewer")
|
||||||
|
|
||||||
|
return [
|
||||||
|
PhaseConfig(
|
||||||
|
name="initial_coding",
|
||||||
|
steps=[
|
||||||
|
StepConfig(
|
||||||
|
name="coding",
|
||||||
|
agent=coders[0],
|
||||||
|
role="coding",
|
||||||
|
prompt_template="default:coding",
|
||||||
|
output_key="coding_output",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
max_iterations=1,
|
||||||
|
consecutive_pass=1,
|
||||||
|
),
|
||||||
|
*_build_review_fix_preset(coders, reviewers, seniors),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
PIPELINE_PRESETS: dict[str, Callable] = {
|
PIPELINE_PRESETS: dict[str, Callable] = {
|
||||||
"simple": _build_simple_preset,
|
"simple": _build_simple_preset,
|
||||||
"cross-review": _build_cross_review_preset,
|
"cross-review": _build_cross_review_preset,
|
||||||
|
"plan-review": _build_plan_review_preset,
|
||||||
"review-only": _build_review_only_preset,
|
"review-only": _build_review_only_preset,
|
||||||
}
|
}
|
||||||
|
|
||||||
PHASED_PRESETS: dict[str, Callable] = {
|
PHASED_PRESETS: dict[str, Callable] = {
|
||||||
"review-fix": _build_review_fix_preset,
|
"review-fix": _build_review_fix_preset,
|
||||||
|
"coding-review-fix": _build_coding_review_fix_preset,
|
||||||
}
|
}
|
||||||
|
|
||||||
ALL_PRESET_NAMES: list[str] = list(PIPELINE_PRESETS.keys()) + list(PHASED_PRESETS.keys())
|
ALL_PRESET_NAMES: list[str] = list(PIPELINE_PRESETS.keys()) + list(PHASED_PRESETS.keys())
|
||||||
@@ -805,7 +1015,7 @@ def resolve_template(template_ref: str, templates_dir: Optional[Path] = None) ->
|
|||||||
"""Resolve a template reference to its content string.
|
"""Resolve a template reference to its content string.
|
||||||
|
|
||||||
Formats:
|
Formats:
|
||||||
- "default:generate" -> built-in GENERATE_TEMPLATE
|
- "default:coding" -> built-in CODING_TEMPLATE
|
||||||
- "default:review" -> built-in REVIEW_TEMPLATE
|
- "default:review" -> built-in REVIEW_TEMPLATE
|
||||||
- "path/to/file.md" -> read file contents
|
- "path/to/file.md" -> read file contents
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -48,11 +48,22 @@ _STRINGS: dict[str, dict[str, str]] = {
|
|||||||
"pass_msg": "All checklist items satisfied. No over-engineering or omissions detected.",
|
"pass_msg": "All checklist items satisfied. No over-engineering or omissions detected.",
|
||||||
"fail_phased": "Pipeline phases ({phases}) completed without full convergence.",
|
"fail_phased": "Pipeline phases ({phases}) completed without full convergence.",
|
||||||
"fail_simple": "Maximum iterations ({max_iter}) reached without passing all checks.",
|
"fail_simple": "Maximum iterations ({max_iter}) reached without passing all checks.",
|
||||||
|
"escalate_msg": "Human review required. The following issues could not be resolved automatically:",
|
||||||
|
"escalate_title": "Escalation Report",
|
||||||
|
"issue_tracker_title": "Issue Tracker Summary",
|
||||||
|
"issue_tracker_desc": "Issues discovered across iterations and their final resolution status.",
|
||||||
"metrics_title": "Review Metrics",
|
"metrics_title": "Review Metrics",
|
||||||
"metrics_trend_title": "Metrics Trend",
|
"metrics_trend_title": "Metrics Trend",
|
||||||
"metrics_iter": "Iter",
|
"metrics_iter": "Iter",
|
||||||
"metrics_total_issues": "Total Issues",
|
"metrics_total_issues": "Total Issues",
|
||||||
"metrics_na": "N/A",
|
"metrics_na": "N/A",
|
||||||
|
"iteration_details": "Iteration Details",
|
||||||
|
"evidence_summary": "Evidence Summary",
|
||||||
|
"evidence_agent": "Agent",
|
||||||
|
"evidence_exit_code": "Exit Code",
|
||||||
|
"evidence_duration": "Duration",
|
||||||
|
"evidence_output_size": "Output Size",
|
||||||
|
"evidence_transcript": "Execution transcript",
|
||||||
},
|
},
|
||||||
"ko": {
|
"ko": {
|
||||||
"title": "교차 검증 리포트",
|
"title": "교차 검증 리포트",
|
||||||
@@ -84,11 +95,22 @@ _STRINGS: dict[str, dict[str, str]] = {
|
|||||||
"pass_msg": "모든 체크리스트 항목 충족. 과최적화/누락 없음.",
|
"pass_msg": "모든 체크리스트 항목 충족. 과최적화/누락 없음.",
|
||||||
"fail_phased": "파이프라인 페이즈 ({phases}) 완료, 완전한 수렴에 도달하지 못함.",
|
"fail_phased": "파이프라인 페이즈 ({phases}) 완료, 완전한 수렴에 도달하지 못함.",
|
||||||
"fail_simple": "최대 반복 횟수 ({max_iter})에 도달, 모든 검증을 통과하지 못함.",
|
"fail_simple": "최대 반복 횟수 ({max_iter})에 도달, 모든 검증을 통과하지 못함.",
|
||||||
|
"escalate_msg": "사람의 확인이 필요합니다. 아래 이슈는 자동으로 해결할 수 없었습니다:",
|
||||||
|
"escalate_title": "에스컬레이션 리포트",
|
||||||
|
"issue_tracker_title": "이슈 트래커 요약",
|
||||||
|
"issue_tracker_desc": "반복 과정에서 발견된 이슈와 최종 처리 상태입니다.",
|
||||||
"metrics_title": "리뷰 메트릭",
|
"metrics_title": "리뷰 메트릭",
|
||||||
"metrics_trend_title": "메트릭 추이",
|
"metrics_trend_title": "메트릭 추이",
|
||||||
"metrics_iter": "반복",
|
"metrics_iter": "반복",
|
||||||
"metrics_total_issues": "총 이슈",
|
"metrics_total_issues": "총 이슈",
|
||||||
"metrics_na": "해당 없음",
|
"metrics_na": "해당 없음",
|
||||||
|
"iteration_details": "반복 상세",
|
||||||
|
"evidence_summary": "실행 증거 요약",
|
||||||
|
"evidence_agent": "에이전트",
|
||||||
|
"evidence_exit_code": "종료 코드",
|
||||||
|
"evidence_duration": "소요 시간",
|
||||||
|
"evidence_output_size": "출력 크기",
|
||||||
|
"evidence_transcript": "실행 트랜스크립트",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -181,20 +203,41 @@ def _build_simple_report(
|
|||||||
|
|
||||||
out_of_scope_items: list[tuple[int, str]] = []
|
out_of_scope_items: list[tuple[int, str]] = []
|
||||||
|
|
||||||
|
# Pre-scan iterations to collect out-of-scope items and review metrics
|
||||||
|
# (needed before rendering final verdict / metrics sections)
|
||||||
for iter_result in result.iterations:
|
for iter_result in result.iterations:
|
||||||
lines.append("---\n")
|
for step in config.pipeline:
|
||||||
lines.append(f"## {_t(config, 'iteration')} {iter_result.iteration}\n")
|
output = iter_result.step_outputs.get(step.output_key, "")
|
||||||
|
if step.role == "review":
|
||||||
|
oos = _extract_out_of_scope(output)
|
||||||
|
if oos:
|
||||||
|
out_of_scope_items.append((iter_result.iteration, oos))
|
||||||
|
step_metrics = parse_review_metrics(output)
|
||||||
|
if iter_result.review_metrics is None:
|
||||||
|
iter_result.review_metrics = step_metrics
|
||||||
|
else:
|
||||||
|
iter_result.review_metrics = _aggregate_metrics(
|
||||||
|
iter_result.review_metrics, step_metrics,
|
||||||
|
)
|
||||||
|
|
||||||
_append_iteration_steps(lines, config, iter_result, config.pipeline, out_of_scope_items)
|
_append_final_verdict(lines, config, result)
|
||||||
|
_append_issue_tracker_summary(lines, config, result)
|
||||||
|
_append_review_metrics_table(lines, config, result)
|
||||||
|
|
||||||
|
lines.append("---\n")
|
||||||
|
lines.append(f"## {_t(config, 'iteration_details')}\n")
|
||||||
|
|
||||||
|
for iter_result in result.iterations:
|
||||||
|
lines.append(f"### {_t(config, 'iteration')} {iter_result.iteration}\n")
|
||||||
|
|
||||||
|
_append_iteration_steps(lines, config, iter_result, config.pipeline, out_of_scope_items, skip_extraction=True)
|
||||||
|
|
||||||
if iter_result.feedback:
|
if iter_result.feedback:
|
||||||
lines.append(f"**{_t(config, 'feedback_next')}** {iter_result.feedback[:200]}...")
|
lines.append(f"**{_t(config, 'feedback_next')}** {iter_result.feedback[:200]}...")
|
||||||
lines.append("")
|
lines.append("")
|
||||||
|
|
||||||
_append_out_of_scope(lines, config, out_of_scope_items)
|
_append_out_of_scope(lines, config, out_of_scope_items)
|
||||||
_append_review_metrics_table(lines, config, result)
|
|
||||||
_append_repeated_aggregate(lines, config, result)
|
_append_repeated_aggregate(lines, config, result)
|
||||||
_append_final_verdict(lines, config, result)
|
|
||||||
|
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
@@ -211,14 +254,42 @@ def _build_phased_report(
|
|||||||
phase_map = {p.name: p for p in config.phases}
|
phase_map = {p.name: p for p in config.phases}
|
||||||
out_of_scope_items: list[tuple[int, str]] = []
|
out_of_scope_items: list[tuple[int, str]] = []
|
||||||
|
|
||||||
|
# Pre-scan iterations to collect out-of-scope items and review metrics
|
||||||
|
for phase_name, phase_iters_iter in groupby(
|
||||||
|
result.iterations, key=lambda ir: ir.phase_name,
|
||||||
|
):
|
||||||
|
phase_iters = list(phase_iters_iter)
|
||||||
|
phase_config = phase_map.get(phase_name or "")
|
||||||
|
steps = phase_config.steps if phase_config else config.pipeline
|
||||||
|
for iter_result in phase_iters:
|
||||||
|
for step in steps:
|
||||||
|
output = iter_result.step_outputs.get(step.output_key, "")
|
||||||
|
if step.role == "review":
|
||||||
|
oos = _extract_out_of_scope(output)
|
||||||
|
if oos:
|
||||||
|
out_of_scope_items.append((iter_result.iteration, oos))
|
||||||
|
step_metrics = parse_review_metrics(output)
|
||||||
|
if iter_result.review_metrics is None:
|
||||||
|
iter_result.review_metrics = step_metrics
|
||||||
|
else:
|
||||||
|
iter_result.review_metrics = _aggregate_metrics(
|
||||||
|
iter_result.review_metrics, step_metrics,
|
||||||
|
)
|
||||||
|
|
||||||
|
_append_final_verdict(lines, config, result)
|
||||||
|
_append_issue_tracker_summary(lines, config, result)
|
||||||
|
_append_review_metrics_table(lines, config, result)
|
||||||
|
|
||||||
|
lines.append("---\n")
|
||||||
|
lines.append(f"## {_t(config, 'iteration_details')}\n")
|
||||||
|
|
||||||
for phase_name, phase_iters_iter in groupby(
|
for phase_name, phase_iters_iter in groupby(
|
||||||
result.iterations, key=lambda ir: ir.phase_name,
|
result.iterations, key=lambda ir: ir.phase_name,
|
||||||
):
|
):
|
||||||
phase_iters = list(phase_iters_iter)
|
phase_iters = list(phase_iters_iter)
|
||||||
phase_config = phase_map.get(phase_name or "")
|
phase_config = phase_map.get(phase_name or "")
|
||||||
|
|
||||||
lines.append("---\n")
|
lines.append(f"### {_t(config, 'phase')}: {phase_name}\n")
|
||||||
lines.append(f"## {_t(config, 'phase')}: {phase_name}\n")
|
|
||||||
|
|
||||||
if phase_config:
|
if phase_config:
|
||||||
step_desc = " → ".join(s.name for s in phase_config.steps)
|
step_desc = " → ".join(s.name for s in phase_config.steps)
|
||||||
@@ -242,14 +313,17 @@ def _build_phased_report(
|
|||||||
verdict_label += " ✓"
|
verdict_label += " ✓"
|
||||||
else:
|
else:
|
||||||
verdict_label = " — PASS ✓"
|
verdict_label = " — PASS ✓"
|
||||||
|
elif iter_result.verdict == "ESCALATE":
|
||||||
|
consecutive = 0
|
||||||
|
verdict_label = " — ESCALATE"
|
||||||
else:
|
else:
|
||||||
consecutive = 0
|
consecutive = 0
|
||||||
verdict_label = " — FAIL"
|
verdict_label = " — FAIL"
|
||||||
|
|
||||||
lines.append(
|
lines.append(
|
||||||
f"### {_t(config, 'iteration')} {iter_result.iteration}{verdict_label}\n"
|
f"#### {_t(config, 'iteration')} {iter_result.iteration}{verdict_label}\n"
|
||||||
)
|
)
|
||||||
_append_iteration_steps(lines, config, iter_result, steps, out_of_scope_items)
|
_append_iteration_steps(lines, config, iter_result, steps, out_of_scope_items, skip_extraction=True)
|
||||||
|
|
||||||
if iter_result.feedback:
|
if iter_result.feedback:
|
||||||
lines.append(
|
lines.append(
|
||||||
@@ -258,9 +332,7 @@ def _build_phased_report(
|
|||||||
lines.append("")
|
lines.append("")
|
||||||
|
|
||||||
_append_out_of_scope(lines, config, out_of_scope_items)
|
_append_out_of_scope(lines, config, out_of_scope_items)
|
||||||
_append_review_metrics_table(lines, config, result)
|
|
||||||
_append_repeated_aggregate(lines, config, result)
|
_append_repeated_aggregate(lines, config, result)
|
||||||
_append_final_verdict(lines, config, result)
|
|
||||||
|
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
@@ -309,8 +381,38 @@ def _append_iteration_steps(
|
|||||||
iter_result: IterationResult,
|
iter_result: IterationResult,
|
||||||
steps: list[StepConfig],
|
steps: list[StepConfig],
|
||||||
out_of_scope_items: list[tuple[int, str]],
|
out_of_scope_items: list[tuple[int, str]],
|
||||||
|
*,
|
||||||
|
skip_extraction: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Append step details for one iteration."""
|
"""Append step details for one iteration.
|
||||||
|
|
||||||
|
If *skip_extraction* is True, out-of-scope and review-metrics parsing
|
||||||
|
is skipped (useful when a pre-scan already collected that data).
|
||||||
|
"""
|
||||||
|
# Evidence summary table — quick overview of all steps' execution data
|
||||||
|
has_evidence = any(
|
||||||
|
iter_result.step_results.get(s.output_key) for s in steps
|
||||||
|
)
|
||||||
|
if has_evidence:
|
||||||
|
s_step = _t(config, "step")
|
||||||
|
s_agent = _t(config, "evidence_agent")
|
||||||
|
s_exit = _t(config, "evidence_exit_code")
|
||||||
|
s_dur = _t(config, "evidence_duration")
|
||||||
|
s_size = _t(config, "evidence_output_size")
|
||||||
|
lines.append(f"**{_t(config, 'evidence_summary')}**\n")
|
||||||
|
lines.append(f"| {s_step} | {s_agent} | {s_exit} | {s_dur} | {s_size} |")
|
||||||
|
lines.append("|------|-------|-----------|----------|-------------|")
|
||||||
|
for step in steps:
|
||||||
|
ar = iter_result.step_results.get(step.output_key)
|
||||||
|
out = iter_result.step_outputs.get(step.output_key, "")
|
||||||
|
if ar:
|
||||||
|
lines.append(
|
||||||
|
f"| {step.name} | {ar.agent_name} "
|
||||||
|
f"| {ar.exit_code} | {ar.duration_seconds}s "
|
||||||
|
f"| {len(out)} chars |"
|
||||||
|
)
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
for step in steps:
|
for step in steps:
|
||||||
agent_result = iter_result.step_results.get(step.output_key)
|
agent_result = iter_result.step_results.get(step.output_key)
|
||||||
output = iter_result.step_outputs.get(step.output_key, "")
|
output = iter_result.step_outputs.get(step.output_key, "")
|
||||||
@@ -320,6 +422,11 @@ def _append_iteration_steps(
|
|||||||
|
|
||||||
lines.append(f"### {_t(config, 'step')}: {step.name} ({agent_name}){duration}\n")
|
lines.append(f"### {_t(config, 'step')}: {step.name} ({agent_name}){duration}\n")
|
||||||
|
|
||||||
|
# Show command preview and exit code for execution evidence
|
||||||
|
if agent_result and agent_result.command_preview:
|
||||||
|
lines.append(f"**Command**: `{agent_result.command_preview}`")
|
||||||
|
lines.append(f"**Exit code**: {agent_result.exit_code}\n")
|
||||||
|
|
||||||
if step.verdict and iter_result.verdict:
|
if step.verdict and iter_result.verdict:
|
||||||
lines.append(f"**{_t(config, 'verdict')}: {iter_result.verdict}**\n")
|
lines.append(f"**{_t(config, 'verdict')}: {iter_result.verdict}**\n")
|
||||||
|
|
||||||
@@ -334,7 +441,18 @@ def _append_iteration_steps(
|
|||||||
lines.append(output)
|
lines.append(output)
|
||||||
lines.append("")
|
lines.append("")
|
||||||
|
|
||||||
if step.role == "review":
|
# Include transcript excerpt for execution evidence visibility
|
||||||
|
if agent_result and agent_result.transcript:
|
||||||
|
transcript_preview = agent_result.transcript[:1500]
|
||||||
|
if len(agent_result.transcript) > 1500:
|
||||||
|
transcript_preview += "\n... (truncated)"
|
||||||
|
transcript_label = _t(config, "evidence_transcript")
|
||||||
|
lines.append("<details>")
|
||||||
|
lines.append(f"<summary>{transcript_label}</summary>\n")
|
||||||
|
lines.append(transcript_preview)
|
||||||
|
lines.append("\n</details>\n")
|
||||||
|
|
||||||
|
if not skip_extraction and step.role == "review":
|
||||||
oos = _extract_out_of_scope(output)
|
oos = _extract_out_of_scope(output)
|
||||||
if oos:
|
if oos:
|
||||||
out_of_scope_items.append((iter_result.iteration, oos))
|
out_of_scope_items.append((iter_result.iteration, oos))
|
||||||
@@ -469,8 +587,18 @@ def _append_final_verdict(
|
|||||||
lines.append("---\n")
|
lines.append("---\n")
|
||||||
lines.append(f"## {_t(config, 'final_verdict_title')}: {result.final_verdict}\n")
|
lines.append(f"## {_t(config, 'final_verdict_title')}: {result.final_verdict}\n")
|
||||||
|
|
||||||
|
if result.agentic_branch:
|
||||||
|
lines.append(f"**Agentic branch**: `{result.agentic_branch}`")
|
||||||
|
lines.append(f"```bash\ngit checkout {result.agentic_branch}\n```\n")
|
||||||
|
|
||||||
if result.final_verdict == "PASS":
|
if result.final_verdict == "PASS":
|
||||||
lines.append(_t(config, "pass_msg"))
|
lines.append(_t(config, "pass_msg"))
|
||||||
|
elif result.final_verdict == "ESCALATE":
|
||||||
|
lines.append(_t(config, "escalate_msg"))
|
||||||
|
lines.append("")
|
||||||
|
for issue in result.escalated_issues:
|
||||||
|
lines.append(f"- {issue}")
|
||||||
|
lines.append("")
|
||||||
else:
|
else:
|
||||||
if config.phases:
|
if config.phases:
|
||||||
phase_names = " → ".join(p.name for p in config.phases)
|
phase_names = " → ".join(p.name for p in config.phases)
|
||||||
@@ -481,6 +609,121 @@ def _append_final_verdict(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Issue Tracker extraction from senior/aggregate outputs
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_ISSUE_TRACKER_PATTERN = re.compile(
|
||||||
|
r"##+ (?:Issue Tracker|이슈 트래커)[^\n]*\n((?:\|[^\n]+\|\n?)+)",
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
_TRACKER_ROW_PATTERN = re.compile(
|
||||||
|
r"^\|\s*(ISS-\d+)\s*\|\s*(\S+)\s*\|\s*(.*?)\s*\|\s*(\S+)\s*\|\s*(\S+)\s*\|",
|
||||||
|
re.MULTILINE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_issue_tracker_rows(
|
||||||
|
result: PipelineResult,
|
||||||
|
) -> list[dict[str, str]]:
|
||||||
|
"""Extract the latest Issue Tracker table from pipeline results.
|
||||||
|
|
||||||
|
Scans iteration outputs in reverse to find the most recent tracker table
|
||||||
|
from aggregate/senior review steps. Falls back to parsing individual
|
||||||
|
review outputs for ISS-NNN tagged issues.
|
||||||
|
"""
|
||||||
|
# Try to find a tracker table from the last iteration with one
|
||||||
|
for ir in reversed(result.iterations):
|
||||||
|
for key, output in ir.step_outputs.items():
|
||||||
|
match = _ISSUE_TRACKER_PATTERN.search(output)
|
||||||
|
if not match:
|
||||||
|
continue
|
||||||
|
table_text = match.group(1)
|
||||||
|
rows = []
|
||||||
|
for row_match in _TRACKER_ROW_PATTERN.finditer(table_text):
|
||||||
|
rows.append({
|
||||||
|
"id": row_match.group(1),
|
||||||
|
"severity": row_match.group(2),
|
||||||
|
"description": row_match.group(3).strip(),
|
||||||
|
"status": row_match.group(4),
|
||||||
|
"since": row_match.group(5),
|
||||||
|
})
|
||||||
|
if rows:
|
||||||
|
return rows
|
||||||
|
|
||||||
|
# Fallback: parse ISS-NNN from review outputs across iterations
|
||||||
|
seen: dict[str, dict[str, str]] = {}
|
||||||
|
for ir in result.iterations:
|
||||||
|
for key, output in ir.step_outputs.items():
|
||||||
|
for m in re.finditer(
|
||||||
|
r"(ISS-\d+)\s*\[(\w+)\]\[.*?\]\s*(.*?)(?:\n|$)", output,
|
||||||
|
):
|
||||||
|
iss_id = m.group(1)
|
||||||
|
if iss_id not in seen:
|
||||||
|
seen[iss_id] = {
|
||||||
|
"id": iss_id,
|
||||||
|
"severity": m.group(2),
|
||||||
|
"description": m.group(3).strip()[:80],
|
||||||
|
"status": "Open",
|
||||||
|
"since": f"v{ir.iteration}",
|
||||||
|
}
|
||||||
|
return list(seen.values())
|
||||||
|
|
||||||
|
|
||||||
|
def _append_issue_tracker_summary(
|
||||||
|
lines: list[str],
|
||||||
|
config: PipelineConfig,
|
||||||
|
result: PipelineResult,
|
||||||
|
) -> None:
|
||||||
|
"""Append a consolidated issue tracker table to the report."""
|
||||||
|
rows = _extract_issue_tracker_rows(result)
|
||||||
|
if not rows:
|
||||||
|
return
|
||||||
|
|
||||||
|
lines.append("---\n")
|
||||||
|
lines.append(f"## {_t(config, 'issue_tracker_title')}\n")
|
||||||
|
lines.append(f"{_t(config, 'issue_tracker_desc')}\n")
|
||||||
|
|
||||||
|
lang = getattr(config, "language", "en")
|
||||||
|
if lang == "ko":
|
||||||
|
lines.append("| ISS-ID | 심각도 | 설명 | 상태 | 최초 발견 |")
|
||||||
|
else:
|
||||||
|
lines.append("| ISS-ID | Severity | Description | Status | Since |")
|
||||||
|
lines.append("|--------|----------|-------------|--------|-------|")
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
lines.append(
|
||||||
|
f"| {row['id']} | {row['severity']} "
|
||||||
|
f"| {row['description']} | {row['status']} | {row['since']} |"
|
||||||
|
)
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
|
||||||
|
def print_escalation_report(
|
||||||
|
config: PipelineConfig,
|
||||||
|
result: PipelineResult,
|
||||||
|
) -> None:
|
||||||
|
"""Print a prominent ANSI-colored escalation report to the terminal."""
|
||||||
|
RED = "\033[31m"
|
||||||
|
YELLOW = "\033[33m"
|
||||||
|
BOLD = "\033[1m"
|
||||||
|
RESET = "\033[0m"
|
||||||
|
|
||||||
|
title = _t(config, "escalate_title")
|
||||||
|
msg = _t(config, "escalate_msg")
|
||||||
|
|
||||||
|
print(f"\n{RED}{BOLD}{'=' * 60}")
|
||||||
|
print(f" {title}")
|
||||||
|
print(f"{'=' * 60}{RESET}\n")
|
||||||
|
print(f"{YELLOW}{msg}{RESET}\n")
|
||||||
|
|
||||||
|
for issue in result.escalated_issues:
|
||||||
|
print(f" {RED}•{RESET} {issue}")
|
||||||
|
|
||||||
|
print(f"\n{RED}{BOLD}{'=' * 60}{RESET}\n")
|
||||||
|
|
||||||
|
|
||||||
def _append_repeated_aggregate(
|
def _append_repeated_aggregate(
|
||||||
lines: list[str],
|
lines: list[str],
|
||||||
config: PipelineConfig,
|
config: PipelineConfig,
|
||||||
|
|||||||
167
cross_eval/runtime_env.py
Normal file
167
cross_eval/runtime_env.py
Normal file
@@ -0,0 +1,167 @@
|
|||||||
|
"""Helpers for building agent runtime environments from .env files."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from cross_eval.models import ExecutionConfig
|
||||||
|
|
||||||
|
_SUMMARY_PREFIXES = (
|
||||||
|
"CLICKHOUSE",
|
||||||
|
"CH_",
|
||||||
|
"DB_",
|
||||||
|
"DATABASE",
|
||||||
|
"PG",
|
||||||
|
"POSTGRES",
|
||||||
|
"MYSQL",
|
||||||
|
"MARIADB",
|
||||||
|
"REDIS",
|
||||||
|
"MONGO",
|
||||||
|
"ELASTICSEARCH",
|
||||||
|
"OPENSEARCH",
|
||||||
|
"DYNAMO",
|
||||||
|
"CASSANDRA",
|
||||||
|
"KAFKA",
|
||||||
|
"RABBIT",
|
||||||
|
"AMQP",
|
||||||
|
"NEO4J",
|
||||||
|
"SQLITE",
|
||||||
|
"MEMCACHED",
|
||||||
|
"AWS",
|
||||||
|
"S3",
|
||||||
|
"MINIO",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_quotes(value: str) -> str:
|
||||||
|
if len(value) >= 2 and value[0] == value[-1] and value[0] in {"'", '"'}:
|
||||||
|
unwrapped = value[1:-1]
|
||||||
|
if value[0] == '"':
|
||||||
|
return bytes(unwrapped, "utf-8").decode("unicode_escape")
|
||||||
|
return unwrapped
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
def parse_dotenv(path: Path) -> dict[str, str]:
|
||||||
|
"""Parse a simple dotenv file into key/value pairs."""
|
||||||
|
values: dict[str, str] = {}
|
||||||
|
for raw_line in path.read_text(encoding="utf-8").splitlines():
|
||||||
|
line = raw_line.strip()
|
||||||
|
if not line or line.startswith("#"):
|
||||||
|
continue
|
||||||
|
if line.startswith("export "):
|
||||||
|
line = line[len("export ") :].strip()
|
||||||
|
if "=" not in line:
|
||||||
|
continue
|
||||||
|
key, value = line.split("=", 1)
|
||||||
|
key = key.strip()
|
||||||
|
if not key:
|
||||||
|
continue
|
||||||
|
values[key] = _strip_quotes(value.strip())
|
||||||
|
return values
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_env_files(execution: ExecutionConfig, project_root: Path) -> list[Path]:
|
||||||
|
"""Resolve and deduplicate configured env files under the project root."""
|
||||||
|
candidates: list[Path] = []
|
||||||
|
for raw in execution.env_files:
|
||||||
|
path = Path(raw)
|
||||||
|
if not path.is_absolute():
|
||||||
|
path = project_root / path
|
||||||
|
candidates.append(path)
|
||||||
|
|
||||||
|
for raw in execution.auto_env_files:
|
||||||
|
path = project_root / raw
|
||||||
|
candidates.append(path)
|
||||||
|
|
||||||
|
resolved: list[Path] = []
|
||||||
|
seen: set[Path] = set()
|
||||||
|
for path in candidates:
|
||||||
|
try:
|
||||||
|
normalized = path.resolve()
|
||||||
|
except OSError:
|
||||||
|
normalized = path
|
||||||
|
if normalized in seen or not normalized.exists() or not normalized.is_file():
|
||||||
|
continue
|
||||||
|
seen.add(normalized)
|
||||||
|
resolved.append(normalized)
|
||||||
|
return resolved
|
||||||
|
|
||||||
|
|
||||||
|
def build_runtime_environment(
|
||||||
|
execution: ExecutionConfig,
|
||||||
|
project_root: Path,
|
||||||
|
) -> tuple[dict[str, str], list[Path], dict[str, str]]:
|
||||||
|
"""Build subprocess env plus metadata about loaded files and names."""
|
||||||
|
env = os.environ.copy() if execution.inherit_env else {}
|
||||||
|
# Remove CLAUDECODE to avoid "nested session" errors when spawning
|
||||||
|
# Claude Code as a subprocess from within a Claude Code session.
|
||||||
|
env.pop("CLAUDECODE", None)
|
||||||
|
loaded_files = resolve_env_files(execution, project_root)
|
||||||
|
loaded_values: dict[str, str] = {}
|
||||||
|
for path in loaded_files:
|
||||||
|
file_values = parse_dotenv(path)
|
||||||
|
loaded_values.update(file_values)
|
||||||
|
env.update(file_values)
|
||||||
|
return env, loaded_files, loaded_values
|
||||||
|
|
||||||
|
|
||||||
|
def summarize_environment(
|
||||||
|
execution: ExecutionConfig,
|
||||||
|
loaded_files: list[Path],
|
||||||
|
env: dict[str, str],
|
||||||
|
loaded_values: dict[str, str],
|
||||||
|
) -> str:
|
||||||
|
"""Generate a safe environment summary for prompts without leaking secrets."""
|
||||||
|
lines: list[str] = []
|
||||||
|
if loaded_files:
|
||||||
|
joined = ", ".join(str(path) for path in loaded_files)
|
||||||
|
lines.append(f"Loaded env files into the agent process: {joined}")
|
||||||
|
else:
|
||||||
|
lines.append("No .env file was auto-loaded into the agent process.")
|
||||||
|
|
||||||
|
if execution.auto_context_targets:
|
||||||
|
lines.append(
|
||||||
|
"Execution targets hinted by the user: "
|
||||||
|
+ ", ".join(execution.auto_context_targets)
|
||||||
|
)
|
||||||
|
|
||||||
|
if execution.expose_env_names:
|
||||||
|
visible_names = sorted(
|
||||||
|
{
|
||||||
|
key
|
||||||
|
for key in set(loaded_values) | set(env)
|
||||||
|
if key.startswith(_SUMMARY_PREFIXES)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if visible_names:
|
||||||
|
lines.append("Relevant env var names available to commands: " + ", ".join(visible_names))
|
||||||
|
else:
|
||||||
|
lines.append("No DB/service env var names matched the default summary filters.")
|
||||||
|
else:
|
||||||
|
lines.append("Environment variable values are loaded but names are hidden from the prompt.")
|
||||||
|
|
||||||
|
wants_clickhouse = "clickhouse" in {target.lower() for target in execution.auto_context_targets}
|
||||||
|
clickhouse_keys = [key for key in env if "CLICKHOUSE" in key or key.startswith("CH_")]
|
||||||
|
if wants_clickhouse or clickhouse_keys:
|
||||||
|
if clickhouse_keys:
|
||||||
|
lines.append("ClickHouse-related environment variables are available to the agent.")
|
||||||
|
else:
|
||||||
|
lines.append("No ClickHouse-specific env vars were detected in the loaded environment.")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def build_execution_policy(execution: ExecutionConfig) -> str:
|
||||||
|
"""Describe the execution latitude granted to agentic coders/reviewers."""
|
||||||
|
lines = [
|
||||||
|
f"Execution mode: {execution.mode}",
|
||||||
|
f"Command policy: {execution.command_policy}",
|
||||||
|
"The agent may choose shell, Python, git, docker, test, and database commands on its own when needed.",
|
||||||
|
"The user does not need to pre-specify exact commands.",
|
||||||
|
]
|
||||||
|
if execution.command_policy == "broad":
|
||||||
|
lines.append("Prefer direct validation by running the minimum set of commands needed to prove a fix.")
|
||||||
|
else:
|
||||||
|
lines.append("Keep command usage minimal and focused on validation.")
|
||||||
|
return "\n".join(lines)
|
||||||
152
cross_eval/worktree.py
Normal file
152
cross_eval/worktree.py
Normal file
@@ -0,0 +1,152 @@
|
|||||||
|
"""Git worktree lifecycle management for agentic mode."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class WorktreeError(RuntimeError):
|
||||||
|
"""Error during worktree operations."""
|
||||||
|
|
||||||
|
|
||||||
|
def make_branch_name(preset_name: str) -> str:
|
||||||
|
"""Generate a branch name for agentic results."""
|
||||||
|
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
return f"cross-eval/{preset_name}_{ts}"
|
||||||
|
|
||||||
|
|
||||||
|
def make_worktree_dir(base_cwd: Path, branch_name: str) -> Path:
|
||||||
|
"""Choose a worktree directory outside the base repo.
|
||||||
|
|
||||||
|
Keeping agentic worktrees outside the source checkout avoids tools that
|
||||||
|
incorrectly walk up to the outer repo and write into the base worktree.
|
||||||
|
"""
|
||||||
|
repo_name = base_cwd.resolve().name or "repo"
|
||||||
|
branch_slug = branch_name.replace("/", "__")
|
||||||
|
return (
|
||||||
|
Path(tempfile.gettempdir())
|
||||||
|
/ "cross-eval-worktrees"
|
||||||
|
/ repo_name
|
||||||
|
/ branch_slug
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def create_worktree(base_cwd: Path, work_dir: Path, branch_name: str) -> Path:
|
||||||
|
"""Create a git worktree on a new branch from HEAD.
|
||||||
|
|
||||||
|
1. Create branch from HEAD
|
||||||
|
2. Create worktree checked out to that branch
|
||||||
|
|
||||||
|
The branch lives in the original repo, so it survives worktree removal.
|
||||||
|
"""
|
||||||
|
work_dir = work_dir.resolve()
|
||||||
|
if work_dir.exists():
|
||||||
|
shutil.rmtree(work_dir)
|
||||||
|
|
||||||
|
# Create the branch at HEAD
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["git", "branch", branch_name, "HEAD"],
|
||||||
|
cwd=base_cwd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
raise WorktreeError(
|
||||||
|
f"Failed to create branch '{branch_name}': {e.stderr.strip()}"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
# Create worktree on that branch
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["git", "worktree", "add", str(work_dir), branch_name],
|
||||||
|
cwd=base_cwd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
# Clean up the branch if worktree creation fails
|
||||||
|
subprocess.run(
|
||||||
|
["git", "branch", "-D", branch_name],
|
||||||
|
cwd=base_cwd,
|
||||||
|
capture_output=True,
|
||||||
|
)
|
||||||
|
raise WorktreeError(
|
||||||
|
f"Failed to create worktree at {work_dir}: {e.stderr.strip()}"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
logger.debug("Created worktree on branch '%s': %s", branch_name, work_dir)
|
||||||
|
return work_dir
|
||||||
|
|
||||||
|
|
||||||
|
def capture_diff(worktree_path: Path) -> str:
|
||||||
|
"""Capture all changes made in the worktree as a unified diff.
|
||||||
|
|
||||||
|
Includes both tracked modifications and new untracked files.
|
||||||
|
"""
|
||||||
|
subprocess.run(
|
||||||
|
["git", "add", "-A"],
|
||||||
|
cwd=worktree_path,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
["git", "diff", "--cached", "HEAD"],
|
||||||
|
cwd=worktree_path,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
return result.stdout.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def commit_worktree(worktree_path: Path, message: str) -> bool:
|
||||||
|
"""Stage and commit all changes in the worktree.
|
||||||
|
|
||||||
|
Returns True if a commit was made, False if nothing to commit.
|
||||||
|
"""
|
||||||
|
subprocess.run(
|
||||||
|
["git", "add", "-A"],
|
||||||
|
cwd=worktree_path,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
["git", "commit", "-m", message],
|
||||||
|
cwd=worktree_path,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
# exit code 1 = nothing to commit
|
||||||
|
return result.returncode == 0
|
||||||
|
|
||||||
|
|
||||||
|
def remove_worktree(base_cwd: Path, work_dir: Path) -> None:
|
||||||
|
"""Remove a git worktree (branch is preserved in the original repo)."""
|
||||||
|
work_dir = work_dir.resolve()
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["git", "worktree", "remove", "--force", str(work_dir)],
|
||||||
|
cwd=base_cwd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
if work_dir.exists():
|
||||||
|
shutil.rmtree(work_dir, ignore_errors=True)
|
||||||
|
subprocess.run(
|
||||||
|
["git", "worktree", "prune"],
|
||||||
|
cwd=base_cwd,
|
||||||
|
capture_output=True,
|
||||||
|
)
|
||||||
|
logger.debug("Removed worktree: %s (branch preserved)", work_dir)
|
||||||
@@ -4,15 +4,65 @@ build-backend = "setuptools.build_meta"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "cross-eval"
|
name = "cross-eval"
|
||||||
version = "0.1.0"
|
version = "0.2.0"
|
||||||
description = "AI agent cross-evaluation CLI tool"
|
description = "AI agent cross-evaluation CLI tool"
|
||||||
requires-python = ">=3.9"
|
requires-python = ">=3.9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pyyaml>=6.0",
|
"pyyaml>=6.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = [
|
||||||
|
"coverage[toml]>=7.6",
|
||||||
|
"pyright>=1.1.390",
|
||||||
|
"pytest-cov>=6.0",
|
||||||
|
"ruff>=0.8.0",
|
||||||
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
cross-eval = "cross_eval.cli:main"
|
cross-eval = "cross_eval.cli:main"
|
||||||
|
|
||||||
[tool.setuptools.packages.find]
|
[tool.setuptools.packages.find]
|
||||||
include = ["cross_eval*"]
|
include = ["cross_eval*"]
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
testpaths = ["tests"]
|
||||||
|
addopts = "-q"
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
target-version = "py39"
|
||||||
|
extend-exclude = [".cross-eval"]
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = ["F"]
|
||||||
|
|
||||||
|
[tool.pyright]
|
||||||
|
include = ["cross_eval", "tests"]
|
||||||
|
exclude = [".cross-eval"]
|
||||||
|
typeCheckingMode = "basic"
|
||||||
|
pythonVersion = "3.9"
|
||||||
|
reportMissingImports = true
|
||||||
|
reportMissingTypeStubs = false
|
||||||
|
|
||||||
|
[tool.coverage.run]
|
||||||
|
branch = true
|
||||||
|
source = ["cross_eval"]
|
||||||
|
omit = [
|
||||||
|
"cross_eval/config.py",
|
||||||
|
"cross_eval/discovery.py",
|
||||||
|
"cross_eval/cli.py",
|
||||||
|
"cross_eval/demo.py",
|
||||||
|
"cross_eval/doctor.py",
|
||||||
|
"cross_eval/prompts.py",
|
||||||
|
"cross_eval/report.py",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.coverage.report]
|
||||||
|
skip_empty = true
|
||||||
|
show_missing = true
|
||||||
|
fail_under = 90
|
||||||
|
exclude_lines = [
|
||||||
|
"pragma: no cover",
|
||||||
|
"if TYPE_CHECKING:",
|
||||||
|
"raise NotImplementedError",
|
||||||
|
]
|
||||||
|
|||||||
Binary file not shown.
854
tests/test_agentic.py
Normal file
854
tests/test_agentic.py
Normal file
@@ -0,0 +1,854 @@
|
|||||||
|
"""Comprehensive tests for the agentic worktree flow.
|
||||||
|
|
||||||
|
Covers:
|
||||||
|
1. worktree.py unit tests (real temp git repo)
|
||||||
|
2. agent.py agentic tests (mocking subprocess)
|
||||||
|
3. config.py _make_agentic tests
|
||||||
|
4. pipeline integration tests (mock invoke_agent / invoke_agent_agentic)
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
from cross_eval.agent import AgentInvocationError, invoke_agent_agentic
|
||||||
|
from cross_eval.config import _make_agentic
|
||||||
|
from cross_eval.models import (
|
||||||
|
AgentConfig,
|
||||||
|
AgentResult,
|
||||||
|
PipelineConfig,
|
||||||
|
StepConfig,
|
||||||
|
)
|
||||||
|
from cross_eval.pipeline import (
|
||||||
|
_assert_base_repo_isolation,
|
||||||
|
_has_agentic_steps,
|
||||||
|
_setup_worktree,
|
||||||
|
run_pipeline,
|
||||||
|
)
|
||||||
|
from cross_eval.worktree import (
|
||||||
|
capture_diff,
|
||||||
|
commit_worktree,
|
||||||
|
create_worktree,
|
||||||
|
make_branch_name,
|
||||||
|
make_worktree_dir,
|
||||||
|
remove_worktree,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _init_git_repo(path: Path) -> None:
|
||||||
|
"""Initialise a minimal git repo with one commit."""
|
||||||
|
subprocess.run(["git", "init"], cwd=path, capture_output=True, check=True)
|
||||||
|
subprocess.run(
|
||||||
|
["git", "config", "user.email", "test@test.com"],
|
||||||
|
cwd=path, capture_output=True, check=True,
|
||||||
|
)
|
||||||
|
subprocess.run(
|
||||||
|
["git", "config", "user.name", "Test"],
|
||||||
|
cwd=path, capture_output=True, check=True,
|
||||||
|
)
|
||||||
|
(path / "README.md").write_text("# init\n")
|
||||||
|
subprocess.run(["git", "add", "."], cwd=path, capture_output=True, check=True)
|
||||||
|
subprocess.run(
|
||||||
|
["git", "commit", "-m", "initial"],
|
||||||
|
cwd=path, capture_output=True, check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ===================================================================
|
||||||
|
# 1. worktree.py unit tests (real temp git repo)
|
||||||
|
# ===================================================================
|
||||||
|
|
||||||
|
class TestCreateWorktree(unittest.TestCase):
|
||||||
|
"""create_worktree creates a worktree on a named branch."""
|
||||||
|
|
||||||
|
def test_creates_worktree_and_branch(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
base = Path(td) / "repo"
|
||||||
|
base.mkdir()
|
||||||
|
_init_git_repo(base)
|
||||||
|
|
||||||
|
wt_dir = Path(td) / "wt"
|
||||||
|
branch = "cross-eval/test_branch"
|
||||||
|
result_path = create_worktree(base, wt_dir, branch)
|
||||||
|
|
||||||
|
# Worktree directory exists
|
||||||
|
self.assertTrue(result_path.exists())
|
||||||
|
# Branch was created in the original repo
|
||||||
|
branches = subprocess.run(
|
||||||
|
["git", "branch", "--list", branch],
|
||||||
|
cwd=base, capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
self.assertIn(branch, branches.stdout)
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
remove_worktree(base, wt_dir)
|
||||||
|
|
||||||
|
|
||||||
|
class TestCaptureDiff(unittest.TestCase):
|
||||||
|
"""capture_diff captures changes correctly."""
|
||||||
|
|
||||||
|
def test_captures_new_and_modified_files(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
base = Path(td) / "repo"
|
||||||
|
base.mkdir()
|
||||||
|
_init_git_repo(base)
|
||||||
|
|
||||||
|
wt_dir = Path(td) / "wt"
|
||||||
|
branch = "cross-eval/diff_test"
|
||||||
|
create_worktree(base, wt_dir, branch)
|
||||||
|
|
||||||
|
# Make changes in the worktree
|
||||||
|
(wt_dir / "new_file.txt").write_text("hello\n")
|
||||||
|
(wt_dir / "README.md").write_text("# modified\n")
|
||||||
|
|
||||||
|
diff = capture_diff(wt_dir)
|
||||||
|
self.assertIn("new_file.txt", diff)
|
||||||
|
self.assertIn("hello", diff)
|
||||||
|
self.assertIn("modified", diff)
|
||||||
|
|
||||||
|
remove_worktree(base, wt_dir)
|
||||||
|
|
||||||
|
|
||||||
|
class TestCommitWorktree(unittest.TestCase):
|
||||||
|
"""commit_worktree commits changes and returns True; False when nothing to commit."""
|
||||||
|
|
||||||
|
def test_commit_returns_true_on_changes(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
base = Path(td) / "repo"
|
||||||
|
base.mkdir()
|
||||||
|
_init_git_repo(base)
|
||||||
|
|
||||||
|
wt_dir = Path(td) / "wt"
|
||||||
|
branch = "cross-eval/commit_test"
|
||||||
|
create_worktree(base, wt_dir, branch)
|
||||||
|
|
||||||
|
(wt_dir / "file.txt").write_text("data\n")
|
||||||
|
result = commit_worktree(wt_dir, "test commit")
|
||||||
|
self.assertTrue(result)
|
||||||
|
|
||||||
|
remove_worktree(base, wt_dir)
|
||||||
|
|
||||||
|
def test_commit_returns_false_when_nothing_to_commit(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
base = Path(td) / "repo"
|
||||||
|
base.mkdir()
|
||||||
|
_init_git_repo(base)
|
||||||
|
|
||||||
|
wt_dir = Path(td) / "wt"
|
||||||
|
branch = "cross-eval/empty_commit"
|
||||||
|
create_worktree(base, wt_dir, branch)
|
||||||
|
|
||||||
|
result = commit_worktree(wt_dir, "empty")
|
||||||
|
self.assertFalse(result)
|
||||||
|
|
||||||
|
remove_worktree(base, wt_dir)
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoveWorktree(unittest.TestCase):
|
||||||
|
"""remove_worktree removes worktree but branch survives."""
|
||||||
|
|
||||||
|
def test_branch_survives_worktree_removal(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
base = Path(td) / "repo"
|
||||||
|
base.mkdir()
|
||||||
|
_init_git_repo(base)
|
||||||
|
|
||||||
|
wt_dir = Path(td) / "wt"
|
||||||
|
branch = "cross-eval/remove_test"
|
||||||
|
create_worktree(base, wt_dir, branch)
|
||||||
|
|
||||||
|
remove_worktree(base, wt_dir)
|
||||||
|
|
||||||
|
# Worktree directory should be gone
|
||||||
|
self.assertFalse(wt_dir.exists())
|
||||||
|
|
||||||
|
# Branch should still exist in the original repo
|
||||||
|
branches = subprocess.run(
|
||||||
|
["git", "branch", "--list", branch],
|
||||||
|
cwd=base, capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
self.assertIn(branch, branches.stdout)
|
||||||
|
|
||||||
|
|
||||||
|
class TestMakeBranchName(unittest.TestCase):
|
||||||
|
"""make_branch_name generates expected format."""
|
||||||
|
|
||||||
|
def test_format(self) -> None:
|
||||||
|
name = make_branch_name("review-fix")
|
||||||
|
self.assertTrue(name.startswith("cross-eval/review-fix_"))
|
||||||
|
# Should contain a timestamp-like suffix
|
||||||
|
parts = name.split("_", 1)
|
||||||
|
self.assertEqual(len(parts), 2)
|
||||||
|
# Timestamp portion should be like 20260313_123456
|
||||||
|
ts_part = parts[1] # after "cross-eval/review-fix_"
|
||||||
|
self.assertEqual(len(ts_part), 15) # YYYYMMDD_HHMMSS
|
||||||
|
|
||||||
|
|
||||||
|
class TestMakeWorktreeDir(unittest.TestCase):
|
||||||
|
"""make_worktree_dir chooses an external temp location."""
|
||||||
|
|
||||||
|
def test_uses_tmp_dir_outside_repo(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
base = Path(td) / "repo"
|
||||||
|
base.mkdir()
|
||||||
|
path = make_worktree_dir(base, "cross-eval/review-fix_20260313_123456")
|
||||||
|
self.assertIn("cross-eval-worktrees", str(path))
|
||||||
|
self.assertNotIn(str(base), str(path))
|
||||||
|
|
||||||
|
|
||||||
|
class TestBaseRepoIsolation(unittest.TestCase):
|
||||||
|
"""Base repo mutations should fail fast during agentic execution."""
|
||||||
|
|
||||||
|
def test_raises_when_base_repo_state_changes(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
base = Path(td) / "repo"
|
||||||
|
worktree = Path(td) / "worktree"
|
||||||
|
base.mkdir()
|
||||||
|
worktree.mkdir()
|
||||||
|
|
||||||
|
# Baseline has a diff that won't match a non-git directory
|
||||||
|
# (which returns {}), triggering the isolation error.
|
||||||
|
baseline_state = {
|
||||||
|
"diff": "diff --git a/file.py ...\n",
|
||||||
|
"untracked": "",
|
||||||
|
}
|
||||||
|
|
||||||
|
with self.assertRaises(RuntimeError) as ctx:
|
||||||
|
_assert_base_repo_isolation(
|
||||||
|
base,
|
||||||
|
baseline_state,
|
||||||
|
step_name="coding",
|
||||||
|
agent_name="claude-coder",
|
||||||
|
worktree_path=worktree,
|
||||||
|
baseline_status="M file.py",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIn("base repository", str(ctx.exception))
|
||||||
|
|
||||||
|
|
||||||
|
# ===================================================================
|
||||||
|
# 2. agent.py agentic tests (mocking subprocess)
|
||||||
|
# ===================================================================
|
||||||
|
|
||||||
|
class TestInvokeAgentAgenticClaude(unittest.TestCase):
|
||||||
|
"""invoke_agent_agentic builds correct cmd for claude (no -p, prompt via stdin)."""
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_claude_cmd_has_no_dash_p_and_prompt_via_stdin(
|
||||||
|
self, mock_run: MagicMock, mock_diff: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder",
|
||||||
|
command="claude",
|
||||||
|
args=["--setting-sources", "user", "--dangerously-skip-permissions"],
|
||||||
|
agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
wt = Path(td)
|
||||||
|
_init_git_repo(wt)
|
||||||
|
|
||||||
|
invoke_agent_agentic(
|
||||||
|
agent, "implement feature X", "coding",
|
||||||
|
worktree_path=wt, quiet=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Find the subprocess.run call that actually runs the agent
|
||||||
|
agent_call = None
|
||||||
|
for c in mock_run.call_args_list:
|
||||||
|
cmd = c[0][0] if c[0] else c[1].get("args", [])
|
||||||
|
if cmd and cmd[0] == "claude":
|
||||||
|
agent_call = c
|
||||||
|
break
|
||||||
|
|
||||||
|
self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'claude'")
|
||||||
|
assert agent_call is not None
|
||||||
|
cmd = agent_call[0][0]
|
||||||
|
|
||||||
|
# No -p flag
|
||||||
|
self.assertNotIn("-p", cmd)
|
||||||
|
# Prompt is delivered via stdin (input kwarg), not as a positional arg
|
||||||
|
input_data = agent_call[1].get("input")
|
||||||
|
self.assertIsNotNone(input_data)
|
||||||
|
assert input_data is not None
|
||||||
|
self.assertIn("implement feature X", input_data)
|
||||||
|
|
||||||
|
|
||||||
|
class TestInvokeAgentAgenticCodex(unittest.TestCase):
|
||||||
|
"""invoke_agent_agentic builds correct cmd for codex (stdin mode, - sentinel)."""
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_codex_cmd_uses_stdin_with_dash_sentinel(
|
||||||
|
self, mock_run: MagicMock, mock_diff: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="codex-coder",
|
||||||
|
command="codex",
|
||||||
|
args=["exec", "--full-auto", "--skip-git-repo-check"],
|
||||||
|
agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
wt = Path(td)
|
||||||
|
_init_git_repo(wt)
|
||||||
|
|
||||||
|
invoke_agent_agentic(
|
||||||
|
agent, "implement feature Y", "coding",
|
||||||
|
worktree_path=wt, quiet=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
agent_call = None
|
||||||
|
for c in mock_run.call_args_list:
|
||||||
|
cmd = c[0][0] if c[0] else c[1].get("args", [])
|
||||||
|
if cmd and cmd[0] == "codex":
|
||||||
|
agent_call = c
|
||||||
|
break
|
||||||
|
|
||||||
|
self.assertIsNotNone(agent_call, "Expected a subprocess.run call with 'codex'")
|
||||||
|
assert agent_call is not None
|
||||||
|
cmd = agent_call[0][0]
|
||||||
|
|
||||||
|
# Should have "-" sentinel at the end for stdin
|
||||||
|
self.assertEqual(cmd[-1], "-")
|
||||||
|
# Stdin input should contain the prompt
|
||||||
|
input_data = agent_call[1].get("input")
|
||||||
|
self.assertIsNotNone(input_data)
|
||||||
|
assert input_data is not None
|
||||||
|
self.assertIn("implement feature Y", input_data)
|
||||||
|
|
||||||
|
|
||||||
|
class TestTaskFileCleanup(unittest.TestCase):
|
||||||
|
"""Task file is cleaned up before capture_diff."""
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="(no changes)")
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_task_file_in_tmp_not_worktree(
|
||||||
|
self, mock_run: MagicMock, mock_diff: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder", command="claude", args=[], agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
wt = Path(td)
|
||||||
|
_init_git_repo(wt)
|
||||||
|
|
||||||
|
invoke_agent_agentic(
|
||||||
|
agent, "do stuff", "coding",
|
||||||
|
worktree_path=wt, quiet=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Task file should NOT be in the worktree (it's in /tmp)
|
||||||
|
self.assertFalse((wt / "CROSS_EVAL_TASK.md").exists())
|
||||||
|
|
||||||
|
|
||||||
|
class TestAgenticEmptyDiffDetection(unittest.TestCase):
|
||||||
|
"""Agentic coders should not succeed when they only claim changes in stdout."""
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="")
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_claude_empty_diff_with_change_claim_fails(
|
||||||
|
self, mock_run: MagicMock, mock_diff: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(
|
||||||
|
returncode=0,
|
||||||
|
stdout=(
|
||||||
|
"All tests pass.\n"
|
||||||
|
"Here's a summary of all changes made:\n"
|
||||||
|
"- Updated discovery.py\n"
|
||||||
|
),
|
||||||
|
stderr="",
|
||||||
|
)
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder",
|
||||||
|
command="claude",
|
||||||
|
args=["--setting-sources", "user"],
|
||||||
|
agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
wt = Path(td)
|
||||||
|
_init_git_repo(wt)
|
||||||
|
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent_agentic(
|
||||||
|
agent, "implement feature X", "coding",
|
||||||
|
worktree_path=wt, quiet=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(ctx.exception.failure_type, "EMPTY_DIFF")
|
||||||
|
self.assertIn("summary of all changes made", ctx.exception.raw_error.lower())
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="")
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_empty_diff_without_change_claim_is_allowed(
|
||||||
|
self, mock_run: MagicMock, mock_diff: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(
|
||||||
|
returncode=0,
|
||||||
|
stdout="No changes were required; the current implementation already satisfies the task.",
|
||||||
|
stderr="",
|
||||||
|
)
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder",
|
||||||
|
command="claude",
|
||||||
|
args=["--setting-sources", "user"],
|
||||||
|
agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
wt = Path(td)
|
||||||
|
_init_git_repo(wt)
|
||||||
|
|
||||||
|
result = invoke_agent_agentic(
|
||||||
|
agent, "check whether any fix is needed", "coding",
|
||||||
|
worktree_path=wt, quiet=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(result.output, "(no changes)")
|
||||||
|
|
||||||
|
|
||||||
|
# ===================================================================
|
||||||
|
# 3. config.py tests
|
||||||
|
# ===================================================================
|
||||||
|
|
||||||
|
class TestMakeAgenticClaude(unittest.TestCase):
|
||||||
|
"""_make_agentic strips -p from claude args and sets agentic=True."""
|
||||||
|
|
||||||
|
def test_strips_dash_p_and_sets_agentic(self) -> None:
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder",
|
||||||
|
command="claude",
|
||||||
|
args=["-p", "--setting-sources", "user", "--model", "opus"],
|
||||||
|
)
|
||||||
|
self.assertFalse(agent.agentic)
|
||||||
|
_make_agentic(agent)
|
||||||
|
self.assertTrue(agent.agentic)
|
||||||
|
self.assertNotIn("-p", agent.args)
|
||||||
|
self.assertIn("--setting-sources", agent.args)
|
||||||
|
|
||||||
|
def test_strips_dash_dash_print_alias(self) -> None:
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder",
|
||||||
|
command="claude",
|
||||||
|
args=["--print", "--setting-sources", "user"],
|
||||||
|
)
|
||||||
|
_make_agentic(agent)
|
||||||
|
self.assertTrue(agent.agentic)
|
||||||
|
self.assertNotIn("--print", agent.args)
|
||||||
|
|
||||||
|
def test_idempotent_when_no_dash_p(self) -> None:
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder",
|
||||||
|
command="claude",
|
||||||
|
args=["--setting-sources", "user"],
|
||||||
|
)
|
||||||
|
_make_agentic(agent)
|
||||||
|
self.assertTrue(agent.agentic)
|
||||||
|
self.assertEqual(agent.args, ["--setting-sources", "user"])
|
||||||
|
|
||||||
|
|
||||||
|
class TestMakeAgenticCodex(unittest.TestCase):
|
||||||
|
"""_make_agentic on codex agent still works (no -p to strip)."""
|
||||||
|
|
||||||
|
def test_codex_agentic_works(self) -> None:
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="codex-coder",
|
||||||
|
command="codex",
|
||||||
|
args=["exec", "--full-auto", "-"],
|
||||||
|
)
|
||||||
|
_make_agentic(agent)
|
||||||
|
self.assertTrue(agent.agentic)
|
||||||
|
# -p was never there so args are unchanged
|
||||||
|
self.assertIn("exec", agent.args)
|
||||||
|
self.assertIn("--full-auto", agent.args)
|
||||||
|
|
||||||
|
|
||||||
|
# ===================================================================
|
||||||
|
# 4. pipeline integration tests
|
||||||
|
# ===================================================================
|
||||||
|
|
||||||
|
def _make_agentic_config(
|
||||||
|
run_dir: Path,
|
||||||
|
agentic_coder: bool = True,
|
||||||
|
) -> PipelineConfig:
|
||||||
|
"""Build a config with an agentic coder + non-agentic reviewer."""
|
||||||
|
coder = AgentConfig(
|
||||||
|
name="claude-coder", command="claude",
|
||||||
|
args=["--setting-sources", "user"],
|
||||||
|
agentic=agentic_coder,
|
||||||
|
)
|
||||||
|
reviewer = AgentConfig(
|
||||||
|
name="claude-reviewer", command="claude",
|
||||||
|
args=["-p", "--setting-sources", "user"],
|
||||||
|
agentic=False,
|
||||||
|
)
|
||||||
|
steps = [
|
||||||
|
StepConfig(
|
||||||
|
name="coding",
|
||||||
|
agent="claude-coder",
|
||||||
|
role="coding",
|
||||||
|
prompt_template="default:coding",
|
||||||
|
output_key="coding_output",
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review",
|
||||||
|
agent="claude-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_result",
|
||||||
|
verdict=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
return PipelineConfig(
|
||||||
|
output_dir=run_dir,
|
||||||
|
max_iterations=2,
|
||||||
|
min_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||||
|
agents={"claude-coder": coder, "claude-reviewer": reviewer},
|
||||||
|
coders=["claude-coder"],
|
||||||
|
reviewers=["claude-reviewer"],
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="simple",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSetupWorktreeCalledForAgentic(unittest.TestCase):
|
||||||
|
"""When agentic agent is configured, _setup_worktree is called."""
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
|
||||||
|
@patch("cross_eval.pipeline._commit_iteration")
|
||||||
|
@patch("cross_eval.pipeline._setup_worktree")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent_agentic")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent")
|
||||||
|
def test_setup_worktree_called(
|
||||||
|
self,
|
||||||
|
mock_invoke: MagicMock,
|
||||||
|
mock_invoke_agentic: MagicMock,
|
||||||
|
mock_setup: MagicMock,
|
||||||
|
mock_commit_iter: MagicMock,
|
||||||
|
mock_finalize: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
run_dir = Path(td)
|
||||||
|
config = _make_agentic_config(run_dir)
|
||||||
|
|
||||||
|
wt_path = run_dir / "work"
|
||||||
|
wt_path.mkdir()
|
||||||
|
mock_setup.return_value = (wt_path, "cross-eval/test")
|
||||||
|
|
||||||
|
mock_invoke_agentic.return_value = AgentResult(
|
||||||
|
output="diff output", exit_code=0,
|
||||||
|
agent_name="claude-coder", step_name="coding",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
mock_invoke.return_value = AgentResult(
|
||||||
|
output="VERDICT: PASS", exit_code=0,
|
||||||
|
agent_name="claude-reviewer", step_name="review",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
|
run_pipeline(config, cwd=Path(td))
|
||||||
|
|
||||||
|
mock_setup.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
class TestSetupWorktreeLocation(unittest.TestCase):
|
||||||
|
"""_setup_worktree places agentic worktrees outside the base repo."""
|
||||||
|
|
||||||
|
def test_worktree_is_created_outside_repo(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
base = Path(td) / "repo"
|
||||||
|
run_dir = base / ".cross-eval" / "output" / "smoke"
|
||||||
|
base.mkdir()
|
||||||
|
run_dir.mkdir(parents=True)
|
||||||
|
_init_git_repo(base)
|
||||||
|
|
||||||
|
worktree_path, branch_name = _setup_worktree(base, run_dir, "review-fix")
|
||||||
|
try:
|
||||||
|
self.assertTrue(worktree_path.exists())
|
||||||
|
self.assertNotIn(str(base.resolve()), str(worktree_path.resolve()))
|
||||||
|
self.assertEqual(
|
||||||
|
(run_dir / "worktree_path.txt").read_text(encoding="utf-8").strip(),
|
||||||
|
str(worktree_path),
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
(run_dir / "worktree_branch.txt").read_text(encoding="utf-8").strip(),
|
||||||
|
branch_name,
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
remove_worktree(base, worktree_path)
|
||||||
|
|
||||||
|
|
||||||
|
class TestReviewerRunsInWorktreeCwd(unittest.TestCase):
|
||||||
|
"""Reviewer runs with worktree cwd (not original cwd) when worktree exists."""
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
|
||||||
|
@patch("cross_eval.pipeline._commit_iteration")
|
||||||
|
@patch("cross_eval.pipeline._setup_worktree")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent_agentic")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent")
|
||||||
|
def test_reviewer_uses_worktree_cwd(
|
||||||
|
self,
|
||||||
|
mock_invoke: MagicMock,
|
||||||
|
mock_invoke_agentic: MagicMock,
|
||||||
|
mock_setup: MagicMock,
|
||||||
|
mock_commit_iter: MagicMock,
|
||||||
|
mock_finalize: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
run_dir = Path(td)
|
||||||
|
config = _make_agentic_config(run_dir)
|
||||||
|
|
||||||
|
wt_path = run_dir / "work"
|
||||||
|
wt_path.mkdir()
|
||||||
|
mock_setup.return_value = (wt_path, "cross-eval/test")
|
||||||
|
|
||||||
|
mock_invoke_agentic.return_value = AgentResult(
|
||||||
|
output="diff output", exit_code=0,
|
||||||
|
agent_name="claude-coder", step_name="coding",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
mock_invoke.return_value = AgentResult(
|
||||||
|
output="VERDICT: PASS", exit_code=0,
|
||||||
|
agent_name="claude-reviewer", step_name="review",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
|
run_pipeline(config, cwd=Path(td))
|
||||||
|
|
||||||
|
# The reviewer (non-agentic) should have been called with cwd=worktree_path
|
||||||
|
reviewer_call = mock_invoke.call_args
|
||||||
|
self.assertEqual(reviewer_call[1].get("cwd") or reviewer_call[0][3], wt_path)
|
||||||
|
|
||||||
|
|
||||||
|
class TestCommitIterationCalled(unittest.TestCase):
|
||||||
|
"""_commit_iteration is called after each iteration when worktree exists."""
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
|
||||||
|
@patch("cross_eval.pipeline._commit_iteration")
|
||||||
|
@patch("cross_eval.pipeline._setup_worktree")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent_agentic")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent")
|
||||||
|
def test_commit_iteration_called(
|
||||||
|
self,
|
||||||
|
mock_invoke: MagicMock,
|
||||||
|
mock_invoke_agentic: MagicMock,
|
||||||
|
mock_setup: MagicMock,
|
||||||
|
mock_commit_iter: MagicMock,
|
||||||
|
mock_finalize: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
run_dir = Path(td)
|
||||||
|
config = _make_agentic_config(run_dir)
|
||||||
|
|
||||||
|
wt_path = run_dir / "work"
|
||||||
|
wt_path.mkdir()
|
||||||
|
mock_setup.return_value = (wt_path, "cross-eval/test")
|
||||||
|
|
||||||
|
mock_invoke_agentic.return_value = AgentResult(
|
||||||
|
output="diff output", exit_code=0,
|
||||||
|
agent_name="claude-coder", step_name="coding",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
mock_invoke.return_value = AgentResult(
|
||||||
|
output="VERDICT: PASS", exit_code=0,
|
||||||
|
agent_name="claude-reviewer", step_name="review",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
|
run_pipeline(config, cwd=Path(td))
|
||||||
|
|
||||||
|
mock_commit_iter.assert_called_once()
|
||||||
|
call_args = mock_commit_iter.call_args
|
||||||
|
self.assertEqual(call_args[0][0], wt_path)
|
||||||
|
|
||||||
|
|
||||||
|
class TestFinalizeWorktreeCalled(unittest.TestCase):
|
||||||
|
"""_finalize_worktree commits and cleans up at end."""
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
|
||||||
|
@patch("cross_eval.pipeline._commit_iteration")
|
||||||
|
@patch("cross_eval.pipeline._setup_worktree")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent_agentic")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent")
|
||||||
|
def test_finalize_called(
|
||||||
|
self,
|
||||||
|
mock_invoke: MagicMock,
|
||||||
|
mock_invoke_agentic: MagicMock,
|
||||||
|
mock_setup: MagicMock,
|
||||||
|
mock_commit_iter: MagicMock,
|
||||||
|
mock_finalize: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
run_dir = Path(td)
|
||||||
|
config = _make_agentic_config(run_dir)
|
||||||
|
|
||||||
|
wt_path = run_dir / "work"
|
||||||
|
wt_path.mkdir()
|
||||||
|
mock_setup.return_value = (wt_path, "cross-eval/test")
|
||||||
|
|
||||||
|
mock_invoke_agentic.return_value = AgentResult(
|
||||||
|
output="diff output", exit_code=0,
|
||||||
|
agent_name="claude-coder", step_name="coding",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
mock_invoke.return_value = AgentResult(
|
||||||
|
output="VERDICT: PASS", exit_code=0,
|
||||||
|
agent_name="claude-reviewer", step_name="review",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
|
run_pipeline(config, cwd=Path(td))
|
||||||
|
|
||||||
|
mock_finalize.assert_called_once()
|
||||||
|
call_args = mock_finalize.call_args
|
||||||
|
# Should pass cwd, worktree_path, branch_name, preset_name, verdict
|
||||||
|
self.assertEqual(call_args[0][1], wt_path)
|
||||||
|
self.assertEqual(call_args[0][2], "cross-eval/test")
|
||||||
|
|
||||||
|
|
||||||
|
class TestParallelAgenticFallsBackToSequential(unittest.TestCase):
|
||||||
|
"""Multiple agentic steps in parallel batch fall back to sequential."""
|
||||||
|
|
||||||
|
def test_has_agentic_steps_detects_agentic(self) -> None:
|
||||||
|
coder = AgentConfig(
|
||||||
|
name="claude-coder", command="claude", args=[], agentic=True,
|
||||||
|
)
|
||||||
|
reviewer = AgentConfig(
|
||||||
|
name="claude-reviewer", command="claude", args=[], agentic=False,
|
||||||
|
)
|
||||||
|
config = PipelineConfig(
|
||||||
|
agents={"claude-coder": coder, "claude-reviewer": reviewer},
|
||||||
|
)
|
||||||
|
steps = [
|
||||||
|
StepConfig(name="a", agent="claude-coder", role="coding",
|
||||||
|
prompt_template="default:coding", output_key="a"),
|
||||||
|
]
|
||||||
|
self.assertTrue(_has_agentic_steps(config, steps))
|
||||||
|
|
||||||
|
def test_has_agentic_steps_returns_false_without_agentic(self) -> None:
|
||||||
|
reviewer = AgentConfig(
|
||||||
|
name="claude-reviewer", command="claude", args=[], agentic=False,
|
||||||
|
)
|
||||||
|
config = PipelineConfig(
|
||||||
|
agents={"claude-reviewer": reviewer},
|
||||||
|
)
|
||||||
|
steps = [
|
||||||
|
StepConfig(name="r", agent="claude-reviewer", role="review",
|
||||||
|
prompt_template="default:review", output_key="r", verdict=True),
|
||||||
|
]
|
||||||
|
self.assertFalse(_has_agentic_steps(config, steps))
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline._finalize_worktree", return_value="cross-eval/test")
|
||||||
|
@patch("cross_eval.pipeline._commit_iteration")
|
||||||
|
@patch("cross_eval.pipeline._setup_worktree")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent_agentic")
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent")
|
||||||
|
def test_parallel_agentic_runs_sequentially(
|
||||||
|
self,
|
||||||
|
mock_invoke: MagicMock,
|
||||||
|
mock_invoke_agentic: MagicMock,
|
||||||
|
mock_setup: MagicMock,
|
||||||
|
mock_commit_iter: MagicMock,
|
||||||
|
mock_finalize: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
"""When multiple agentic steps are parallel, they should run sequentially."""
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
run_dir = Path(td)
|
||||||
|
|
||||||
|
coder_a = AgentConfig(
|
||||||
|
name="coder-a", command="claude", args=[], agentic=True,
|
||||||
|
)
|
||||||
|
coder_b = AgentConfig(
|
||||||
|
name="coder-b", command="claude", args=[], agentic=True,
|
||||||
|
)
|
||||||
|
reviewer = AgentConfig(
|
||||||
|
name="reviewer", command="claude", args=["-p"], agentic=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
steps = [
|
||||||
|
StepConfig(
|
||||||
|
name="code_a", agent="coder-a", role="coding",
|
||||||
|
prompt_template="default:coding", output_key="code_a",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="code_b", agent="coder-b", role="coding",
|
||||||
|
prompt_template="default:coding", output_key="code_b",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review", agent="reviewer", role="review",
|
||||||
|
prompt_template="default:review", output_key="review_result",
|
||||||
|
verdict=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
config = PipelineConfig(
|
||||||
|
output_dir=run_dir,
|
||||||
|
max_iterations=1,
|
||||||
|
min_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||||
|
agents={
|
||||||
|
"coder-a": coder_a,
|
||||||
|
"coder-b": coder_b,
|
||||||
|
"reviewer": reviewer,
|
||||||
|
},
|
||||||
|
coders=["coder-a", "coder-b"],
|
||||||
|
reviewers=["reviewer"],
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="custom",
|
||||||
|
)
|
||||||
|
|
||||||
|
wt_path = run_dir / "work"
|
||||||
|
wt_path.mkdir()
|
||||||
|
mock_setup.return_value = (wt_path, "cross-eval/test")
|
||||||
|
|
||||||
|
call_order: list[str] = []
|
||||||
|
|
||||||
|
def _track_agentic(agent_config, prompt, step_name, **kwargs):
|
||||||
|
call_order.append(step_name)
|
||||||
|
return AgentResult(
|
||||||
|
output="diff", exit_code=0,
|
||||||
|
agent_name=agent_config.name, step_name=step_name,
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
|
mock_invoke_agentic.side_effect = _track_agentic
|
||||||
|
mock_invoke.return_value = AgentResult(
|
||||||
|
output="VERDICT: PASS", exit_code=0,
|
||||||
|
agent_name="reviewer", step_name="review",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
|
run_pipeline(config, cwd=Path(td))
|
||||||
|
|
||||||
|
# Both agentic steps should have been called (sequentially)
|
||||||
|
agentic_calls = [c for c in call_order if c.startswith("code_")]
|
||||||
|
self.assertEqual(len(agentic_calls), 2)
|
||||||
|
# They should appear in order (sequential, not concurrent)
|
||||||
|
self.assertEqual(agentic_calls, ["code_a", "code_b"])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -1,45 +1,79 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
from cross_eval.agent import _supports_reasoning_effort
|
from cross_eval.agent import AgentInvocationError, _supports_reasoning_effort
|
||||||
|
from cross_eval.cli import _apply_phased_iteration_override, main
|
||||||
from cross_eval.agent import invoke_agent
|
from cross_eval.agent import invoke_agent
|
||||||
from cross_eval.config import (
|
from cross_eval.config import (
|
||||||
BUILTIN_AGENTS,
|
BUILTIN_AGENTS,
|
||||||
|
_SENIOR_SYSTEM_PROMPT,
|
||||||
_default_seniors_for_preset,
|
_default_seniors_for_preset,
|
||||||
apply_reasoning_effort_settings,
|
apply_reasoning_effort_settings,
|
||||||
|
load_config,
|
||||||
normalize_reasoning_effort,
|
normalize_reasoning_effort,
|
||||||
|
normalize_prompt_template,
|
||||||
|
normalize_step_role,
|
||||||
validate_config,
|
validate_config,
|
||||||
)
|
)
|
||||||
from cross_eval.models import (
|
from cross_eval.models import (
|
||||||
AgentConfig,
|
AgentConfig,
|
||||||
|
AgentResult,
|
||||||
IterationResult,
|
IterationResult,
|
||||||
PhaseConfig,
|
PhaseConfig,
|
||||||
PipelineConfig,
|
PipelineConfig,
|
||||||
PipelineResult,
|
PipelineResult,
|
||||||
ReviewMetrics,
|
|
||||||
StepConfig,
|
StepConfig,
|
||||||
)
|
)
|
||||||
from cross_eval.pipeline import _detect_repeated_aggregate
|
from cross_eval.pipeline import (
|
||||||
|
_detect_auto_escalate,
|
||||||
|
_detect_repeated_aggregate,
|
||||||
|
_execute_parallel_batch,
|
||||||
|
_extract_senior_tracker,
|
||||||
|
_extract_verdict,
|
||||||
|
)
|
||||||
from cross_eval.prompts import (
|
from cross_eval.prompts import (
|
||||||
GENERATE_TEMPLATE,
|
CODING_TEMPLATE,
|
||||||
GENERATE_TEMPLATE_KO,
|
CODING_TEMPLATE_KO,
|
||||||
REVIEW_TEMPLATE,
|
REVIEW_TEMPLATE,
|
||||||
REVIEW_TEMPLATE_KO,
|
REVIEW_TEMPLATE_KO,
|
||||||
|
PLAN_REVIEW_TEMPLATE,
|
||||||
|
PLAN_REVIEW_TEMPLATE_KO,
|
||||||
REVIEW_ONLY_TEMPLATE,
|
REVIEW_ONLY_TEMPLATE,
|
||||||
REVIEW_ONLY_TEMPLATE_KO,
|
REVIEW_ONLY_TEMPLATE_KO,
|
||||||
AGGREGATE_REVIEW_TEMPLATE,
|
AGGREGATE_REVIEW_TEMPLATE,
|
||||||
AGGREGATE_REVIEW_TEMPLATE_KO,
|
AGGREGATE_REVIEW_TEMPLATE_KO,
|
||||||
_build_cross_review_preset,
|
_build_cross_review_preset,
|
||||||
|
_build_coding_review_fix_preset,
|
||||||
|
_build_plan_review_preset,
|
||||||
_build_review_fix_preset,
|
_build_review_fix_preset,
|
||||||
_build_review_only_preset,
|
_build_review_only_preset,
|
||||||
_build_simple_preset,
|
_build_simple_preset,
|
||||||
)
|
)
|
||||||
from cross_eval.report import build_report, parse_review_metrics
|
from cross_eval.report import build_report, parse_review_metrics
|
||||||
|
|
||||||
|
|
||||||
class BuiltinAgentConfigTest(unittest.TestCase):
|
class BuiltinAgentConfigTest(unittest.TestCase):
|
||||||
|
def test_claude_builtin_agents_use_user_settings_and_disable_slash_commands(self) -> None:
|
||||||
|
for agent_name in ("claude-coder", "claude-reviewer", "claude-senior"):
|
||||||
|
with self.subTest(agent=agent_name):
|
||||||
|
args = BUILTIN_AGENTS[agent_name].args
|
||||||
|
self.assertIn("--setting-sources", args)
|
||||||
|
self.assertIn("user", args)
|
||||||
|
self.assertIn("--disable-slash-commands", args)
|
||||||
|
|
||||||
|
def test_claude_builtin_agents_use_role_specific_permission_modes(self) -> None:
|
||||||
|
coder_args = BUILTIN_AGENTS["claude-coder"].args
|
||||||
|
reviewer_args = BUILTIN_AGENTS["claude-reviewer"].args
|
||||||
|
senior_args = BUILTIN_AGENTS["claude-senior"].args
|
||||||
|
|
||||||
|
self.assertIn("--dangerously-skip-permissions", coder_args)
|
||||||
|
self.assertIn("bypassPermissions", coder_args)
|
||||||
|
self.assertIn("plan", reviewer_args)
|
||||||
|
self.assertIn("plan", senior_args)
|
||||||
|
|
||||||
def test_codex_builtin_agents_skip_git_repo_check(self) -> None:
|
def test_codex_builtin_agents_skip_git_repo_check(self) -> None:
|
||||||
for agent_name in ("codex-coder", "codex-reviewer", "codex-senior"):
|
for agent_name in ("codex-coder", "codex-reviewer", "codex-senior"):
|
||||||
with self.subTest(agent=agent_name):
|
with self.subTest(agent=agent_name):
|
||||||
@@ -62,6 +96,10 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
|||||||
self.assertEqual(normalize_reasoning_effort("extra_high"), "xhigh")
|
self.assertEqual(normalize_reasoning_effort("extra_high"), "xhigh")
|
||||||
self.assertEqual(normalize_reasoning_effort("x-high"), "xhigh")
|
self.assertEqual(normalize_reasoning_effort("x-high"), "xhigh")
|
||||||
|
|
||||||
|
def test_normalize_step_role_and_template_aliases(self) -> None:
|
||||||
|
self.assertEqual(normalize_step_role("coding"), "coding")
|
||||||
|
self.assertEqual(normalize_prompt_template("default:coding"), "default:coding")
|
||||||
|
|
||||||
def test_apply_reasoning_effort_settings_uses_defaults_and_role_overrides(self) -> None:
|
def test_apply_reasoning_effort_settings_uses_defaults_and_role_overrides(self) -> None:
|
||||||
config = PipelineConfig(
|
config = PipelineConfig(
|
||||||
agents={
|
agents={
|
||||||
@@ -116,6 +154,123 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
|||||||
["codex", "-c", 'model_reasoning_effort="high"'],
|
["codex", "-c", 'model_reasoning_effort="high"'],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_invoke_agent_classifies_auth_failures(self) -> None:
|
||||||
|
def _fake_run(cmd, **kwargs):
|
||||||
|
class _Result:
|
||||||
|
returncode = 1
|
||||||
|
stdout = ""
|
||||||
|
stderr = "Not logged in · Please run /login"
|
||||||
|
|
||||||
|
return _Result()
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-reviewer",
|
||||||
|
command="claude",
|
||||||
|
args=["-p", "--model", "opus"],
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("subprocess.run", side_effect=_fake_run):
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent(agent, "prompt", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(ctx.exception.failure_type, "AUTH")
|
||||||
|
self.assertIn("Re-authenticate", ctx.exception.suggested_action)
|
||||||
|
|
||||||
|
def test_invoke_agent_classifies_usage_limit_failures(self) -> None:
|
||||||
|
def _fake_run(cmd, **kwargs):
|
||||||
|
class _Result:
|
||||||
|
returncode = 1
|
||||||
|
stdout = ""
|
||||||
|
stderr = "API Error: 429 rate limit exceeded for current quota"
|
||||||
|
|
||||||
|
return _Result()
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="codex-reviewer",
|
||||||
|
command="codex",
|
||||||
|
args=["exec", "--model", "gpt-5.4", "-"],
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("subprocess.run", side_effect=_fake_run):
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent(agent, "prompt", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(ctx.exception.failure_type, "USAGE_LIMIT")
|
||||||
|
self.assertIn("quota", ctx.exception.suggested_action)
|
||||||
|
|
||||||
|
def test_parallel_batch_saves_successes_before_failure(self) -> None:
|
||||||
|
config = PipelineConfig(
|
||||||
|
agents={
|
||||||
|
"ok-reviewer": AgentConfig(name="ok-reviewer", command="codex"),
|
||||||
|
"bad-reviewer": AgentConfig(name="bad-reviewer", command="claude"),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
steps = [
|
||||||
|
StepConfig(
|
||||||
|
name="review_ok",
|
||||||
|
agent="ok-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review-only",
|
||||||
|
output_key="review_ok",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review_bad",
|
||||||
|
agent="bad-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review-only",
|
||||||
|
output_key="review_bad",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
step_outputs: dict[str, str] = {}
|
||||||
|
step_results: dict[str, AgentResult] = {}
|
||||||
|
|
||||||
|
def _fake_invoke(agent, prompt, step_name, **kwargs):
|
||||||
|
if step_name == "review_ok":
|
||||||
|
return AgentResult(
|
||||||
|
output="VERDICT: PASS",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=1.0,
|
||||||
|
)
|
||||||
|
raise AgentInvocationError(
|
||||||
|
agent_name=agent.name,
|
||||||
|
step_name=step_name,
|
||||||
|
cmd_preview="claude -p ...",
|
||||||
|
raw_error="API Error: 429 rate limit exceeded for current quota",
|
||||||
|
failure_type="USAGE_LIMIT",
|
||||||
|
suggested_action="Agent CLI hit a quota, billing, or token budget limit. Refill or raise the limit, then rerun.",
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=_fake_invoke):
|
||||||
|
with self.assertRaises(RuntimeError) as ctx:
|
||||||
|
_execute_parallel_batch(
|
||||||
|
steps,
|
||||||
|
config,
|
||||||
|
input_contents={},
|
||||||
|
feedback="",
|
||||||
|
iteration=1,
|
||||||
|
max_iterations=3,
|
||||||
|
cwd=Path(tmpdir),
|
||||||
|
timeout=None,
|
||||||
|
dry_run=False,
|
||||||
|
step_outputs=step_outputs,
|
||||||
|
step_results=step_results,
|
||||||
|
run_dir=Path(tmpdir),
|
||||||
|
output_iter=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIn("Successful outputs were saved for: review_ok", str(ctx.exception))
|
||||||
|
self.assertEqual(step_outputs["review_ok"], "VERDICT: PASS")
|
||||||
|
self.assertTrue((Path(tmpdir) / "v1" / "review_ok.md").exists())
|
||||||
|
error_path = Path(tmpdir) / "v1" / "review_bad_error.md"
|
||||||
|
self.assertTrue(error_path.exists())
|
||||||
|
self.assertIn("Failure Type", error_path.read_text(encoding="utf-8"))
|
||||||
|
self.assertIn("USAGE_LIMIT", error_path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
def test_detect_repeated_aggregate_warns_on_same_output(self) -> None:
|
def test_detect_repeated_aggregate_warns_on_same_output(self) -> None:
|
||||||
steps = [
|
steps = [
|
||||||
StepConfig(
|
StepConfig(
|
||||||
@@ -169,6 +324,14 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
|||||||
),
|
),
|
||||||
["claude-senior"],
|
["claude-senior"],
|
||||||
)
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
_default_seniors_for_preset(
|
||||||
|
"preset:coding-review-fix",
|
||||||
|
["codex-reviewer"],
|
||||||
|
BUILTIN_AGENTS,
|
||||||
|
),
|
||||||
|
["codex-senior"],
|
||||||
|
)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
_default_seniors_for_preset(
|
_default_seniors_for_preset(
|
||||||
"preset:simple",
|
"preset:simple",
|
||||||
@@ -204,9 +367,37 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
[step.name for step in converge.steps[3:]],
|
[step.name for step in converge.steps[3:]],
|
||||||
["aggregate_review", "generate", "verify"],
|
["aggregate_review", "coding", "verify"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_coding_review_fix_starts_with_single_coding_phase(self) -> None:
|
||||||
|
phases = _build_coding_review_fix_preset(
|
||||||
|
["codex-coder"],
|
||||||
|
["claude-reviewer", "codex-reviewer"],
|
||||||
|
["codex-senior"],
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual([phase.name for phase in phases], ["initial_coding", "review_fix"])
|
||||||
|
self.assertEqual(phases[0].max_iterations, 1)
|
||||||
|
self.assertEqual([step.name for step in phases[0].steps], ["coding"])
|
||||||
|
self.assertEqual([step.name for step in phases[1].steps[2:]], ["aggregate_review", "coding", "verify"])
|
||||||
|
|
||||||
|
def test_apply_phased_iteration_override_updates_only_verdict_phases(self) -> None:
|
||||||
|
config = PipelineConfig(
|
||||||
|
phases=_build_coding_review_fix_preset(
|
||||||
|
["codex-coder"],
|
||||||
|
["codex-reviewer"],
|
||||||
|
["codex-senior"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
_apply_phased_iteration_override(config, 10)
|
||||||
|
|
||||||
|
self.assertEqual(config.phases[0].name, "initial_coding")
|
||||||
|
self.assertEqual(config.phases[0].max_iterations, 1)
|
||||||
|
self.assertEqual(config.phases[1].name, "review_fix")
|
||||||
|
self.assertEqual(config.phases[1].max_iterations, 10)
|
||||||
|
|
||||||
def test_review_only_duplicate_reviewers_get_unique_step_keys(self) -> None:
|
def test_review_only_duplicate_reviewers_get_unique_step_keys(self) -> None:
|
||||||
steps = _build_review_only_preset(
|
steps = _build_review_only_preset(
|
||||||
["codex-coder"],
|
["codex-coder"],
|
||||||
@@ -219,6 +410,31 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
|||||||
["review_codex_reviewer", "review_codex_reviewer_2"],
|
["review_codex_reviewer", "review_codex_reviewer_2"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_plan_review_duplicate_reviewers_get_unique_step_keys(self) -> None:
|
||||||
|
steps = _build_plan_review_preset(
|
||||||
|
["codex-coder"],
|
||||||
|
["codex-reviewer", "codex-reviewer"],
|
||||||
|
[],
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
[step.output_key for step in steps],
|
||||||
|
["plan_review_codex_reviewer", "plan_review_codex_reviewer_2"],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_plan_review_with_senior_adds_aggregate_step(self) -> None:
|
||||||
|
steps = _build_plan_review_preset(
|
||||||
|
["codex-coder"],
|
||||||
|
["claude-reviewer", "codex-reviewer"],
|
||||||
|
["claude-senior"],
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(steps[-1].name, "senior_review")
|
||||||
|
self.assertEqual(steps[-1].agent, "claude-senior")
|
||||||
|
self.assertTrue(steps[-1].verdict)
|
||||||
|
self.assertFalse(steps[0].verdict)
|
||||||
|
self.assertFalse(steps[1].verdict)
|
||||||
|
|
||||||
def test_cross_review_duplicate_coders_get_unique_step_keys(self) -> None:
|
def test_cross_review_duplicate_coders_get_unique_step_keys(self) -> None:
|
||||||
steps = _build_cross_review_preset(
|
steps = _build_cross_review_preset(
|
||||||
["codex-coder", "codex-coder"],
|
["codex-coder", "codex-coder"],
|
||||||
@@ -246,7 +462,7 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
|||||||
steps = phases[0].steps
|
steps = phases[0].steps
|
||||||
self.assertEqual(steps[2].name, "aggregate_review")
|
self.assertEqual(steps[2].name, "aggregate_review")
|
||||||
self.assertEqual(steps[2].agent, "codex-senior")
|
self.assertEqual(steps[2].agent, "codex-senior")
|
||||||
self.assertEqual(steps[3].name, "generate")
|
self.assertEqual(steps[3].name, "coding")
|
||||||
self.assertEqual(steps[4].name, "verify")
|
self.assertEqual(steps[4].name, "verify")
|
||||||
self.assertEqual(steps[4].agent, "codex-senior")
|
self.assertEqual(steps[4].agent, "codex-senior")
|
||||||
self.assertTrue(steps[4].verdict)
|
self.assertTrue(steps[4].verdict)
|
||||||
@@ -273,7 +489,7 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
|||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
[step.name for step in steps],
|
[step.name for step in steps],
|
||||||
["generate", "review", "senior_review"],
|
["coding", "review", "senior_review"],
|
||||||
)
|
)
|
||||||
self.assertFalse(steps[1].verdict)
|
self.assertFalse(steps[1].verdict)
|
||||||
self.assertTrue(steps[2].verdict)
|
self.assertTrue(steps[2].verdict)
|
||||||
@@ -325,6 +541,8 @@ class PromptTemplateTest(unittest.TestCase):
|
|||||||
for tmpl, label in [
|
for tmpl, label in [
|
||||||
(REVIEW_TEMPLATE, "REVIEW_TEMPLATE"),
|
(REVIEW_TEMPLATE, "REVIEW_TEMPLATE"),
|
||||||
(REVIEW_TEMPLATE_KO, "REVIEW_TEMPLATE_KO"),
|
(REVIEW_TEMPLATE_KO, "REVIEW_TEMPLATE_KO"),
|
||||||
|
(PLAN_REVIEW_TEMPLATE, "PLAN_REVIEW_TEMPLATE"),
|
||||||
|
(PLAN_REVIEW_TEMPLATE_KO, "PLAN_REVIEW_TEMPLATE_KO"),
|
||||||
(REVIEW_ONLY_TEMPLATE, "REVIEW_ONLY_TEMPLATE"),
|
(REVIEW_ONLY_TEMPLATE, "REVIEW_ONLY_TEMPLATE"),
|
||||||
(REVIEW_ONLY_TEMPLATE_KO, "REVIEW_ONLY_TEMPLATE_KO"),
|
(REVIEW_ONLY_TEMPLATE_KO, "REVIEW_ONLY_TEMPLATE_KO"),
|
||||||
]:
|
]:
|
||||||
@@ -351,10 +569,10 @@ class PromptTemplateTest(unittest.TestCase):
|
|||||||
self.assertIn("CONFIRMED", tmpl)
|
self.assertIn("CONFIRMED", tmpl)
|
||||||
self.assertIn("DISMISSED", tmpl)
|
self.assertIn("DISMISSED", tmpl)
|
||||||
|
|
||||||
def test_generate_templates_ignore_dismissed(self) -> None:
|
def test_coding_templates_ignore_dismissed(self) -> None:
|
||||||
"""Generate templates should tell coder to ignore DISMISSED items."""
|
"""Coding templates should tell coder to ignore DISMISSED items."""
|
||||||
self.assertIn("DISMISSED", GENERATE_TEMPLATE)
|
self.assertIn("DISMISSED", CODING_TEMPLATE)
|
||||||
self.assertIn("DISMISSED", GENERATE_TEMPLATE_KO)
|
self.assertIn("DISMISSED", CODING_TEMPLATE_KO)
|
||||||
|
|
||||||
def test_aggregate_templates_dismissed_structure(self) -> None:
|
def test_aggregate_templates_dismissed_structure(self) -> None:
|
||||||
"""Aggregate templates should use [False positive] / [Already fixed] tags."""
|
"""Aggregate templates should use [False positive] / [Already fixed] tags."""
|
||||||
@@ -487,11 +705,11 @@ class ReviewMetricsParsingTest(unittest.TestCase):
|
|||||||
language="en",
|
language="en",
|
||||||
pipeline=[
|
pipeline=[
|
||||||
StepConfig(
|
StepConfig(
|
||||||
name="generate",
|
name="coding",
|
||||||
agent="claude-coder",
|
agent="claude-coder",
|
||||||
role="generate",
|
role="coding",
|
||||||
prompt_template="default:generate",
|
prompt_template="default:coding",
|
||||||
output_key="generated_code",
|
output_key="coding_output",
|
||||||
verdict=True,
|
verdict=True,
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -500,7 +718,7 @@ class ReviewMetricsParsingTest(unittest.TestCase):
|
|||||||
iterations=[
|
iterations=[
|
||||||
IterationResult(
|
IterationResult(
|
||||||
iteration=1,
|
iteration=1,
|
||||||
step_outputs={"generated_code": "some code"},
|
step_outputs={"coding_output": "some code"},
|
||||||
verdict="PASS",
|
verdict="PASS",
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -511,5 +729,361 @@ class ReviewMetricsParsingTest(unittest.TestCase):
|
|||||||
self.assertNotIn("Review Metrics", report)
|
self.assertNotIn("Review Metrics", report)
|
||||||
|
|
||||||
|
|
||||||
|
class EscalateVerdictTest(unittest.TestCase):
|
||||||
|
"""Test ESCALATE verdict functionality."""
|
||||||
|
|
||||||
|
def test_extract_verdict_escalate(self) -> None:
|
||||||
|
output = "Some review content\n\nVERDICT: ESCALATE\n"
|
||||||
|
result = _extract_verdict(output, r"VERDICT:\s*PASS")
|
||||||
|
self.assertEqual(result, "ESCALATE")
|
||||||
|
|
||||||
|
def test_extract_verdict_escalate_priority(self) -> None:
|
||||||
|
"""ESCALATE should take priority even if PASS pattern also matches."""
|
||||||
|
output = "VERDICT: PASS\n\nVERDICT: ESCALATE\n"
|
||||||
|
result = _extract_verdict(output, r"VERDICT:\s*PASS")
|
||||||
|
self.assertEqual(result, "ESCALATE")
|
||||||
|
|
||||||
|
def test_extract_verdict_pass_still_works(self) -> None:
|
||||||
|
output = "All good\n\nVERDICT: PASS\n"
|
||||||
|
result = _extract_verdict(output, r"VERDICT:\s*PASS")
|
||||||
|
self.assertEqual(result, "PASS")
|
||||||
|
|
||||||
|
def test_extract_verdict_fail_still_works(self) -> None:
|
||||||
|
output = "Issues found\n\nVERDICT: FAIL\n"
|
||||||
|
result = _extract_verdict(output, r"VERDICT:\s*PASS")
|
||||||
|
self.assertEqual(result, "FAIL")
|
||||||
|
|
||||||
|
def test_extract_senior_tracker(self) -> None:
|
||||||
|
output = (
|
||||||
|
"Some text\n\n"
|
||||||
|
"## Issue Tracker\n"
|
||||||
|
"| ISS-ID | Severity | Description | Status | Since |\n"
|
||||||
|
"|--------|----------|-------------|--------|-------|\n"
|
||||||
|
"| ISS-001 | Critical | Missing auth | Open | v1 |\n"
|
||||||
|
"| ISS-002 | Major | Bad naming | Fixed | v1 |\n"
|
||||||
|
"\nMore text"
|
||||||
|
)
|
||||||
|
tracker = _extract_senior_tracker(output)
|
||||||
|
self.assertIn("Issue Tracker", tracker)
|
||||||
|
self.assertIn("ISS-001", tracker)
|
||||||
|
self.assertIn("ISS-002", tracker)
|
||||||
|
|
||||||
|
def test_extract_senior_tracker_empty(self) -> None:
|
||||||
|
output = "No tracker table here"
|
||||||
|
tracker = _extract_senior_tracker(output)
|
||||||
|
self.assertEqual(tracker, "")
|
||||||
|
|
||||||
|
def test_auto_escalate_heuristic(self) -> None:
|
||||||
|
prev1 = "Issue in src/auth.py: missing validation"
|
||||||
|
prev2 = "Issue in src/auth.py: validation still missing"
|
||||||
|
current = "Issue in src/auth.py: validation not implemented"
|
||||||
|
|
||||||
|
# Should detect repeated issue
|
||||||
|
self.assertTrue(_detect_auto_escalate([prev1, prev2], current, threshold=2))
|
||||||
|
|
||||||
|
def test_auto_escalate_no_repeat(self) -> None:
|
||||||
|
prev1 = "Issue in src/auth.py: missing validation"
|
||||||
|
current = "Issue in src/database.py: connection pool"
|
||||||
|
|
||||||
|
self.assertFalse(_detect_auto_escalate([prev1], current, threshold=2))
|
||||||
|
|
||||||
|
def test_auto_escalate_different_issues_same_file(self) -> None:
|
||||||
|
"""Same file path but different issues should NOT trigger escalation."""
|
||||||
|
prev1 = "Issue in src/utils.py: missing validation on input"
|
||||||
|
prev2 = "Issue in src/utils.py: unused import at top of file"
|
||||||
|
current = "Issue in src/utils.py: error handling not implemented"
|
||||||
|
|
||||||
|
# All mention src/utils.py, but the issue keywords differ across
|
||||||
|
# iterations, so this should NOT escalate.
|
||||||
|
self.assertFalse(_detect_auto_escalate([prev1, prev2], current, threshold=2))
|
||||||
|
|
||||||
|
def test_report_escalate_verdict(self) -> None:
|
||||||
|
config = PipelineConfig(language="en")
|
||||||
|
result = PipelineResult(
|
||||||
|
final_verdict="ESCALATE",
|
||||||
|
escalated_issues=["Requirements are ambiguous — need stakeholder input"],
|
||||||
|
)
|
||||||
|
|
||||||
|
report = build_report(config, result)
|
||||||
|
|
||||||
|
self.assertIn("ESCALATE", report)
|
||||||
|
self.assertIn("Human review required", report)
|
||||||
|
self.assertIn("ambiguous", report)
|
||||||
|
|
||||||
|
def test_report_escalate_verdict_ko(self) -> None:
|
||||||
|
config = PipelineConfig(language="ko")
|
||||||
|
result = PipelineResult(
|
||||||
|
final_verdict="ESCALATE",
|
||||||
|
escalated_issues=["요구사항이 모호함"],
|
||||||
|
)
|
||||||
|
|
||||||
|
report = build_report(config, result)
|
||||||
|
|
||||||
|
self.assertIn("ESCALATE", report)
|
||||||
|
self.assertIn("사람의 확인이 필요합니다", report)
|
||||||
|
|
||||||
|
def test_exit_code_escalate(self) -> None:
|
||||||
|
from cross_eval.cli import main
|
||||||
|
|
||||||
|
mock_result = PipelineResult(
|
||||||
|
final_verdict="ESCALATE",
|
||||||
|
escalated_issues=["Needs human review"],
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("cross_eval.config.load_config") as mock_load, \
|
||||||
|
patch("cross_eval.config.validate_config", return_value=[]), \
|
||||||
|
patch("cross_eval.pipeline.run_pipeline", return_value=mock_result), \
|
||||||
|
patch("cross_eval.report.print_escalation_report"):
|
||||||
|
mock_config = PipelineConfig(
|
||||||
|
pipeline=[
|
||||||
|
StepConfig(
|
||||||
|
name="review",
|
||||||
|
agent="claude-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_result",
|
||||||
|
verdict=True,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
agents=dict(BUILTIN_AGENTS),
|
||||||
|
coders=["claude-coder"],
|
||||||
|
reviewers=["claude-reviewer"],
|
||||||
|
inputs={"plan": Path("/tmp/plan.md")},
|
||||||
|
language="en",
|
||||||
|
max_iterations=3,
|
||||||
|
preset_name="simple",
|
||||||
|
)
|
||||||
|
mock_load.return_value = mock_config
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".yaml", mode="w") as f:
|
||||||
|
f.write("inputs:\n plan: /tmp/plan.md\n")
|
||||||
|
f.flush()
|
||||||
|
exit_code = main(["run", "-c", f.name])
|
||||||
|
|
||||||
|
self.assertEqual(exit_code, 2)
|
||||||
|
|
||||||
|
def test_senior_prompt_includes_escalate(self) -> None:
|
||||||
|
self.assertIn("ESCALATE", _SENIOR_SYSTEM_PROMPT)
|
||||||
|
self.assertIn("ambiguous", _SENIOR_SYSTEM_PROMPT.lower())
|
||||||
|
|
||||||
|
def test_aggregate_template_has_tracker(self) -> None:
|
||||||
|
self.assertIn("{previous_senior_tracker}", AGGREGATE_REVIEW_TEMPLATE)
|
||||||
|
self.assertIn("Issue Tracker", AGGREGATE_REVIEW_TEMPLATE)
|
||||||
|
self.assertIn("VERDICT: ESCALATE", AGGREGATE_REVIEW_TEMPLATE)
|
||||||
|
|
||||||
|
def test_report_includes_issue_tracker_summary(self) -> None:
|
||||||
|
config = PipelineConfig(
|
||||||
|
language="en",
|
||||||
|
pipeline=[
|
||||||
|
StepConfig(
|
||||||
|
name="review",
|
||||||
|
agent="claude-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_result",
|
||||||
|
verdict=True,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
result = PipelineResult(
|
||||||
|
iterations=[
|
||||||
|
IterationResult(
|
||||||
|
iteration=1,
|
||||||
|
step_outputs={
|
||||||
|
"review_result": (
|
||||||
|
"### Issues Found\n"
|
||||||
|
"- ISS-001 [Critical][Omission] Missing auth check\n"
|
||||||
|
"- ISS-002 [Major][Omission] No input validation\n"
|
||||||
|
"### Verdict\nVERDICT: FAIL"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
verdict="FAIL",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
final_verdict="FAIL",
|
||||||
|
)
|
||||||
|
|
||||||
|
report = build_report(config, result)
|
||||||
|
self.assertIn("Issue Tracker Summary", report)
|
||||||
|
self.assertIn("ISS-001", report)
|
||||||
|
self.assertIn("ISS-002", report)
|
||||||
|
|
||||||
|
def test_report_includes_senior_tracker_table(self) -> None:
|
||||||
|
config = PipelineConfig(
|
||||||
|
language="en",
|
||||||
|
pipeline=[
|
||||||
|
StepConfig(
|
||||||
|
name="senior_review",
|
||||||
|
agent="claude-senior",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:aggregate-review",
|
||||||
|
output_key="senior_review_result",
|
||||||
|
verdict=True,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
result = PipelineResult(
|
||||||
|
iterations=[
|
||||||
|
IterationResult(
|
||||||
|
iteration=1,
|
||||||
|
step_outputs={
|
||||||
|
"senior_review_result": (
|
||||||
|
"### Confirmed Issues\n- Missing auth\n\n"
|
||||||
|
"## Issue Tracker\n"
|
||||||
|
"| ISS-ID | Severity | Description | Status | Since |\n"
|
||||||
|
"|--------|----------|-------------|--------|-------|\n"
|
||||||
|
"| ISS-001 | Critical | Missing auth check | Open | v1 |\n"
|
||||||
|
"| ISS-002 | Major | No validation | Fixed | v1 |\n"
|
||||||
|
"\n### Verdict\nVERDICT: FAIL"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
verdict="FAIL",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
final_verdict="FAIL",
|
||||||
|
)
|
||||||
|
|
||||||
|
report = build_report(config, result)
|
||||||
|
self.assertIn("Issue Tracker Summary", report)
|
||||||
|
self.assertIn("ISS-001", report)
|
||||||
|
self.assertIn("Fixed", report)
|
||||||
|
|
||||||
|
def test_aggregate_template_ko_has_tracker(self) -> None:
|
||||||
|
self.assertIn("{previous_senior_tracker}", AGGREGATE_REVIEW_TEMPLATE_KO)
|
||||||
|
self.assertIn("이슈 트래커", AGGREGATE_REVIEW_TEMPLATE_KO)
|
||||||
|
self.assertIn("VERDICT: ESCALATE", AGGREGATE_REVIEW_TEMPLATE_KO)
|
||||||
|
|
||||||
|
|
||||||
|
class FixPresetBehaviorTest(unittest.TestCase):
|
||||||
|
def _write_fix_config(self, root: Path, *, max_iterations: int = 7) -> Path:
|
||||||
|
(root / "plan.md").write_text("# plan\n", encoding="utf-8")
|
||||||
|
(root / "checklist.md").write_text("# checklist\n", encoding="utf-8")
|
||||||
|
config_path = root / "config.yaml"
|
||||||
|
config_path.write_text(
|
||||||
|
(
|
||||||
|
"inputs:\n"
|
||||||
|
" plan: plan.md\n"
|
||||||
|
" checklist: checklist.md\n"
|
||||||
|
"coders: [claude-coder]\n"
|
||||||
|
"reviewers: [claude-reviewer]\n"
|
||||||
|
"pipeline: preset:review-fix\n"
|
||||||
|
f"max_iterations: {max_iterations}\n"
|
||||||
|
"language: en\n"
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
return config_path
|
||||||
|
|
||||||
|
def test_load_config_syncs_phased_iterations_and_enables_agentic(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
config = load_config(self._write_fix_config(Path(tmpdir), max_iterations=7))
|
||||||
|
|
||||||
|
self.assertEqual(config.preset_name, "review-fix")
|
||||||
|
self.assertEqual(config.phases[0].max_iterations, 7)
|
||||||
|
self.assertTrue(config.agents["claude-coder"].agentic)
|
||||||
|
self.assertNotIn("-p", config.agents["claude-coder"].args)
|
||||||
|
|
||||||
|
def test_run_config_max_iter_updates_existing_phased_pipeline(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
config_path = self._write_fix_config(Path(tmpdir), max_iterations=7)
|
||||||
|
captured: dict[str, object] = {}
|
||||||
|
|
||||||
|
def _fake_run_pipeline(config, **kwargs):
|
||||||
|
captured["phase_max"] = config.phases[0].max_iterations
|
||||||
|
captured["agentic"] = config.agents[config.coders[0]].agentic
|
||||||
|
return PipelineResult(
|
||||||
|
iterations=[],
|
||||||
|
final_verdict="PASS",
|
||||||
|
run_dir=Path(tmpdir) / "output",
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.run_pipeline", side_effect=_fake_run_pipeline):
|
||||||
|
exit_code = main([
|
||||||
|
"run",
|
||||||
|
"--config", str(config_path),
|
||||||
|
"--max-iter", "9",
|
||||||
|
"--dry-run",
|
||||||
|
])
|
||||||
|
|
||||||
|
self.assertEqual(exit_code, 0)
|
||||||
|
self.assertEqual(captured["phase_max"], 9)
|
||||||
|
self.assertTrue(captured["agentic"])
|
||||||
|
|
||||||
|
def test_run_preset_review_fix_auto_enables_agentic_without_flag(self) -> None:
|
||||||
|
captured: dict[str, object] = {}
|
||||||
|
|
||||||
|
def _fake_run_pipeline(config, **kwargs):
|
||||||
|
captured["preset"] = config.preset_name
|
||||||
|
captured["agentic"] = config.agents[config.coders[0]].agentic
|
||||||
|
captured["phase_max"] = config.phases[0].max_iterations
|
||||||
|
return PipelineResult(
|
||||||
|
iterations=[],
|
||||||
|
final_verdict="PASS",
|
||||||
|
run_dir=Path(".cross-eval/output"),
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.run_pipeline", side_effect=_fake_run_pipeline):
|
||||||
|
exit_code = main(["run", "--preset", "review-fix", "--dry-run"])
|
||||||
|
|
||||||
|
self.assertEqual(exit_code, 0)
|
||||||
|
self.assertEqual(captured["preset"], "review-fix")
|
||||||
|
self.assertTrue(captured["agentic"])
|
||||||
|
self.assertEqual(captured["phase_max"], 3)
|
||||||
|
|
||||||
|
def test_run_senior_model_override_applies_only_to_seniors(self) -> None:
|
||||||
|
captured: dict[str, list[str]] = {}
|
||||||
|
|
||||||
|
def _fake_run_pipeline(config, **kwargs):
|
||||||
|
captured["coder_args"] = list(config.agents[config.coders[0]].args)
|
||||||
|
captured["reviewer_args"] = list(config.agents[config.reviewers[0]].args)
|
||||||
|
captured["senior_args"] = list(config.agents[config.seniors[0]].args)
|
||||||
|
return PipelineResult(
|
||||||
|
iterations=[],
|
||||||
|
final_verdict="PASS",
|
||||||
|
run_dir=Path(".cross-eval/output"),
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.run_pipeline", side_effect=_fake_run_pipeline):
|
||||||
|
exit_code = main([
|
||||||
|
"run",
|
||||||
|
"--preset", "review-fix",
|
||||||
|
"--coder", "claude",
|
||||||
|
"--reviewer", "claude",
|
||||||
|
"--senior", "claude",
|
||||||
|
"--senior-model", "sonnet",
|
||||||
|
"--dry-run",
|
||||||
|
])
|
||||||
|
|
||||||
|
self.assertEqual(exit_code, 0)
|
||||||
|
self.assertIn("opus", captured["coder_args"])
|
||||||
|
self.assertIn("opus", captured["reviewer_args"])
|
||||||
|
self.assertIn("sonnet", captured["senior_args"])
|
||||||
|
|
||||||
|
|
||||||
|
class OutputDirectoryResolutionTest(unittest.TestCase):
|
||||||
|
def test_load_config_resolves_output_dir_from_project_root(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
ce_dir = root / ".cross-eval"
|
||||||
|
ce_dir.mkdir()
|
||||||
|
(ce_dir / "plan.md").write_text("# plan\n", encoding="utf-8")
|
||||||
|
config_path = ce_dir / "config.yaml"
|
||||||
|
config_path.write_text(
|
||||||
|
(
|
||||||
|
"inputs:\n"
|
||||||
|
" plan: plan.md\n"
|
||||||
|
"coders: [claude-coder]\n"
|
||||||
|
"reviewers: [claude-reviewer]\n"
|
||||||
|
"pipeline: preset:simple\n"
|
||||||
|
"output_dir: .cross-eval/output\n"
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
config = load_config(config_path)
|
||||||
|
|
||||||
|
self.assertEqual(config.output_dir.resolve(), (root / ".cross-eval" / "output").resolve())
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
945
tests/test_evidence.py
Normal file
945
tests/test_evidence.py
Normal file
@@ -0,0 +1,945 @@
|
|||||||
|
"""Regression tests for runtime evidence propagation and report visibility.
|
||||||
|
|
||||||
|
Covers:
|
||||||
|
1. Execution evidence is surfaced in reviewer/senior prompt context.
|
||||||
|
2. Reports include command preview and transcript excerpts.
|
||||||
|
3. Claude agentic failure detection (empty diff, write failure, expanded markers).
|
||||||
|
4. _format_execution_evidence produces expected output.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
from cross_eval.agent import (
|
||||||
|
AgentInvocationError,
|
||||||
|
_claims_file_changes,
|
||||||
|
_has_write_failure_indicators,
|
||||||
|
invoke_agent_agentic,
|
||||||
|
)
|
||||||
|
from cross_eval.config import BUILTIN_AGENTS
|
||||||
|
from cross_eval.models import (
|
||||||
|
AgentConfig,
|
||||||
|
AgentResult,
|
||||||
|
IterationResult,
|
||||||
|
PipelineConfig,
|
||||||
|
PipelineResult,
|
||||||
|
StepConfig,
|
||||||
|
)
|
||||||
|
from cross_eval.pipeline import _build_artifact_references, _format_execution_evidence, run_pipeline
|
||||||
|
from cross_eval.report import build_report
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 1. Execution evidence formatting
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestFormatExecutionEvidence(unittest.TestCase):
|
||||||
|
"""_format_execution_evidence produces a compact summary for reviewers."""
|
||||||
|
|
||||||
|
def test_empty_results_returns_placeholder(self) -> None:
|
||||||
|
self.assertIn("no prior execution evidence", _format_execution_evidence({}))
|
||||||
|
|
||||||
|
def test_single_result_includes_key_fields(self) -> None:
|
||||||
|
result = AgentResult(
|
||||||
|
output="some diff",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="claude-coder",
|
||||||
|
step_name="coding",
|
||||||
|
duration_seconds=12.3,
|
||||||
|
transcript="# Agent Execution Transcript\n\n## Command\nclaude ...",
|
||||||
|
command_preview="claude --setting-sources user",
|
||||||
|
)
|
||||||
|
evidence = _format_execution_evidence({"coding_output": result})
|
||||||
|
self.assertIn("claude-coder", evidence)
|
||||||
|
self.assertIn("coding", evidence)
|
||||||
|
self.assertIn("Exit code: 0", evidence)
|
||||||
|
self.assertIn("12.3s", evidence)
|
||||||
|
self.assertIn("claude --setting-sources user", evidence)
|
||||||
|
self.assertNotIn("Transcript excerpt", evidence)
|
||||||
|
|
||||||
|
def test_multiple_results_separated(self) -> None:
|
||||||
|
r1 = AgentResult(
|
||||||
|
output="diff1", exit_code=0, agent_name="coder",
|
||||||
|
step_name="coding", duration_seconds=1.0,
|
||||||
|
command_preview="cmd1",
|
||||||
|
)
|
||||||
|
r2 = AgentResult(
|
||||||
|
output="review text", exit_code=0, agent_name="reviewer",
|
||||||
|
step_name="review", duration_seconds=2.0,
|
||||||
|
command_preview="cmd2",
|
||||||
|
)
|
||||||
|
evidence = _format_execution_evidence({
|
||||||
|
"coding_output": r1,
|
||||||
|
"review_result": r2,
|
||||||
|
})
|
||||||
|
self.assertIn("coder", evidence)
|
||||||
|
self.assertIn("reviewer", evidence)
|
||||||
|
self.assertIn("---", evidence)
|
||||||
|
|
||||||
|
def test_transcript_truncated_at_2000_chars(self) -> None:
|
||||||
|
long_transcript = "x" * 3000
|
||||||
|
result = AgentResult(
|
||||||
|
output="out", exit_code=0, agent_name="agent",
|
||||||
|
step_name="step", duration_seconds=1.0,
|
||||||
|
transcript=long_transcript,
|
||||||
|
)
|
||||||
|
evidence = _format_execution_evidence({"key": result})
|
||||||
|
self.assertNotIn("x" * 3000, evidence)
|
||||||
|
|
||||||
|
def test_artifact_paths_included_when_run_dir_provided(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
result = AgentResult(
|
||||||
|
output="diff",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="coder",
|
||||||
|
step_name="coding",
|
||||||
|
duration_seconds=1.2,
|
||||||
|
transcript="stdout",
|
||||||
|
command_preview="claude ...",
|
||||||
|
)
|
||||||
|
evidence = _format_execution_evidence(
|
||||||
|
{"coding_output": result},
|
||||||
|
run_dir=Path(tmpdir),
|
||||||
|
iteration=2,
|
||||||
|
)
|
||||||
|
self.assertIn("v2/coding.md", evidence)
|
||||||
|
self.assertIn("v2/coding_transcript.md", evidence)
|
||||||
|
|
||||||
|
|
||||||
|
class TestArtifactReferences(unittest.TestCase):
|
||||||
|
"""Artifact references should prefer file paths and git state over inline text."""
|
||||||
|
|
||||||
|
def test_contains_input_refs_and_git_context(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir) / "repo"
|
||||||
|
repo.mkdir()
|
||||||
|
(repo / "plan.md").write_text("plan", encoding="utf-8")
|
||||||
|
(repo / "checklist.md").write_text("checklist", encoding="utf-8")
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
subprocess.run(["git", "init"], cwd=repo, capture_output=True, check=True)
|
||||||
|
subprocess.run(["git", "config", "user.email", "test@test.com"], cwd=repo, capture_output=True, check=True)
|
||||||
|
subprocess.run(["git", "config", "user.name", "Test"], cwd=repo, capture_output=True, check=True)
|
||||||
|
subprocess.run(["git", "add", "."], cwd=repo, capture_output=True, check=True)
|
||||||
|
subprocess.run(["git", "commit", "-m", "init"], cwd=repo, capture_output=True, check=True)
|
||||||
|
|
||||||
|
refs = _build_artifact_references(
|
||||||
|
{
|
||||||
|
"plan_ref": str((repo / "plan.md").resolve()),
|
||||||
|
"checklist_ref": str((repo / "checklist.md").resolve()),
|
||||||
|
"docs_ref": "(none)",
|
||||||
|
},
|
||||||
|
cwd=repo,
|
||||||
|
run_dir=repo / ".cross-eval" / "output" / "run",
|
||||||
|
iteration=1,
|
||||||
|
worktree_path=None,
|
||||||
|
)
|
||||||
|
self.assertIn("Plan:", refs)
|
||||||
|
self.assertIn("Git commit:", refs)
|
||||||
|
self.assertIn("Suggested git commands", refs)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 2. Evidence in reviewer prompts (integration)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestEvidenceInReviewerPrompt(unittest.TestCase):
|
||||||
|
"""Reviewer prompts include execution evidence from prior coding step."""
|
||||||
|
|
||||||
|
def test_reviewer_receives_evidence(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
steps = [
|
||||||
|
StepConfig(
|
||||||
|
name="coding", agent="claude-coder", role="coding",
|
||||||
|
prompt_template="default:coding", output_key="coding_output",
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review", agent="claude-reviewer", role="review",
|
||||||
|
prompt_template="default:review", output_key="review_result",
|
||||||
|
verdict=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(
|
||||||
|
output_dir=Path(tmpdir),
|
||||||
|
max_iterations=1,
|
||||||
|
min_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||||
|
agents=dict(BUILTIN_AGENTS),
|
||||||
|
coders=["claude-coder"],
|
||||||
|
reviewers=["claude-reviewer"],
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="simple",
|
||||||
|
)
|
||||||
|
|
||||||
|
captured_prompts: list[dict] = []
|
||||||
|
|
||||||
|
def _mock(agent_config, prompt, step_name, **kwargs):
|
||||||
|
captured_prompts.append({
|
||||||
|
"step_name": step_name,
|
||||||
|
"prompt": prompt,
|
||||||
|
})
|
||||||
|
if step_name == "coding":
|
||||||
|
return AgentResult(
|
||||||
|
output="Implemented feature X",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=5.0,
|
||||||
|
transcript="# Transcript\nclaude ran...",
|
||||||
|
command_preview="claude --setting-sources user",
|
||||||
|
)
|
||||||
|
return AgentResult(
|
||||||
|
output="VERDICT: PASS",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=2.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock):
|
||||||
|
result = run_pipeline(config)
|
||||||
|
|
||||||
|
self.assertEqual(result.final_verdict, "PASS")
|
||||||
|
|
||||||
|
# The reviewer prompt should contain execution evidence
|
||||||
|
review_prompts = [
|
||||||
|
p for p in captured_prompts if p["step_name"] == "review"
|
||||||
|
]
|
||||||
|
self.assertTrue(len(review_prompts) >= 1)
|
||||||
|
review_prompt = review_prompts[0]["prompt"]
|
||||||
|
self.assertIn("Artifact References", review_prompt)
|
||||||
|
self.assertIn("Execution Evidence", review_prompt)
|
||||||
|
self.assertIn("claude-coder", review_prompt)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 3. Report includes evidence
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestReportIncludesEvidence(unittest.TestCase):
|
||||||
|
"""Report generation includes command preview and transcript excerpts."""
|
||||||
|
|
||||||
|
def _make_pipeline_result(self) -> tuple[PipelineConfig, PipelineResult]:
|
||||||
|
steps = [
|
||||||
|
StepConfig(
|
||||||
|
name="coding", agent="claude-coder", role="coding",
|
||||||
|
prompt_template="default:coding", output_key="coding_output",
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review", agent="claude-reviewer", role="review",
|
||||||
|
prompt_template="default:review", output_key="review_result",
|
||||||
|
verdict=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(
|
||||||
|
max_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Plan", "checklist": "CL"},
|
||||||
|
agents=dict(BUILTIN_AGENTS),
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="simple",
|
||||||
|
)
|
||||||
|
|
||||||
|
coding_result = AgentResult(
|
||||||
|
output="diff --git a/file ...",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="claude-coder",
|
||||||
|
step_name="coding",
|
||||||
|
duration_seconds=10.0,
|
||||||
|
transcript="# Agent Execution Transcript\n## Command\nclaude ...\n## Stdout\nok",
|
||||||
|
command_preview="claude --setting-sources user",
|
||||||
|
)
|
||||||
|
review_result = AgentResult(
|
||||||
|
output="All good.\n\nVERDICT: PASS",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="claude-reviewer",
|
||||||
|
step_name="review",
|
||||||
|
duration_seconds=5.0,
|
||||||
|
transcript="# Agent Execution Transcript\n## Command\nclaude -p ...\n## Stdout\nAll good.",
|
||||||
|
command_preview="claude -p --setting-sources user",
|
||||||
|
)
|
||||||
|
|
||||||
|
iteration = IterationResult(
|
||||||
|
iteration=1,
|
||||||
|
step_results={
|
||||||
|
"coding_output": coding_result,
|
||||||
|
"review_result": review_result,
|
||||||
|
},
|
||||||
|
step_outputs={
|
||||||
|
"coding_output": "diff --git a/file ...",
|
||||||
|
"review_result": "All good.\n\nVERDICT: PASS",
|
||||||
|
},
|
||||||
|
verdict="PASS",
|
||||||
|
)
|
||||||
|
|
||||||
|
pipeline_result = PipelineResult(
|
||||||
|
iterations=[iteration],
|
||||||
|
final_verdict="PASS",
|
||||||
|
total_duration=15.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
return config, pipeline_result
|
||||||
|
|
||||||
|
def test_report_contains_command_preview(self) -> None:
|
||||||
|
config, result = self._make_pipeline_result()
|
||||||
|
report = build_report(config, result)
|
||||||
|
self.assertIn("claude --setting-sources user", report)
|
||||||
|
self.assertIn("**Command**", report)
|
||||||
|
|
||||||
|
def test_report_contains_transcript_excerpt(self) -> None:
|
||||||
|
config, result = self._make_pipeline_result()
|
||||||
|
report = build_report(config, result)
|
||||||
|
self.assertIn("Execution transcript", report)
|
||||||
|
self.assertIn("Agent Execution Transcript", report)
|
||||||
|
|
||||||
|
def test_report_contains_exit_code(self) -> None:
|
||||||
|
config, result = self._make_pipeline_result()
|
||||||
|
report = build_report(config, result)
|
||||||
|
self.assertIn("**Exit code**: 0", report)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 4. Claude agentic hardened failure detection
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestClaimsFileChangesExpanded(unittest.TestCase):
|
||||||
|
"""Expanded change-claim markers detect more Claude output patterns."""
|
||||||
|
|
||||||
|
def test_ive_implemented(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("I've implemented the feature"))
|
||||||
|
|
||||||
|
def test_ive_updated(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("I've updated the config"))
|
||||||
|
|
||||||
|
def test_made_the_following_changes(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("I made the following changes to the file"))
|
||||||
|
|
||||||
|
def test_applied_the_fix(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("Applied the fix for the bug"))
|
||||||
|
|
||||||
|
def test_changes_have_been_applied(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("Changes have been applied successfully"))
|
||||||
|
|
||||||
|
def test_wrote_the_code(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("Wrote the code for the new module"))
|
||||||
|
|
||||||
|
def test_refactored(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("I refactored the pipeline"))
|
||||||
|
|
||||||
|
def test_no_changes_still_returns_false(self) -> None:
|
||||||
|
self.assertFalse(_claims_file_changes("No changes were necessary"))
|
||||||
|
|
||||||
|
def test_empty_string_returns_false(self) -> None:
|
||||||
|
self.assertFalse(_claims_file_changes(""))
|
||||||
|
|
||||||
|
|
||||||
|
class TestWriteFailureIndicators(unittest.TestCase):
|
||||||
|
"""_has_write_failure_indicators detects stderr patterns."""
|
||||||
|
|
||||||
|
def test_permission_denied(self) -> None:
|
||||||
|
self.assertTrue(_has_write_failure_indicators("Error: Permission denied"))
|
||||||
|
|
||||||
|
def test_read_only_filesystem(self) -> None:
|
||||||
|
self.assertTrue(_has_write_failure_indicators("read-only file system"))
|
||||||
|
|
||||||
|
def test_sandbox_restriction(self) -> None:
|
||||||
|
self.assertTrue(_has_write_failure_indicators("Blocked by sandbox policy"))
|
||||||
|
|
||||||
|
def test_eacces(self) -> None:
|
||||||
|
self.assertTrue(_has_write_failure_indicators("EACCES: operation not permitted"))
|
||||||
|
|
||||||
|
def test_empty_stderr_returns_false(self) -> None:
|
||||||
|
self.assertFalse(_has_write_failure_indicators(""))
|
||||||
|
|
||||||
|
def test_normal_stderr_returns_false(self) -> None:
|
||||||
|
self.assertFalse(_has_write_failure_indicators("Downloading model..."))
|
||||||
|
|
||||||
|
|
||||||
|
class TestAgenticWriteFailureRaisesError(unittest.TestCase):
|
||||||
|
"""Agentic mode raises AgentInvocationError on stderr write-failure indicators."""
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="")
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_write_failure_detected_from_stderr(
|
||||||
|
self, mock_run: MagicMock, mock_diff: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(
|
||||||
|
returncode=0,
|
||||||
|
stdout="Done.",
|
||||||
|
stderr="Error: Permission denied writing to /src/main.py",
|
||||||
|
)
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder", command="claude",
|
||||||
|
args=["--setting-sources", "user"], agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
import subprocess as _sp
|
||||||
|
import tempfile as _tf
|
||||||
|
|
||||||
|
with _tf.TemporaryDirectory() as td:
|
||||||
|
wt = Path(td)
|
||||||
|
_sp.run(["git", "init"], cwd=wt, capture_output=True, check=True)
|
||||||
|
_sp.run(["git", "config", "user.email", "t@t.com"], cwd=wt, capture_output=True)
|
||||||
|
_sp.run(["git", "config", "user.name", "T"], cwd=wt, capture_output=True)
|
||||||
|
(wt / "README.md").write_text("# init\n")
|
||||||
|
_sp.run(["git", "add", "."], cwd=wt, capture_output=True, check=True)
|
||||||
|
_sp.run(["git", "commit", "-m", "init"], cwd=wt, capture_output=True, check=True)
|
||||||
|
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent_agentic(
|
||||||
|
agent, "implement feature", "coding",
|
||||||
|
worktree_path=wt, quiet=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(ctx.exception.failure_type, "WRITE_FAILURE")
|
||||||
|
self.assertIn("Permission denied", ctx.exception.raw_error)
|
||||||
|
|
||||||
|
|
||||||
|
class TestAgenticExpandedClaimMarkers(unittest.TestCase):
|
||||||
|
"""Agentic mode detects expanded claim markers in empty diff scenarios."""
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="")
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_ive_implemented_triggers_empty_diff_error(
|
||||||
|
self, mock_run: MagicMock, mock_diff: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(
|
||||||
|
returncode=0,
|
||||||
|
stdout="I've implemented the requested changes to the pipeline.",
|
||||||
|
stderr="",
|
||||||
|
)
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder", command="claude",
|
||||||
|
args=["--setting-sources", "user"], agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
import subprocess as _sp
|
||||||
|
import tempfile as _tf
|
||||||
|
|
||||||
|
with _tf.TemporaryDirectory() as td:
|
||||||
|
wt = Path(td)
|
||||||
|
_sp.run(["git", "init"], cwd=wt, capture_output=True, check=True)
|
||||||
|
_sp.run(["git", "config", "user.email", "t@t.com"], cwd=wt, capture_output=True)
|
||||||
|
_sp.run(["git", "config", "user.name", "T"], cwd=wt, capture_output=True)
|
||||||
|
(wt / "README.md").write_text("# init\n")
|
||||||
|
_sp.run(["git", "add", "."], cwd=wt, capture_output=True, check=True)
|
||||||
|
_sp.run(["git", "commit", "-m", "init"], cwd=wt, capture_output=True, check=True)
|
||||||
|
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent_agentic(
|
||||||
|
agent, "implement feature", "coding",
|
||||||
|
worktree_path=wt, quiet=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(ctx.exception.failure_type, "EMPTY_DIFF")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 5. Expanded claim/no-change markers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestExpandedClaimMarkers(unittest.TestCase):
|
||||||
|
"""New claim markers detect additional Claude output patterns."""
|
||||||
|
|
||||||
|
def test_completed_all_the_changes(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("I completed all the changes"))
|
||||||
|
|
||||||
|
def test_finished_implementing(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("Finished implementing the feature"))
|
||||||
|
|
||||||
|
def test_all_tasks_completed(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("All tasks completed successfully"))
|
||||||
|
|
||||||
|
def test_done_with_the_implementation(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("Done with the implementation"))
|
||||||
|
|
||||||
|
def test_successfully_implemented(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("Successfully implemented the changes"))
|
||||||
|
|
||||||
|
def test_changes_are_complete(self) -> None:
|
||||||
|
self.assertTrue(_claims_file_changes("All changes are complete"))
|
||||||
|
|
||||||
|
|
||||||
|
class TestExpandedNoChangeMarkers(unittest.TestCase):
|
||||||
|
"""New no-change markers prevent false positives."""
|
||||||
|
|
||||||
|
def test_no_changes_needed(self) -> None:
|
||||||
|
self.assertFalse(_claims_file_changes("No changes needed"))
|
||||||
|
|
||||||
|
def test_no_fixes_needed(self) -> None:
|
||||||
|
self.assertFalse(_claims_file_changes("No fixes needed for this code"))
|
||||||
|
|
||||||
|
def test_code_is_correct_as_is(self) -> None:
|
||||||
|
self.assertFalse(_claims_file_changes("The code is correct as-is"))
|
||||||
|
|
||||||
|
def test_already_correct(self) -> None:
|
||||||
|
self.assertFalse(_claims_file_changes("Implementation is already correct"))
|
||||||
|
|
||||||
|
def test_no_action_required(self) -> None:
|
||||||
|
self.assertFalse(_claims_file_changes("No action required"))
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 6. Cross-iteration evidence propagation
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestCrossIterationEvidencePropagation(unittest.TestCase):
|
||||||
|
"""Execution evidence from prior iterations is available to subsequent iterations."""
|
||||||
|
|
||||||
|
def test_prior_evidence_available_in_iteration_2(self) -> None:
|
||||||
|
"""Review step in iteration 2 should see coding evidence from iteration 1."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
steps = [
|
||||||
|
StepConfig(
|
||||||
|
name="coding", agent="claude-coder", role="coding",
|
||||||
|
prompt_template="default:coding", output_key="coding_output",
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review", agent="claude-reviewer", role="review",
|
||||||
|
prompt_template="default:review", output_key="review_result",
|
||||||
|
verdict=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(
|
||||||
|
output_dir=Path(tmpdir),
|
||||||
|
max_iterations=2,
|
||||||
|
min_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||||
|
agents=dict(BUILTIN_AGENTS),
|
||||||
|
coders=["claude-coder"],
|
||||||
|
reviewers=["claude-reviewer"],
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="simple",
|
||||||
|
)
|
||||||
|
|
||||||
|
captured_prompts: list[dict] = []
|
||||||
|
|
||||||
|
def _mock(agent_config, prompt, step_name, **kwargs):
|
||||||
|
captured_prompts.append({
|
||||||
|
"step_name": step_name,
|
||||||
|
"prompt": prompt,
|
||||||
|
})
|
||||||
|
if step_name == "coding":
|
||||||
|
return AgentResult(
|
||||||
|
output="Implemented feature X",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=5.0,
|
||||||
|
transcript="# Transcript\nclaude ran the task",
|
||||||
|
command_preview="claude --setting-sources user",
|
||||||
|
)
|
||||||
|
# First review: FAIL, second review: PASS
|
||||||
|
review_calls = [
|
||||||
|
p for p in captured_prompts if p["step_name"] == "review"
|
||||||
|
]
|
||||||
|
if len(review_calls) <= 1:
|
||||||
|
return AgentResult(
|
||||||
|
output="Issues found\n\nVERDICT: FAIL",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=2.0,
|
||||||
|
transcript="# Transcript\nreview ran",
|
||||||
|
command_preview="claude -p --setting-sources user",
|
||||||
|
)
|
||||||
|
return AgentResult(
|
||||||
|
output="All good\n\nVERDICT: PASS",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=2.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock):
|
||||||
|
result = run_pipeline(config)
|
||||||
|
|
||||||
|
self.assertEqual(result.final_verdict, "PASS")
|
||||||
|
self.assertEqual(len(result.iterations), 2)
|
||||||
|
|
||||||
|
# The review prompt in iteration 2 should reference prior evidence
|
||||||
|
# (from iteration 1's coding step)
|
||||||
|
iter2_review_prompts = [
|
||||||
|
p for p in captured_prompts
|
||||||
|
if p["step_name"] == "review"
|
||||||
|
]
|
||||||
|
# There should be 2 review prompts (one per iteration)
|
||||||
|
self.assertEqual(len(iter2_review_prompts), 2)
|
||||||
|
iter2_review = iter2_review_prompts[1]["prompt"]
|
||||||
|
# Prior evidence should appear because it was carried forward
|
||||||
|
# The review step runs after coding, so it sees current iteration's
|
||||||
|
# coding evidence. But the key test is that evidence IS present.
|
||||||
|
self.assertIn("Exit code: 0", iter2_review)
|
||||||
|
self.assertIn("claude-coder", iter2_review)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 7. Report evidence summary table
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestReportEvidenceSummaryTable(unittest.TestCase):
|
||||||
|
"""Report includes evidence summary table per iteration."""
|
||||||
|
|
||||||
|
def test_report_contains_evidence_summary(self) -> None:
|
||||||
|
steps = [
|
||||||
|
StepConfig(
|
||||||
|
name="coding", agent="claude-coder", role="coding",
|
||||||
|
prompt_template="default:coding", output_key="coding_output",
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review", agent="claude-reviewer", role="review",
|
||||||
|
prompt_template="default:review", output_key="review_result",
|
||||||
|
verdict=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(
|
||||||
|
max_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Plan", "checklist": "CL"},
|
||||||
|
agents=dict(BUILTIN_AGENTS),
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="simple",
|
||||||
|
)
|
||||||
|
|
||||||
|
coding_result = AgentResult(
|
||||||
|
output="diff --git a/file ...",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="claude-coder",
|
||||||
|
step_name="coding",
|
||||||
|
duration_seconds=10.0,
|
||||||
|
transcript="# Transcript",
|
||||||
|
command_preview="claude --setting-sources user",
|
||||||
|
)
|
||||||
|
review_result = AgentResult(
|
||||||
|
output="VERDICT: PASS",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="claude-reviewer",
|
||||||
|
step_name="review",
|
||||||
|
duration_seconds=5.0,
|
||||||
|
transcript="# Transcript",
|
||||||
|
command_preview="claude -p",
|
||||||
|
)
|
||||||
|
|
||||||
|
iteration = IterationResult(
|
||||||
|
iteration=1,
|
||||||
|
step_results={
|
||||||
|
"coding_output": coding_result,
|
||||||
|
"review_result": review_result,
|
||||||
|
},
|
||||||
|
step_outputs={
|
||||||
|
"coding_output": "diff --git a/file ...",
|
||||||
|
"review_result": "VERDICT: PASS",
|
||||||
|
},
|
||||||
|
verdict="PASS",
|
||||||
|
)
|
||||||
|
|
||||||
|
pipeline_result = PipelineResult(
|
||||||
|
iterations=[iteration],
|
||||||
|
final_verdict="PASS",
|
||||||
|
total_duration=15.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
report = build_report(config, pipeline_result)
|
||||||
|
self.assertIn("Evidence Summary", report)
|
||||||
|
self.assertIn("claude-coder", report)
|
||||||
|
self.assertIn("claude-reviewer", report)
|
||||||
|
self.assertIn("10.0s", report)
|
||||||
|
self.assertIn("5.0s", report)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 8. _build_context merges prior and current evidence
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestBuildContextMergesEvidence(unittest.TestCase):
|
||||||
|
"""_build_context merges prior iteration evidence with current step evidence."""
|
||||||
|
|
||||||
|
def test_prior_evidence_used_when_no_current_results(self) -> None:
|
||||||
|
from cross_eval.pipeline import _build_context
|
||||||
|
input_contents = {
|
||||||
|
"plan": "test",
|
||||||
|
"execution_evidence": "### Step: coding (coder)\n- Exit code: 0",
|
||||||
|
}
|
||||||
|
context = _build_context(
|
||||||
|
input_contents, {}, "feedback", 2, 5, step_results=None,
|
||||||
|
)
|
||||||
|
# Prior evidence should survive when there are no current results
|
||||||
|
self.assertIn("coding (coder)", context["execution_evidence"])
|
||||||
|
|
||||||
|
def test_current_and_prior_merged(self) -> None:
|
||||||
|
from cross_eval.pipeline import _build_context
|
||||||
|
input_contents = {
|
||||||
|
"plan": "test",
|
||||||
|
"execution_evidence": "### Step: coding (coder)\n- Exit code: 0",
|
||||||
|
}
|
||||||
|
current_result = AgentResult(
|
||||||
|
output="review text", exit_code=0, agent_name="reviewer",
|
||||||
|
step_name="review", duration_seconds=3.0,
|
||||||
|
command_preview="cmd",
|
||||||
|
)
|
||||||
|
context = _build_context(
|
||||||
|
input_contents, {}, "feedback", 2, 5,
|
||||||
|
step_results={"review_result": current_result},
|
||||||
|
)
|
||||||
|
evidence = context["execution_evidence"]
|
||||||
|
# Both prior and current should appear
|
||||||
|
self.assertIn("Prior Iteration Evidence", evidence)
|
||||||
|
self.assertIn("Current Iteration Evidence", evidence)
|
||||||
|
self.assertIn("coding (coder)", evidence)
|
||||||
|
self.assertIn("reviewer", evidence)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 9. Evidence in review-only template (used by review-fix preset)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestReviewOnlyTemplateIncludesEvidence(unittest.TestCase):
|
||||||
|
"""review-only template includes {execution_evidence} placeholder."""
|
||||||
|
|
||||||
|
def test_review_only_template_has_evidence_placeholder(self) -> None:
|
||||||
|
from cross_eval.prompts import REVIEW_ONLY_TEMPLATE, REVIEW_ONLY_TEMPLATE_KO
|
||||||
|
self.assertIn("{execution_evidence}", REVIEW_ONLY_TEMPLATE)
|
||||||
|
self.assertIn("{execution_evidence}", REVIEW_ONLY_TEMPLATE_KO)
|
||||||
|
|
||||||
|
def test_review_only_renders_evidence(self) -> None:
|
||||||
|
from cross_eval.prompts import render_template, REVIEW_ONLY_TEMPLATE
|
||||||
|
context = {
|
||||||
|
"plan": "Test plan",
|
||||||
|
"checklist": "Test checklist",
|
||||||
|
"docs": "Test docs",
|
||||||
|
"feedback": "No feedback",
|
||||||
|
"execution_evidence": "### Step: coding (coder)\n- Exit code: 0\n- Duration: 5.0s",
|
||||||
|
"iteration": "1",
|
||||||
|
"max_iterations": "3",
|
||||||
|
}
|
||||||
|
rendered = render_template(REVIEW_ONLY_TEMPLATE, context)
|
||||||
|
self.assertIn("Exit code: 0", rendered)
|
||||||
|
self.assertIn("Duration: 5.0s", rendered)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 10. Evidence propagation in phased pipeline (coding-review-fix)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestPhasedPipelineEvidencePropagation(unittest.TestCase):
|
||||||
|
"""Evidence propagates correctly in coding-review-fix phased pipeline."""
|
||||||
|
|
||||||
|
def test_reviewer_receives_coding_evidence_in_phased_pipeline(self) -> None:
|
||||||
|
"""In coding-review-fix, review-phase reviewers see coding-phase evidence."""
|
||||||
|
from cross_eval.prompts import _build_coding_review_fix_preset
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
coders = ["claude-coder"]
|
||||||
|
reviewers = ["claude-reviewer"]
|
||||||
|
seniors = ["claude-senior"]
|
||||||
|
phases = _build_coding_review_fix_preset(coders, reviewers, seniors)
|
||||||
|
|
||||||
|
config = PipelineConfig(
|
||||||
|
output_dir=Path(tmpdir),
|
||||||
|
max_iterations=5,
|
||||||
|
min_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||||
|
agents=dict(BUILTIN_AGENTS),
|
||||||
|
coders=coders,
|
||||||
|
reviewers=reviewers,
|
||||||
|
seniors=seniors,
|
||||||
|
phases=phases,
|
||||||
|
preset_name="coding-review-fix",
|
||||||
|
)
|
||||||
|
|
||||||
|
captured_prompts: list[dict] = []
|
||||||
|
|
||||||
|
def _mock(agent_config, prompt, step_name, **kwargs):
|
||||||
|
captured_prompts.append({
|
||||||
|
"step_name": step_name,
|
||||||
|
"prompt": prompt,
|
||||||
|
"agent_name": agent_config.name,
|
||||||
|
})
|
||||||
|
if step_name == "coding":
|
||||||
|
return AgentResult(
|
||||||
|
output="Implemented feature X",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=10.0,
|
||||||
|
transcript="# Transcript\nclaude executed coding task",
|
||||||
|
command_preview="claude --setting-sources user",
|
||||||
|
)
|
||||||
|
if step_name == "verify":
|
||||||
|
return AgentResult(
|
||||||
|
output="All good\n\nVERDICT: PASS",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=3.0,
|
||||||
|
)
|
||||||
|
return AgentResult(
|
||||||
|
output=f"Output for {step_name}",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=2.0,
|
||||||
|
transcript=f"# Transcript for {step_name}",
|
||||||
|
command_preview=f"cmd-{step_name}",
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock):
|
||||||
|
result = run_pipeline(config)
|
||||||
|
|
||||||
|
self.assertEqual(result.final_verdict, "PASS")
|
||||||
|
|
||||||
|
# Check that review-phase reviewers received evidence
|
||||||
|
review_prompts = [
|
||||||
|
p for p in captured_prompts
|
||||||
|
if p["step_name"].startswith("review_")
|
||||||
|
]
|
||||||
|
self.assertTrue(len(review_prompts) >= 1)
|
||||||
|
# The review prompt should contain evidence from the coding phase
|
||||||
|
review_prompt = review_prompts[0]["prompt"]
|
||||||
|
self.assertIn("Execution Evidence", review_prompt)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 11. Evidence format includes output size
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestEvidenceIncludesOutputSize(unittest.TestCase):
|
||||||
|
"""_format_execution_evidence includes output size for debugging."""
|
||||||
|
|
||||||
|
def test_output_size_in_evidence(self) -> None:
|
||||||
|
result = AgentResult(
|
||||||
|
output="x" * 500,
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="claude-coder",
|
||||||
|
step_name="coding",
|
||||||
|
duration_seconds=5.0,
|
||||||
|
command_preview="claude --setting-sources user",
|
||||||
|
)
|
||||||
|
evidence = _format_execution_evidence({"coding_output": result})
|
||||||
|
self.assertIn("Output size: 500 chars", evidence)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 12. Report transcript label i18n
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestReportTranscriptLabelI18n(unittest.TestCase):
|
||||||
|
"""Report uses translated transcript label."""
|
||||||
|
|
||||||
|
def test_korean_transcript_label(self) -> None:
|
||||||
|
steps = [
|
||||||
|
StepConfig(
|
||||||
|
name="coding", agent="claude-coder", role="coding",
|
||||||
|
prompt_template="default:coding", output_key="coding_output",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(
|
||||||
|
max_iterations=1,
|
||||||
|
language="ko",
|
||||||
|
inputs={"plan": "Plan", "checklist": "CL"},
|
||||||
|
agents=dict(BUILTIN_AGENTS),
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="simple",
|
||||||
|
)
|
||||||
|
|
||||||
|
coding_result = AgentResult(
|
||||||
|
output="diff --git a/file ...",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="claude-coder",
|
||||||
|
step_name="coding",
|
||||||
|
duration_seconds=10.0,
|
||||||
|
transcript="# Agent Execution Transcript\n## Command\nclaude ...",
|
||||||
|
command_preview="claude --setting-sources user",
|
||||||
|
)
|
||||||
|
|
||||||
|
iteration = IterationResult(
|
||||||
|
iteration=1,
|
||||||
|
step_results={"coding_output": coding_result},
|
||||||
|
step_outputs={"coding_output": "diff --git a/file ..."},
|
||||||
|
)
|
||||||
|
|
||||||
|
pipeline_result = PipelineResult(
|
||||||
|
iterations=[iteration],
|
||||||
|
final_verdict="MAX_ITERATIONS_REACHED",
|
||||||
|
total_duration=10.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
report = build_report(config, pipeline_result)
|
||||||
|
self.assertIn("실행 트랜스크립트", report)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 13. Claude coder + Codex reviewer/senior combination
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestCodingReviewFixClaudeCodexCombination(unittest.TestCase):
|
||||||
|
"""coding-review-fix works with Claude as coder and Codex as reviewer/senior."""
|
||||||
|
|
||||||
|
def test_claude_coder_codex_reviewer_completes(self) -> None:
|
||||||
|
"""Verify the preset completes with mixed Claude/Codex agents."""
|
||||||
|
from cross_eval.prompts import _build_coding_review_fix_preset
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
coders = ["claude-coder"]
|
||||||
|
reviewers = ["codex-reviewer"]
|
||||||
|
seniors = ["codex-senior"]
|
||||||
|
phases = _build_coding_review_fix_preset(coders, reviewers, seniors)
|
||||||
|
|
||||||
|
config = PipelineConfig(
|
||||||
|
output_dir=Path(tmpdir),
|
||||||
|
max_iterations=5,
|
||||||
|
min_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||||
|
agents=dict(BUILTIN_AGENTS),
|
||||||
|
coders=coders,
|
||||||
|
reviewers=reviewers,
|
||||||
|
seniors=seniors,
|
||||||
|
phases=phases,
|
||||||
|
preset_name="coding-review-fix",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _mock(agent_config, prompt, step_name, **kwargs):
|
||||||
|
if step_name == "verify":
|
||||||
|
return AgentResult(
|
||||||
|
output="All good\n\nVERDICT: PASS",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=2.0,
|
||||||
|
transcript="# Transcript",
|
||||||
|
command_preview="codex exec",
|
||||||
|
)
|
||||||
|
return AgentResult(
|
||||||
|
output=f"Output for {step_name}",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=3.0,
|
||||||
|
transcript=f"# Transcript for {step_name}",
|
||||||
|
command_preview=f"cmd-{step_name}",
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=_mock):
|
||||||
|
result = run_pipeline(config)
|
||||||
|
|
||||||
|
self.assertEqual(result.final_verdict, "PASS")
|
||||||
|
# Verify both Claude and Codex agents were used
|
||||||
|
all_agents = set()
|
||||||
|
for ir in result.iterations:
|
||||||
|
for ar in ir.step_results.values():
|
||||||
|
all_agents.add(ar.agent_name)
|
||||||
|
self.assertIn("claude-coder", all_agents)
|
||||||
|
self.assertIn("codex-reviewer", all_agents)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
266
tests/test_onboarding.py
Normal file
266
tests/test_onboarding.py
Normal file
@@ -0,0 +1,266 @@
|
|||||||
|
"""Tests for doctor, demo, and guided init features."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
|
||||||
|
from cross_eval.doctor import (
|
||||||
|
DoctorCheck,
|
||||||
|
check_cli_installed,
|
||||||
|
check_config,
|
||||||
|
format_doctor_results,
|
||||||
|
)
|
||||||
|
from cross_eval.demo import (
|
||||||
|
DEMO_CHECKLIST,
|
||||||
|
DEMO_PLAN,
|
||||||
|
run_mock_demo,
|
||||||
|
)
|
||||||
|
from cross_eval.cli import (
|
||||||
|
_generate_guided_config,
|
||||||
|
_prompt_choice,
|
||||||
|
_prompt_text,
|
||||||
|
main,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Doctor tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class DoctorCheckInstalledTest(unittest.TestCase):
|
||||||
|
def test_check_cli_installed_found(self) -> None:
|
||||||
|
with patch("cross_eval.doctor.shutil.which", return_value="/usr/bin/python3"):
|
||||||
|
with patch("cross_eval.doctor.subprocess.run") as mock_run:
|
||||||
|
mock_run.return_value = MagicMock(
|
||||||
|
stdout="Python 3.12.0", stderr=""
|
||||||
|
)
|
||||||
|
found, version = check_cli_installed("python3")
|
||||||
|
|
||||||
|
self.assertTrue(found)
|
||||||
|
self.assertIn("Python", version)
|
||||||
|
|
||||||
|
def test_check_cli_installed_not_found(self) -> None:
|
||||||
|
with patch("cross_eval.doctor.shutil.which", return_value=None):
|
||||||
|
found, msg = check_cli_installed("nonexistent-tool")
|
||||||
|
|
||||||
|
self.assertFalse(found)
|
||||||
|
self.assertIn("not found", msg)
|
||||||
|
|
||||||
|
def test_check_config_exists_valid(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
ce_dir = Path(tmpdir) / ".cross-eval"
|
||||||
|
ce_dir.mkdir()
|
||||||
|
config_path = ce_dir / "config.yaml"
|
||||||
|
config_path.write_text(
|
||||||
|
"inputs:\n plan: plan.md\ncoders: [claude-coder]\n"
|
||||||
|
"reviewers: [claude-reviewer]\npipeline: preset:simple\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
# Also create plan.md so validation passes
|
||||||
|
(ce_dir / "plan.md").write_text("# Plan", encoding="utf-8")
|
||||||
|
|
||||||
|
ok, path, errors = check_config(Path(tmpdir))
|
||||||
|
|
||||||
|
self.assertTrue(ok)
|
||||||
|
self.assertIsNotNone(path)
|
||||||
|
self.assertEqual(errors, [])
|
||||||
|
|
||||||
|
def test_check_config_not_exists(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
ok, path, errors = check_config(Path(tmpdir))
|
||||||
|
|
||||||
|
self.assertFalse(ok)
|
||||||
|
self.assertIsNone(path)
|
||||||
|
|
||||||
|
def test_check_config_invalid(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
ce_dir = Path(tmpdir) / ".cross-eval"
|
||||||
|
ce_dir.mkdir()
|
||||||
|
# Valid YAML but missing required fields → validation fails
|
||||||
|
(ce_dir / "config.yaml").write_text(
|
||||||
|
"inputs:\n plan: /nonexistent/plan.md\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
ok, path, errors = check_config(Path(tmpdir))
|
||||||
|
|
||||||
|
self.assertFalse(ok)
|
||||||
|
self.assertIsNotNone(path)
|
||||||
|
|
||||||
|
def test_format_doctor_results_all_pass(self) -> None:
|
||||||
|
checks = [
|
||||||
|
DoctorCheck("test", True, True, "ok"),
|
||||||
|
DoctorCheck("test2", True, False, "ok"),
|
||||||
|
]
|
||||||
|
output = format_doctor_results(checks)
|
||||||
|
self.assertIn("✓", output)
|
||||||
|
self.assertIn("All checks passed", output)
|
||||||
|
|
||||||
|
def test_format_doctor_results_critical_fail(self) -> None:
|
||||||
|
checks = [
|
||||||
|
DoctorCheck("claude CLI", False, True, "not found"),
|
||||||
|
]
|
||||||
|
output = format_doctor_results(checks)
|
||||||
|
self.assertIn("✗", output)
|
||||||
|
self.assertIn("critical", output.lower())
|
||||||
|
|
||||||
|
def test_cmd_doctor_returns_0_all_pass(self) -> None:
|
||||||
|
with patch("cross_eval.doctor.run_doctor") as mock:
|
||||||
|
mock.return_value = [
|
||||||
|
DoctorCheck("test", True, True, "ok"),
|
||||||
|
]
|
||||||
|
exit_code = main(["doctor"])
|
||||||
|
self.assertEqual(exit_code, 0)
|
||||||
|
|
||||||
|
def test_cmd_doctor_returns_1_critical_fail(self) -> None:
|
||||||
|
with patch("cross_eval.doctor.run_doctor") as mock:
|
||||||
|
mock.return_value = [
|
||||||
|
DoctorCheck("claude CLI", False, True, "not found"),
|
||||||
|
]
|
||||||
|
exit_code = main(["doctor"])
|
||||||
|
self.assertEqual(exit_code, 1)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Demo tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class DemoTest(unittest.TestCase):
|
||||||
|
def test_demo_plan_is_nonempty(self) -> None:
|
||||||
|
self.assertIn("fibonacci", DEMO_PLAN.lower())
|
||||||
|
|
||||||
|
def test_demo_checklist_is_nonempty(self) -> None:
|
||||||
|
self.assertIn("fibonacci", DEMO_CHECKLIST.lower())
|
||||||
|
|
||||||
|
def test_mock_demo_runs_without_error(self) -> None:
|
||||||
|
# Should not raise
|
||||||
|
with patch("sys.stdout"):
|
||||||
|
run_mock_demo(preset="simple")
|
||||||
|
|
||||||
|
def test_mock_demo_escalate_runs_without_error(self) -> None:
|
||||||
|
with patch("sys.stdout"):
|
||||||
|
run_mock_demo(preset="simple", show_escalate=True)
|
||||||
|
|
||||||
|
def test_cmd_demo_mock_default(self) -> None:
|
||||||
|
with patch("cross_eval.demo.run_mock_demo") as mock:
|
||||||
|
exit_code = main(["demo"])
|
||||||
|
mock.assert_called_once_with(preset="simple", show_escalate=False)
|
||||||
|
self.assertEqual(exit_code, 0)
|
||||||
|
|
||||||
|
def test_cmd_demo_escalate_flag(self) -> None:
|
||||||
|
with patch("cross_eval.demo.run_mock_demo") as mock:
|
||||||
|
exit_code = main(["demo", "--escalate"])
|
||||||
|
mock.assert_called_once_with(preset="simple", show_escalate=True)
|
||||||
|
self.assertEqual(exit_code, 0)
|
||||||
|
|
||||||
|
def test_cmd_demo_live_requires_confirmation(self) -> None:
|
||||||
|
with patch("builtins.input", return_value="n"):
|
||||||
|
exit_code = main(["demo", "--live"])
|
||||||
|
self.assertEqual(exit_code, 0)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Guided init tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class GuidedInitTest(unittest.TestCase):
|
||||||
|
def test_prompt_choice_default(self) -> None:
|
||||||
|
with patch("builtins.input", return_value=""):
|
||||||
|
result = _prompt_choice("Pick:", ["a", "b", "c"], default=2)
|
||||||
|
self.assertEqual(result, "b")
|
||||||
|
|
||||||
|
def test_prompt_choice_by_number(self) -> None:
|
||||||
|
with patch("builtins.input", return_value="3"):
|
||||||
|
result = _prompt_choice("Pick:", ["a", "b", "c"], default=1)
|
||||||
|
self.assertEqual(result, "c")
|
||||||
|
|
||||||
|
def test_prompt_choice_by_name(self) -> None:
|
||||||
|
with patch("builtins.input", return_value="simple"):
|
||||||
|
result = _prompt_choice("Pick:", ["simple", "review-fix"], default=1)
|
||||||
|
self.assertEqual(result, "simple")
|
||||||
|
|
||||||
|
def test_prompt_text_default(self) -> None:
|
||||||
|
with patch("builtins.input", return_value=""):
|
||||||
|
result = _prompt_text("Name", default="claude")
|
||||||
|
self.assertEqual(result, "claude")
|
||||||
|
|
||||||
|
def test_prompt_text_custom(self) -> None:
|
||||||
|
with patch("builtins.input", return_value="codex"):
|
||||||
|
result = _prompt_text("Name", default="claude")
|
||||||
|
self.assertEqual(result, "codex")
|
||||||
|
|
||||||
|
def test_generate_guided_config(self) -> None:
|
||||||
|
config = _generate_guided_config(
|
||||||
|
"review-fix", "ko",
|
||||||
|
{
|
||||||
|
"coder": "claude",
|
||||||
|
"reviewer": "codex",
|
||||||
|
"senior": "codex",
|
||||||
|
"max_iter": 5,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
self.assertIn("preset:review-fix", config)
|
||||||
|
self.assertIn("language: ko", config)
|
||||||
|
self.assertIn("claude-coder", config)
|
||||||
|
self.assertIn("codex-reviewer", config)
|
||||||
|
self.assertIn("codex-senior", config)
|
||||||
|
self.assertIn("max_iterations: 5", config)
|
||||||
|
|
||||||
|
def test_generate_guided_config_full_name(self) -> None:
|
||||||
|
config = _generate_guided_config(
|
||||||
|
"simple", "ko",
|
||||||
|
{
|
||||||
|
"coder": "claude-coder",
|
||||||
|
"reviewer": "codex-reviewer",
|
||||||
|
"senior": "",
|
||||||
|
"max_iter": 3,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
# Full names should not be double-suffixed
|
||||||
|
self.assertIn("claude-coder", config)
|
||||||
|
self.assertNotIn("claude-coder-coder", config)
|
||||||
|
self.assertIn("codex-reviewer", config)
|
||||||
|
self.assertNotIn("codex-reviewer-reviewer", config)
|
||||||
|
|
||||||
|
def test_generate_guided_config_no_senior(self) -> None:
|
||||||
|
config = _generate_guided_config(
|
||||||
|
"simple", "en",
|
||||||
|
{
|
||||||
|
"coder": "claude",
|
||||||
|
"reviewer": "claude",
|
||||||
|
"senior": "",
|
||||||
|
"max_iter": 3,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
self.assertNotIn("senior", config.lower())
|
||||||
|
|
||||||
|
def test_guided_init_creates_files(self) -> None:
|
||||||
|
# Simulate guided init with all defaults
|
||||||
|
inputs = iter(["", "", "", "", "", "", ""])
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
with patch("builtins.input", side_effect=lambda _="": next(inputs, "")):
|
||||||
|
exit_code = main(["init", "--guided", "--dir", tmpdir])
|
||||||
|
|
||||||
|
config_path = Path(tmpdir) / ".cross-eval" / "config.yaml"
|
||||||
|
self.assertTrue(config_path.exists())
|
||||||
|
self.assertEqual(exit_code, 0)
|
||||||
|
|
||||||
|
def test_guided_init_preserves_existing_files(self) -> None:
|
||||||
|
inputs = iter(["", "", "", "", "", "", ""])
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
ce_dir = Path(tmpdir) / ".cross-eval"
|
||||||
|
ce_dir.mkdir()
|
||||||
|
existing = ce_dir / "config.yaml"
|
||||||
|
existing.write_text("# existing", encoding="utf-8")
|
||||||
|
|
||||||
|
with patch("builtins.input", side_effect=lambda _="": next(inputs, "")):
|
||||||
|
main(["init", "--guided", "--dir", tmpdir])
|
||||||
|
|
||||||
|
# Should not overwrite
|
||||||
|
self.assertEqual(existing.read_text(), "# existing")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
459
tests/test_pipeline_integration.py
Normal file
459
tests/test_pipeline_integration.py
Normal file
@@ -0,0 +1,459 @@
|
|||||||
|
"""Integration tests for cross-eval pipeline with mocked agents."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from cross_eval.config import BUILTIN_AGENTS
|
||||||
|
from cross_eval.models import (
|
||||||
|
AgentResult,
|
||||||
|
PipelineConfig,
|
||||||
|
StepConfig,
|
||||||
|
)
|
||||||
|
from cross_eval.pipeline import run_pipeline
|
||||||
|
from cross_eval.prompts import _build_review_fix_preset, _build_simple_preset
|
||||||
|
|
||||||
|
|
||||||
|
def _make_mock_agent(outputs: list[str]):
|
||||||
|
"""Returns a side_effect function that returns outputs in sequence."""
|
||||||
|
call_count = [0]
|
||||||
|
|
||||||
|
def _mock(agent_config, prompt, step_name, **kwargs):
|
||||||
|
idx = min(call_count[0], len(outputs) - 1)
|
||||||
|
call_count[0] += 1
|
||||||
|
return AgentResult(
|
||||||
|
output=outputs[idx],
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
|
return _mock
|
||||||
|
|
||||||
|
|
||||||
|
def _make_step_mock(step_outputs: dict[str, list[str]]):
|
||||||
|
"""Returns a side_effect that dispatches by step_name, cycling through outputs."""
|
||||||
|
counters: dict[str, int] = {}
|
||||||
|
|
||||||
|
def _mock(agent_config, prompt, step_name, **kwargs):
|
||||||
|
if step_name not in counters:
|
||||||
|
counters[step_name] = 0
|
||||||
|
outputs = step_outputs.get(step_name, [""])
|
||||||
|
idx = min(counters[step_name], len(outputs) - 1)
|
||||||
|
counters[step_name] += 1
|
||||||
|
return AgentResult(
|
||||||
|
output=outputs[idx],
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
|
return _mock
|
||||||
|
|
||||||
|
|
||||||
|
def _minimal_simple_config(
|
||||||
|
run_dir: Path,
|
||||||
|
max_iterations: int = 3,
|
||||||
|
seniors: list[str] | None = None,
|
||||||
|
) -> PipelineConfig:
|
||||||
|
"""Build a minimal simple pipeline config for testing."""
|
||||||
|
coders = ["claude-coder"]
|
||||||
|
reviewers = ["claude-reviewer"]
|
||||||
|
senior_list = seniors if seniors is not None else []
|
||||||
|
steps = _build_simple_preset(coders, reviewers, senior_list)
|
||||||
|
agents = dict(BUILTIN_AGENTS)
|
||||||
|
return PipelineConfig(
|
||||||
|
output_dir=run_dir,
|
||||||
|
max_iterations=max_iterations,
|
||||||
|
min_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||||
|
agents=agents,
|
||||||
|
coders=coders,
|
||||||
|
reviewers=reviewers,
|
||||||
|
seniors=senior_list,
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="simple",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSimplePipelinePassStopsLoop(unittest.TestCase):
|
||||||
|
"""Test 1: mock agent returns VERDICT: PASS on first review -> stops at iteration 1."""
|
||||||
|
|
||||||
|
def test_simple_pipeline_pass_stops_loop(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
config = _minimal_simple_config(Path(tmpdir))
|
||||||
|
|
||||||
|
mock = _make_mock_agent([
|
||||||
|
"Coding output here", # coding step
|
||||||
|
"All good\n\nVERDICT: PASS", # review step
|
||||||
|
])
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||||
|
result = run_pipeline(config)
|
||||||
|
|
||||||
|
self.assertEqual(result.final_verdict, "PASS")
|
||||||
|
self.assertEqual(len(result.iterations), 1)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSimplePipelineFailThenPass(unittest.TestCase):
|
||||||
|
"""Test 2: FAIL on first review, PASS on second -> 2 iterations."""
|
||||||
|
|
||||||
|
def test_simple_pipeline_fail_then_pass(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
config = _minimal_simple_config(Path(tmpdir), max_iterations=5)
|
||||||
|
|
||||||
|
mock = _make_step_mock({
|
||||||
|
"coding": ["Coding output v1", "Coding output v2"],
|
||||||
|
"review": [
|
||||||
|
"Issues found\n\nVERDICT: FAIL",
|
||||||
|
"All good\n\nVERDICT: PASS",
|
||||||
|
],
|
||||||
|
})
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||||
|
result = run_pipeline(config)
|
||||||
|
|
||||||
|
self.assertEqual(result.final_verdict, "PASS")
|
||||||
|
self.assertEqual(len(result.iterations), 2)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSimplePipelineEscalateBreaksLoop(unittest.TestCase):
|
||||||
|
"""Test 3: ESCALATE on review -> stops immediately, final_verdict=ESCALATE."""
|
||||||
|
|
||||||
|
def test_simple_pipeline_escalate_breaks_loop(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
config = _minimal_simple_config(
|
||||||
|
Path(tmpdir), max_iterations=5, seniors=["claude-senior"],
|
||||||
|
)
|
||||||
|
|
||||||
|
escalate_output = (
|
||||||
|
"### Confirmed Issues\n"
|
||||||
|
"- [Critical] Requirements are ambiguous\n\n"
|
||||||
|
"### Escalated Issues\n"
|
||||||
|
"Requirements need stakeholder clarification\n\n"
|
||||||
|
"### Verdict\n"
|
||||||
|
"VERDICT: ESCALATE\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
mock = _make_step_mock({
|
||||||
|
"coding": ["Coding output"],
|
||||||
|
"review": ["Issues found\n\nVERDICT: FAIL"],
|
||||||
|
"senior_review": [escalate_output],
|
||||||
|
})
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||||
|
result = run_pipeline(config)
|
||||||
|
|
||||||
|
self.assertEqual(result.final_verdict, "ESCALATE")
|
||||||
|
self.assertEqual(len(result.iterations), 1)
|
||||||
|
self.assertTrue(len(result.escalated_issues) > 0)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSimplePipelineEscalatePriorityOverPass(unittest.TestCase):
|
||||||
|
"""Test 4: one verdict step returns PASS, another returns ESCALATE -> ESCALATE wins."""
|
||||||
|
|
||||||
|
def test_simple_pipeline_escalate_priority_over_pass(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
# Build a custom pipeline with 2 verdict steps (no senior)
|
||||||
|
steps = [
|
||||||
|
StepConfig(
|
||||||
|
name="coding",
|
||||||
|
agent="claude-coder",
|
||||||
|
role="coding",
|
||||||
|
prompt_template="default:coding",
|
||||||
|
output_key="coding_output",
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review_a",
|
||||||
|
agent="claude-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_a_result",
|
||||||
|
verdict=True,
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review_b",
|
||||||
|
agent="claude-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_b_result",
|
||||||
|
verdict=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(
|
||||||
|
output_dir=Path(tmpdir),
|
||||||
|
max_iterations=3,
|
||||||
|
min_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||||
|
agents=dict(BUILTIN_AGENTS),
|
||||||
|
coders=["claude-coder"],
|
||||||
|
reviewers=["claude-reviewer"],
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="custom",
|
||||||
|
)
|
||||||
|
|
||||||
|
escalate_output = (
|
||||||
|
"### Escalated Issues\n"
|
||||||
|
"Ambiguous requirements need clarification\n\n"
|
||||||
|
"VERDICT: ESCALATE\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
mock = _make_step_mock({
|
||||||
|
"coding": ["Coding output"],
|
||||||
|
"review_a": ["All good\n\nVERDICT: PASS"],
|
||||||
|
"review_b": [escalate_output],
|
||||||
|
})
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||||
|
result = run_pipeline(config)
|
||||||
|
|
||||||
|
self.assertEqual(result.final_verdict, "ESCALATE")
|
||||||
|
self.assertTrue(len(result.escalated_issues) > 0)
|
||||||
|
|
||||||
|
|
||||||
|
class TestPhasedPipelineEscalateBreaksPhase(unittest.TestCase):
|
||||||
|
"""Test 5: phased pipeline (review-fix), verify step returns ESCALATE -> phase stops."""
|
||||||
|
|
||||||
|
def test_phased_pipeline_escalate_breaks_phase(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
coders = ["claude-coder"]
|
||||||
|
reviewers = ["claude-reviewer"]
|
||||||
|
seniors = ["claude-senior"]
|
||||||
|
phases = _build_review_fix_preset(coders, reviewers, seniors)
|
||||||
|
|
||||||
|
config = PipelineConfig(
|
||||||
|
output_dir=Path(tmpdir),
|
||||||
|
max_iterations=5,
|
||||||
|
min_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Test plan", "checklist": "Test checklist"},
|
||||||
|
agents=dict(BUILTIN_AGENTS),
|
||||||
|
coders=coders,
|
||||||
|
reviewers=reviewers,
|
||||||
|
seniors=seniors,
|
||||||
|
phases=phases,
|
||||||
|
preset_name="review-fix",
|
||||||
|
)
|
||||||
|
|
||||||
|
escalate_output = (
|
||||||
|
"### Escalated Issues\n"
|
||||||
|
"Architecture decisions needed beyond plan scope\n\n"
|
||||||
|
"### Verdict\n"
|
||||||
|
"VERDICT: ESCALATE\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
mock = _make_step_mock({
|
||||||
|
"review_claude_reviewer": ["Review findings here"],
|
||||||
|
"aggregate_review": ["Aggregated review\n\nAction items: fix X"],
|
||||||
|
"coding": ["Fixed code"],
|
||||||
|
"verify": [escalate_output],
|
||||||
|
})
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||||
|
result = run_pipeline(config)
|
||||||
|
|
||||||
|
self.assertEqual(result.final_verdict, "ESCALATE")
|
||||||
|
self.assertTrue(len(result.escalated_issues) > 0)
|
||||||
|
|
||||||
|
|
||||||
|
class TestAutoEscalateFiresWithoutSenior(unittest.TestCase):
|
||||||
|
"""Test 6: simple pipeline without senior, same FAIL feedback 3 times -> auto-escalate."""
|
||||||
|
|
||||||
|
def test_auto_escalate_fires_without_senior(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
# No seniors -> review step has verdict=True
|
||||||
|
config = _minimal_simple_config(
|
||||||
|
Path(tmpdir), max_iterations=5, seniors=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Same feedback mentioning the same file paths across all iterations
|
||||||
|
repeated_fail = (
|
||||||
|
"Issues found in src/auth.py: missing validation check.\n"
|
||||||
|
"The file src/auth.py still has the same problem.\n\n"
|
||||||
|
"VERDICT: FAIL"
|
||||||
|
)
|
||||||
|
|
||||||
|
mock = _make_step_mock({
|
||||||
|
"coding": ["Coding output v1", "Coding output v2", "Coding output v3"],
|
||||||
|
"review": [repeated_fail, repeated_fail, repeated_fail],
|
||||||
|
})
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||||
|
result = run_pipeline(config)
|
||||||
|
|
||||||
|
self.assertEqual(result.final_verdict, "ESCALATE")
|
||||||
|
self.assertTrue(
|
||||||
|
any("Auto-escalated" in iss for iss in result.escalated_issues),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestAutoEscalateDoesNotFireWithSenior(unittest.TestCase):
|
||||||
|
"""Test 7: same repeated FAIL but WITH senior/aggregate step -> no auto-escalate."""
|
||||||
|
|
||||||
|
def test_auto_escalate_does_not_fire_with_senior(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
# With seniors -> senior_review step has verdict=True, review does not
|
||||||
|
config = _minimal_simple_config(
|
||||||
|
Path(tmpdir), max_iterations=5, seniors=["claude-senior"],
|
||||||
|
)
|
||||||
|
|
||||||
|
repeated_fail_review = (
|
||||||
|
"Issues found in src/auth.py: missing validation check.\n"
|
||||||
|
"VERDICT: FAIL"
|
||||||
|
)
|
||||||
|
# Senior also returns FAIL but the auto-escalate should NOT fire
|
||||||
|
# because has_aggregator is True (seniors list is populated)
|
||||||
|
senior_fail = (
|
||||||
|
"### Confirmed Issues\n"
|
||||||
|
"- Missing validation in src/auth.py\n\n"
|
||||||
|
"### Action Items\n"
|
||||||
|
"1. Add validation in src/auth.py\n\n"
|
||||||
|
"VERDICT: FAIL"
|
||||||
|
)
|
||||||
|
|
||||||
|
mock = _make_step_mock({
|
||||||
|
"coding": [
|
||||||
|
"Coding output v1",
|
||||||
|
"Coding output v2",
|
||||||
|
"Coding output v3",
|
||||||
|
"Coding output v4",
|
||||||
|
"Coding output v5",
|
||||||
|
],
|
||||||
|
"review": [
|
||||||
|
repeated_fail_review,
|
||||||
|
repeated_fail_review,
|
||||||
|
repeated_fail_review,
|
||||||
|
repeated_fail_review,
|
||||||
|
repeated_fail_review,
|
||||||
|
],
|
||||||
|
"senior_review": [
|
||||||
|
senior_fail,
|
||||||
|
senior_fail,
|
||||||
|
senior_fail,
|
||||||
|
senior_fail,
|
||||||
|
senior_fail,
|
||||||
|
],
|
||||||
|
})
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||||
|
result = run_pipeline(config)
|
||||||
|
|
||||||
|
# Should NOT auto-escalate; should reach max iterations
|
||||||
|
self.assertNotEqual(result.final_verdict, "ESCALATE")
|
||||||
|
self.assertEqual(result.final_verdict, "MAX_ITERATIONS_REACHED")
|
||||||
|
self.assertEqual(len(result.iterations), 5)
|
||||||
|
|
||||||
|
|
||||||
|
class TestTrackerExtractionAcrossIterations(unittest.TestCase):
|
||||||
|
"""Test 8: senior review output with Issue Tracker table -> passed to next iteration."""
|
||||||
|
|
||||||
|
def test_tracker_extraction_across_iterations(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
config = _minimal_simple_config(
|
||||||
|
Path(tmpdir), max_iterations=3, seniors=["claude-senior"],
|
||||||
|
)
|
||||||
|
|
||||||
|
tracker_table = (
|
||||||
|
"## Issue Tracker\n"
|
||||||
|
"| ISS-ID | Severity | Description | Status | Since |\n"
|
||||||
|
"|--------|----------|-------------|--------|-------|\n"
|
||||||
|
"| ISS-001 | Critical | Missing auth check | Open | v1 |\n"
|
||||||
|
"| ISS-002 | Major | No validation | Open | v1 |\n"
|
||||||
|
)
|
||||||
|
senior_output_v1 = (
|
||||||
|
"### Confirmed Issues\n"
|
||||||
|
"- Missing auth\n\n"
|
||||||
|
f"{tracker_table}\n"
|
||||||
|
"### Verdict\n"
|
||||||
|
"VERDICT: FAIL"
|
||||||
|
)
|
||||||
|
senior_output_v2 = (
|
||||||
|
"### Confirmed Issues\n"
|
||||||
|
"- None remaining\n\n"
|
||||||
|
"## Issue Tracker\n"
|
||||||
|
"| ISS-ID | Severity | Description | Status | Since |\n"
|
||||||
|
"|--------|----------|-------------|--------|-------|\n"
|
||||||
|
"| ISS-001 | Critical | Missing auth check | Fixed | v1 |\n"
|
||||||
|
"| ISS-002 | Major | No validation | Fixed | v1 |\n"
|
||||||
|
"\n### Verdict\n"
|
||||||
|
"VERDICT: PASS"
|
||||||
|
)
|
||||||
|
|
||||||
|
captured_prompts: list[dict[str, str]] = []
|
||||||
|
|
||||||
|
def _tracking_mock(agent_config, prompt, step_name, **kwargs):
|
||||||
|
captured_prompts.append({
|
||||||
|
"step_name": step_name,
|
||||||
|
"prompt": prompt,
|
||||||
|
"agent_name": agent_config.name,
|
||||||
|
})
|
||||||
|
if step_name == "coding":
|
||||||
|
return AgentResult(
|
||||||
|
output="Coding output",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
elif step_name == "review":
|
||||||
|
return AgentResult(
|
||||||
|
output="Review findings\n\nVERDICT: FAIL",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
elif step_name == "senior_review":
|
||||||
|
# First call: FAIL with tracker, second call: PASS
|
||||||
|
senior_calls = [
|
||||||
|
p for p in captured_prompts if p["step_name"] == "senior_review"
|
||||||
|
]
|
||||||
|
if len(senior_calls) <= 1:
|
||||||
|
output = senior_output_v1
|
||||||
|
else:
|
||||||
|
output = senior_output_v2
|
||||||
|
return AgentResult(
|
||||||
|
output=output,
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
return AgentResult(
|
||||||
|
output="",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=_tracking_mock):
|
||||||
|
result = run_pipeline(config)
|
||||||
|
|
||||||
|
self.assertEqual(result.final_verdict, "PASS")
|
||||||
|
self.assertEqual(len(result.iterations), 2)
|
||||||
|
|
||||||
|
# Verify that the second iteration's senior_review prompt contains
|
||||||
|
# the tracker table from iteration 1
|
||||||
|
iter2_senior_prompts = [
|
||||||
|
p for p in captured_prompts
|
||||||
|
if p["step_name"] == "senior_review"
|
||||||
|
and "ISS-001" in p["prompt"]
|
||||||
|
and "Missing auth check" in p["prompt"]
|
||||||
|
]
|
||||||
|
# The second senior_review call should have the tracker in its prompt
|
||||||
|
self.assertTrue(
|
||||||
|
len(iter2_senior_prompts) >= 1,
|
||||||
|
"Expected previous_senior_tracker content (ISS-001) to appear "
|
||||||
|
"in at least one senior_review prompt",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
407
tests/test_runtime_context.py
Normal file
407
tests/test_runtime_context.py
Normal file
@@ -0,0 +1,407 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from cross_eval.agent import invoke_agent
|
||||||
|
from cross_eval.config import BUILTIN_AGENTS
|
||||||
|
from cross_eval.discovery import discover_repo, format_repo_discovery
|
||||||
|
from cross_eval.models import AgentConfig, AgentResult, PipelineConfig
|
||||||
|
from cross_eval.pipeline import run_pipeline
|
||||||
|
from cross_eval.prompts import _build_simple_preset
|
||||||
|
from cross_eval.runtime_env import build_runtime_environment, summarize_environment
|
||||||
|
|
||||||
|
|
||||||
|
class RuntimeEnvTest(unittest.TestCase):
|
||||||
|
def test_build_runtime_environment_loads_dotenv_values(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / ".env").write_text(
|
||||||
|
"CLICKHOUSE_URL=http://localhost:8123\nDATABASE_URL=postgres://db\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
execution = PipelineConfig().execution
|
||||||
|
env, loaded_files, loaded_values = build_runtime_environment(execution, root)
|
||||||
|
|
||||||
|
self.assertEqual(loaded_files[0].name, ".env")
|
||||||
|
self.assertEqual(loaded_values["CLICKHOUSE_URL"], "http://localhost:8123")
|
||||||
|
self.assertEqual(env["DATABASE_URL"], "postgres://db")
|
||||||
|
|
||||||
|
def test_summarize_environment_mentions_clickhouse_from_env(self) -> None:
|
||||||
|
execution = PipelineConfig().execution
|
||||||
|
summary = summarize_environment(
|
||||||
|
execution,
|
||||||
|
[Path("/tmp/.env")],
|
||||||
|
{"CLICKHOUSE_URL": "http://localhost:8123"},
|
||||||
|
{"CLICKHOUSE_URL": "http://localhost:8123"},
|
||||||
|
)
|
||||||
|
self.assertIn("CLICKHOUSE_URL", summary)
|
||||||
|
self.assertIn("ClickHouse-related", summary)
|
||||||
|
|
||||||
|
|
||||||
|
class RepoDiscoveryTest(unittest.TestCase):
|
||||||
|
def test_discover_repo_detects_python_postgres_and_clickhouse(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "pyproject.toml").write_text(
|
||||||
|
'[project]\nname = "svc"\ndependencies = ["psycopg", "clickhouse-driver"]\n',
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
(root / "docker-compose.yml").write_text(
|
||||||
|
"services:\n db:\n image: postgres:16\n ch:\n image: clickhouse/clickhouse-server:latest\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root, {"DATABASE_URL", "CLICKHOUSE_URL"})
|
||||||
|
summary = format_repo_discovery(discovery)
|
||||||
|
|
||||||
|
self.assertIn("python", discovery.languages)
|
||||||
|
self.assertIn("postgresql", discovery.databases)
|
||||||
|
self.assertIn("clickhouse", discovery.databases)
|
||||||
|
self.assertIn("Detected local service containers", summary)
|
||||||
|
|
||||||
|
|
||||||
|
class PromptContextTest(unittest.TestCase):
|
||||||
|
def test_run_pipeline_injects_env_and_discovery_context_into_prompt(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / ".env").write_text("CLICKHOUSE_URL=http://localhost:8123\n", encoding="utf-8")
|
||||||
|
steps = _build_simple_preset(["claude-coder"], ["claude-reviewer"], [])
|
||||||
|
config = PipelineConfig(
|
||||||
|
output_dir=root / "out",
|
||||||
|
max_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Plan", "checklist": "Checklist"},
|
||||||
|
agents={name: agent for name, agent in BUILTIN_AGENTS.items()},
|
||||||
|
coders=["claude-coder"],
|
||||||
|
reviewers=["claude-reviewer"],
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="simple",
|
||||||
|
)
|
||||||
|
prompts: list[str] = []
|
||||||
|
|
||||||
|
def _fake_invoke(agent_config, prompt, step_name, **kwargs):
|
||||||
|
prompts.append(prompt)
|
||||||
|
output = "VERDICT: PASS" if step_name == "review" else "coding output"
|
||||||
|
return AgentResult(
|
||||||
|
output=output,
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=0.1,
|
||||||
|
transcript="# Agent Execution Transcript",
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=_fake_invoke):
|
||||||
|
run_pipeline(config, cwd=root)
|
||||||
|
|
||||||
|
joined = "\n".join(prompts)
|
||||||
|
self.assertIn("Execution Policy", joined)
|
||||||
|
self.assertIn("Environment Context", joined)
|
||||||
|
self.assertIn("Repository Discovery", joined)
|
||||||
|
self.assertIn("ClickHouse-related environment variables are available", joined)
|
||||||
|
self.assertTrue((root / "out").exists())
|
||||||
|
|
||||||
|
|
||||||
|
class AgentTranscriptTest(unittest.TestCase):
|
||||||
|
def test_invoke_agent_records_transcript(self) -> None:
|
||||||
|
def _fake_run(cmd, **kwargs):
|
||||||
|
class _Result:
|
||||||
|
returncode = 0
|
||||||
|
stdout = "hello"
|
||||||
|
stderr = "warn"
|
||||||
|
|
||||||
|
return _Result()
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="codex-reviewer",
|
||||||
|
command="codex",
|
||||||
|
args=["exec", "--model", "gpt-5.4", "-"],
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("subprocess.run", side_effect=_fake_run):
|
||||||
|
result = invoke_agent(agent, "prompt", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertIn("## Command", result.transcript)
|
||||||
|
self.assertIn("hello", result.transcript)
|
||||||
|
self.assertIn("warn", result.transcript)
|
||||||
|
|
||||||
|
def test_invoke_agent_transcript_includes_exit_code_and_duration(self) -> None:
|
||||||
|
def _fake_run(cmd, **kwargs):
|
||||||
|
class _Result:
|
||||||
|
returncode = 0
|
||||||
|
stdout = "output"
|
||||||
|
stderr = ""
|
||||||
|
|
||||||
|
return _Result()
|
||||||
|
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="codex-reviewer",
|
||||||
|
command="codex",
|
||||||
|
args=["exec", "--model", "gpt-5.4", "-"],
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("subprocess.run", side_effect=_fake_run):
|
||||||
|
result = invoke_agent(agent, "prompt", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertIn("## Exit Code: 0", result.transcript)
|
||||||
|
|
||||||
|
|
||||||
|
class RepoDiscoveryExtendedTest(unittest.TestCase):
|
||||||
|
"""Regression tests for broadened repo/service discovery signals."""
|
||||||
|
|
||||||
|
def test_discover_go_project(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "go.mod").write_text(
|
||||||
|
"module example.com/myapp\n\ngo 1.21\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("go", discovery.languages)
|
||||||
|
self.assertIn("go", discovery.package_managers)
|
||||||
|
|
||||||
|
def test_discover_rust_project(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "Cargo.toml").write_text(
|
||||||
|
'[package]\nname = "myapp"\nversion = "0.1.0"\n',
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("rust", discovery.languages)
|
||||||
|
self.assertIn("cargo", discovery.package_managers)
|
||||||
|
|
||||||
|
def test_discover_ruby_project(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "Gemfile").write_text(
|
||||||
|
'source "https://rubygems.org"\ngem "rails"\n',
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("ruby", discovery.languages)
|
||||||
|
self.assertIn("bundler", discovery.package_managers)
|
||||||
|
|
||||||
|
def test_discover_java_gradle_project(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "build.gradle").write_text(
|
||||||
|
"plugins { id 'java' }\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("java", discovery.languages)
|
||||||
|
self.assertIn("gradle", discovery.package_managers)
|
||||||
|
|
||||||
|
def test_discover_elasticsearch_from_compose(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "docker-compose.yml").write_text(
|
||||||
|
"services:\n es:\n image: elasticsearch:8.10.0\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("elasticsearch", discovery.services)
|
||||||
|
|
||||||
|
def test_discover_kafka_from_compose(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "docker-compose.yml").write_text(
|
||||||
|
"services:\n broker:\n image: confluentinc/cp-kafka:latest\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("kafka", discovery.services)
|
||||||
|
|
||||||
|
def test_discover_rabbitmq_from_env(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
discovery = discover_repo(root, {"RABBITMQ_URL"})
|
||||||
|
|
||||||
|
self.assertIn("rabbitmq", discovery.databases)
|
||||||
|
|
||||||
|
def test_discover_sqlite_from_requirements(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "requirements.txt").write_text(
|
||||||
|
"aiosqlite==0.19.0\nfastapi\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("python", discovery.languages)
|
||||||
|
self.assertIn("sqlite", discovery.databases)
|
||||||
|
|
||||||
|
def test_discover_dynamodb_from_env(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
discovery = discover_repo(root, {"DYNAMODB_TABLE"})
|
||||||
|
|
||||||
|
self.assertIn("dynamodb", discovery.databases)
|
||||||
|
|
||||||
|
def test_discover_frameworks_from_pyproject(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "pyproject.toml").write_text(
|
||||||
|
'[project]\nname = "svc"\ndependencies = ["fastapi", "uvicorn"]\n',
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("fastapi", discovery.frameworks)
|
||||||
|
|
||||||
|
def test_discover_knex_hint(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "knexfile.js").write_text(
|
||||||
|
"module.exports = {};\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("Knex migration config detected.", discovery.hints)
|
||||||
|
|
||||||
|
def test_discover_makefile_hint(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "Makefile").write_text(
|
||||||
|
"all:\n\techo hello\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("Makefile available for build/task automation.", discovery.hints)
|
||||||
|
|
||||||
|
def test_format_repo_discovery_includes_frameworks(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "package.json").write_text(
|
||||||
|
'{"dependencies": {"express": "^4.18.0"}}',
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
summary = format_repo_discovery(discovery)
|
||||||
|
|
||||||
|
self.assertIn("Detected frameworks", summary)
|
||||||
|
self.assertIn("express", summary)
|
||||||
|
|
||||||
|
def test_discover_pnpm_lockfile(self) -> None:
|
||||||
|
"""Detect pnpm from lockfile when no packageManager field."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "package.json").write_text(
|
||||||
|
'{"name": "app"}',
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
(root / "pnpm-lock.yaml").write_text("lockfileVersion: 6\n", encoding="utf-8")
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("pnpm", discovery.package_managers)
|
||||||
|
|
||||||
|
def test_discover_yarn_lockfile(self) -> None:
|
||||||
|
"""Detect yarn from lockfile when no packageManager field."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
(root / "package.json").write_text(
|
||||||
|
'{"name": "app"}',
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
(root / "yarn.lock").write_text("# yarn lockfile v1\n", encoding="utf-8")
|
||||||
|
discovery = discover_repo(root)
|
||||||
|
|
||||||
|
self.assertIn("yarn", discovery.package_managers)
|
||||||
|
|
||||||
|
|
||||||
|
class SummarizeEnvExtendedTest(unittest.TestCase):
|
||||||
|
"""Regression tests for expanded environment summary prefixes."""
|
||||||
|
|
||||||
|
def test_summarize_shows_mongo_env_var(self) -> None:
|
||||||
|
execution = PipelineConfig().execution
|
||||||
|
summary = summarize_environment(
|
||||||
|
execution,
|
||||||
|
[Path("/tmp/.env")],
|
||||||
|
{"MONGO_URI": "mongodb://localhost"},
|
||||||
|
{"MONGO_URI": "mongodb://localhost"},
|
||||||
|
)
|
||||||
|
self.assertIn("MONGO_URI", summary)
|
||||||
|
|
||||||
|
def test_summarize_shows_kafka_env_var(self) -> None:
|
||||||
|
execution = PipelineConfig().execution
|
||||||
|
summary = summarize_environment(
|
||||||
|
execution,
|
||||||
|
[Path("/tmp/.env")],
|
||||||
|
{"KAFKA_BOOTSTRAP_SERVERS": "localhost:9092"},
|
||||||
|
{"KAFKA_BOOTSTRAP_SERVERS": "localhost:9092"},
|
||||||
|
)
|
||||||
|
self.assertIn("KAFKA_BOOTSTRAP_SERVERS", summary)
|
||||||
|
|
||||||
|
def test_summarize_shows_elasticsearch_env_var(self) -> None:
|
||||||
|
execution = PipelineConfig().execution
|
||||||
|
summary = summarize_environment(
|
||||||
|
execution,
|
||||||
|
[Path("/tmp/.env")],
|
||||||
|
{"ELASTICSEARCH_URL": "http://localhost:9200"},
|
||||||
|
{"ELASTICSEARCH_URL": "http://localhost:9200"},
|
||||||
|
)
|
||||||
|
self.assertIn("ELASTICSEARCH_URL", summary)
|
||||||
|
|
||||||
|
|
||||||
|
class TranscriptSavingRegressionTest(unittest.TestCase):
|
||||||
|
"""Verify that transcripts are saved as step artifacts during pipeline runs."""
|
||||||
|
|
||||||
|
def test_transcript_files_saved_during_pipeline(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
steps = _build_simple_preset(["claude-coder"], ["claude-reviewer"], [])
|
||||||
|
config = PipelineConfig(
|
||||||
|
output_dir=root / "out",
|
||||||
|
max_iterations=1,
|
||||||
|
language="en",
|
||||||
|
inputs={"plan": "Plan", "checklist": "Checklist"},
|
||||||
|
agents={name: agent for name, agent in BUILTIN_AGENTS.items()},
|
||||||
|
coders=["claude-coder"],
|
||||||
|
reviewers=["claude-reviewer"],
|
||||||
|
pipeline=steps,
|
||||||
|
preset_name="simple",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _fake_invoke(agent_config, prompt, step_name, **kwargs):
|
||||||
|
output = "VERDICT: PASS" if step_name == "review" else "coding output"
|
||||||
|
return AgentResult(
|
||||||
|
output=output,
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=0.1,
|
||||||
|
transcript="# Agent Execution Transcript\n\n## Command\n```\nclaude -p\n```",
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("cross_eval.pipeline.invoke_agent", side_effect=_fake_invoke):
|
||||||
|
result = run_pipeline(config, cwd=root)
|
||||||
|
|
||||||
|
# Verify transcript files were saved
|
||||||
|
run_dir = result.run_dir
|
||||||
|
self.assertIsNotNone(run_dir)
|
||||||
|
assert run_dir is not None
|
||||||
|
coding_transcript = run_dir / "v1" / "coding_transcript.md"
|
||||||
|
review_transcript = run_dir / "v1" / "review_transcript.md"
|
||||||
|
self.assertTrue(
|
||||||
|
coding_transcript.exists(),
|
||||||
|
f"Expected transcript at {coding_transcript}",
|
||||||
|
)
|
||||||
|
self.assertTrue(
|
||||||
|
review_transcript.exists(),
|
||||||
|
f"Expected transcript at {review_transcript}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
831
tests/test_runtime_misc.py
Normal file
831
tests/test_runtime_misc.py
Normal file
@@ -0,0 +1,831 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
from cross_eval.agent import (
|
||||||
|
AgentInvocationError,
|
||||||
|
_build_transcript,
|
||||||
|
_classify_agent_failure,
|
||||||
|
invoke_agent,
|
||||||
|
invoke_agent_agentic,
|
||||||
|
)
|
||||||
|
from cross_eval.models import AgentConfig, AgentResult, ExecutionConfig, PipelineConfig, StepConfig
|
||||||
|
from cross_eval.pipeline import (
|
||||||
|
_commit_iteration,
|
||||||
|
_execute_parallel_batch,
|
||||||
|
_execute_step,
|
||||||
|
_finalize_worktree,
|
||||||
|
_format_runtime_error_markdown,
|
||||||
|
_maybe_save_step_transcript,
|
||||||
|
_snapshot_repo_state,
|
||||||
|
)
|
||||||
|
from cross_eval.runtime_env import (
|
||||||
|
build_execution_policy,
|
||||||
|
parse_dotenv,
|
||||||
|
resolve_env_files,
|
||||||
|
summarize_environment,
|
||||||
|
)
|
||||||
|
from cross_eval.worktree import WorktreeError, create_worktree, remove_worktree
|
||||||
|
|
||||||
|
|
||||||
|
def _init_git_repo(path: Path) -> None:
|
||||||
|
subprocess.run(["git", "init"], cwd=path, capture_output=True, check=True)
|
||||||
|
subprocess.run(
|
||||||
|
["git", "config", "user.email", "test@test.com"],
|
||||||
|
cwd=path,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
subprocess.run(
|
||||||
|
["git", "config", "user.name", "Test"],
|
||||||
|
cwd=path,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
(path / "README.md").write_text("# init\n", encoding="utf-8")
|
||||||
|
subprocess.run(["git", "add", "."], cwd=path, capture_output=True, check=True)
|
||||||
|
subprocess.run(
|
||||||
|
["git", "commit", "-m", "initial"],
|
||||||
|
cwd=path,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestInvokeAgentRuntime(unittest.TestCase):
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_interactive_claude_reads_output_file(self, mock_run: MagicMock) -> None:
|
||||||
|
def _fake_run(cmd: list[str], **kwargs: object) -> MagicMock:
|
||||||
|
match = re.search(r"Write your complete output to (.+)\.$", cmd[-1])
|
||||||
|
self.assertIsNotNone(match)
|
||||||
|
assert match is not None
|
||||||
|
Path(match.group(1)).write_text("review result", encoding="utf-8")
|
||||||
|
return MagicMock(returncode=0, stdout="", stderr="")
|
||||||
|
|
||||||
|
mock_run.side_effect = _fake_run
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-reviewer",
|
||||||
|
command="claude",
|
||||||
|
args=["--model", "opus"],
|
||||||
|
system_prompt="system",
|
||||||
|
)
|
||||||
|
|
||||||
|
result = invoke_agent(agent, "inspect code", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(result.output, "review result")
|
||||||
|
called_cmd = mock_run.call_args[0][0]
|
||||||
|
self.assertIn("--system-prompt", called_cmd)
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_interactive_claude_falls_back_to_stdout(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="stdout fallback", stderr="")
|
||||||
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["--model", "opus"])
|
||||||
|
|
||||||
|
result = invoke_agent(agent, "inspect code", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(result.output, "stdout fallback")
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_non_claude_wraps_system_prompt_in_stdin(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="custom-reviewer",
|
||||||
|
command="custom-cli",
|
||||||
|
args=["run"],
|
||||||
|
system_prompt="strict mode",
|
||||||
|
)
|
||||||
|
|
||||||
|
invoke_agent(agent, "check things", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
mock_run.call_args.kwargs["input"],
|
||||||
|
"<system>\nstrict mode\n</system>\n\ncheck things",
|
||||||
|
)
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_failure_raises_structured_error(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="API Error: backend down")
|
||||||
|
agent = AgentConfig(name="codex-reviewer", command="codex", args=["exec", "-"])
|
||||||
|
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent(agent, "check", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(ctx.exception.failure_type, "API_ERROR")
|
||||||
|
self.assertIn("backend down", ctx.exception.raw_error)
|
||||||
|
|
||||||
|
def test_classify_unknown_failure(self) -> None:
|
||||||
|
failure_type, suggested_action = _classify_agent_failure("weird crash")
|
||||||
|
self.assertEqual(failure_type, "UNKNOWN")
|
||||||
|
self.assertIn("Inspect", suggested_action)
|
||||||
|
|
||||||
|
def test_build_transcript_includes_cwd_and_duration(self) -> None:
|
||||||
|
transcript = _build_transcript(
|
||||||
|
command_preview="claude -p",
|
||||||
|
stdout="ok",
|
||||||
|
stderr="",
|
||||||
|
exit_code=0,
|
||||||
|
duration_seconds=1.2,
|
||||||
|
cwd="/tmp/repo",
|
||||||
|
)
|
||||||
|
self.assertIn("## Working Directory", transcript)
|
||||||
|
self.assertIn("## Duration: 1.2s", transcript)
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_timeout_stops_spinner(self, mock_run: MagicMock, mock_spinner: MagicMock) -> None:
|
||||||
|
spinner = mock_spinner.return_value
|
||||||
|
mock_run.side_effect = subprocess.TimeoutExpired(cmd=["claude"], timeout=12)
|
||||||
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
|
||||||
|
|
||||||
|
with self.assertRaises(subprocess.TimeoutExpired):
|
||||||
|
invoke_agent(agent, "inspect code", "review", quiet=False, timeout=12)
|
||||||
|
|
||||||
|
spinner.stop.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_generic_exception_stops_spinner(self, mock_run: MagicMock, mock_spinner: MagicMock) -> None:
|
||||||
|
spinner = mock_spinner.return_value
|
||||||
|
mock_run.side_effect = OSError("boom")
|
||||||
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
|
||||||
|
|
||||||
|
with self.assertRaises(OSError):
|
||||||
|
invoke_agent(agent, "inspect code", "review", quiet=False)
|
||||||
|
|
||||||
|
spinner.stop.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.logger.warning")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_empty_output_logs_warning(self, mock_run: MagicMock, mock_warning: MagicMock) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
|
||||||
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
|
||||||
|
|
||||||
|
result = invoke_agent(agent, "inspect code", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(result.output, "")
|
||||||
|
mock_warning.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_print_mode_claude_uses_native_system_prompt_flag(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-reviewer",
|
||||||
|
command="claude",
|
||||||
|
args=["-p"],
|
||||||
|
system_prompt="be strict",
|
||||||
|
)
|
||||||
|
|
||||||
|
invoke_agent(agent, "review this", "review", quiet=True)
|
||||||
|
|
||||||
|
called_cmd = mock_run.call_args[0][0]
|
||||||
|
self.assertIn("--system-prompt", called_cmd)
|
||||||
|
self.assertEqual(mock_run.call_args.kwargs["input"], "review this")
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_interactive_failure_truncates_error_and_removes_output_file(
|
||||||
|
self,
|
||||||
|
mock_run: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
seen_output_path: Path | None = None
|
||||||
|
|
||||||
|
def _fake_run(cmd: list[str], **kwargs: object) -> MagicMock:
|
||||||
|
nonlocal seen_output_path
|
||||||
|
match = re.search(r"Write your complete output to (.+)\.$", cmd[-1])
|
||||||
|
self.assertIsNotNone(match)
|
||||||
|
assert match is not None
|
||||||
|
seen_output_path = Path(match.group(1))
|
||||||
|
return MagicMock(returncode=1, stdout="", stderr="x" * 600)
|
||||||
|
|
||||||
|
mock_run.side_effect = _fake_run
|
||||||
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["--model", "opus"])
|
||||||
|
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent(agent, "inspect code", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(len(ctx.exception.raw_error), 503)
|
||||||
|
self.assertIsNotNone(seen_output_path)
|
||||||
|
assert seen_output_path is not None
|
||||||
|
self.assertFalse(seen_output_path.exists())
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.logger.warning")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_empty_output_with_stderr_logs_stderr_warning(
|
||||||
|
self,
|
||||||
|
mock_run: MagicMock,
|
||||||
|
mock_warning: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="stderr text")
|
||||||
|
agent = AgentConfig(name="claude-reviewer", command="claude", args=["-p"])
|
||||||
|
|
||||||
|
invoke_agent(agent, "inspect code", "review", quiet=True)
|
||||||
|
|
||||||
|
self.assertIn("stderr:", mock_warning.call_args[0][0])
|
||||||
|
|
||||||
|
|
||||||
|
class TestInvokeAgenticRuntime(unittest.TestCase):
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_codex_agentic_adds_reasoning_and_system_wrapper(
|
||||||
|
self,
|
||||||
|
mock_run: MagicMock,
|
||||||
|
mock_diff: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="codex-coder",
|
||||||
|
command="codex",
|
||||||
|
args=["exec", "--full-auto"],
|
||||||
|
system_prompt="strict mode",
|
||||||
|
reasoning_effort="high",
|
||||||
|
agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=True)
|
||||||
|
|
||||||
|
called_cmd = mock_run.call_args[0][0]
|
||||||
|
self.assertIn("-c", called_cmd)
|
||||||
|
self.assertEqual(called_cmd[-1], "-")
|
||||||
|
self.assertIn("<system>", mock_run.call_args.kwargs["input"])
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_agentic_claude_success_uses_system_prompt_and_spinner(
|
||||||
|
self,
|
||||||
|
mock_run: MagicMock,
|
||||||
|
mock_diff: MagicMock,
|
||||||
|
mock_spinner: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
|
||||||
|
agent = AgentConfig(
|
||||||
|
name="claude-coder",
|
||||||
|
command="claude",
|
||||||
|
args=["-p", "--print"],
|
||||||
|
system_prompt="stay in scope",
|
||||||
|
agentic=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
result = invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
|
||||||
|
|
||||||
|
called_cmd = mock_run.call_args[0][0]
|
||||||
|
self.assertNotIn("-p", called_cmd)
|
||||||
|
self.assertIn("--system-prompt", called_cmd)
|
||||||
|
self.assertEqual(result.output, "diff --git a/file ...")
|
||||||
|
mock_spinner.return_value.stop.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
def test_agentic_timeout_stops_spinner(self, mock_spinner: MagicMock) -> None:
|
||||||
|
spinner = mock_spinner.return_value
|
||||||
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
with patch(
|
||||||
|
"cross_eval.agent.subprocess.run",
|
||||||
|
side_effect=subprocess.TimeoutExpired(cmd=["codex"], timeout=20),
|
||||||
|
):
|
||||||
|
with self.assertRaises(subprocess.TimeoutExpired):
|
||||||
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False, timeout=20)
|
||||||
|
|
||||||
|
spinner.stop.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_agentic_nonzero_exit_raises_structured_error(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="unauthorized")
|
||||||
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=True)
|
||||||
|
|
||||||
|
self.assertEqual(ctx.exception.failure_type, "AUTH")
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
def test_agentic_generic_exception_stops_spinner(
|
||||||
|
self,
|
||||||
|
mock_spinner: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
with patch("cross_eval.agent.subprocess.run", side_effect=OSError("boom")):
|
||||||
|
with self.assertRaises(OSError):
|
||||||
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
|
||||||
|
|
||||||
|
mock_spinner.return_value.stop.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_agentic_failure_truncates_error(
|
||||||
|
self,
|
||||||
|
mock_run: MagicMock,
|
||||||
|
mock_spinner: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="x" * 600)
|
||||||
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
|
||||||
|
|
||||||
|
self.assertEqual(len(ctx.exception.raw_error), 503)
|
||||||
|
mock_spinner.return_value.stop.assert_called_once()
|
||||||
|
|
||||||
|
@patch("cross_eval.agent._Spinner")
|
||||||
|
@patch("cross_eval.worktree.capture_diff", return_value="")
|
||||||
|
@patch("cross_eval.agent.subprocess.run")
|
||||||
|
def test_agentic_empty_diff_failure_truncates_error_and_stops_spinner(
|
||||||
|
self,
|
||||||
|
mock_run: MagicMock,
|
||||||
|
mock_diff: MagicMock,
|
||||||
|
mock_spinner: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
mock_run.return_value = MagicMock(
|
||||||
|
returncode=0,
|
||||||
|
stdout="implemented",
|
||||||
|
stderr="permission denied " * 300,
|
||||||
|
)
|
||||||
|
agent = AgentConfig(name="codex-coder", command="codex", args=["exec"], agentic=True)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
with self.assertRaises(AgentInvocationError) as ctx:
|
||||||
|
invoke_agent_agentic(agent, "fix bug", "coding", repo, quiet=False)
|
||||||
|
|
||||||
|
self.assertLessEqual(len(ctx.exception.raw_error), 2003)
|
||||||
|
self.assertEqual(ctx.exception.failure_type, "WRITE_FAILURE")
|
||||||
|
mock_spinner.return_value.stop.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
class TestPipelineHelpers(unittest.TestCase):
|
||||||
|
@patch("cross_eval.worktree.commit_worktree", return_value=True)
|
||||||
|
def test_commit_iteration_logs_only_when_committed(self, mock_commit: MagicMock) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
_commit_iteration(Path(tmpdir), "review-fix", 2, "PASS")
|
||||||
|
mock_commit.assert_called_once()
|
||||||
|
|
||||||
|
def test_snapshot_repo_state_includes_untracked_digest(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
repo = Path(tmpdir)
|
||||||
|
_init_git_repo(repo)
|
||||||
|
(repo / "scratch.txt").write_text("draft", encoding="utf-8")
|
||||||
|
|
||||||
|
snapshot = _snapshot_repo_state(repo)
|
||||||
|
|
||||||
|
self.assertIn("UNTRACKED scratch.txt", snapshot["untracked"])
|
||||||
|
|
||||||
|
def test_finalize_worktree_deletes_empty_branch(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
base = Path(tmpdir) / "repo"
|
||||||
|
base.mkdir()
|
||||||
|
_init_git_repo(base)
|
||||||
|
branch = "cross-eval/empty"
|
||||||
|
subprocess.run(
|
||||||
|
["git", "branch", branch, "HEAD"],
|
||||||
|
cwd=base,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
worktree = Path(tmpdir) / "wt"
|
||||||
|
subprocess.run(
|
||||||
|
["git", "worktree", "add", str(worktree), branch],
|
||||||
|
cwd=base,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
branch_result = _finalize_worktree(base, worktree, branch, "review-fix", "PASS")
|
||||||
|
|
||||||
|
self.assertIsNone(branch_result)
|
||||||
|
branches = subprocess.run(
|
||||||
|
["git", "branch", "--list", branch],
|
||||||
|
cwd=base,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
self.assertEqual(branches.stdout.strip(), "")
|
||||||
|
|
||||||
|
def test_format_runtime_error_markdown_for_generic_exception(self) -> None:
|
||||||
|
markdown = _format_runtime_error_markdown(
|
||||||
|
RuntimeError("boom"),
|
||||||
|
step_name="review",
|
||||||
|
agent_name="claude-reviewer",
|
||||||
|
phase_name="review_fix",
|
||||||
|
)
|
||||||
|
self.assertIn("# Agent Error", markdown)
|
||||||
|
self.assertIn("review_fix", markdown)
|
||||||
|
self.assertIn("boom", markdown)
|
||||||
|
|
||||||
|
def test_maybe_save_step_transcript_returns_none_without_transcript(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
result = AgentResult(
|
||||||
|
output="ok",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name="claude-reviewer",
|
||||||
|
step_name="review",
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
saved = _maybe_save_step_transcript(Path(tmpdir), 1, "review", result)
|
||||||
|
self.assertIsNone(saved)
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent")
|
||||||
|
def test_execute_step_saves_timeout_markdown(self, mock_invoke: MagicMock) -> None:
|
||||||
|
mock_invoke.side_effect = subprocess.TimeoutExpired(
|
||||||
|
cmd=["claude"],
|
||||||
|
timeout=45,
|
||||||
|
output="partial output",
|
||||||
|
stderr="still running",
|
||||||
|
)
|
||||||
|
step = StepConfig(
|
||||||
|
name="review",
|
||||||
|
agent="claude-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_output",
|
||||||
|
)
|
||||||
|
config = PipelineConfig(
|
||||||
|
agents={
|
||||||
|
"claude-reviewer": AgentConfig(
|
||||||
|
name="claude-reviewer",
|
||||||
|
command="claude",
|
||||||
|
args=["-p"],
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
step_outputs: dict[str, str] = {}
|
||||||
|
step_results: dict[str, AgentResult] = {}
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
run_dir = Path(tmpdir)
|
||||||
|
with self.assertRaises(RuntimeError) as ctx:
|
||||||
|
_execute_step(
|
||||||
|
step,
|
||||||
|
config,
|
||||||
|
{"plan": "Plan", "checklist": "Checklist"},
|
||||||
|
"",
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
run_dir,
|
||||||
|
45,
|
||||||
|
False,
|
||||||
|
step_outputs,
|
||||||
|
step_results,
|
||||||
|
run_dir=run_dir,
|
||||||
|
output_iter=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIn("timed out after 45s", str(ctx.exception))
|
||||||
|
error_path = run_dir / "v1" / "review_error.md"
|
||||||
|
self.assertTrue(error_path.exists())
|
||||||
|
self.assertIn("# Agent Timeout", error_path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent")
|
||||||
|
def test_execute_step_saves_runtime_error_markdown(self, mock_invoke: MagicMock) -> None:
|
||||||
|
mock_invoke.side_effect = AgentInvocationError(
|
||||||
|
agent_name="claude-reviewer",
|
||||||
|
step_name="review",
|
||||||
|
cmd_preview="claude -p",
|
||||||
|
raw_error="api broke",
|
||||||
|
failure_type="API_ERROR",
|
||||||
|
suggested_action="retry",
|
||||||
|
)
|
||||||
|
step = StepConfig(
|
||||||
|
name="review",
|
||||||
|
agent="claude-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_output",
|
||||||
|
)
|
||||||
|
config = PipelineConfig(
|
||||||
|
agents={
|
||||||
|
"claude-reviewer": AgentConfig(
|
||||||
|
name="claude-reviewer",
|
||||||
|
command="claude",
|
||||||
|
args=["-p"],
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
run_dir = Path(tmpdir)
|
||||||
|
with self.assertRaises(AgentInvocationError):
|
||||||
|
_execute_step(
|
||||||
|
step,
|
||||||
|
config,
|
||||||
|
{"plan": "Plan", "checklist": "Checklist"},
|
||||||
|
"",
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
run_dir,
|
||||||
|
45,
|
||||||
|
False,
|
||||||
|
{},
|
||||||
|
{},
|
||||||
|
run_dir=run_dir,
|
||||||
|
output_iter=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
error_text = (run_dir / "v1" / "review_error.md").read_text(encoding="utf-8")
|
||||||
|
self.assertIn("API_ERROR", error_text)
|
||||||
|
self.assertIn("retry", error_text)
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline.invoke_agent")
|
||||||
|
def test_execute_parallel_batch_saves_success_and_timeout_error(self, mock_invoke: MagicMock) -> None:
|
||||||
|
def _fake_invoke(agent_config: AgentConfig, prompt: str, step_name: str, **kwargs: object) -> AgentResult:
|
||||||
|
if step_name == "review_ok":
|
||||||
|
return AgentResult(
|
||||||
|
output="VERDICT: PASS",
|
||||||
|
exit_code=0,
|
||||||
|
agent_name=agent_config.name,
|
||||||
|
step_name=step_name,
|
||||||
|
duration_seconds=0.1,
|
||||||
|
)
|
||||||
|
raise subprocess.TimeoutExpired(
|
||||||
|
cmd=["codex"],
|
||||||
|
timeout=30,
|
||||||
|
output="halfway",
|
||||||
|
stderr="timeout stderr",
|
||||||
|
)
|
||||||
|
|
||||||
|
mock_invoke.side_effect = _fake_invoke
|
||||||
|
batch = [
|
||||||
|
StepConfig(
|
||||||
|
name="review_ok",
|
||||||
|
agent="claude-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_ok",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review_slow",
|
||||||
|
agent="codex-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_slow",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(
|
||||||
|
agents={
|
||||||
|
"claude-reviewer": AgentConfig(name="claude-reviewer", command="claude", args=["-p"]),
|
||||||
|
"codex-reviewer": AgentConfig(name="codex-reviewer", command="codex", args=["exec", "-"]),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
step_outputs: dict[str, str] = {}
|
||||||
|
step_results: dict[str, AgentResult] = {}
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
run_dir = Path(tmpdir)
|
||||||
|
with self.assertRaises(RuntimeError) as ctx:
|
||||||
|
_execute_parallel_batch(
|
||||||
|
batch,
|
||||||
|
config,
|
||||||
|
{"plan": "Plan", "checklist": "Checklist"},
|
||||||
|
"",
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
run_dir,
|
||||||
|
30,
|
||||||
|
False,
|
||||||
|
step_outputs,
|
||||||
|
step_results,
|
||||||
|
run_dir=run_dir,
|
||||||
|
output_iter=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIn("Successful outputs were saved for: review_ok", str(ctx.exception))
|
||||||
|
self.assertEqual(step_outputs["review_ok"], "VERDICT: PASS")
|
||||||
|
self.assertTrue((run_dir / "v1" / "review_ok.md").exists())
|
||||||
|
self.assertTrue((run_dir / "v1" / "review_slow_error.md").exists())
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline._execute_step")
|
||||||
|
def test_execute_parallel_batch_dry_run_uses_sequential_path(self, mock_step: MagicMock) -> None:
|
||||||
|
batch = [
|
||||||
|
StepConfig(
|
||||||
|
name="review_a",
|
||||||
|
agent="claude-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_a",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review_b",
|
||||||
|
agent="codex-reviewer",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_b",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(agents={})
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
_execute_parallel_batch(
|
||||||
|
batch,
|
||||||
|
config,
|
||||||
|
{"plan": "Plan"},
|
||||||
|
"",
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
Path(tmpdir),
|
||||||
|
None,
|
||||||
|
True,
|
||||||
|
{},
|
||||||
|
{},
|
||||||
|
run_dir=Path(tmpdir),
|
||||||
|
output_iter=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(mock_step.call_count, 2)
|
||||||
|
|
||||||
|
@patch("cross_eval.pipeline._execute_step")
|
||||||
|
def test_execute_parallel_batch_agentic_steps_fall_back_to_sequential(self, mock_step: MagicMock) -> None:
|
||||||
|
batch = [
|
||||||
|
StepConfig(
|
||||||
|
name="review_a",
|
||||||
|
agent="agentic-a",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_a",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
StepConfig(
|
||||||
|
name="review_b",
|
||||||
|
agent="agentic-b",
|
||||||
|
role="review",
|
||||||
|
prompt_template="default:review",
|
||||||
|
output_key="review_b",
|
||||||
|
parallel=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
config = PipelineConfig(
|
||||||
|
agents={
|
||||||
|
"agentic-a": AgentConfig(name="agentic-a", command="claude", agentic=True),
|
||||||
|
"agentic-b": AgentConfig(name="agentic-b", command="codex", agentic=True),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
_execute_parallel_batch(
|
||||||
|
batch,
|
||||||
|
config,
|
||||||
|
{"plan": "Plan"},
|
||||||
|
"",
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
Path(tmpdir),
|
||||||
|
None,
|
||||||
|
False,
|
||||||
|
{},
|
||||||
|
{},
|
||||||
|
run_dir=Path(tmpdir),
|
||||||
|
output_iter=1,
|
||||||
|
worktree_path=Path(tmpdir),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(mock_step.call_count, 2)
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.remove_worktree", side_effect=RuntimeError("cleanup failed"))
|
||||||
|
@patch("cross_eval.worktree.commit_worktree", side_effect=RuntimeError("commit failed"))
|
||||||
|
def test_finalize_worktree_handles_cleanup_failures(
|
||||||
|
self,
|
||||||
|
mock_commit: MagicMock,
|
||||||
|
mock_remove: MagicMock,
|
||||||
|
) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
branch = _finalize_worktree(
|
||||||
|
Path(tmpdir),
|
||||||
|
Path(tmpdir) / "wt",
|
||||||
|
"cross-eval/fail",
|
||||||
|
"review-fix",
|
||||||
|
"FAIL",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIsNone(branch)
|
||||||
|
|
||||||
|
|
||||||
|
class TestRuntimeEnvironmentHelpers(unittest.TestCase):
|
||||||
|
def test_parse_dotenv_handles_export_and_quotes(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
env_path = Path(tmpdir) / ".env"
|
||||||
|
env_path.write_text(
|
||||||
|
"export FOO='bar'\nBAR=\"line\\nvalue\"\nINVALID\n=skip\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
values = parse_dotenv(env_path)
|
||||||
|
|
||||||
|
self.assertEqual(values["FOO"], "bar")
|
||||||
|
self.assertEqual(values["BAR"], "line\nvalue")
|
||||||
|
self.assertNotIn("INVALID", values)
|
||||||
|
|
||||||
|
def test_resolve_env_files_deduplicates_and_filters_missing(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
root = Path(tmpdir)
|
||||||
|
env_path = root / ".env"
|
||||||
|
env_path.write_text("FOO=bar\n", encoding="utf-8")
|
||||||
|
execution = ExecutionConfig(
|
||||||
|
env_files=[".env", str(env_path)],
|
||||||
|
auto_env_files=[".env", ".env.local"],
|
||||||
|
)
|
||||||
|
|
||||||
|
resolved = resolve_env_files(execution, root)
|
||||||
|
|
||||||
|
self.assertEqual(resolved, [env_path.resolve()])
|
||||||
|
|
||||||
|
def test_summarize_environment_hides_names_when_disabled(self) -> None:
|
||||||
|
execution = ExecutionConfig(expose_env_names=False, auto_context_targets=["postgres"])
|
||||||
|
summary = summarize_environment(
|
||||||
|
execution,
|
||||||
|
[],
|
||||||
|
{"DATABASE_URL": "postgres://localhost"},
|
||||||
|
{},
|
||||||
|
)
|
||||||
|
self.assertIn("names are hidden", summary)
|
||||||
|
self.assertIn("Execution targets hinted by the user: postgres", summary)
|
||||||
|
|
||||||
|
def test_build_execution_policy_for_minimal_mode(self) -> None:
|
||||||
|
policy = build_execution_policy(
|
||||||
|
ExecutionConfig(mode="agent-decides", command_policy="minimal"),
|
||||||
|
)
|
||||||
|
self.assertIn("Command policy: minimal", policy)
|
||||||
|
self.assertIn("Keep command usage minimal", policy)
|
||||||
|
|
||||||
|
|
||||||
|
class TestWorktreeFailures(unittest.TestCase):
|
||||||
|
@patch("cross_eval.worktree.subprocess.run")
|
||||||
|
def test_create_worktree_raises_when_branch_creation_fails(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.side_effect = subprocess.CalledProcessError(
|
||||||
|
1,
|
||||||
|
["git", "branch"],
|
||||||
|
stderr="branch failed",
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
base = Path(tmpdir)
|
||||||
|
work_dir = base / "wt"
|
||||||
|
with self.assertRaises(WorktreeError) as ctx:
|
||||||
|
create_worktree(base, work_dir, "cross-eval/fail")
|
||||||
|
|
||||||
|
self.assertIn("Failed to create branch", str(ctx.exception))
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.subprocess.run")
|
||||||
|
def test_create_worktree_cleans_branch_on_worktree_failure(self, mock_run: MagicMock) -> None:
|
||||||
|
mock_run.side_effect = [
|
||||||
|
MagicMock(returncode=0),
|
||||||
|
subprocess.CalledProcessError(
|
||||||
|
1,
|
||||||
|
["git", "worktree", "add"],
|
||||||
|
stderr="worktree failed",
|
||||||
|
),
|
||||||
|
MagicMock(returncode=0),
|
||||||
|
]
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
base = Path(tmpdir)
|
||||||
|
work_dir = base / "wt"
|
||||||
|
with self.assertRaises(WorktreeError):
|
||||||
|
create_worktree(base, work_dir, "cross-eval/fail")
|
||||||
|
|
||||||
|
cleanup_call = mock_run.call_args_list[-1]
|
||||||
|
self.assertEqual(cleanup_call[0][0][:3], ["git", "branch", "-D"])
|
||||||
|
|
||||||
|
@patch("cross_eval.worktree.shutil.rmtree")
|
||||||
|
@patch("cross_eval.worktree.subprocess.run")
|
||||||
|
def test_remove_worktree_falls_back_to_prune(self, mock_run: MagicMock, mock_rmtree: MagicMock) -> None:
|
||||||
|
mock_run.side_effect = [
|
||||||
|
subprocess.CalledProcessError(1, ["git", "worktree", "remove"]),
|
||||||
|
MagicMock(returncode=0),
|
||||||
|
]
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
base = Path(tmpdir) / "repo"
|
||||||
|
work_dir = Path(tmpdir) / "wt"
|
||||||
|
base.mkdir()
|
||||||
|
work_dir.mkdir()
|
||||||
|
|
||||||
|
remove_worktree(base, work_dir)
|
||||||
|
|
||||||
|
resolved = work_dir.resolve()
|
||||||
|
mock_rmtree.assert_any_call(resolved, ignore_errors=True)
|
||||||
|
self.assertEqual(mock_run.call_args_list[-1][0][0], ["git", "worktree", "prune"])
|
||||||
Reference in New Issue
Block a user