initial commit
This commit is contained in:
1
cross_eval/__init__.py
Normal file
1
cross_eval/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
__version__ = "0.1.0"
|
||||
BIN
cross_eval/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
cross_eval/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
cross_eval/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
cross_eval/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
BIN
cross_eval/__pycache__/agent.cpython-312.pyc
Normal file
BIN
cross_eval/__pycache__/agent.cpython-312.pyc
Normal file
Binary file not shown.
BIN
cross_eval/__pycache__/agent.cpython-313.pyc
Normal file
BIN
cross_eval/__pycache__/agent.cpython-313.pyc
Normal file
Binary file not shown.
BIN
cross_eval/__pycache__/cli.cpython-312.pyc
Normal file
BIN
cross_eval/__pycache__/cli.cpython-312.pyc
Normal file
Binary file not shown.
BIN
cross_eval/__pycache__/cli.cpython-313.pyc
Normal file
BIN
cross_eval/__pycache__/cli.cpython-313.pyc
Normal file
Binary file not shown.
BIN
cross_eval/__pycache__/config.cpython-312.pyc
Normal file
BIN
cross_eval/__pycache__/config.cpython-312.pyc
Normal file
Binary file not shown.
BIN
cross_eval/__pycache__/config.cpython-313.pyc
Normal file
BIN
cross_eval/__pycache__/config.cpython-313.pyc
Normal file
Binary file not shown.
BIN
cross_eval/__pycache__/models.cpython-312.pyc
Normal file
BIN
cross_eval/__pycache__/models.cpython-312.pyc
Normal file
Binary file not shown.
BIN
cross_eval/__pycache__/models.cpython-313.pyc
Normal file
BIN
cross_eval/__pycache__/models.cpython-313.pyc
Normal file
Binary file not shown.
BIN
cross_eval/__pycache__/pipeline.cpython-312.pyc
Normal file
BIN
cross_eval/__pycache__/pipeline.cpython-312.pyc
Normal file
Binary file not shown.
BIN
cross_eval/__pycache__/pipeline.cpython-313.pyc
Normal file
BIN
cross_eval/__pycache__/pipeline.cpython-313.pyc
Normal file
Binary file not shown.
BIN
cross_eval/__pycache__/prompts.cpython-312.pyc
Normal file
BIN
cross_eval/__pycache__/prompts.cpython-312.pyc
Normal file
Binary file not shown.
BIN
cross_eval/__pycache__/prompts.cpython-313.pyc
Normal file
BIN
cross_eval/__pycache__/prompts.cpython-313.pyc
Normal file
Binary file not shown.
BIN
cross_eval/__pycache__/report.cpython-312.pyc
Normal file
BIN
cross_eval/__pycache__/report.cpython-312.pyc
Normal file
Binary file not shown.
BIN
cross_eval/__pycache__/report.cpython-313.pyc
Normal file
BIN
cross_eval/__pycache__/report.cpython-313.pyc
Normal file
Binary file not shown.
162
cross_eval/agent.py
Normal file
162
cross_eval/agent.py
Normal file
@@ -0,0 +1,162 @@
|
||||
"""Agent invocation via subprocess with live spinner."""
|
||||
from __future__ import annotations
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from cross_eval.models import AgentConfig, AgentResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# CLI tools that support --system-prompt flag natively
|
||||
_SYSTEM_PROMPT_AGENTS = ("claude",)
|
||||
_REASONING_EFFORT_AGENTS = ("codex",)
|
||||
|
||||
|
||||
def _supports_system_prompt_flag(command: str) -> bool:
|
||||
"""Check if the agent CLI supports --system-prompt flag."""
|
||||
return any(name in command for name in _SYSTEM_PROMPT_AGENTS)
|
||||
|
||||
|
||||
def _supports_reasoning_effort(command: str) -> bool:
|
||||
"""Check if the agent CLI supports reasoning effort overrides."""
|
||||
return any(name in command for name in _REASONING_EFFORT_AGENTS)
|
||||
|
||||
|
||||
class _Spinner:
|
||||
"""Animated spinner for long-running agent calls."""
|
||||
|
||||
FRAMES = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"
|
||||
_CLEAR_LINE = "\r" + (" " * 160) + "\r"
|
||||
|
||||
def __init__(self, message: str) -> None:
|
||||
self.message = message
|
||||
self._running = False
|
||||
self._thread: Optional[threading.Thread] = None
|
||||
self._start_time = 0.0
|
||||
|
||||
def start(self) -> None:
|
||||
self._running = True
|
||||
self._start_time = time.monotonic()
|
||||
self._thread = threading.Thread(target=self._spin, daemon=True)
|
||||
self._thread.start()
|
||||
|
||||
def _spin(self) -> None:
|
||||
for frame in itertools.cycle(self.FRAMES):
|
||||
if not self._running:
|
||||
break
|
||||
elapsed = int(time.monotonic() - self._start_time)
|
||||
line = f"\r {frame} {self.message} ({elapsed}s)"
|
||||
sys.stderr.write(line)
|
||||
sys.stderr.flush()
|
||||
time.sleep(0.1)
|
||||
|
||||
def stop(self, final: str) -> None:
|
||||
self._running = False
|
||||
if self._thread:
|
||||
self._thread.join(timeout=1)
|
||||
elapsed = round(time.monotonic() - self._start_time, 1)
|
||||
sys.stderr.write(self._CLEAR_LINE)
|
||||
sys.stderr.write(f" \u2713 {final} ({elapsed}s)\n")
|
||||
sys.stderr.flush()
|
||||
|
||||
|
||||
def invoke_agent(
|
||||
agent: AgentConfig,
|
||||
prompt: str,
|
||||
step_name: str,
|
||||
cwd: Optional[Path] = None,
|
||||
timeout: int | None = None,
|
||||
quiet: bool = False,
|
||||
) -> AgentResult:
|
||||
"""Invoke an agent CLI with the given prompt.
|
||||
|
||||
Args:
|
||||
quiet: If True, suppress spinner (for parallel execution).
|
||||
"""
|
||||
cmd = [agent.command]
|
||||
if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
|
||||
cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
|
||||
cmd.extend(agent.args)
|
||||
|
||||
# Build the full prompt (system prompt + user prompt)
|
||||
if agent.system_prompt and _supports_system_prompt_flag(agent.command):
|
||||
# claude: --system-prompt flag supported natively
|
||||
cmd.extend(["--system-prompt", agent.system_prompt])
|
||||
input_data = prompt
|
||||
elif agent.system_prompt:
|
||||
# codex, others: no --system-prompt flag, prepend to prompt
|
||||
input_data = (
|
||||
f"<system>\n{agent.system_prompt}\n</system>\n\n"
|
||||
f"{prompt}"
|
||||
)
|
||||
else:
|
||||
input_data = prompt
|
||||
|
||||
logger.debug("Invoking agent '%s': %s", agent.name, " ".join(cmd[:5]) + " ...")
|
||||
|
||||
spinner: Optional[_Spinner] = None
|
||||
if not quiet:
|
||||
logger.info(" cmd: %s", " ".join(cmd[:6]))
|
||||
spinner = _Spinner(f"[{step_name}] {agent.name} running...")
|
||||
spinner.start()
|
||||
|
||||
try:
|
||||
start = time.monotonic()
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
input=input_data,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout,
|
||||
cwd=cwd,
|
||||
)
|
||||
duration = time.monotonic() - start
|
||||
except subprocess.TimeoutExpired:
|
||||
if spinner:
|
||||
spinner.stop(f"[{step_name}] TIMEOUT after {timeout}s")
|
||||
raise
|
||||
except Exception:
|
||||
if spinner:
|
||||
spinner.stop(f"[{step_name}] ERROR")
|
||||
raise
|
||||
|
||||
output = result.stdout.strip()
|
||||
chars = len(output)
|
||||
|
||||
if result.returncode != 0:
|
||||
if spinner:
|
||||
spinner.stop(f"[{step_name}] FAILED (exit {result.returncode})")
|
||||
err_detail = result.stderr.strip() or result.stdout.strip()
|
||||
if err_detail and len(err_detail) > 500:
|
||||
err_detail = err_detail[:500] + "..."
|
||||
cmd_preview = " ".join(cmd[:6])
|
||||
raise RuntimeError(
|
||||
f"Agent '{agent.name}' failed (exit code {result.returncode}) "
|
||||
f"at step '{step_name}':\n"
|
||||
f" cmd: {cmd_preview}\n"
|
||||
f" error: {err_detail or '(no output)'}"
|
||||
)
|
||||
|
||||
if spinner:
|
||||
spinner.stop(f"[{step_name}] done — {chars} chars")
|
||||
|
||||
if not output:
|
||||
logger.warning(
|
||||
"Agent '%s' produced empty output at step '%s'",
|
||||
agent.name, step_name,
|
||||
)
|
||||
|
||||
return AgentResult(
|
||||
output=output,
|
||||
exit_code=result.returncode,
|
||||
agent_name=agent.name,
|
||||
step_name=step_name,
|
||||
duration_seconds=round(duration, 1),
|
||||
)
|
||||
701
cross_eval/cli.py
Normal file
701
cross_eval/cli.py
Normal file
@@ -0,0 +1,701 @@
|
||||
"""CLI entry point with argparse subcommands."""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from cross_eval import __version__
|
||||
from cross_eval.config import REASONING_EFFORT_CHOICES
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scaffolding templates for `cross-eval init`
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DEFAULT_CONFIG_YAML = """\
|
||||
# ─── cross-eval 설정 ───────────────────────────────────────────
|
||||
#
|
||||
# 기본 제공 에이전트 (별도 정의 없이 바로 사용 가능):
|
||||
# claude-coder, claude-reviewer (Claude, opus 모델)
|
||||
# claude-senior (Claude, opus 모델)
|
||||
# codex-coder, codex-reviewer (Codex, gpt-5.4 모델)
|
||||
# codex-senior (Codex, gpt-5.4 모델)
|
||||
#
|
||||
# CLI에서 --coder claude --reviewer codex --senior codex 같이 축약해서 지정 가능
|
||||
# ────────────────────────────────────────────────────────────────
|
||||
|
||||
# 입력 파일 (이 파일 기준 상대경로)
|
||||
inputs:
|
||||
plan: plan.md
|
||||
checklist: checklist.md
|
||||
|
||||
# 에이전트 역할 지정
|
||||
coders: [claude-coder]
|
||||
reviewers: [claude-reviewer]
|
||||
# seniors: [codex-senior]
|
||||
|
||||
# 파이프라인 종류: simple | cross-review | review-only | review-fix
|
||||
pipeline: preset:{preset}
|
||||
|
||||
# 반복 설정
|
||||
max_iterations: 3
|
||||
# min_iterations: 1 # PASS여도 최소 이만큼 반복
|
||||
|
||||
# 프롬프트 언어
|
||||
language: {language}
|
||||
|
||||
# 결과 저장 경로
|
||||
output_dir: output
|
||||
|
||||
# ─── 커스텀 에이전트 (선택) ────────────────────────────────────
|
||||
# 기본 제공 에이전트를 덮어쓰거나 새 에이전트를 정의할 수 있습니다.
|
||||
#
|
||||
# agents:
|
||||
# my-reviewer:
|
||||
# command: my-tool
|
||||
# args: ["--flag"]
|
||||
# system_prompt: "..."
|
||||
# ────────────────────────────────────────────────────────────────
|
||||
"""
|
||||
|
||||
PLAN_SAMPLE_EN = """\
|
||||
# Project Plan
|
||||
|
||||
## Objective
|
||||
[Describe what you want to build]
|
||||
|
||||
## Requirements
|
||||
1. [Requirement 1]
|
||||
2. [Requirement 2]
|
||||
|
||||
## Constraints
|
||||
- [Constraint 1]
|
||||
- [Constraint 2]
|
||||
|
||||
## Out of Scope
|
||||
- [Explicitly list what should NOT be implemented]
|
||||
"""
|
||||
|
||||
PLAN_SAMPLE_KO = """\
|
||||
# 프로젝트 기획서
|
||||
|
||||
## 목표
|
||||
[구현할 내용을 설명하세요]
|
||||
|
||||
## 요구사항
|
||||
1. [요구사항 1]
|
||||
2. [요구사항 2]
|
||||
|
||||
## 제약조건
|
||||
- [제약조건 1]
|
||||
- [제약조건 2]
|
||||
|
||||
## 범위 밖 (구현하지 않을 것)
|
||||
- [명시적으로 구현하지 않을 항목 나열]
|
||||
"""
|
||||
|
||||
CHECKLIST_SAMPLE_EN = """\
|
||||
# Implementation Checklist
|
||||
|
||||
## Functional Requirements
|
||||
- [ ] [Item 1]
|
||||
- [ ] [Item 2]
|
||||
|
||||
## Code Quality
|
||||
- [ ] No unused imports or dead code
|
||||
- [ ] Error handling for edge cases
|
||||
- [ ] Follows project coding conventions
|
||||
|
||||
## Constraints
|
||||
- [ ] Does NOT add features beyond the plan
|
||||
- [ ] Does NOT introduce unnecessary abstractions
|
||||
"""
|
||||
|
||||
CHECKLIST_SAMPLE_KO = """\
|
||||
# 구현 체크리스트
|
||||
|
||||
## 기능 요구사항
|
||||
- [ ] [항목 1]
|
||||
- [ ] [항목 2]
|
||||
|
||||
## 코드 품질
|
||||
- [ ] 사용하지 않는 import나 죽은 코드 없음
|
||||
- [ ] 엣지 케이스에 대한 에러 처리
|
||||
- [ ] 프로젝트 코딩 컨벤션 준수
|
||||
|
||||
## 제약
|
||||
- [ ] 기획서 범위를 넘는 기능을 추가하지 않음
|
||||
- [ ] 불필요한 추상화를 도입하지 않음
|
||||
"""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
"""Main CLI entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="cross-eval",
|
||||
description=(
|
||||
"AI 코딩 에이전트의 결과물을 자동으로 검증하는 CLI 도구.\n"
|
||||
"\n"
|
||||
"동작 방식:\n"
|
||||
" 1. 기획서(plan)를 바탕으로 Coder 에이전트가 코드를 생성\n"
|
||||
" 2. Reviewer 에이전트가 기획서 대비 코드를 검토하고 PASS/FAIL 판정\n"
|
||||
" 3. FAIL이면 피드백을 반영해서 1~2를 반복 (최대 N회)\n"
|
||||
"\n"
|
||||
"빠른 시작:\n"
|
||||
" cross-eval init 설정 파일 생성\n"
|
||||
" cross-eval run --plan plan.md 기획서로 바로 실행\n"
|
||||
" cross-eval run .cross-eval/config.yaml 기반 실행\n"
|
||||
"\n"
|
||||
"자세한 사용법: cross-eval <command> --help"
|
||||
),
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--version",
|
||||
action="version",
|
||||
version=f"%(prog)s {__version__}",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="상세 로그 출력",
|
||||
)
|
||||
|
||||
subparsers = parser.add_subparsers(dest="command")
|
||||
|
||||
# --- init ---
|
||||
init_parser = subparsers.add_parser(
|
||||
"init",
|
||||
help="설정 파일 생성 (config.yaml, plan.md, checklist.md)",
|
||||
description=(
|
||||
"현재 디렉토리에 .cross-eval/ 폴더를 만들고 템플릿을 생성합니다.\n"
|
||||
"이미 있는 파일은 건드리지 않습니다.\n"
|
||||
"\n"
|
||||
"생성되는 파일:\n"
|
||||
" .cross-eval/config.yaml 에이전트, 파이프라인 설정\n"
|
||||
" .cross-eval/plan.md 기획서 템플릿\n"
|
||||
" .cross-eval/checklist.md 체크리스트 템플릿"
|
||||
),
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
init_parser.add_argument(
|
||||
"--dir",
|
||||
type=Path,
|
||||
default=Path("."),
|
||||
help="초기화할 디렉토리 (기본: 현재 디렉토리)",
|
||||
)
|
||||
init_parser.add_argument(
|
||||
"--preset",
|
||||
default="simple",
|
||||
choices=["simple", "cross-review", "review-only", "review-fix"],
|
||||
help=(
|
||||
"파이프라인 종류 (기본: simple). "
|
||||
"simple=코딩+리뷰, cross-review=교차리뷰, "
|
||||
"review-only=리뷰만, review-fix=리뷰수렴+자동수정"
|
||||
),
|
||||
)
|
||||
init_parser.add_argument(
|
||||
"--lang",
|
||||
default="ko",
|
||||
choices=["en", "ko"],
|
||||
help="프롬프트 언어 (기본: ko)",
|
||||
)
|
||||
|
||||
# --- run ---
|
||||
run_parser = subparsers.add_parser(
|
||||
"run",
|
||||
help="검증 파이프라인 실행",
|
||||
description=(
|
||||
"기획서(plan)를 기반으로 AI 에이전트가 코드 생성과 리뷰를 반복합니다.\n"
|
||||
"\n"
|
||||
"설정 파일 없이 바로 실행할 수 있고, config.yaml로도 실행할 수 있습니다.\n"
|
||||
"CLI 옵션이 config.yaml보다 우선합니다."
|
||||
),
|
||||
epilog=(
|
||||
"파이프라인 종류 (--preset):\n"
|
||||
" ┌──────────────┬─────────────────────────────────────────────────────┐\n"
|
||||
" │ simple │ Coder가 코드 생성 → Reviewer가 리뷰 │\n"
|
||||
" │ (기본값) │ FAIL이면 피드백 반영해서 재생성, PASS까지 반복 │\n"
|
||||
" ├──────────────┼─────────────────────────────────────────────────────┤\n"
|
||||
" │ review-fix │ 2단계 파이프라인: │\n"
|
||||
" │ │ Reviewer N명 병렬 리뷰 → 취합 → 수정 → 재검증 │\n"
|
||||
" ├──────────────┼─────────────────────────────────────────────────────┤\n"
|
||||
" │ review-only │ 코드 생성 없이 Reviewer N명이 기존 코드만 검토 │\n"
|
||||
" │ │ (이미 작성된 코드의 품질 감사용) │\n"
|
||||
" ├──────────────┼─────────────────────────────────────────────────────┤\n"
|
||||
" │ cross-review │ Coder 2명이 각각 구현 → 상대방 코드를 교차 리뷰 │\n"
|
||||
" │ │ (서로 다른 에이전트의 구현 비교용) │\n"
|
||||
" └──────────────┴─────────────────────────────────────────────────────┘\n"
|
||||
"\n"
|
||||
"기본 제공 에이전트:\n"
|
||||
" ┌──────────────────┬─────────┬───────────┬──────────────────────────┐\n"
|
||||
" │ 이름 │ CLI │ 기본 모델 │ 역할 │\n"
|
||||
" ├──────────────────┼─────────┼───────────┼──────────────────────────┤\n"
|
||||
" │ claude-coder │ claude │ opus │ 코드 생성 │\n"
|
||||
" │ claude-reviewer │ claude │ opus │ 코드 리뷰 │\n"
|
||||
" │ claude-senior │ claude │ opus │ 리뷰 취합/판정 │\n"
|
||||
" │ codex-coder │ codex │ gpt-5.4 │ 코드 생성 │\n"
|
||||
" │ codex-reviewer │ codex │ gpt-5.4 │ 코드 리뷰 │\n"
|
||||
" │ codex-senior │ codex │ gpt-5.4 │ 리뷰 취합/판정 │\n"
|
||||
" └──────────────────┴─────────┴───────────┴──────────────────────────┘\n"
|
||||
" --coder, --reviewer, --senior에서 축약 가능: claude → claude-<role>\n"
|
||||
"\n"
|
||||
"사용 예시:\n"
|
||||
"\n"
|
||||
" 기본 실행 (Claude가 코딩하고 Claude가 리뷰):\n"
|
||||
" cross-eval run --plan plan.md\n"
|
||||
"\n"
|
||||
" Codex가 코딩, Claude가 리뷰:\n"
|
||||
" cross-eval run --plan plan.md --coder codex --reviewer claude\n"
|
||||
"\n"
|
||||
" 리뷰어 2명 (Claude + Codex):\n"
|
||||
" cross-eval run --plan plan.md --reviewer claude --reviewer codex\n"
|
||||
"\n"
|
||||
" 리뷰 취합용 Senior 추가:\n"
|
||||
" cross-eval run --plan plan.md --preset review-fix \\\n"
|
||||
" --reviewer claude --reviewer codex --senior codex\n"
|
||||
"\n"
|
||||
" 리뷰 수렴 후 자동 수정 (review-fix):\n"
|
||||
" cross-eval run --plan plan.md --preset review-fix \\\n"
|
||||
" --reviewer claude --reviewer codex\n"
|
||||
"\n"
|
||||
" 기존 코드 리뷰만 (review-only):\n"
|
||||
" cross-eval run --plan plan.md --preset review-only \\\n"
|
||||
" --reviewer claude --reviewer codex\n"
|
||||
"\n"
|
||||
" 모델 변경:\n"
|
||||
" cross-eval run --plan plan.md --model sonnet\n"
|
||||
"\n"
|
||||
" config.yaml 기반 실행:\n"
|
||||
" cross-eval run\n"
|
||||
" cross-eval run -c my-config.yaml"
|
||||
),
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
# -- 입력 파일 --
|
||||
input_group = run_parser.add_argument_group("입력 파일")
|
||||
input_group.add_argument(
|
||||
"--plan", type=Path, default=None,
|
||||
help="기획서 파일 경로 (필수)",
|
||||
)
|
||||
input_group.add_argument(
|
||||
"--checklist", type=Path, default=None,
|
||||
help="체크리스트 파일 경로 (선택)",
|
||||
)
|
||||
input_group.add_argument(
|
||||
"--docs", type=Path, default=None,
|
||||
help="참고 문서 폴더. 폴더 안 모든 파일을 에이전트에게 전달",
|
||||
)
|
||||
input_group.add_argument(
|
||||
"--input", action="append", dest="inputs", metavar="KEY=PATH",
|
||||
help="추가 입력 파일 (예: --input spec=./api-spec.md)",
|
||||
)
|
||||
|
||||
# -- 에이전트 설정 --
|
||||
agent_group = run_parser.add_argument_group(
|
||||
"에이전트 설정",
|
||||
"축약 가능: claude → claude-<role>, codex → codex-<role>",
|
||||
)
|
||||
agent_group.add_argument(
|
||||
"--coder", action="append", dest="coders", metavar="NAME",
|
||||
help="코드를 생성할 에이전트 (여러 개 가능, 기본: claude)",
|
||||
)
|
||||
agent_group.add_argument(
|
||||
"--reviewer", action="append", dest="reviewers", metavar="NAME",
|
||||
help="코드를 리뷰할 에이전트 (여러 개 가능, 기본: claude)",
|
||||
)
|
||||
agent_group.add_argument(
|
||||
"--senior", action="append", dest="seniors", metavar="NAME",
|
||||
help="리뷰를 취합하고 최종 판정할 시니어 에이전트 (선택)",
|
||||
)
|
||||
agent_group.add_argument(
|
||||
"--reasoning-effort", default=None, metavar="LEVEL",
|
||||
choices=REASONING_EFFORT_CHOICES + ("extra-high", "extra_high", "x-high"),
|
||||
help="모든 역할의 reasoning effort (minimal|low|medium|high|xhigh)",
|
||||
)
|
||||
agent_group.add_argument(
|
||||
"--coder-effort", default=None, metavar="LEVEL",
|
||||
choices=REASONING_EFFORT_CHOICES + ("extra-high", "extra_high", "x-high"),
|
||||
help="Coder용 reasoning effort",
|
||||
)
|
||||
agent_group.add_argument(
|
||||
"--reviewer-effort", default=None, metavar="LEVEL",
|
||||
choices=REASONING_EFFORT_CHOICES + ("extra-high", "extra_high", "x-high"),
|
||||
help="Reviewer용 reasoning effort",
|
||||
)
|
||||
agent_group.add_argument(
|
||||
"--senior-effort", default=None, metavar="LEVEL",
|
||||
choices=REASONING_EFFORT_CHOICES + ("extra-high", "extra_high", "x-high"),
|
||||
help="Senior용 reasoning effort",
|
||||
)
|
||||
agent_group.add_argument(
|
||||
"--model", default=None, metavar="MODEL",
|
||||
help="모든 에이전트의 모델을 한번에 변경 (예: sonnet, opus)",
|
||||
)
|
||||
agent_group.add_argument(
|
||||
"--generator-model", default=None, metavar="MODEL",
|
||||
help="Coder 에이전트 모델만 변경",
|
||||
)
|
||||
agent_group.add_argument(
|
||||
"--reviewer-model", default=None, metavar="MODEL",
|
||||
help="Reviewer 에이전트 모델만 변경",
|
||||
)
|
||||
|
||||
# -- 파이프라인 --
|
||||
pipe_group = run_parser.add_argument_group("파이프라인")
|
||||
pipe_group.add_argument(
|
||||
"--preset", default=None,
|
||||
choices=["simple", "cross-review", "review-only", "review-fix"],
|
||||
help="파이프라인 종류 (기본: simple). 각 종류 설명은 아래 참조",
|
||||
)
|
||||
pipe_group.add_argument(
|
||||
"--max-iter", type=int, default=None,
|
||||
help="최대 반복 횟수 (기본: 3)",
|
||||
)
|
||||
pipe_group.add_argument(
|
||||
"--min-iter", type=int, default=None,
|
||||
help="최소 반복 횟수. PASS여도 이 횟수까지 반복 (기본: 1)",
|
||||
)
|
||||
pipe_group.add_argument(
|
||||
"--timeout", type=int, default=None, metavar="SEC",
|
||||
help="에이전트 1회 호출 제한 시간(초). 0=무제한 (기본: 무제한)",
|
||||
)
|
||||
pipe_group.add_argument(
|
||||
"--lang", default=None, choices=["en", "ko"],
|
||||
help="프롬프트 언어 (기본: ko)",
|
||||
)
|
||||
|
||||
# -- 기타 --
|
||||
etc_group = run_parser.add_argument_group("기타")
|
||||
etc_group.add_argument(
|
||||
"-c", "--config", type=Path, default=None,
|
||||
help="설정 파일 경로 (기본: .cross-eval/config.yaml)",
|
||||
)
|
||||
etc_group.add_argument(
|
||||
"--output-dir", type=Path, default=None,
|
||||
help="결과 저장 디렉토리 (기본: output/)",
|
||||
)
|
||||
etc_group.add_argument(
|
||||
"--dry-run", action="store_true",
|
||||
help="실제 실행 없이 에이전트에게 보낼 프롬프트만 미리보기",
|
||||
)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
# Setup logging
|
||||
level = logging.DEBUG if args.verbose else logging.INFO
|
||||
logging.basicConfig(
|
||||
level=level,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
|
||||
if args.command == "init":
|
||||
return cmd_init(args)
|
||||
elif args.command == "run":
|
||||
return cmd_run(args)
|
||||
else:
|
||||
parser.print_help()
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_init(args: argparse.Namespace) -> int:
|
||||
"""Scaffold a new cross-eval project."""
|
||||
target = args.dir.resolve()
|
||||
ce_dir = target / ".cross-eval"
|
||||
ce_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
lang = args.lang
|
||||
plan_sample = PLAN_SAMPLE_KO if lang == "ko" else PLAN_SAMPLE_EN
|
||||
checklist_sample = CHECKLIST_SAMPLE_KO if lang == "ko" else CHECKLIST_SAMPLE_EN
|
||||
|
||||
files = {
|
||||
".cross-eval/config.yaml": DEFAULT_CONFIG_YAML.format(
|
||||
preset=args.preset, language=lang,
|
||||
),
|
||||
".cross-eval/plan.md": plan_sample,
|
||||
".cross-eval/checklist.md": checklist_sample,
|
||||
}
|
||||
|
||||
created = []
|
||||
skipped = []
|
||||
for name, content in files.items():
|
||||
path = target / name
|
||||
if path.exists():
|
||||
skipped.append(name)
|
||||
else:
|
||||
path.write_text(content, encoding="utf-8")
|
||||
created.append(name)
|
||||
|
||||
if created:
|
||||
print(f" 생성: {', '.join(created)}")
|
||||
if skipped:
|
||||
print(f" 이미 존재 (건너뜀): {', '.join(skipped)}")
|
||||
|
||||
print(f"\n 파이프라인: {args.preset}")
|
||||
print(f" 언어: {lang}")
|
||||
print("")
|
||||
print("다음 단계:")
|
||||
print(" 1. .cross-eval/plan.md 에 기획서 작성")
|
||||
print(" 2. .cross-eval/checklist.md 에 체크리스트 작성 (선택)")
|
||||
print(" 3. cross-eval run 으로 실행")
|
||||
print("")
|
||||
print("주의: 에이전트는 기본적으로 파일 읽기/쓰기/실행 권한을 가집니다.")
|
||||
print(" 실행 전에 .cross-eval/config.yaml 을 확인하세요.")
|
||||
return 0
|
||||
|
||||
|
||||
def _read_docs_dir(docs_dir: Path) -> str:
|
||||
"""Read all files in a directory and concatenate with filename headers."""
|
||||
parts: list[str] = []
|
||||
for f in sorted(docs_dir.iterdir()):
|
||||
if f.is_file() and not f.name.startswith("."):
|
||||
try:
|
||||
content = f.read_text(encoding="utf-8")
|
||||
parts.append(f"### {f.name}\n{content}")
|
||||
except (UnicodeDecodeError, OSError):
|
||||
continue # skip binary or unreadable files
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _apply_model_override(config, agent_name: str, model: str) -> None:
|
||||
"""Replace --model in agent args."""
|
||||
agent = config.agents.get(agent_name)
|
||||
if agent is None:
|
||||
return
|
||||
new_args = list(agent.args)
|
||||
for i, arg in enumerate(new_args):
|
||||
if arg == "--model" and i + 1 < len(new_args):
|
||||
new_args[i + 1] = model
|
||||
agent.args = new_args
|
||||
return
|
||||
# --model not found, append it
|
||||
new_args.extend(["--model", model])
|
||||
agent.args = new_args
|
||||
|
||||
|
||||
def cmd_run(args: argparse.Namespace) -> int:
|
||||
"""Load config, validate, and execute the pipeline."""
|
||||
from cross_eval.config import (
|
||||
apply_input_overrides,
|
||||
default_config,
|
||||
load_config,
|
||||
validate_config,
|
||||
)
|
||||
from cross_eval.prompts import PIPELINE_PRESETS
|
||||
from cross_eval.pipeline import run_pipeline
|
||||
|
||||
# 1. Load config: YAML if exists, otherwise defaults
|
||||
config_path = args.config
|
||||
if config_path is not None:
|
||||
config_path = config_path.resolve()
|
||||
if not config_path.exists():
|
||||
print(f"Config file not found: {config_path}", file=sys.stderr)
|
||||
return 1
|
||||
try:
|
||||
config = load_config(config_path)
|
||||
except (ValueError, FileNotFoundError) as e:
|
||||
print(f"Config error: {e}", file=sys.stderr)
|
||||
return 1
|
||||
config_source = config_path.name
|
||||
else:
|
||||
# Try default location, fall back to built-in defaults
|
||||
default_path = Path(".cross-eval/config.yaml").resolve()
|
||||
if default_path.exists():
|
||||
try:
|
||||
config = load_config(default_path)
|
||||
config_source = default_path.name
|
||||
except (ValueError, FileNotFoundError) as e:
|
||||
print(f"Config error: {e}", file=sys.stderr)
|
||||
return 1
|
||||
else:
|
||||
config = default_config()
|
||||
config_source = "defaults"
|
||||
|
||||
# 2. Apply CLI overrides
|
||||
if args.max_iter is not None:
|
||||
config.max_iterations = args.max_iter
|
||||
if args.min_iter is not None:
|
||||
config.min_iterations = args.min_iter
|
||||
if args.output_dir is not None:
|
||||
config.output_dir = args.output_dir
|
||||
if args.lang is not None:
|
||||
config.language = args.lang
|
||||
|
||||
# --coder / --reviewer: resolve shorthands and override roles
|
||||
from cross_eval.config import (
|
||||
_default_seniors_for_preset,
|
||||
_infer_roles,
|
||||
_resolve_agents,
|
||||
apply_reasoning_effort_settings,
|
||||
resolve_agent_shorthand,
|
||||
)
|
||||
if args.coders or args.reviewers or args.seniors:
|
||||
coders = [resolve_agent_shorthand(c, "coder") for c in (args.coders or [])]
|
||||
reviewers = [resolve_agent_shorthand(r, "reviewer") for r in (args.reviewers or [])]
|
||||
seniors = [resolve_agent_shorthand(s, "senior") for s in (args.seniors or [])]
|
||||
# Fill defaults if only one side specified
|
||||
if not coders:
|
||||
coders = config.coders or ["claude-coder"]
|
||||
if not reviewers:
|
||||
reviewers = config.reviewers or ["claude-reviewer"]
|
||||
if not seniors:
|
||||
seniors = config.seniors
|
||||
config.coders = coders
|
||||
config.reviewers = reviewers
|
||||
config.seniors = seniors
|
||||
# Auto-merge built-in agents
|
||||
config.agents = _resolve_agents(config.agents, coders, reviewers, seniors)
|
||||
|
||||
# --preset: rebuild pipeline from preset
|
||||
need_rebuild = args.preset is not None or args.coders or args.reviewers or args.seniors
|
||||
if need_rebuild:
|
||||
from cross_eval.prompts import PHASED_PRESETS
|
||||
preset = args.preset or "simple"
|
||||
# Determine which preset was configured (from YAML or defaults)
|
||||
if args.preset is None and config.phases:
|
||||
preset = "review-fix" # only phased preset currently
|
||||
elif args.preset is None and not args.coders and not args.reviewers and not args.seniors:
|
||||
pass # no changes needed
|
||||
inferred_coders, inferred_reviewers, inferred_seniors = _infer_roles(
|
||||
list(config.agents.keys())
|
||||
)
|
||||
coders = config.coders or inferred_coders
|
||||
reviewers = config.reviewers or inferred_reviewers
|
||||
seniors = config.seniors or []
|
||||
if not seniors:
|
||||
seniors = _default_seniors_for_preset(
|
||||
f"preset:{preset}",
|
||||
reviewers,
|
||||
config.agents,
|
||||
)
|
||||
config.agents = _resolve_agents(config.agents, coders, reviewers, seniors)
|
||||
config.coders = coders
|
||||
config.reviewers = reviewers
|
||||
config.seniors = seniors
|
||||
config.preset_name = preset
|
||||
if preset in PHASED_PRESETS:
|
||||
config.phases = PHASED_PRESETS[preset](coders, reviewers, seniors)
|
||||
config.pipeline = []
|
||||
elif preset in PIPELINE_PRESETS:
|
||||
config.pipeline = PIPELINE_PRESETS[preset](coders, reviewers, seniors)
|
||||
config.phases = []
|
||||
if preset == "review-only" and args.max_iter is None and args.min_iter is None:
|
||||
config.max_iterations = 1
|
||||
|
||||
apply_reasoning_effort_settings(
|
||||
config,
|
||||
reasoning_effort=args.reasoning_effort,
|
||||
coder_effort=args.coder_effort,
|
||||
reviewer_effort=args.reviewer_effort,
|
||||
senior_effort=args.senior_effort,
|
||||
)
|
||||
|
||||
# --model: apply to ALL agents
|
||||
if args.model is not None:
|
||||
for agent_name in config.agents:
|
||||
_apply_model_override(config, agent_name, args.model)
|
||||
# --generator-model / --reviewer-model: apply by role
|
||||
if args.generator_model is not None:
|
||||
for coder_name in config.coders:
|
||||
_apply_model_override(config, coder_name, args.generator_model)
|
||||
if args.reviewer_model is not None:
|
||||
for reviewer_name in config.reviewers:
|
||||
_apply_model_override(config, reviewer_name, args.reviewer_model)
|
||||
|
||||
# --plan / --checklist shortcuts
|
||||
for key, val in [("plan", args.plan), ("checklist", args.checklist)]:
|
||||
if val is not None:
|
||||
p = val.resolve()
|
||||
if not p.exists():
|
||||
print(f"File not found: {p}", file=sys.stderr)
|
||||
return 1
|
||||
config.inputs[key] = p
|
||||
|
||||
# --docs: read all files in directory, inject as {docs}
|
||||
if args.docs is not None:
|
||||
docs_dir = args.docs.resolve()
|
||||
if not docs_dir.is_dir():
|
||||
print(f"Not a directory: {docs_dir}", file=sys.stderr)
|
||||
return 1
|
||||
docs_content = _read_docs_dir(docs_dir)
|
||||
if not docs_content:
|
||||
print(f"No files found in: {docs_dir}", file=sys.stderr)
|
||||
return 1
|
||||
config.inputs["docs"] = docs_content
|
||||
|
||||
if args.inputs:
|
||||
overrides = {}
|
||||
for item in args.inputs:
|
||||
if "=" not in item:
|
||||
print(
|
||||
f"Invalid --input format: '{item}'. Use KEY=PATH.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
key, path = item.split("=", 1)
|
||||
overrides[key] = path
|
||||
apply_input_overrides(config, overrides)
|
||||
|
||||
# 3. Validate after all overrides
|
||||
from cross_eval.config import validate_config
|
||||
errors = validate_config(config)
|
||||
if errors:
|
||||
print("Config error:\n " + "\n ".join(errors), file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# 4. Run pipeline
|
||||
logger.info("Config: %s", config_source)
|
||||
logger.info(
|
||||
"Agents: %s",
|
||||
", ".join(f"{n} ({a.command})" for n, a in config.agents.items()),
|
||||
)
|
||||
if config.coders or config.reviewers or config.seniors:
|
||||
logger.info("Coders: %s", config.coders)
|
||||
logger.info("Reviewers: %s", config.reviewers)
|
||||
logger.info("Seniors: %s", config.seniors)
|
||||
if config.phases:
|
||||
phase_desc = " → ".join(
|
||||
f"{p.name}(max {p.max_iterations}, {p.consecutive_pass}xPASS)"
|
||||
for p in config.phases
|
||||
)
|
||||
logger.info("Pipeline: phased [%s], lang=%s", phase_desc, config.language)
|
||||
else:
|
||||
iter_info = f"max {config.max_iterations}"
|
||||
if config.min_iterations > 1:
|
||||
iter_info = f"min {config.min_iterations}, max {config.max_iterations}"
|
||||
logger.info(
|
||||
"Pipeline: %d steps, %s iterations, lang=%s",
|
||||
len(config.pipeline), iter_info, config.language,
|
||||
)
|
||||
|
||||
try:
|
||||
raw_timeout = args.timeout if args.timeout is not None else 0
|
||||
agent_timeout = None if raw_timeout == 0 else raw_timeout
|
||||
result = run_pipeline(config, dry_run=args.dry_run, timeout=agent_timeout)
|
||||
except (RuntimeError, KeyboardInterrupt) as e:
|
||||
if isinstance(e, KeyboardInterrupt):
|
||||
print("\nInterrupted by user.", file=sys.stderr)
|
||||
return 130
|
||||
print(f"Pipeline error: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# 4. Print summary
|
||||
print(f"\nResult: {result.final_verdict}")
|
||||
print(f"Iterations: {len(result.iterations)}")
|
||||
if not args.dry_run and result.run_dir:
|
||||
print(f"Output: {result.run_dir}/")
|
||||
|
||||
return 0 if result.final_verdict == "PASS" else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
607
cross_eval/config.py
Normal file
607
cross_eval/config.py
Normal file
@@ -0,0 +1,607 @@
|
||||
"""Configuration loading, validation, and preset resolution."""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
|
||||
from cross_eval.models import AgentConfig, PhaseConfig, PipelineConfig, StepConfig
|
||||
from cross_eval.prompts import PHASED_PRESETS, PIPELINE_PRESETS
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
REASONING_EFFORT_ALIASES = {
|
||||
"extra-high": "xhigh",
|
||||
"extra_high": "xhigh",
|
||||
"x-high": "xhigh",
|
||||
}
|
||||
REASONING_EFFORT_CHOICES = ("minimal", "low", "medium", "high", "xhigh")
|
||||
DEFAULT_ROLE_REASONING_EFFORTS = {
|
||||
"coder": "medium",
|
||||
"reviewer": "medium",
|
||||
"senior": "high",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Built-in agent registry
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_CODEX_ARGS = [
|
||||
"exec",
|
||||
"--full-auto",
|
||||
"--skip-git-repo-check",
|
||||
"--model",
|
||||
"gpt-5.4",
|
||||
"-",
|
||||
]
|
||||
|
||||
_CODER_SYSTEM_PROMPT = (
|
||||
"You are a senior software engineer implementing code changes.\n"
|
||||
"Rules:\n"
|
||||
"1. FIRST explore the project directory to understand the existing codebase, "
|
||||
"patterns, and conventions before writing any code.\n"
|
||||
"2. Implement ONLY what the plan specifies. Do NOT add extra features, "
|
||||
"unnecessary abstractions, premature optimizations, or \"nice-to-have\" improvements.\n"
|
||||
"3. Follow the project's existing coding style, naming conventions, and directory structure.\n"
|
||||
"4. If previous review feedback is provided, fix ONLY the specific issues mentioned. "
|
||||
"Do NOT refactor unrelated code.\n"
|
||||
"5. Ignore any items from previous feedback that were marked as DISMISSED or false positive.\n"
|
||||
"6. When in doubt about scope, do LESS, not more."
|
||||
)
|
||||
|
||||
_REVIEWER_SYSTEM_PROMPT = (
|
||||
"You are a code reviewer. You MUST NOT create, modify, or delete any files.\n"
|
||||
"Rules:\n"
|
||||
"1. Explore the project directory to understand the full codebase context.\n"
|
||||
"2. Compare the implementation against the plan and checklist ONLY.\n"
|
||||
"3. Classify every issue with BOTH severity AND category:\n"
|
||||
" - Severity: Critical (breaks functionality/security) > Major (requirement mismatch) > Minor (convention/style)\n"
|
||||
" - Category: Over-engineering / Omission\n"
|
||||
"4. When reviewing with previous feedback, mark items as CONFIRMED (still an issue) "
|
||||
"or DISMISSED (false positive) with rationale.\n"
|
||||
"5. Report out-of-scope issues separately — problems found outside plan/checklist scope.\n"
|
||||
"6. Order issues by severity (Critical first).\n"
|
||||
"7. Do NOT suggest improvements beyond the plan scope.\n"
|
||||
"8. End with VERDICT: PASS (all requirements met, no over-engineering) "
|
||||
"or VERDICT: FAIL (issues found)."
|
||||
)
|
||||
|
||||
_SENIOR_SYSTEM_PROMPT = (
|
||||
"You are a senior technical reviewer coordinating a review-fix-verification loop.\n"
|
||||
"Rules:\n"
|
||||
"1. Explore the project directory to understand the full codebase context.\n"
|
||||
"2. In aggregation mode, deduplicate overlaps, resolve disagreements, and keep only "
|
||||
"evidence-backed issues. Categorize dismissed findings as [False positive] or [Already fixed].\n"
|
||||
"3. In verification mode, judge the current implementation directly against ONLY the "
|
||||
"plan and checklist.\n"
|
||||
"4. Be skeptical of false positives, but do not lower the bar on real requirement "
|
||||
"gaps.\n"
|
||||
"5. When issues remain, produce a concise prioritized action list the coder can act on.\n"
|
||||
"6. Do NOT invent new requirements beyond the plan and checklist.\n"
|
||||
"7. End with VERDICT: PASS or VERDICT: FAIL."
|
||||
)
|
||||
|
||||
BUILTIN_AGENTS: dict[str, AgentConfig] = {
|
||||
"claude-coder": AgentConfig(
|
||||
name="claude-coder",
|
||||
command="claude",
|
||||
args=["-p", "--model", "opus", "--permission-mode", "auto"],
|
||||
system_prompt=_CODER_SYSTEM_PROMPT,
|
||||
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["coder"],
|
||||
),
|
||||
"claude-reviewer": AgentConfig(
|
||||
name="claude-reviewer",
|
||||
command="claude",
|
||||
args=["-p", "--model", "opus", "--permission-mode", "auto"],
|
||||
system_prompt=_REVIEWER_SYSTEM_PROMPT,
|
||||
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["reviewer"],
|
||||
),
|
||||
"claude-senior": AgentConfig(
|
||||
name="claude-senior",
|
||||
command="claude",
|
||||
args=["-p", "--model", "opus", "--permission-mode", "auto"],
|
||||
system_prompt=_SENIOR_SYSTEM_PROMPT,
|
||||
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["senior"],
|
||||
),
|
||||
"codex-coder": AgentConfig(
|
||||
name="codex-coder",
|
||||
command="codex",
|
||||
args=list(_CODEX_ARGS),
|
||||
system_prompt=_CODER_SYSTEM_PROMPT,
|
||||
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["coder"],
|
||||
),
|
||||
"codex-reviewer": AgentConfig(
|
||||
name="codex-reviewer",
|
||||
command="codex",
|
||||
args=list(_CODEX_ARGS),
|
||||
system_prompt=_REVIEWER_SYSTEM_PROMPT,
|
||||
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["reviewer"],
|
||||
),
|
||||
"codex-senior": AgentConfig(
|
||||
name="codex-senior",
|
||||
command="codex",
|
||||
args=list(_CODEX_ARGS),
|
||||
system_prompt=_SENIOR_SYSTEM_PROMPT,
|
||||
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["senior"],
|
||||
),
|
||||
}
|
||||
|
||||
# Shorthand aliases: "claude" → "claude-coder"/"claude-reviewer", "codex" → same
|
||||
_AGENT_ALIASES: dict[str, str] = {
|
||||
"claude": "claude",
|
||||
"codex": "codex",
|
||||
}
|
||||
|
||||
|
||||
def resolve_agent_shorthand(name: str, role: str) -> str:
|
||||
"""Resolve shorthand agent name to full builtin name.
|
||||
|
||||
Examples:
|
||||
resolve_agent_shorthand("claude", "coder") → "claude-coder"
|
||||
resolve_agent_shorthand("codex", "reviewer") → "codex-reviewer"
|
||||
resolve_agent_shorthand("claude-coder", "coder") → "claude-coder" (unchanged)
|
||||
"""
|
||||
if name in _AGENT_ALIASES:
|
||||
return f"{_AGENT_ALIASES[name]}-{role}"
|
||||
return name
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Role inference (backward compatibility)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_CODER_PATTERNS = ("gen", "coder", "implement", "develop", "write")
|
||||
_SENIOR_PATTERNS = ("senior", "lead", "principal", "aggregate", "adjudicat", "synth")
|
||||
_REVIEWER_PATTERNS = ("review", "audit", "check", "verify", "inspect")
|
||||
|
||||
|
||||
def _infer_roles(agent_names: list[str]) -> tuple[list[str], list[str], list[str]]:
|
||||
"""Infer coder/reviewer/senior roles from agent names.
|
||||
|
||||
Heuristic:
|
||||
- Names containing 'gen', 'coder', etc. → coder
|
||||
- Names containing 'senior', 'lead', etc. → senior
|
||||
- Names containing 'review', 'audit', etc. → reviewer
|
||||
- If no matches: first agent → coder, rest → reviewers
|
||||
"""
|
||||
coders: list[str] = []
|
||||
reviewers: list[str] = []
|
||||
seniors: list[str] = []
|
||||
unclassified: list[str] = []
|
||||
|
||||
for name in agent_names:
|
||||
lower = name.lower()
|
||||
if any(p in lower for p in _CODER_PATTERNS):
|
||||
coders.append(name)
|
||||
elif any(p in lower for p in _SENIOR_PATTERNS):
|
||||
seniors.append(name)
|
||||
elif any(p in lower for p in _REVIEWER_PATTERNS):
|
||||
reviewers.append(name)
|
||||
else:
|
||||
unclassified.append(name)
|
||||
|
||||
# Fallback: if no classification worked, use positional convention
|
||||
if not coders and not reviewers:
|
||||
if len(agent_names) >= 2:
|
||||
coders = [agent_names[0]]
|
||||
reviewers = list(agent_names[1:])
|
||||
elif agent_names:
|
||||
# Single agent: treat as reviewer (for review-only)
|
||||
reviewers = list(agent_names)
|
||||
elif not coders and unclassified:
|
||||
coders = [unclassified.pop(0)]
|
||||
elif not reviewers and unclassified:
|
||||
reviewers = list(unclassified)
|
||||
unclassified = []
|
||||
|
||||
# Any remaining unclassified go to reviewers
|
||||
reviewers.extend(unclassified)
|
||||
|
||||
return coders, reviewers, seniors
|
||||
|
||||
|
||||
def _resolve_agents(
|
||||
user_agents: dict[str, AgentConfig],
|
||||
coders: list[str],
|
||||
reviewers: list[str],
|
||||
seniors: list[str],
|
||||
) -> dict[str, AgentConfig]:
|
||||
"""Ensure all referenced agents exist by merging built-in definitions.
|
||||
|
||||
If a coder or reviewer name references an agent not in user_agents
|
||||
but present in BUILTIN_AGENTS, the built-in definition is added.
|
||||
"""
|
||||
all_referenced = set(coders) | set(reviewers) | set(seniors)
|
||||
result = dict(user_agents)
|
||||
|
||||
for name in all_referenced:
|
||||
if name not in result and name in BUILTIN_AGENTS:
|
||||
result[name] = BUILTIN_AGENTS[name]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _default_seniors_for_preset(
|
||||
pipeline_raw: Any,
|
||||
reviewers: list[str],
|
||||
agents: dict[str, AgentConfig],
|
||||
) -> list[str]:
|
||||
"""Infer a default senior agent for presets that benefit from adjudication."""
|
||||
if not (
|
||||
isinstance(pipeline_raw, str)
|
||||
and pipeline_raw == "preset:review-fix"
|
||||
and reviewers
|
||||
):
|
||||
return []
|
||||
|
||||
first_reviewer = reviewers[0]
|
||||
if first_reviewer.startswith("codex-"):
|
||||
return ["codex-senior"]
|
||||
if first_reviewer.startswith("claude-"):
|
||||
return ["claude-senior"]
|
||||
|
||||
reviewer_agent = agents.get(first_reviewer)
|
||||
if reviewer_agent is None:
|
||||
return []
|
||||
|
||||
command = reviewer_agent.command.lower()
|
||||
if "codex" in command:
|
||||
return ["codex-senior"]
|
||||
if "claude" in command:
|
||||
return ["claude-senior"]
|
||||
return []
|
||||
|
||||
|
||||
def normalize_reasoning_effort(effort: str) -> str:
|
||||
"""Normalize user-facing reasoning effort aliases."""
|
||||
normalized = REASONING_EFFORT_ALIASES.get(effort, effort)
|
||||
if normalized not in REASONING_EFFORT_CHOICES:
|
||||
raise ValueError(
|
||||
f"Unsupported reasoning effort '{effort}'. "
|
||||
f"Use one of: {REASONING_EFFORT_CHOICES}"
|
||||
)
|
||||
return normalized
|
||||
|
||||
|
||||
def apply_reasoning_effort_settings(
|
||||
config: PipelineConfig,
|
||||
*,
|
||||
reasoning_effort: str | None = None,
|
||||
coder_effort: str | None = None,
|
||||
reviewer_effort: str | None = None,
|
||||
senior_effort: str | None = None,
|
||||
) -> None:
|
||||
"""Apply default and override reasoning effort settings by role."""
|
||||
shared_effort = normalize_reasoning_effort(reasoning_effort) if reasoning_effort else None
|
||||
role_efforts = {
|
||||
"coder": normalize_reasoning_effort(coder_effort) if coder_effort else shared_effort,
|
||||
"reviewer": normalize_reasoning_effort(reviewer_effort) if reviewer_effort else shared_effort,
|
||||
"senior": normalize_reasoning_effort(senior_effort) if senior_effort else shared_effort,
|
||||
}
|
||||
|
||||
_apply_role_effort(config.agents, config.coders, role_efforts["coder"], "coder")
|
||||
_apply_role_effort(config.agents, config.reviewers, role_efforts["reviewer"], "reviewer")
|
||||
_apply_role_effort(config.agents, config.seniors, role_efforts["senior"], "senior")
|
||||
|
||||
|
||||
def _apply_role_effort(
|
||||
agents: dict[str, AgentConfig],
|
||||
agent_names: list[str],
|
||||
override_effort: str | None,
|
||||
role: str,
|
||||
) -> None:
|
||||
"""Set reasoning effort on agents for a specific role."""
|
||||
for agent_name in agent_names:
|
||||
agent = agents.get(agent_name)
|
||||
if agent is None:
|
||||
continue
|
||||
if override_effort is not None:
|
||||
agent.reasoning_effort = override_effort
|
||||
elif agent.reasoning_effort is None:
|
||||
agent.reasoning_effort = DEFAULT_ROLE_REASONING_EFFORTS[role]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Default config (no YAML)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def default_config() -> PipelineConfig:
|
||||
"""Return a PipelineConfig with sensible defaults (no YAML needed)."""
|
||||
agents = dict(BUILTIN_AGENTS)
|
||||
coders = ["claude-coder"]
|
||||
reviewers = ["claude-reviewer"]
|
||||
seniors: list[str] = []
|
||||
pipeline = PIPELINE_PRESETS["simple"](coders, reviewers, seniors)
|
||||
return PipelineConfig(
|
||||
output_dir=Path("output"),
|
||||
max_iterations=3,
|
||||
language="ko",
|
||||
inputs={},
|
||||
agents=agents,
|
||||
coders=coders,
|
||||
reviewers=reviewers,
|
||||
seniors=seniors,
|
||||
pipeline=pipeline,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# YAML loading
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def load_config(path: Path) -> PipelineConfig:
|
||||
"""Load and validate a YAML config file, returning PipelineConfig."""
|
||||
path = path.resolve()
|
||||
with open(path, encoding="utf-8") as f:
|
||||
raw = yaml.safe_load(f)
|
||||
|
||||
if not isinstance(raw, dict):
|
||||
raise ValueError(f"Config file must be a YAML mapping, got {type(raw).__name__}")
|
||||
|
||||
config = _parse_raw(raw, path)
|
||||
|
||||
errors = validate_config(config)
|
||||
if errors:
|
||||
raise ValueError("Config validation failed:\n " + "\n ".join(errors))
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
|
||||
"""Parse raw YAML dict into PipelineConfig."""
|
||||
# --- agents ---
|
||||
agents: dict[str, AgentConfig] = {}
|
||||
for name, agent_data in raw.get("agents", {}).items():
|
||||
agents[name] = AgentConfig(
|
||||
name=name,
|
||||
command=agent_data.get("command", "claude"),
|
||||
args=agent_data.get("args", ["-p"]),
|
||||
system_prompt=agent_data.get("system_prompt"),
|
||||
reasoning_effort=agent_data.get("reasoning_effort"),
|
||||
stdin_mode=agent_data.get("stdin_mode", False),
|
||||
)
|
||||
|
||||
# --- roles: explicit or inferred ---
|
||||
pipeline_raw = raw.get("pipeline", "preset:simple")
|
||||
coders_raw = raw.get("coders")
|
||||
reviewers_raw = raw.get("reviewers")
|
||||
seniors_raw = raw.get("seniors")
|
||||
|
||||
if coders_raw is not None or reviewers_raw is not None or seniors_raw is not None:
|
||||
# Explicit role assignment from YAML
|
||||
coders: list[str] = coders_raw if coders_raw is not None else []
|
||||
reviewers: list[str] = reviewers_raw if reviewers_raw is not None else []
|
||||
seniors: list[str] = seniors_raw if seniors_raw is not None else []
|
||||
else:
|
||||
# Backward compat: infer from agent names
|
||||
coders, reviewers, seniors = _infer_roles(list(agents.keys()))
|
||||
|
||||
if not seniors:
|
||||
seniors = _default_seniors_for_preset(pipeline_raw, reviewers, agents)
|
||||
|
||||
# Auto-merge built-in agents for any referenced names not yet defined
|
||||
agents = _resolve_agents(agents, coders, reviewers, seniors)
|
||||
config_stub = PipelineConfig(
|
||||
agents=agents,
|
||||
coders=coders,
|
||||
reviewers=reviewers,
|
||||
seniors=seniors,
|
||||
)
|
||||
apply_reasoning_effort_settings(config_stub)
|
||||
|
||||
# --- inputs (resolve relative to config file location) ---
|
||||
config_dir = config_path.parent
|
||||
inputs: dict[str, Path | str] = {}
|
||||
for key, val in raw.get("inputs", {}).items():
|
||||
p = Path(val)
|
||||
if not p.is_absolute():
|
||||
p = config_dir / p
|
||||
inputs[key] = p
|
||||
|
||||
# --- pipeline (preset or custom) ---
|
||||
steps, phases = _resolve_pipeline(pipeline_raw, coders, reviewers, seniors)
|
||||
|
||||
# Detect preset name for output directory naming
|
||||
preset_name = "custom"
|
||||
if isinstance(pipeline_raw, str) and pipeline_raw.startswith("preset:"):
|
||||
preset_name = pipeline_raw.split(":", 1)[1]
|
||||
|
||||
return PipelineConfig(
|
||||
output_dir=Path(raw.get("output_dir", "output")),
|
||||
max_iterations=int(raw.get("max_iterations", 3)),
|
||||
min_iterations=int(raw.get("min_iterations", 1)),
|
||||
verbose=bool(raw.get("verbose", False)),
|
||||
language=raw.get("language", "en"),
|
||||
inputs=inputs,
|
||||
agents=agents,
|
||||
coders=coders,
|
||||
reviewers=reviewers,
|
||||
seniors=seniors,
|
||||
pipeline=steps,
|
||||
phases=phases,
|
||||
preset_name=preset_name,
|
||||
_config_path=config_path,
|
||||
_config_mtime=config_path.stat().st_mtime,
|
||||
)
|
||||
|
||||
|
||||
def try_reload_config(config: PipelineConfig) -> PipelineConfig:
|
||||
"""Reload config if the file has been modified on disk.
|
||||
|
||||
Returns the new config if reloaded, or the same config if unchanged.
|
||||
Validation errors during reload are logged but do not crash the pipeline.
|
||||
"""
|
||||
if config._config_path is None or config._config_mtime is None:
|
||||
return config
|
||||
|
||||
try:
|
||||
current_mtime = config._config_path.stat().st_mtime
|
||||
except OSError:
|
||||
return config
|
||||
|
||||
if current_mtime <= config._config_mtime:
|
||||
return config
|
||||
|
||||
logger.info("Config file changed, reloading: %s", config._config_path.name)
|
||||
try:
|
||||
new_config = load_config(config._config_path)
|
||||
logger.info("Config reloaded successfully")
|
||||
return new_config
|
||||
except (ValueError, FileNotFoundError, yaml.YAMLError) as e:
|
||||
logger.warning("Config reload failed, keeping previous config: %s", e)
|
||||
return config
|
||||
|
||||
|
||||
def _resolve_pipeline(
|
||||
pipeline_raw: Any,
|
||||
coders: list[str],
|
||||
reviewers: list[str],
|
||||
seniors: list[str],
|
||||
) -> tuple[list[StepConfig], list[PhaseConfig]]:
|
||||
"""Resolve pipeline from preset string or explicit step list.
|
||||
|
||||
Returns (steps, phases) tuple. Only one will be non-empty.
|
||||
- Simple/cross-review/review-only → steps populated, phases empty.
|
||||
- Phased presets (review-fix) → steps empty, phases populated.
|
||||
"""
|
||||
# Preset: "preset:simple" or "preset:review-fix"
|
||||
if isinstance(pipeline_raw, str) and pipeline_raw.startswith("preset:"):
|
||||
preset_name = pipeline_raw.split(":", 1)[1]
|
||||
if preset_name in PIPELINE_PRESETS:
|
||||
return PIPELINE_PRESETS[preset_name](coders, reviewers, seniors), []
|
||||
if preset_name in PHASED_PRESETS:
|
||||
return [], PHASED_PRESETS[preset_name](coders, reviewers, seniors)
|
||||
all_presets = list(PIPELINE_PRESETS.keys()) + list(PHASED_PRESETS.keys())
|
||||
raise ValueError(
|
||||
f"Unknown pipeline preset '{preset_name}'. "
|
||||
f"Available: {all_presets}"
|
||||
)
|
||||
|
||||
# Explicit step list
|
||||
if isinstance(pipeline_raw, list):
|
||||
steps = []
|
||||
for step_data in pipeline_raw:
|
||||
steps.append(StepConfig(
|
||||
name=step_data["name"],
|
||||
agent=step_data["agent"],
|
||||
role=step_data.get("role", "generate"),
|
||||
prompt_template=step_data.get("prompt_template", f"default:{step_data.get('role', 'generate')}"),
|
||||
output_key=step_data["output_key"],
|
||||
verdict=step_data.get("verdict", False),
|
||||
verdict_pattern=step_data.get("verdict_pattern", r"VERDICT:\s*PASS"),
|
||||
context_override=step_data.get("context_override", {}),
|
||||
))
|
||||
return steps, []
|
||||
|
||||
raise ValueError(
|
||||
f"'pipeline' must be a preset string (e.g. 'preset:simple') "
|
||||
f"or a list of step definitions, got {type(pipeline_raw).__name__}"
|
||||
)
|
||||
|
||||
|
||||
def validate_config(config: PipelineConfig) -> list[str]:
|
||||
"""Return list of validation error strings (empty = valid)."""
|
||||
errors: list[str] = []
|
||||
|
||||
if config.phases:
|
||||
# --- Phased pipeline validation ---
|
||||
for phase in config.phases:
|
||||
if not phase.steps:
|
||||
errors.append(f"Phase '{phase.name}' has no steps")
|
||||
for step in phase.steps:
|
||||
if step.agent not in config.agents:
|
||||
errors.append(
|
||||
f"Phase '{phase.name}' step '{step.name}' references "
|
||||
f"undefined agent '{step.agent}'. "
|
||||
f"Defined agents: {list(config.agents.keys())}"
|
||||
)
|
||||
_validate_unique_step_fields(
|
||||
phase.steps,
|
||||
errors,
|
||||
scope=f"Phase '{phase.name}'",
|
||||
)
|
||||
if not any(s.verdict for s in phase.steps):
|
||||
errors.append(
|
||||
f"Phase '{phase.name}' must have at least one step with verdict: true"
|
||||
)
|
||||
# Validate verdict patterns
|
||||
for step in phase.steps:
|
||||
if step.verdict:
|
||||
try:
|
||||
re.compile(step.verdict_pattern)
|
||||
except re.error as e:
|
||||
errors.append(
|
||||
f"Phase '{phase.name}' step '{step.name}' "
|
||||
f"has invalid verdict_pattern: {e}"
|
||||
)
|
||||
else:
|
||||
# --- Simple pipeline validation ---
|
||||
if not config.pipeline:
|
||||
errors.append("Pipeline must have at least one step")
|
||||
|
||||
for step in config.pipeline:
|
||||
if step.agent not in config.agents:
|
||||
errors.append(
|
||||
f"Step '{step.name}' references undefined agent '{step.agent}'. "
|
||||
f"Defined agents: {list(config.agents.keys())}"
|
||||
)
|
||||
|
||||
_validate_unique_step_fields(
|
||||
config.pipeline,
|
||||
errors,
|
||||
scope="Pipeline",
|
||||
)
|
||||
|
||||
if not any(s.verdict for s in config.pipeline):
|
||||
errors.append("Pipeline must have at least one step with verdict: true")
|
||||
|
||||
for step in config.pipeline:
|
||||
if step.verdict:
|
||||
try:
|
||||
re.compile(step.verdict_pattern)
|
||||
except re.error as e:
|
||||
errors.append(
|
||||
f"Step '{step.name}' has invalid verdict_pattern: {e}"
|
||||
)
|
||||
|
||||
# --- Common validation ---
|
||||
for key, val in config.inputs.items():
|
||||
if isinstance(val, Path) and not val.exists():
|
||||
errors.append(f"Input file '{key}' not found: {val}")
|
||||
|
||||
if config.language not in ("en", "ko"):
|
||||
errors.append(f"Unsupported language '{config.language}'. Use 'en' or 'ko'.")
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def _validate_unique_step_fields(
|
||||
steps: list[StepConfig],
|
||||
errors: list[str],
|
||||
*,
|
||||
scope: str,
|
||||
) -> None:
|
||||
"""Ensure step names and output keys are unique within a step collection."""
|
||||
seen_names: set[str] = set()
|
||||
seen_output_keys: set[str] = set()
|
||||
|
||||
for step in steps:
|
||||
if step.name in seen_names:
|
||||
errors.append(f"{scope} has duplicate step name '{step.name}'")
|
||||
seen_names.add(step.name)
|
||||
|
||||
if step.output_key in seen_output_keys:
|
||||
errors.append(f"{scope} has duplicate output_key '{step.output_key}'")
|
||||
seen_output_keys.add(step.output_key)
|
||||
|
||||
|
||||
def apply_input_overrides(
|
||||
config: PipelineConfig, overrides: dict[str, str]
|
||||
) -> None:
|
||||
"""Apply CLI --input overrides to the config."""
|
||||
for key, path_str in overrides.items():
|
||||
config.inputs[key] = Path(path_str)
|
||||
118
cross_eval/models.py
Normal file
118
cross_eval/models.py
Normal file
@@ -0,0 +1,118 @@
|
||||
"""Data models for cross-eval pipeline."""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class AgentConfig:
|
||||
"""Definition of a single agent."""
|
||||
|
||||
name: str
|
||||
command: str
|
||||
args: list[str] = field(default_factory=list)
|
||||
system_prompt: Optional[str] = None
|
||||
reasoning_effort: Optional[str] = None
|
||||
stdin_mode: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class StepConfig:
|
||||
"""One step in the pipeline."""
|
||||
|
||||
name: str
|
||||
agent: str # reference to agents key
|
||||
role: str # "generate" or "review"
|
||||
prompt_template: str # "default:<role>" or file path
|
||||
output_key: str
|
||||
verdict: bool = False
|
||||
verdict_pattern: str = r"VERDICT:\s*PASS"
|
||||
context_override: dict[str, str] = field(default_factory=dict)
|
||||
parallel: bool = False # Can run concurrently with adjacent parallel steps
|
||||
|
||||
|
||||
@dataclass
|
||||
class PhaseConfig:
|
||||
"""One phase in a multi-phase pipeline (e.g. review-fix)."""
|
||||
|
||||
name: str
|
||||
steps: list[StepConfig] = field(default_factory=list)
|
||||
max_iterations: int = 10
|
||||
consecutive_pass: int = 1 # stop after N consecutive PASSes
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineConfig:
|
||||
"""Full cross-eval configuration."""
|
||||
|
||||
output_dir: Path = field(default_factory=lambda: Path("output"))
|
||||
max_iterations: int = 3
|
||||
min_iterations: int = 1
|
||||
verbose: bool = False
|
||||
language: str = "en" # "en" or "ko"
|
||||
inputs: dict[str, Path | str] = field(default_factory=dict)
|
||||
agents: dict[str, AgentConfig] = field(default_factory=dict)
|
||||
coders: list[str] = field(default_factory=list)
|
||||
reviewers: list[str] = field(default_factory=list)
|
||||
seniors: list[str] = field(default_factory=list)
|
||||
pipeline: list[StepConfig] = field(default_factory=list)
|
||||
phases: list[PhaseConfig] = field(default_factory=list)
|
||||
preset_name: str = "custom"
|
||||
_config_path: Optional[Path] = field(default=None, repr=False)
|
||||
_config_mtime: Optional[float] = field(default=None, repr=False)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AgentResult:
|
||||
"""Result from an agent invocation."""
|
||||
|
||||
output: str
|
||||
exit_code: int
|
||||
agent_name: str
|
||||
step_name: str
|
||||
duration_seconds: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class ReviewMetrics:
|
||||
"""Parsed metrics from a single review output."""
|
||||
|
||||
# Severity counts
|
||||
critical: int = 0
|
||||
major: int = 0
|
||||
minor: int = 0
|
||||
|
||||
# Category counts
|
||||
over_engineering: int = 0
|
||||
omission: int = 0
|
||||
|
||||
# Assessment counts
|
||||
confirmed: int = 0
|
||||
dismissed: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class IterationResult:
|
||||
"""Results from a single iteration."""
|
||||
|
||||
iteration: int
|
||||
step_results: dict[str, AgentResult] = field(default_factory=dict)
|
||||
step_outputs: dict[str, str] = field(default_factory=dict)
|
||||
verdict: Optional[str] = None
|
||||
feedback: Optional[str] = None
|
||||
phase_name: Optional[str] = None
|
||||
repeated_aggregate_warning: Optional[str] = None
|
||||
review_metrics: Optional[ReviewMetrics] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineResult:
|
||||
"""Results from the entire pipeline run."""
|
||||
|
||||
iterations: list[IterationResult] = field(default_factory=list)
|
||||
final_verdict: str = "MAX_ITERATIONS_REACHED"
|
||||
total_duration: float = 0.0
|
||||
run_dir: Optional[Path] = None
|
||||
repeated_aggregate_warnings: list[str] = field(default_factory=list)
|
||||
700
cross_eval/pipeline.py
Normal file
700
cross_eval/pipeline.py
Normal file
@@ -0,0 +1,700 @@
|
||||
"""Main pipeline execution engine."""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from cross_eval.agent import invoke_agent
|
||||
from cross_eval.config import try_reload_config
|
||||
from cross_eval.models import (
|
||||
AgentResult,
|
||||
IterationResult,
|
||||
PipelineConfig,
|
||||
PipelineResult,
|
||||
StepConfig,
|
||||
)
|
||||
from cross_eval.prompts import render_template, resolve_template, set_language
|
||||
from cross_eval.report import build_report
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def run_pipeline(
|
||||
config: PipelineConfig,
|
||||
cwd: Path | None = None,
|
||||
dry_run: bool = False,
|
||||
timeout: int | None = None,
|
||||
) -> PipelineResult:
|
||||
"""Execute the full cross-eval pipeline."""
|
||||
# Create run directory: output/{preset}_{datetime}/
|
||||
run_dir = _make_run_dir(config)
|
||||
|
||||
if config.phases:
|
||||
return _run_phased_pipeline(config, run_dir, cwd, dry_run, timeout)
|
||||
return _run_simple_pipeline(config, run_dir, cwd, dry_run, timeout)
|
||||
|
||||
|
||||
def _make_run_dir(config: PipelineConfig) -> Path:
|
||||
"""Create timestamped run directory under output_dir."""
|
||||
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
run_dir = config.output_dir / f"{config.preset_name}_{ts}"
|
||||
run_dir.mkdir(parents=True, exist_ok=True)
|
||||
return run_dir
|
||||
|
||||
|
||||
def _run_simple_pipeline(
|
||||
config: PipelineConfig,
|
||||
run_dir: Path,
|
||||
cwd: Path | None = None,
|
||||
dry_run: bool = False,
|
||||
timeout: int | None = None,
|
||||
) -> PipelineResult:
|
||||
"""Execute a simple (non-phased) pipeline."""
|
||||
if cwd is None:
|
||||
cwd = Path(os.getcwd())
|
||||
|
||||
set_language(config.language)
|
||||
input_contents = _load_inputs(config)
|
||||
|
||||
feedback = "(no feedback — first iteration)"
|
||||
iterations: list[IterationResult] = []
|
||||
start_time = time.monotonic()
|
||||
final_verdict = "MAX_ITERATIONS_REACHED"
|
||||
aggregate_history: dict[str, int] = {}
|
||||
aggregate_warnings: list[str] = []
|
||||
|
||||
for i in range(1, config.max_iterations + 1):
|
||||
config = try_reload_config(config)
|
||||
set_language(config.language)
|
||||
_refresh_inputs(config, input_contents)
|
||||
|
||||
logger.info("=" * 50)
|
||||
logger.info(" Iteration %d/%d", i, config.max_iterations)
|
||||
logger.info("=" * 50)
|
||||
|
||||
step_outputs, step_results, verdict = _run_steps(
|
||||
config.pipeline, config, input_contents, feedback,
|
||||
i, config.max_iterations, cwd, timeout, dry_run,
|
||||
run_dir=run_dir, output_iter=i,
|
||||
)
|
||||
|
||||
iter_result = IterationResult(
|
||||
iteration=i,
|
||||
step_results=step_results,
|
||||
step_outputs=step_outputs,
|
||||
verdict=verdict,
|
||||
)
|
||||
warning = _detect_repeated_aggregate(
|
||||
config.pipeline, step_outputs, aggregate_history, iteration=i,
|
||||
)
|
||||
if warning:
|
||||
iter_result.repeated_aggregate_warning = warning
|
||||
aggregate_warnings.append(warning)
|
||||
logger.warning(" %s", warning)
|
||||
|
||||
iter_result.feedback = _collect_feedback(config.pipeline, step_outputs)
|
||||
feedback = iter_result.feedback or feedback
|
||||
iterations.append(iter_result)
|
||||
|
||||
if verdict == "PASS":
|
||||
final_verdict = "PASS"
|
||||
if i >= config.min_iterations:
|
||||
logger.info(" PASS at iteration %d (min=%d reached)!", i, config.min_iterations)
|
||||
break
|
||||
else:
|
||||
logger.info(
|
||||
" PASS at iteration %d, but min_iterations=%d — continuing",
|
||||
i, config.min_iterations,
|
||||
)
|
||||
|
||||
if dry_run:
|
||||
logger.info(" (dry-run: stopping after iteration 1)")
|
||||
break
|
||||
|
||||
total_duration = time.monotonic() - start_time
|
||||
|
||||
pipeline_result = PipelineResult(
|
||||
iterations=iterations,
|
||||
final_verdict=final_verdict,
|
||||
total_duration=round(total_duration, 1),
|
||||
run_dir=run_dir,
|
||||
repeated_aggregate_warnings=aggregate_warnings,
|
||||
)
|
||||
|
||||
if not dry_run:
|
||||
_save_report(run_dir, config, pipeline_result)
|
||||
|
||||
return pipeline_result
|
||||
|
||||
|
||||
def _run_phased_pipeline(
|
||||
config: PipelineConfig,
|
||||
run_dir: Path,
|
||||
cwd: Path | None = None,
|
||||
dry_run: bool = False,
|
||||
timeout: int | None = None,
|
||||
) -> PipelineResult:
|
||||
"""Execute a multi-phase pipeline (e.g. review-fix)."""
|
||||
if cwd is None:
|
||||
cwd = Path(os.getcwd())
|
||||
|
||||
set_language(config.language)
|
||||
input_contents = _load_inputs(config)
|
||||
|
||||
iterations: list[IterationResult] = []
|
||||
feedback = "(no feedback — first iteration)"
|
||||
start_time = time.monotonic()
|
||||
final_verdict = "MAX_ITERATIONS_REACHED"
|
||||
global_iter = 0
|
||||
aggregate_history_by_phase: dict[str, dict[str, int]] = {}
|
||||
aggregate_warnings: list[str] = []
|
||||
|
||||
for phase_idx, phase in enumerate(config.phases):
|
||||
logger.info("=" * 60)
|
||||
logger.info(
|
||||
" Phase: %s (max_iter=%d, consecutive_pass=%d)",
|
||||
phase.name, phase.max_iterations, phase.consecutive_pass,
|
||||
)
|
||||
logger.info("=" * 60)
|
||||
|
||||
consecutive_passes = 0
|
||||
phase_converged = False
|
||||
|
||||
for pi in range(1, phase.max_iterations + 1):
|
||||
global_iter += 1
|
||||
|
||||
config = try_reload_config(config)
|
||||
set_language(config.language)
|
||||
_refresh_inputs(config, input_contents)
|
||||
|
||||
logger.info("-" * 50)
|
||||
logger.info(
|
||||
" [%s] Iteration %d/%d (global: v%d)",
|
||||
phase.name, pi, phase.max_iterations, global_iter,
|
||||
)
|
||||
logger.info("-" * 50)
|
||||
|
||||
step_outputs, step_results, verdict = _run_steps(
|
||||
phase.steps, config, input_contents, feedback,
|
||||
pi, phase.max_iterations, cwd, timeout, dry_run,
|
||||
run_dir=run_dir, output_iter=global_iter, phase_name=phase.name,
|
||||
)
|
||||
|
||||
iter_result = IterationResult(
|
||||
iteration=global_iter,
|
||||
step_results=step_results,
|
||||
step_outputs=step_outputs,
|
||||
verdict=verdict,
|
||||
phase_name=phase.name,
|
||||
)
|
||||
phase_history = aggregate_history_by_phase.setdefault(phase.name, {})
|
||||
warning = _detect_repeated_aggregate(
|
||||
phase.steps, step_outputs, phase_history, iteration=global_iter,
|
||||
phase_name=phase.name,
|
||||
)
|
||||
if warning:
|
||||
iter_result.repeated_aggregate_warning = warning
|
||||
aggregate_warnings.append(warning)
|
||||
logger.warning(" %s", warning)
|
||||
|
||||
iter_result.feedback = _collect_feedback(phase.steps, step_outputs)
|
||||
feedback = iter_result.feedback or feedback
|
||||
iterations.append(iter_result)
|
||||
|
||||
if verdict == "PASS":
|
||||
consecutive_passes += 1
|
||||
logger.info(
|
||||
" [%s] PASS (%d/%d consecutive)",
|
||||
phase.name, consecutive_passes, phase.consecutive_pass,
|
||||
)
|
||||
if consecutive_passes >= phase.consecutive_pass:
|
||||
logger.info(
|
||||
" [%s] Converged! %d consecutive PASSes.",
|
||||
phase.name, phase.consecutive_pass,
|
||||
)
|
||||
phase_converged = True
|
||||
break
|
||||
else:
|
||||
consecutive_passes = 0
|
||||
|
||||
if dry_run:
|
||||
break
|
||||
|
||||
if phase_converged:
|
||||
logger.info(" Phase '%s' completed: CONVERGED", phase.name)
|
||||
else:
|
||||
logger.info(
|
||||
" Phase '%s' completed: max iterations (%d) reached",
|
||||
phase.name, phase.max_iterations,
|
||||
)
|
||||
|
||||
if phase_idx == len(config.phases) - 1:
|
||||
final_verdict = "PASS" if phase_converged else "MAX_ITERATIONS_REACHED"
|
||||
|
||||
total_duration = time.monotonic() - start_time
|
||||
|
||||
pipeline_result = PipelineResult(
|
||||
iterations=iterations,
|
||||
final_verdict=final_verdict,
|
||||
total_duration=round(total_duration, 1),
|
||||
run_dir=run_dir,
|
||||
repeated_aggregate_warnings=aggregate_warnings,
|
||||
)
|
||||
|
||||
if not dry_run:
|
||||
_save_report(run_dir, config, pipeline_result)
|
||||
|
||||
return pipeline_result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Shared helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _load_inputs(config: PipelineConfig) -> dict[str, str]:
|
||||
"""Load input file contents from config."""
|
||||
input_contents: dict[str, str] = {}
|
||||
for key, val in config.inputs.items():
|
||||
if isinstance(val, str):
|
||||
input_contents[key] = val
|
||||
else:
|
||||
input_contents[key] = val.read_text(encoding="utf-8")
|
||||
return input_contents
|
||||
|
||||
|
||||
def _refresh_inputs(
|
||||
config: PipelineConfig, input_contents: dict[str, str],
|
||||
) -> None:
|
||||
"""Re-read input files (they may have changed on disk)."""
|
||||
for key, val in config.inputs.items():
|
||||
if isinstance(val, str):
|
||||
input_contents[key] = val
|
||||
elif isinstance(val, Path) and val.exists():
|
||||
input_contents[key] = val.read_text(encoding="utf-8")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parallel step grouping
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _get_step_dependencies(step: StepConfig) -> set[str]:
|
||||
"""Extract output_key references from context_override values."""
|
||||
deps: set[str] = set()
|
||||
for val in step.context_override.values():
|
||||
for match in re.finditer(r"\{(\w+)\}", val):
|
||||
deps.add(match.group(1))
|
||||
return deps
|
||||
|
||||
|
||||
def _group_parallel_steps(steps: list[StepConfig]) -> list[list[StepConfig]]:
|
||||
"""Group consecutive parallel steps into batches.
|
||||
|
||||
Consecutive steps with parallel=True are grouped together,
|
||||
but a new batch starts when a step depends on an output_key
|
||||
from a step in the current batch (dependency breaking).
|
||||
"""
|
||||
batches: list[list[StepConfig]] = []
|
||||
current: list[StepConfig] = []
|
||||
current_output_keys: set[str] = set()
|
||||
|
||||
for step in steps:
|
||||
if not step.parallel:
|
||||
if current:
|
||||
batches.append(current)
|
||||
current = []
|
||||
current_output_keys = set()
|
||||
batches.append([step])
|
||||
continue
|
||||
|
||||
# Check if this step depends on any output from the current batch
|
||||
deps = _get_step_dependencies(step)
|
||||
if deps & current_output_keys:
|
||||
batches.append(current)
|
||||
current = []
|
||||
current_output_keys = set()
|
||||
|
||||
current.append(step)
|
||||
current_output_keys.add(step.output_key)
|
||||
|
||||
if current:
|
||||
batches.append(current)
|
||||
|
||||
return batches
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step execution
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _run_steps(
|
||||
steps: list[StepConfig],
|
||||
config: PipelineConfig,
|
||||
input_contents: dict[str, str],
|
||||
feedback: str,
|
||||
iteration: int,
|
||||
max_iterations: int,
|
||||
cwd: Path,
|
||||
timeout: int | None,
|
||||
dry_run: bool,
|
||||
*,
|
||||
run_dir: Path,
|
||||
output_iter: int,
|
||||
phase_name: str | None = None,
|
||||
) -> tuple[dict[str, str], dict[str, AgentResult], str | None]:
|
||||
"""Execute all steps in one iteration, parallelizing where possible."""
|
||||
step_outputs: dict[str, str] = {}
|
||||
step_results: dict[str, AgentResult] = {}
|
||||
verdict: str | None = None
|
||||
|
||||
batches = _group_parallel_steps(steps)
|
||||
|
||||
for batch in batches:
|
||||
if len(batch) == 1:
|
||||
# Single step — run directly
|
||||
step = batch[0]
|
||||
_execute_step(
|
||||
step, config, input_contents, feedback,
|
||||
iteration, max_iterations, cwd, timeout, dry_run,
|
||||
step_outputs, step_results,
|
||||
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
|
||||
)
|
||||
else:
|
||||
# Parallel batch — run with ThreadPoolExecutor
|
||||
_execute_parallel_batch(
|
||||
batch, config, input_contents, feedback,
|
||||
iteration, max_iterations, cwd, timeout, dry_run,
|
||||
step_outputs, step_results,
|
||||
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
|
||||
)
|
||||
|
||||
# Extract verdict from all verdict steps (ALL must PASS)
|
||||
for step in steps:
|
||||
if step.verdict:
|
||||
output = step_outputs.get(step.output_key, "")
|
||||
step_verdict = _extract_verdict(output, step.verdict_pattern)
|
||||
logger.info(" [%s] verdict: %s", step.name, step_verdict)
|
||||
if verdict is None:
|
||||
verdict = step_verdict
|
||||
elif step_verdict == "FAIL":
|
||||
verdict = "FAIL"
|
||||
|
||||
return step_outputs, step_results, verdict
|
||||
|
||||
|
||||
def _execute_step(
|
||||
step: StepConfig,
|
||||
config: PipelineConfig,
|
||||
input_contents: dict[str, str],
|
||||
feedback: str,
|
||||
iteration: int,
|
||||
max_iterations: int,
|
||||
cwd: Path,
|
||||
timeout: int | None,
|
||||
dry_run: bool,
|
||||
step_outputs: dict[str, str],
|
||||
step_results: dict[str, AgentResult],
|
||||
*,
|
||||
run_dir: Path,
|
||||
output_iter: int,
|
||||
phase_name: str | None = None,
|
||||
quiet: bool = False,
|
||||
) -> None:
|
||||
"""Execute a single step, updating step_outputs and step_results in place."""
|
||||
if not quiet:
|
||||
logger.info(" [%s] agent='%s' role='%s'", step.name, step.agent, step.role)
|
||||
|
||||
# 1. Resolve template
|
||||
template = resolve_template(step.prompt_template)
|
||||
|
||||
# 2. Build context
|
||||
context = _build_context(
|
||||
input_contents, step_outputs, feedback, iteration, max_iterations,
|
||||
)
|
||||
|
||||
# 3. Apply context overrides
|
||||
if step.context_override:
|
||||
context = _apply_context_override(context, step.context_override)
|
||||
|
||||
# 4. Render prompt
|
||||
prompt = render_template(template, context)
|
||||
|
||||
# 5. Dry run: print and skip
|
||||
if dry_run:
|
||||
phase_label = f" phase={phase_name}" if phase_name else ""
|
||||
print(f"\n--- Step: {step.name} (agent={step.agent}{phase_label}) ---")
|
||||
print(prompt)
|
||||
print(f"--- end {step.name} ---\n")
|
||||
step_outputs[step.output_key] = f"(dry-run: no output for {step.output_key})"
|
||||
return
|
||||
|
||||
# 6. Invoke agent
|
||||
agent_config = config.agents[step.agent]
|
||||
try:
|
||||
result = invoke_agent(
|
||||
agent_config, prompt, step.name,
|
||||
cwd=cwd, timeout=timeout, quiet=quiet,
|
||||
)
|
||||
except subprocess.TimeoutExpired as e:
|
||||
stdout = (e.stdout or b"") if isinstance(e.stdout, bytes) else (e.stdout or "")
|
||||
stderr = (e.stderr or b"") if isinstance(e.stderr, bytes) else (e.stderr or "")
|
||||
if isinstance(stdout, bytes):
|
||||
stdout = stdout.decode("utf-8", errors="replace")
|
||||
if isinstance(stderr, bytes):
|
||||
stderr = stderr.decode("utf-8", errors="replace")
|
||||
phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
|
||||
error_msg = (
|
||||
f"# Agent Timeout\n\n"
|
||||
f"{phase_info}"
|
||||
f"- **Step**: {step.name}\n"
|
||||
f"- **Agent**: {step.agent}\n"
|
||||
f"- **Timeout**: {timeout}s\n\n"
|
||||
f"Partial stdout ({len(stdout)} chars):\n"
|
||||
f"```\n{stdout[:2000] or '(none)'}\n```\n\n"
|
||||
f"Stderr:\n```\n{stderr[:2000] or '(none)'}\n```\n"
|
||||
)
|
||||
_save_step_output(run_dir, output_iter, f"{step.name}_error", error_msg)
|
||||
logger.error(" [%s] TIMEOUT after %ss — saved to output", step.name, timeout)
|
||||
raise RuntimeError(
|
||||
f"Agent '{step.agent}' timed out after {timeout}s at step '{step.name}'. "
|
||||
f"Error saved to {run_dir}/v{output_iter}/{step.name}_error.md. "
|
||||
f"Try --timeout 0 (unlimited)"
|
||||
)
|
||||
except RuntimeError as e:
|
||||
phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
|
||||
error_msg = (
|
||||
f"# Agent Error\n\n{phase_info}"
|
||||
f"- **Step**: {step.name}\n- **Agent**: {step.agent}\n\n```\n{e}\n```\n"
|
||||
)
|
||||
_save_step_output(run_dir, output_iter, f"{step.name}_error", error_msg)
|
||||
logger.error(" [%s] FAILED — saved to output", step.name)
|
||||
raise
|
||||
|
||||
# 7. Store output
|
||||
step_outputs[step.output_key] = result.output
|
||||
step_results[step.output_key] = result
|
||||
|
||||
if not quiet:
|
||||
logger.info(
|
||||
" [%s] completed (%.1fs, %d chars)",
|
||||
step.name, result.duration_seconds, len(result.output),
|
||||
)
|
||||
|
||||
# 8. Save to disk
|
||||
_save_step_output(run_dir, output_iter, step.name, result.output)
|
||||
|
||||
|
||||
def _execute_parallel_batch(
|
||||
batch: list[StepConfig],
|
||||
config: PipelineConfig,
|
||||
input_contents: dict[str, str],
|
||||
feedback: str,
|
||||
iteration: int,
|
||||
max_iterations: int,
|
||||
cwd: Path,
|
||||
timeout: int | None,
|
||||
dry_run: bool,
|
||||
step_outputs: dict[str, str],
|
||||
step_results: dict[str, AgentResult],
|
||||
*,
|
||||
run_dir: Path,
|
||||
output_iter: int,
|
||||
phase_name: str | None = None,
|
||||
) -> None:
|
||||
"""Execute multiple steps in parallel using threads."""
|
||||
agent_names = ", ".join(s.agent for s in batch)
|
||||
logger.info(" [parallel] %d agents: %s", len(batch), agent_names)
|
||||
|
||||
if dry_run:
|
||||
for step in batch:
|
||||
_execute_step(
|
||||
step, config, input_contents, feedback,
|
||||
iteration, max_iterations, cwd, timeout, dry_run,
|
||||
step_outputs, step_results,
|
||||
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
|
||||
)
|
||||
return
|
||||
|
||||
# Snapshot context before parallel execution (all steps see same state)
|
||||
context_snapshot = dict(input_contents)
|
||||
context_snapshot.update(step_outputs)
|
||||
|
||||
# Collect results from parallel threads
|
||||
local_outputs: dict[str, str] = {}
|
||||
local_results: dict[str, AgentResult] = {}
|
||||
errors: list[Exception] = []
|
||||
|
||||
# Show a single spinner for the batch
|
||||
from cross_eval.agent import _Spinner
|
||||
spinner = _Spinner(
|
||||
f"[parallel] {len(batch)} agents running ({agent_names})..."
|
||||
)
|
||||
spinner.start()
|
||||
batch_start = time.monotonic()
|
||||
|
||||
def _run_one(step: StepConfig) -> tuple[str, str, AgentResult]:
|
||||
"""Run one step, return (output_key, output, result)."""
|
||||
template = resolve_template(step.prompt_template)
|
||||
context = _build_context(
|
||||
context_snapshot, {}, feedback, iteration, max_iterations,
|
||||
)
|
||||
if step.context_override:
|
||||
context = _apply_context_override(context, step.context_override)
|
||||
prompt = render_template(template, context)
|
||||
|
||||
agent_config = config.agents[step.agent]
|
||||
result = invoke_agent(
|
||||
agent_config, prompt, step.name,
|
||||
cwd=cwd, timeout=timeout, quiet=True,
|
||||
)
|
||||
return step.output_key, result.output, result
|
||||
|
||||
with ThreadPoolExecutor(max_workers=len(batch)) as executor:
|
||||
futures = {executor.submit(_run_one, step): step for step in batch}
|
||||
for future in as_completed(futures):
|
||||
step = futures[future]
|
||||
try:
|
||||
output_key, output, result = future.result()
|
||||
local_results[output_key] = result
|
||||
local_outputs[output_key] = output
|
||||
except Exception as e:
|
||||
errors.append(e)
|
||||
|
||||
batch_elapsed = round(time.monotonic() - batch_start, 1)
|
||||
|
||||
if errors:
|
||||
spinner.stop(f"[parallel] FAILED ({batch_elapsed}s)")
|
||||
raise errors[0]
|
||||
|
||||
spinner.stop(f"[parallel] {len(batch)} agents done ({batch_elapsed}s)")
|
||||
|
||||
# Merge results
|
||||
for step in batch:
|
||||
key = step.output_key
|
||||
step_outputs[key] = local_outputs[key]
|
||||
step_results[key] = local_results[key]
|
||||
r = local_results[key]
|
||||
logger.info(
|
||||
" [%s] completed (%.1fs, %d chars)",
|
||||
step.name, r.duration_seconds, len(r.output),
|
||||
)
|
||||
_save_step_output(run_dir, output_iter, step.name, r.output)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Context and template helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _build_context(
|
||||
input_contents: dict[str, str],
|
||||
step_outputs: dict[str, str],
|
||||
feedback: str,
|
||||
iteration: int,
|
||||
max_iterations: int,
|
||||
) -> dict[str, str]:
|
||||
"""Build the template context dict."""
|
||||
context: dict[str, str] = {}
|
||||
context.update(input_contents)
|
||||
context.update(step_outputs)
|
||||
context["feedback"] = feedback
|
||||
context["iteration"] = str(iteration)
|
||||
context["max_iterations"] = str(max_iterations)
|
||||
return context
|
||||
|
||||
|
||||
def _apply_context_override(
|
||||
context: dict[str, str],
|
||||
overrides: dict[str, str],
|
||||
) -> dict[str, str]:
|
||||
"""Apply context_override mappings for cross-review scenarios."""
|
||||
result = dict(context)
|
||||
for key, value_template in overrides.items():
|
||||
result[key] = render_template(value_template, context)
|
||||
return result
|
||||
|
||||
|
||||
def _collect_feedback(
|
||||
steps: list[StepConfig],
|
||||
step_outputs: dict[str, str],
|
||||
) -> str:
|
||||
"""Collect feedback from all verdict steps.
|
||||
|
||||
Single verdict step → raw output (backward compatible).
|
||||
Multiple verdict steps → combined with agent headers for cross-referencing.
|
||||
"""
|
||||
verdict_steps = [s for s in steps if s.verdict]
|
||||
if len(verdict_steps) == 1:
|
||||
return step_outputs.get(verdict_steps[0].output_key, "")
|
||||
parts: list[str] = []
|
||||
for s in verdict_steps:
|
||||
output = step_outputs.get(s.output_key, "")
|
||||
if output:
|
||||
parts.append(f"## Review by {s.agent} ({s.name})\n{output}")
|
||||
return "\n\n---\n\n".join(parts)
|
||||
|
||||
|
||||
def _detect_repeated_aggregate(
|
||||
steps: list[StepConfig],
|
||||
step_outputs: dict[str, str],
|
||||
history: dict[str, int],
|
||||
*,
|
||||
iteration: int,
|
||||
phase_name: str | None = None,
|
||||
) -> str | None:
|
||||
"""Detect repeated aggregate-review outputs across iterations."""
|
||||
for step in steps:
|
||||
if step.prompt_template != "default:aggregate-review":
|
||||
continue
|
||||
output = step_outputs.get(step.output_key, "")
|
||||
normalized = _normalize_aggregate_output(output)
|
||||
if not normalized:
|
||||
return None
|
||||
if normalized in history:
|
||||
prev_iter = history[normalized]
|
||||
phase_prefix = f"[{phase_name}] " if phase_name else ""
|
||||
return (
|
||||
f"{phase_prefix}Repeated aggregate_review detected at iteration {iteration} "
|
||||
f"(same as iteration {prev_iter})."
|
||||
)
|
||||
history[normalized] = iteration
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _normalize_aggregate_output(output: str) -> str:
|
||||
"""Normalize aggregate output for repeat detection."""
|
||||
return " ".join(output.lower().split())
|
||||
|
||||
|
||||
def _extract_verdict(output: str, pattern: str) -> str:
|
||||
"""Extract PASS or FAIL from output using regex pattern."""
|
||||
if re.search(pattern, output):
|
||||
return "PASS"
|
||||
return "FAIL"
|
||||
|
||||
|
||||
def _save_step_output(
|
||||
run_dir: Path,
|
||||
iteration: int,
|
||||
step_name: str,
|
||||
content: str,
|
||||
) -> Path:
|
||||
"""Save step output to run_dir/v{iteration}/{step_name}.md"""
|
||||
path = run_dir / f"v{iteration}" / f"{step_name}.md"
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(content, encoding="utf-8")
|
||||
return path
|
||||
|
||||
|
||||
def _save_report(run_dir: Path, config: PipelineConfig, result: PipelineResult) -> None:
|
||||
"""Generate and save the final markdown report."""
|
||||
report = build_report(config, result)
|
||||
report_path = run_dir / "final-report.md"
|
||||
report_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
report_path.write_text(report, encoding="utf-8")
|
||||
logger.info("Report saved: %s", report_path)
|
||||
845
cross_eval/prompts.py
Normal file
845
cross_eval/prompts.py
Normal file
@@ -0,0 +1,845 @@
|
||||
"""Default prompt templates and pipeline presets."""
|
||||
from __future__ import annotations
|
||||
|
||||
import collections
|
||||
from pathlib import Path
|
||||
from typing import Callable, Optional
|
||||
|
||||
from cross_eval.models import PhaseConfig, StepConfig
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Default prompt templates
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
GENERATE_TEMPLATE = """\
|
||||
You are tasked with implementing code based on a plan and checklist.
|
||||
|
||||
## Plan
|
||||
{plan}
|
||||
|
||||
## Checklist
|
||||
{checklist}
|
||||
|
||||
## Reference Documents
|
||||
{docs}
|
||||
|
||||
## Previous Review Feedback
|
||||
{feedback}
|
||||
|
||||
## Iteration
|
||||
This is iteration {iteration} of {max_iterations}.
|
||||
|
||||
## Instructions
|
||||
1. Explore the project directory to understand the existing codebase structure.
|
||||
2. Implement ONLY what the plan specifies. Do NOT add extra features, \
|
||||
unnecessary abstractions, or premature optimizations.
|
||||
3. Follow every item in the checklist.
|
||||
4. If there is previous feedback, address ONLY the specific issues mentioned.
|
||||
5. If previous feedback contains items marked as DISMISSED or false positive, \
|
||||
IGNORE those items — they have been verified as correct.
|
||||
6. Output the complete implementation.
|
||||
"""
|
||||
|
||||
REVIEW_TEMPLATE = """\
|
||||
You are tasked with reviewing code against a plan and checklist.
|
||||
|
||||
## Plan
|
||||
{plan}
|
||||
|
||||
## Checklist
|
||||
{checklist}
|
||||
|
||||
## Reference Documents
|
||||
{docs}
|
||||
|
||||
## Generated Code / Previous Step Output
|
||||
{generated_code}
|
||||
|
||||
## Previous Review Feedback
|
||||
{feedback}
|
||||
|
||||
## Review Instructions
|
||||
Explore the project directory to understand the full codebase context, \
|
||||
then evaluate the code against ONLY the plan and checklist above.
|
||||
|
||||
For each issue found, classify it with BOTH severity AND category:
|
||||
|
||||
Severity levels:
|
||||
- **Critical**: Breaks functionality, causes data loss, or introduces security vulnerabilities.
|
||||
- **Major**: Requirement mismatch, significant logic errors, or missing core functionality.
|
||||
- **Minor**: Coding convention violations, trivial omissions, or style issues.
|
||||
|
||||
Categories:
|
||||
- **Over-engineering**: Code adds features, abstractions, or complexity \
|
||||
NOT required by the plan.
|
||||
- **Omission**: A requirement from the plan or checklist that is missing or \
|
||||
incomplete in the implementation.
|
||||
|
||||
If previous review feedback is provided above, you MUST assess each item:
|
||||
- **CONFIRMED**: The issue is still present in the current code.
|
||||
- **DISMISSED (false positive)**: The flagged item is actually correct per \
|
||||
the plan requirements. Provide rationale.
|
||||
|
||||
If you find issues outside the plan/checklist scope (e.g. pre-existing bugs, \
|
||||
security concerns, performance problems), report them separately under \
|
||||
"Out of Scope Issues".
|
||||
|
||||
## Output Format
|
||||
|
||||
### Previous Feedback Assessment
|
||||
(Only include this section if previous feedback was provided.)
|
||||
- CONFIRMED: [item description] — still an issue because [reason]
|
||||
- DISMISSED (false positive): [item description] — actually correct because [reason]
|
||||
(Write "N/A" if no previous feedback was provided.)
|
||||
|
||||
### Issues Found
|
||||
List issues ordered by severity (Critical first):
|
||||
- [Critical][Over-engineering] Description (reference specific plan/checklist item)
|
||||
- [Major][Omission] Description (reference specific plan/checklist item)
|
||||
- [Minor][Omission] Description (reference specific plan/checklist item)
|
||||
|
||||
### Out of Scope Issues
|
||||
Issues found outside plan/checklist scope but worth noting:
|
||||
- [Critical] Description of issue
|
||||
- [Minor] Description of issue
|
||||
(Write "None" if no out-of-scope issues found.)
|
||||
|
||||
### Summary
|
||||
- Critical: N, Major: N, Minor: N
|
||||
- Over-engineering count: N
|
||||
- Omission count: N
|
||||
- CONFIRMED: N, DISMISSED: N
|
||||
- Overall quality: [BRIEF ASSESSMENT]
|
||||
|
||||
### Verdict
|
||||
If all checklist items are satisfied and there is no over-engineering or \
|
||||
omission, output: VERDICT: PASS
|
||||
Otherwise output: VERDICT: FAIL
|
||||
"""
|
||||
|
||||
|
||||
GENERATE_TEMPLATE_KO = """\
|
||||
당신은 기획서와 체크리스트를 기반으로 코드를 구현하는 개발자입니다.
|
||||
|
||||
## 기획서
|
||||
{plan}
|
||||
|
||||
## 체크리스트
|
||||
{checklist}
|
||||
|
||||
## 참고 문서
|
||||
{docs}
|
||||
|
||||
## 이전 리뷰 피드백
|
||||
{feedback}
|
||||
|
||||
## 반복 정보
|
||||
현재 {max_iterations}회 중 {iteration}번째 반복입니다.
|
||||
|
||||
## 지침
|
||||
1. 프로젝트 디렉토리를 탐색하여 기존 코드베이스 구조를 파악하세요.
|
||||
2. 기획서에 명시된 것만 구현하세요. 추가 기능, 불필요한 추상화, 과도한 최적화를 하지 마세요.
|
||||
3. 체크리스트의 모든 항목을 충족하세요.
|
||||
4. 이전 리뷰 피드백이 있다면 해당 이슈만 해결하세요.
|
||||
5. 이전 피드백에서 DISMISSED 또는 오탐으로 표시된 항목은 무시하세요 — 이미 올바른 것으로 검증되었습니다.
|
||||
6. 완전한 구현을 출력하세요.
|
||||
"""
|
||||
|
||||
REVIEW_TEMPLATE_KO = """\
|
||||
당신은 기획서와 체크리스트 기준으로 코드를 검토하는 리뷰어입니다.
|
||||
|
||||
## 기획서
|
||||
{plan}
|
||||
|
||||
## 체크리스트
|
||||
{checklist}
|
||||
|
||||
## 참고 문서
|
||||
{docs}
|
||||
|
||||
## 검토 대상 코드
|
||||
{generated_code}
|
||||
|
||||
## 이전 리뷰 피드백
|
||||
{feedback}
|
||||
|
||||
## 검토 지침
|
||||
프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스 맥락을 파악한 뒤, \
|
||||
위 기획서와 체크리스트 기준으로만 코드를 평가하세요.
|
||||
|
||||
발견된 각 이슈에 심각도와 카테고리를 모두 부여하세요:
|
||||
|
||||
심각도:
|
||||
- **Critical**: 기능 장애, 데이터 손실, 보안 취약점을 유발하는 문제.
|
||||
- **Major**: 요구사항 불일치, 중대한 로직 오류, 핵심 기능 누락.
|
||||
- **Minor**: 코딩 컨벤션 위반, 사소한 누락, 스타일 문제.
|
||||
|
||||
카테고리:
|
||||
- **과최적화**: 기획서에 없는 기능, 추상화, 복잡성을 추가한 경우.
|
||||
- **누락**: 기획서/체크리스트에 있지만 구현에서 빠지거나 불완전한 요구사항.
|
||||
|
||||
이전 리뷰 피드백이 제공된 경우, 각 항목을 반드시 평가하세요:
|
||||
- **CONFIRMED**: 현재 코드에 여전히 존재하는 이슈.
|
||||
- **DISMISSED (오탐)**: 기획서 요구사항상 실제로 올바른 항목. 근거를 제시하세요.
|
||||
|
||||
기획서/체크리스트 범위 밖에서 발견된 문제(기존 버그, 보안 이슈, 성능 문제 등)는 \
|
||||
"범위 밖 이슈" 섹션에 별도로 보고하세요.
|
||||
|
||||
## 출력 형식
|
||||
|
||||
### 이전 피드백 평가
|
||||
(이전 피드백이 제공된 경우에만 포함하세요.)
|
||||
- CONFIRMED: [항목 설명] — 여전히 이슈인 이유: [근거]
|
||||
- DISMISSED (오탐): [항목 설명] — 실제로 올바른 이유: [근거]
|
||||
(이전 피드백이 없으면 "해당 없음"이라고 작성하세요.)
|
||||
|
||||
### 발견된 이슈
|
||||
심각도 순서(Critical 먼저)로 나열:
|
||||
- [Critical][과최적화] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
|
||||
- [Major][누락] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
|
||||
- [Minor][누락] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
|
||||
|
||||
### 범위 밖 이슈
|
||||
기획서/체크리스트 범위 밖이지만 주목할 만한 이슈:
|
||||
- [Critical] 이슈 설명
|
||||
- [Minor] 이슈 설명
|
||||
(범위 밖 이슈가 없으면 "없음"이라고 작성하세요.)
|
||||
|
||||
### 요약
|
||||
- Critical: N, Major: N, Minor: N
|
||||
- 과최적화 수: N
|
||||
- 누락 수: N
|
||||
- CONFIRMED: N, DISMISSED: N
|
||||
- 전체 품질: [간략한 평가]
|
||||
|
||||
### 판정
|
||||
모든 체크리스트 항목이 충족되고 과최적화/누락이 없으면: VERDICT: PASS
|
||||
그렇지 않으면: VERDICT: FAIL
|
||||
"""
|
||||
|
||||
|
||||
REVIEW_ONLY_TEMPLATE = """\
|
||||
You are tasked with reviewing existing code against a plan and checklist.
|
||||
|
||||
## Plan
|
||||
{plan}
|
||||
|
||||
## Checklist
|
||||
{checklist}
|
||||
|
||||
## Reference Documents
|
||||
{docs}
|
||||
|
||||
## Previous Review (iteration {iteration} of {max_iterations})
|
||||
{feedback}
|
||||
|
||||
## Review Instructions
|
||||
Explore the project directory thoroughly to understand the full codebase, \
|
||||
then evaluate the EXISTING code against ONLY the plan and checklist above.
|
||||
|
||||
You are NOT generating or modifying code. You are auditing what already exists.
|
||||
|
||||
If previous review results are provided above, you MUST:
|
||||
1. Verify each previously reported issue — is it a real issue or a false positive?
|
||||
2. Look for issues the previous review MISSED.
|
||||
3. Do NOT simply repeat the previous review. Provide your own independent assessment.
|
||||
4. Explicitly mark items as CONFIRMED (still an issue) or DISMISSED (false positive).
|
||||
|
||||
For each issue found, classify it with BOTH severity AND category:
|
||||
|
||||
Severity levels:
|
||||
- **Critical**: Breaks functionality, causes data loss, or introduces security vulnerabilities.
|
||||
- **Major**: Requirement mismatch, significant logic errors, or missing core functionality.
|
||||
- **Minor**: Coding convention violations, trivial omissions, or style issues.
|
||||
|
||||
Categories:
|
||||
- **Over-engineering**: Code adds features, abstractions, or complexity \
|
||||
NOT required by the plan.
|
||||
- **Omission**: A requirement from the plan or checklist that is missing or \
|
||||
incomplete in the implementation.
|
||||
|
||||
If you find issues outside the plan/checklist scope (e.g. pre-existing bugs, \
|
||||
security concerns, performance problems), report them separately under \
|
||||
"Out of Scope Issues".
|
||||
|
||||
## Output Format
|
||||
|
||||
### Issues Found
|
||||
List issues ordered by severity (Critical first):
|
||||
- [Critical][Over-engineering] Description (reference specific plan/checklist item)
|
||||
- [Major][Omission] Description (reference specific plan/checklist item)
|
||||
- [Minor][Omission] Description (reference specific plan/checklist item)
|
||||
|
||||
### Out of Scope Issues
|
||||
Issues found outside plan/checklist scope but worth noting:
|
||||
- [Critical] Description of issue
|
||||
- [Minor] Description of issue
|
||||
(Write "None" if no out-of-scope issues found.)
|
||||
|
||||
### Summary
|
||||
- Critical: N, Major: N, Minor: N
|
||||
- Over-engineering count: N
|
||||
- Omission count: N
|
||||
- CONFIRMED: N, DISMISSED: N
|
||||
- Overall quality: [BRIEF ASSESSMENT]
|
||||
|
||||
### Verdict
|
||||
If all checklist items are satisfied and there is no over-engineering or \
|
||||
omission, output: VERDICT: PASS
|
||||
Otherwise output: VERDICT: FAIL
|
||||
"""
|
||||
|
||||
REVIEW_ONLY_TEMPLATE_KO = """\
|
||||
당신은 기존 코드를 기획서와 체크리스트 기준으로 감사하는 리뷰어입니다.
|
||||
|
||||
## 기획서
|
||||
{plan}
|
||||
|
||||
## 체크리스트
|
||||
{checklist}
|
||||
|
||||
## 참고 문서
|
||||
{docs}
|
||||
|
||||
## 이전 리뷰 결과 ({max_iterations}회 중 {iteration}번째)
|
||||
{feedback}
|
||||
|
||||
## 검토 지침
|
||||
프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스를 파악한 뒤, \
|
||||
위 기획서와 체크리스트 기준으로 **기존 코드**를 평가하세요.
|
||||
|
||||
코드를 생성하거나 수정하지 마세요. 이미 존재하는 코드를 감사하는 것이 목적입니다.
|
||||
|
||||
이전 리뷰 결과가 제공된 경우 반드시:
|
||||
1. 이전에 보고된 각 이슈를 검증하세요 — 진짜 이슈인지 오탐인지?
|
||||
2. 이전 리뷰가 놓친 새로운 이슈를 찾으세요.
|
||||
3. 이전 리뷰를 그대로 반복하지 마세요. 독립적인 평가를 제공하세요.
|
||||
4. 각 항목에 CONFIRMED (여전히 이슈) 또는 DISMISSED (오탐) 태그를 명시하세요.
|
||||
|
||||
발견된 각 이슈에 심각도와 카테고리를 모두 부여하세요:
|
||||
|
||||
심각도:
|
||||
- **Critical**: 기능 장애, 데이터 손실, 보안 취약점을 유발하는 문제.
|
||||
- **Major**: 요구사항 불일치, 중대한 로직 오류, 핵심 기능 누락.
|
||||
- **Minor**: 코딩 컨벤션 위반, 사소한 누락, 스타일 문제.
|
||||
|
||||
카테고리:
|
||||
- **과최적화**: 기획서에 없는 기능, 추상화, 복잡성을 추가한 경우.
|
||||
- **누락**: 기획서/체크리스트에 있지만 구현에서 빠지거나 불완전한 요구사항.
|
||||
|
||||
기획서/체크리스트 범위 밖에서 발견된 문제(기존 버그, 보안 이슈, 성능 문제 등)는 \
|
||||
"범위 밖 이슈" 섹션에 별도로 보고하세요.
|
||||
|
||||
## 출력 형식
|
||||
|
||||
### 발견된 이슈
|
||||
심각도 순서(Critical 먼저)로 나열:
|
||||
- [Critical][과최적화] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
|
||||
- [Major][누락] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
|
||||
- [Minor][누락] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
|
||||
|
||||
### 범위 밖 이슈
|
||||
기획서/체크리스트 범위 밖이지만 주목할 만한 이슈:
|
||||
- [Critical] 이슈 설명
|
||||
- [Minor] 이슈 설명
|
||||
(범위 밖 이슈가 없으면 "없음"이라고 작성하세요.)
|
||||
|
||||
### 요약
|
||||
- Critical: N, Major: N, Minor: N
|
||||
- 과최적화 수: N
|
||||
- 누락 수: N
|
||||
- CONFIRMED: N, DISMISSED: N
|
||||
- 전체 품질: [간략한 평가]
|
||||
|
||||
### 판정
|
||||
모든 체크리스트 항목이 충족되고 과최적화/누락이 없으면: VERDICT: PASS
|
||||
그렇지 않으면: VERDICT: FAIL
|
||||
"""
|
||||
|
||||
AGGREGATE_REVIEW_TEMPLATE = """\
|
||||
You are adjudicating multiple review results and turning them into an actionable decision.
|
||||
|
||||
## Plan
|
||||
{plan}
|
||||
|
||||
## Checklist
|
||||
{checklist}
|
||||
|
||||
## Reference Documents
|
||||
{docs}
|
||||
|
||||
## Candidate Outputs
|
||||
{candidate_outputs}
|
||||
|
||||
## Reviewer Findings
|
||||
{reviews_bundle}
|
||||
|
||||
## Previous Verification Feedback
|
||||
{feedback}
|
||||
|
||||
## Instructions
|
||||
Explore the project directory to confirm the current codebase state. Then:
|
||||
1. Deduplicate overlapping issues across reviewers.
|
||||
2. Resolve disagreements explicitly.
|
||||
3. Keep only issues supported by the plan, checklist, code, or reviewer evidence.
|
||||
4. When evidence is mixed, explain what was confirmed, what was dismissed, and what still needs follow-up.
|
||||
5. Produce a prioritized action list for the coder.
|
||||
6. If no confirmed issue remains, output VERDICT: PASS. Otherwise VERDICT: FAIL.
|
||||
|
||||
## Output Format
|
||||
|
||||
### Confirmed Issues
|
||||
- [Critical][Omission] Description with rationale and source reviewer(s)
|
||||
|
||||
### Dismissed Findings
|
||||
- [False positive] Claim — reason why it is actually correct (raised by: Reviewer X)
|
||||
- [Already fixed] Claim — already resolved in the current code (raised by: Reviewer X)
|
||||
(Write "None" if nothing was dismissed.)
|
||||
|
||||
### Action Items
|
||||
1. Concrete fix the coder should make
|
||||
2. Concrete fix the coder should make
|
||||
|
||||
### Summary
|
||||
- Confirmed issues: N
|
||||
- Dismissed findings: N (false positive: N, already fixed: N)
|
||||
- Overall quality: [BRIEF ASSESSMENT]
|
||||
|
||||
### Verdict
|
||||
VERDICT: PASS or VERDICT: FAIL
|
||||
"""
|
||||
|
||||
AGGREGATE_REVIEW_TEMPLATE_KO = """\
|
||||
당신은 여러 리뷰 결과를 판정하고 coder가 수정할 액션으로 정리하는 시니어 리뷰어입니다.
|
||||
|
||||
## 기획서
|
||||
{plan}
|
||||
|
||||
## 체크리스트
|
||||
{checklist}
|
||||
|
||||
## 참고 문서
|
||||
{docs}
|
||||
|
||||
## 후보 결과물
|
||||
{candidate_outputs}
|
||||
|
||||
## 개별 리뷰 결과
|
||||
{reviews_bundle}
|
||||
|
||||
## 이전 검증 피드백
|
||||
{feedback}
|
||||
|
||||
## 지침
|
||||
프로젝트 디렉토리를 탐색하여 현재 코드베이스 상태를 확인한 뒤 다음을 수행하세요.
|
||||
1. 리뷰어들 사이에 중복되는 이슈를 합치세요.
|
||||
2. 의견 충돌은 명시적으로 정리하세요.
|
||||
3. 기획서, 체크리스트, 코드, 리뷰 근거로 뒷받침되는 이슈만 남기세요.
|
||||
4. 근거가 엇갈리면 무엇이 확정이고 무엇이 기각 또는 추가확인 대상인지 분명히 적으세요.
|
||||
5. coder가 바로 수정할 수 있는 우선순위 액션 아이템을 만드세요.
|
||||
6. 확정된 이슈가 없으면 VERDICT: PASS, 있으면 VERDICT: FAIL 을 출력하세요.
|
||||
|
||||
## 출력 형식
|
||||
|
||||
### 확정 이슈
|
||||
- [Critical][누락] 확정된 이슈 설명, 근거, 출처 리뷰어
|
||||
|
||||
### 기각된 주장
|
||||
- [오탐] 주장 내용 — 실제로 올바른 이유 (제기: 리뷰어 X)
|
||||
- [수정 완료] 주장 내용 — 현재 코드에서 이미 해결됨 (제기: 리뷰어 X)
|
||||
(기각된 항목이 없으면 "없음"이라고 작성하세요.)
|
||||
|
||||
### 액션 아이템
|
||||
1. coder가 수정해야 할 구체적인 작업
|
||||
2. coder가 수정해야 할 구체적인 작업
|
||||
|
||||
### 요약
|
||||
- 확정 이슈 수: N
|
||||
- 기각된 주장 수: N (오탐: N, 수정 완료: N)
|
||||
- 전체 품질: [간략한 평가]
|
||||
|
||||
### 판정
|
||||
VERDICT: PASS 또는 VERDICT: FAIL
|
||||
"""
|
||||
|
||||
|
||||
DEFAULT_TEMPLATES: dict[str, dict[str, str]] = {
|
||||
"en": {
|
||||
"generate": GENERATE_TEMPLATE,
|
||||
"review": REVIEW_TEMPLATE,
|
||||
"review-only": REVIEW_ONLY_TEMPLATE,
|
||||
"aggregate-review": AGGREGATE_REVIEW_TEMPLATE,
|
||||
},
|
||||
"ko": {
|
||||
"generate": GENERATE_TEMPLATE_KO,
|
||||
"review": REVIEW_TEMPLATE_KO,
|
||||
"review-only": REVIEW_ONLY_TEMPLATE_KO,
|
||||
"aggregate-review": AGGREGATE_REVIEW_TEMPLATE_KO,
|
||||
},
|
||||
}
|
||||
|
||||
# Current language (set by pipeline before run)
|
||||
_current_language: str = "en"
|
||||
|
||||
|
||||
def set_language(lang: str) -> None:
|
||||
"""Set the current template language."""
|
||||
global _current_language
|
||||
if lang not in DEFAULT_TEMPLATES:
|
||||
raise ValueError(f"Unsupported language '{lang}'. Available: {list(DEFAULT_TEMPLATES.keys())}")
|
||||
_current_language = lang
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pipeline presets
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _safe_key(name: str) -> str:
|
||||
"""Sanitize agent name for use as template variable / output_key.
|
||||
|
||||
Replaces hyphens with underscores so names like 'claude-coder'
|
||||
become 'claude_coder', which is valid in format_map().
|
||||
"""
|
||||
return name.replace("-", "_")
|
||||
|
||||
|
||||
def _unique_safe_keys(names: list[str]) -> list[str]:
|
||||
"""Return stable, collision-free keys for agent names.
|
||||
|
||||
Duplicate names keep the first key unchanged and receive numeric suffixes
|
||||
from the second occurrence onward.
|
||||
"""
|
||||
totals = collections.Counter(_safe_key(name) for name in names)
|
||||
seen: collections.defaultdict[str, int] = collections.defaultdict(int)
|
||||
keys: list[str] = []
|
||||
|
||||
for name in names:
|
||||
base = _safe_key(name)
|
||||
seen[base] += 1
|
||||
if totals[base] == 1 or seen[base] == 1:
|
||||
keys.append(base)
|
||||
else:
|
||||
keys.append(f"{base}_{seen[base]}")
|
||||
|
||||
return keys
|
||||
|
||||
|
||||
def _build_named_bundle(
|
||||
labels: list[str],
|
||||
step_names: list[str],
|
||||
output_keys: list[str],
|
||||
title: str,
|
||||
) -> str:
|
||||
"""Build a templated bundle from prior step outputs."""
|
||||
parts: list[str] = []
|
||||
for label, step_name, output_key in zip(labels, step_names, output_keys):
|
||||
parts.append(
|
||||
f"## {title}: {label} ({step_name})\n"
|
||||
f"{{{output_key}}}"
|
||||
)
|
||||
return "\n\n---\n\n".join(parts)
|
||||
|
||||
|
||||
def _build_simple_preset(
|
||||
coders: list[str], reviewers: list[str], seniors: list[str],
|
||||
) -> list[StepConfig]:
|
||||
"""First coder generates, first reviewer reviews."""
|
||||
if not coders:
|
||||
raise ValueError("'simple' preset requires at least 1 coder")
|
||||
if not reviewers:
|
||||
raise ValueError("'simple' preset requires at least 1 reviewer")
|
||||
steps = [
|
||||
StepConfig(
|
||||
name="generate",
|
||||
agent=coders[0],
|
||||
role="generate",
|
||||
prompt_template="default:generate",
|
||||
output_key="generated_code",
|
||||
),
|
||||
StepConfig(
|
||||
name="review",
|
||||
agent=reviewers[0],
|
||||
role="review",
|
||||
prompt_template="default:review",
|
||||
output_key="review_result",
|
||||
verdict=not seniors,
|
||||
),
|
||||
]
|
||||
if seniors:
|
||||
steps.append(
|
||||
StepConfig(
|
||||
name="senior_review",
|
||||
agent=seniors[0],
|
||||
role="review",
|
||||
prompt_template="default:aggregate-review",
|
||||
output_key="senior_review_result",
|
||||
verdict=True,
|
||||
context_override={
|
||||
"candidate_outputs": "## Generated code\n{generated_code}",
|
||||
"reviews_bundle": f"## Review: {reviewers[0]} (review)\n{{review_result}}",
|
||||
},
|
||||
),
|
||||
)
|
||||
return steps
|
||||
|
||||
|
||||
def _build_cross_review_preset(
|
||||
coders: list[str], reviewers: list[str], seniors: list[str],
|
||||
) -> list[StepConfig]:
|
||||
"""Both coders generate, then cross-review each other's output."""
|
||||
if len(coders) < 2:
|
||||
raise ValueError("'cross-review' preset requires at least 2 coders")
|
||||
a, b = coders[0], coders[1]
|
||||
ak, bk = _unique_safe_keys([a, b])
|
||||
steps = [
|
||||
StepConfig(
|
||||
name=f"generate_{ak}",
|
||||
agent=a,
|
||||
role="generate",
|
||||
prompt_template="default:generate",
|
||||
output_key=f"code_{ak}",
|
||||
parallel=True,
|
||||
),
|
||||
StepConfig(
|
||||
name=f"generate_{bk}",
|
||||
agent=b,
|
||||
role="generate",
|
||||
prompt_template="default:generate",
|
||||
output_key=f"code_{bk}",
|
||||
parallel=True,
|
||||
),
|
||||
StepConfig(
|
||||
name=f"review_by_{ak}",
|
||||
agent=a,
|
||||
role="review",
|
||||
prompt_template="default:review",
|
||||
output_key=f"review_by_{ak}",
|
||||
context_override={"generated_code": f"{{code_{bk}}}"},
|
||||
parallel=True,
|
||||
verdict=not seniors,
|
||||
),
|
||||
StepConfig(
|
||||
name=f"review_by_{bk}",
|
||||
agent=b,
|
||||
role="review",
|
||||
prompt_template="default:review",
|
||||
output_key=f"review_by_{bk}",
|
||||
verdict=not seniors,
|
||||
context_override={"generated_code": f"{{code_{ak}}}"},
|
||||
parallel=True,
|
||||
),
|
||||
]
|
||||
if seniors:
|
||||
steps.append(
|
||||
StepConfig(
|
||||
name="senior_review",
|
||||
agent=seniors[0],
|
||||
role="review",
|
||||
prompt_template="default:aggregate-review",
|
||||
output_key="senior_review_result",
|
||||
verdict=True,
|
||||
context_override={
|
||||
"candidate_outputs": _build_named_bundle(
|
||||
[a, b],
|
||||
[f"generate_{ak}", f"generate_{bk}"],
|
||||
[f"code_{ak}", f"code_{bk}"],
|
||||
"Candidate",
|
||||
),
|
||||
"reviews_bundle": _build_named_bundle(
|
||||
[a, b],
|
||||
[f"review_by_{ak}", f"review_by_{bk}"],
|
||||
[f"review_by_{ak}", f"review_by_{bk}"],
|
||||
"Review",
|
||||
),
|
||||
},
|
||||
),
|
||||
)
|
||||
return steps
|
||||
|
||||
|
||||
def _build_review_only_preset(
|
||||
coders: list[str], reviewers: list[str], seniors: list[str],
|
||||
) -> list[StepConfig]:
|
||||
"""Review-only: all reviewers audit existing code independently."""
|
||||
if not reviewers:
|
||||
raise ValueError("'review-only' preset requires at least 1 reviewer")
|
||||
|
||||
if len(reviewers) == 1 and not seniors:
|
||||
# Single reviewer — backward compatible
|
||||
return [
|
||||
StepConfig(
|
||||
name="review",
|
||||
agent=reviewers[0],
|
||||
role="review",
|
||||
prompt_template="default:review-only",
|
||||
output_key="review_result",
|
||||
verdict=True,
|
||||
),
|
||||
]
|
||||
|
||||
# Multiple reviewers — each produces a separate review with verdict (parallel)
|
||||
steps: list[StepConfig] = []
|
||||
reviewer_keys = _unique_safe_keys(reviewers)
|
||||
for reviewer, rk in zip(reviewers, reviewer_keys):
|
||||
steps.append(
|
||||
StepConfig(
|
||||
name=f"review_{rk}",
|
||||
agent=reviewer,
|
||||
role="review",
|
||||
prompt_template="default:review-only",
|
||||
output_key=f"review_{rk}",
|
||||
verdict=not seniors,
|
||||
parallel=True,
|
||||
),
|
||||
)
|
||||
if seniors:
|
||||
step_names = [f"review_{rk}" for rk in reviewer_keys]
|
||||
output_keys = [f"review_{rk}" for rk in reviewer_keys]
|
||||
steps.append(
|
||||
StepConfig(
|
||||
name="senior_review",
|
||||
agent=seniors[0],
|
||||
role="review",
|
||||
prompt_template="default:aggregate-review",
|
||||
output_key="senior_review_result",
|
||||
verdict=True,
|
||||
context_override={
|
||||
"candidate_outputs": "Current repository working tree under review.",
|
||||
"reviews_bundle": _build_named_bundle(
|
||||
reviewers, step_names, output_keys, "Review",
|
||||
),
|
||||
},
|
||||
),
|
||||
)
|
||||
return steps
|
||||
|
||||
|
||||
def _build_review_fix_preset(
|
||||
coders: list[str], reviewers: list[str], seniors: list[str],
|
||||
) -> list[PhaseConfig]:
|
||||
"""Review in parallel, aggregate findings, fix, then verify in a loop."""
|
||||
if not coders:
|
||||
raise ValueError("'review-fix' preset requires at least 1 coder")
|
||||
if not reviewers:
|
||||
raise ValueError("'review-fix' preset requires at least 1 reviewer")
|
||||
|
||||
review_steps: list[StepConfig] = []
|
||||
reviewer_keys = _unique_safe_keys(reviewers)
|
||||
for reviewer, rk in zip(reviewers, reviewer_keys):
|
||||
review_steps.append(
|
||||
StepConfig(
|
||||
name=f"review_{rk}",
|
||||
agent=reviewer,
|
||||
role="review",
|
||||
prompt_template="default:review-only",
|
||||
output_key=f"review_{rk}",
|
||||
verdict=False,
|
||||
parallel=True,
|
||||
),
|
||||
)
|
||||
|
||||
fix_coder = coders[0]
|
||||
senior_agent = seniors[0] if seniors else reviewers[0]
|
||||
review_step_names = [f"review_{rk}" for rk in reviewer_keys]
|
||||
review_output_keys = [f"review_{rk}" for rk in reviewer_keys]
|
||||
|
||||
return [
|
||||
PhaseConfig(
|
||||
name="review_fix",
|
||||
steps=review_steps + [
|
||||
StepConfig(
|
||||
name="aggregate_review",
|
||||
agent=senior_agent,
|
||||
role="review",
|
||||
prompt_template="default:aggregate-review",
|
||||
output_key="aggregate_review",
|
||||
context_override={
|
||||
"candidate_outputs": "Current repository working tree under review.",
|
||||
"reviews_bundle": _build_named_bundle(
|
||||
reviewers, review_step_names, review_output_keys, "Review",
|
||||
),
|
||||
},
|
||||
),
|
||||
StepConfig(
|
||||
name="generate",
|
||||
agent=fix_coder,
|
||||
role="generate",
|
||||
prompt_template="default:generate",
|
||||
output_key="generated_code",
|
||||
context_override={"feedback": "{aggregate_review}"},
|
||||
),
|
||||
StepConfig(
|
||||
name="verify",
|
||||
agent=senior_agent,
|
||||
role="review",
|
||||
prompt_template="default:review",
|
||||
output_key="verify_result",
|
||||
verdict=True,
|
||||
),
|
||||
],
|
||||
max_iterations=5,
|
||||
consecutive_pass=1,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
PIPELINE_PRESETS: dict[str, Callable] = {
|
||||
"simple": _build_simple_preset,
|
||||
"cross-review": _build_cross_review_preset,
|
||||
"review-only": _build_review_only_preset,
|
||||
}
|
||||
|
||||
PHASED_PRESETS: dict[str, Callable] = {
|
||||
"review-fix": _build_review_fix_preset,
|
||||
}
|
||||
|
||||
ALL_PRESET_NAMES: list[str] = list(PIPELINE_PRESETS.keys()) + list(PHASED_PRESETS.keys())
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Template resolution and rendering
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def resolve_template(template_ref: str, templates_dir: Optional[Path] = None) -> str:
|
||||
"""Resolve a template reference to its content string.
|
||||
|
||||
Formats:
|
||||
- "default:generate" -> built-in GENERATE_TEMPLATE
|
||||
- "default:review" -> built-in REVIEW_TEMPLATE
|
||||
- "path/to/file.md" -> read file contents
|
||||
"""
|
||||
if template_ref.startswith("default:"):
|
||||
key = template_ref.split(":", 1)[1]
|
||||
lang_templates = DEFAULT_TEMPLATES.get(_current_language, DEFAULT_TEMPLATES["en"])
|
||||
if key not in lang_templates:
|
||||
raise ValueError(
|
||||
f"Unknown default template '{key}'. "
|
||||
f"Available: {list(lang_templates.keys())}"
|
||||
)
|
||||
return lang_templates[key]
|
||||
|
||||
# Treat as file path
|
||||
path = Path(template_ref)
|
||||
if templates_dir and not path.is_absolute():
|
||||
path = templates_dir / path
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"Template file not found: {path}")
|
||||
return path.read_text(encoding="utf-8")
|
||||
|
||||
|
||||
class _DefaultDict(collections.defaultdict):
|
||||
"""defaultdict that uses the missing key name in the default value."""
|
||||
|
||||
def __missing__(self, key: str) -> str:
|
||||
return f"(no {key} provided)"
|
||||
|
||||
|
||||
def render_template(template: str, context: dict[str, str]) -> str:
|
||||
"""Render a template string with {variable} placeholders.
|
||||
|
||||
Missing keys produce "(no <key> provided)" instead of raising KeyError.
|
||||
"""
|
||||
safe_context = _DefaultDict(str)
|
||||
safe_context.update(context)
|
||||
return template.format_map(safe_context)
|
||||
497
cross_eval/report.py
Normal file
497
cross_eval/report.py
Normal file
@@ -0,0 +1,497 @@
|
||||
"""Markdown report generation."""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from itertools import groupby
|
||||
|
||||
from cross_eval.models import (
|
||||
IterationResult,
|
||||
PipelineConfig,
|
||||
PipelineResult,
|
||||
ReviewMetrics,
|
||||
StepConfig,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# i18n strings
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_STRINGS: dict[str, dict[str, str]] = {
|
||||
"en": {
|
||||
"title": "Cross-Eval Report",
|
||||
"summary": "Summary",
|
||||
"prop": "Property",
|
||||
"val": "Value",
|
||||
"total_iter": "Total Iterations",
|
||||
"final_verdict": "Final Verdict",
|
||||
"duration": "Duration",
|
||||
"max_iter": "Max Iterations",
|
||||
"phases_label": "Phases",
|
||||
"iteration": "Iteration",
|
||||
"phase": "Phase",
|
||||
"steps": "Steps",
|
||||
"max_iterations": "Max iterations",
|
||||
"consec_pass": "Consecutive PASS required",
|
||||
"step": "Step",
|
||||
"verdict": "Verdict",
|
||||
"output_chars": "Output ({n} chars)",
|
||||
"feedback_next": "Feedback for next iteration:",
|
||||
"oos_title": "Out of Scope Issues",
|
||||
"oos_desc": (
|
||||
"The following issues were found outside the plan/checklist scope "
|
||||
"but are worth noting."
|
||||
),
|
||||
"final_verdict_title": "Final Verdict",
|
||||
"repeat_title": "Repeated Aggregate Findings",
|
||||
"repeat_desc": "The following aggregate-review outputs repeated across iterations.",
|
||||
"pass_msg": "All checklist items satisfied. No over-engineering or omissions detected.",
|
||||
"fail_phased": "Pipeline phases ({phases}) completed without full convergence.",
|
||||
"fail_simple": "Maximum iterations ({max_iter}) reached without passing all checks.",
|
||||
"metrics_title": "Review Metrics",
|
||||
"metrics_trend_title": "Metrics Trend",
|
||||
"metrics_iter": "Iter",
|
||||
"metrics_total_issues": "Total Issues",
|
||||
"metrics_na": "N/A",
|
||||
},
|
||||
"ko": {
|
||||
"title": "교차 검증 리포트",
|
||||
"summary": "요약",
|
||||
"prop": "항목",
|
||||
"val": "값",
|
||||
"total_iter": "총 반복 횟수",
|
||||
"final_verdict": "최종 판정",
|
||||
"duration": "소요 시간",
|
||||
"max_iter": "최대 반복",
|
||||
"phases_label": "페이즈",
|
||||
"iteration": "반복",
|
||||
"phase": "페이즈",
|
||||
"steps": "단계",
|
||||
"max_iterations": "최대 반복",
|
||||
"consec_pass": "연속 PASS 필요",
|
||||
"step": "단계",
|
||||
"verdict": "판정",
|
||||
"output_chars": "출력 ({n}자)",
|
||||
"feedback_next": "다음 반복을 위한 피드백:",
|
||||
"oos_title": "범위 밖 이슈",
|
||||
"oos_desc": (
|
||||
"아래는 기획서/체크리스트 범위 밖이지만 "
|
||||
"리뷰 중 발견된 이슈입니다."
|
||||
),
|
||||
"final_verdict_title": "최종 판정",
|
||||
"repeat_title": "반복된 Aggregate 이슈",
|
||||
"repeat_desc": "아래 aggregate-review 결과가 여러 반복에서 동일하게 다시 나타났습니다.",
|
||||
"pass_msg": "모든 체크리스트 항목 충족. 과최적화/누락 없음.",
|
||||
"fail_phased": "파이프라인 페이즈 ({phases}) 완료, 완전한 수렴에 도달하지 못함.",
|
||||
"fail_simple": "최대 반복 횟수 ({max_iter})에 도달, 모든 검증을 통과하지 못함.",
|
||||
"metrics_title": "리뷰 메트릭",
|
||||
"metrics_trend_title": "메트릭 추이",
|
||||
"metrics_iter": "반복",
|
||||
"metrics_total_issues": "총 이슈",
|
||||
"metrics_na": "해당 없음",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _t(config: PipelineConfig, key: str, **kwargs: str) -> str:
|
||||
"""Get translated string."""
|
||||
lang = getattr(config, "language", "en")
|
||||
strings = _STRINGS.get(lang, _STRINGS["en"])
|
||||
s = strings.get(key, _STRINGS["en"].get(key, key))
|
||||
if kwargs:
|
||||
s = s.format(**kwargs)
|
||||
return s
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Review output parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def parse_review_metrics(output: str) -> ReviewMetrics:
|
||||
"""Parse review output to extract severity, category, and assessment counts."""
|
||||
metrics = ReviewMetrics()
|
||||
|
||||
# Severity: count tagged issue lines (e.g. "[Critical]", "[Major]", "[Minor]")
|
||||
metrics.critical = len(re.findall(r"\[Critical\]", output, re.IGNORECASE))
|
||||
metrics.major = len(re.findall(r"\[Major\]", output, re.IGNORECASE))
|
||||
metrics.minor = len(re.findall(r"\[Minor\]", output, re.IGNORECASE))
|
||||
|
||||
# Categories (EN and KO variants)
|
||||
metrics.over_engineering = len(re.findall(
|
||||
r"\[Over-engineering\]|\[과최적화\]", output, re.IGNORECASE,
|
||||
))
|
||||
metrics.omission = len(re.findall(
|
||||
r"\[Omission\]|\[누락\]", output, re.IGNORECASE,
|
||||
))
|
||||
|
||||
# Assessments — match "CONFIRMED: <description>" but not summary "CONFIRMED: N"
|
||||
metrics.confirmed = len(re.findall(r"\bCONFIRMED:\s+(?!\d)", output))
|
||||
metrics.dismissed = len(re.findall(r"\bDISMISSED\b(?:\s*\([^)]*\))?\s*:\s+(?!\d)", output))
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
def _aggregate_metrics(a: ReviewMetrics, b: ReviewMetrics) -> ReviewMetrics:
|
||||
"""Combine metrics from two review steps."""
|
||||
return ReviewMetrics(
|
||||
critical=a.critical + b.critical,
|
||||
major=a.major + b.major,
|
||||
minor=a.minor + b.minor,
|
||||
over_engineering=a.over_engineering + b.over_engineering,
|
||||
omission=a.omission + b.omission,
|
||||
confirmed=a.confirmed + b.confirmed,
|
||||
dismissed=a.dismissed + b.dismissed,
|
||||
)
|
||||
|
||||
|
||||
def _extract_out_of_scope(output: str) -> str:
|
||||
"""Extract the 'Out of Scope Issues' section from review output.
|
||||
|
||||
Looks for '### Out of Scope Issues' or '### 범위 밖 이슈' heading,
|
||||
captures text until the next '###' heading or end of string.
|
||||
Returns empty string if not found or contains only 'None'/'없음'.
|
||||
"""
|
||||
pattern = r"###\s*(?:Out of Scope Issues|범위 밖 이슈)\s*\n(.*?)(?=\n###|\Z)"
|
||||
match = re.search(pattern, output, re.DOTALL)
|
||||
if not match:
|
||||
return ""
|
||||
content = match.group(1).strip()
|
||||
if content.lower() in ("none", "없음", ""):
|
||||
return ""
|
||||
return content
|
||||
|
||||
|
||||
def build_report(config: PipelineConfig, result: PipelineResult) -> str:
|
||||
"""Build the complete markdown report string."""
|
||||
has_phases = any(ir.phase_name for ir in result.iterations)
|
||||
|
||||
if has_phases:
|
||||
return _build_phased_report(config, result)
|
||||
return _build_simple_report(config, result)
|
||||
|
||||
|
||||
def _build_simple_report(
|
||||
config: PipelineConfig, result: PipelineResult,
|
||||
) -> str:
|
||||
"""Build report for a non-phased (simple) pipeline run."""
|
||||
lines: list[str] = []
|
||||
|
||||
lines.append(f"# {_t(config, 'title')}\n")
|
||||
_append_summary_table(lines, config, result)
|
||||
|
||||
out_of_scope_items: list[tuple[int, str]] = []
|
||||
|
||||
for iter_result in result.iterations:
|
||||
lines.append("---\n")
|
||||
lines.append(f"## {_t(config, 'iteration')} {iter_result.iteration}\n")
|
||||
|
||||
_append_iteration_steps(lines, config, iter_result, config.pipeline, out_of_scope_items)
|
||||
|
||||
if iter_result.feedback:
|
||||
lines.append(f"**{_t(config, 'feedback_next')}** {iter_result.feedback[:200]}...")
|
||||
lines.append("")
|
||||
|
||||
_append_out_of_scope(lines, config, out_of_scope_items)
|
||||
_append_review_metrics_table(lines, config, result)
|
||||
_append_repeated_aggregate(lines, config, result)
|
||||
_append_final_verdict(lines, config, result)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _build_phased_report(
|
||||
config: PipelineConfig, result: PipelineResult,
|
||||
) -> str:
|
||||
"""Build report for a phased pipeline run (e.g. review-fix)."""
|
||||
lines: list[str] = []
|
||||
|
||||
lines.append(f"# {_t(config, 'title')}\n")
|
||||
_append_summary_table(lines, config, result, phased=True)
|
||||
|
||||
phase_map = {p.name: p for p in config.phases}
|
||||
out_of_scope_items: list[tuple[int, str]] = []
|
||||
|
||||
for phase_name, phase_iters_iter in groupby(
|
||||
result.iterations, key=lambda ir: ir.phase_name,
|
||||
):
|
||||
phase_iters = list(phase_iters_iter)
|
||||
phase_config = phase_map.get(phase_name or "")
|
||||
|
||||
lines.append("---\n")
|
||||
lines.append(f"## {_t(config, 'phase')}: {phase_name}\n")
|
||||
|
||||
if phase_config:
|
||||
step_desc = " → ".join(s.name for s in phase_config.steps)
|
||||
lines.append(
|
||||
f"{_t(config, 'steps')}: {step_desc} | "
|
||||
f"{_t(config, 'max_iterations')}: {phase_config.max_iterations} | "
|
||||
f"{_t(config, 'consec_pass')}: {phase_config.consecutive_pass}\n"
|
||||
)
|
||||
|
||||
steps = phase_config.steps if phase_config else config.pipeline
|
||||
|
||||
consecutive = 0
|
||||
for iter_result in phase_iters:
|
||||
verdict_label = ""
|
||||
if iter_result.verdict:
|
||||
if iter_result.verdict == "PASS":
|
||||
consecutive += 1
|
||||
if phase_config and phase_config.consecutive_pass > 1:
|
||||
verdict_label = f" — PASS ({consecutive}/{phase_config.consecutive_pass})"
|
||||
if consecutive >= phase_config.consecutive_pass:
|
||||
verdict_label += " ✓"
|
||||
else:
|
||||
verdict_label = " — PASS ✓"
|
||||
else:
|
||||
consecutive = 0
|
||||
verdict_label = " — FAIL"
|
||||
|
||||
lines.append(
|
||||
f"### {_t(config, 'iteration')} {iter_result.iteration}{verdict_label}\n"
|
||||
)
|
||||
_append_iteration_steps(lines, config, iter_result, steps, out_of_scope_items)
|
||||
|
||||
if iter_result.feedback:
|
||||
lines.append(
|
||||
f"**{_t(config, 'feedback_next')}** {iter_result.feedback[:200]}..."
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
_append_out_of_scope(lines, config, out_of_scope_items)
|
||||
_append_review_metrics_table(lines, config, result)
|
||||
_append_repeated_aggregate(lines, config, result)
|
||||
_append_final_verdict(lines, config, result)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Shared helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _append_summary_table(
|
||||
lines: list[str],
|
||||
config: PipelineConfig,
|
||||
result: PipelineResult,
|
||||
phased: bool = False,
|
||||
) -> None:
|
||||
"""Append the summary table to lines."""
|
||||
total_iter = len(result.iterations)
|
||||
minutes = int(result.total_duration // 60)
|
||||
seconds = int(result.total_duration % 60)
|
||||
duration_str = f"{minutes}m {seconds}s" if minutes else f"{seconds}s"
|
||||
|
||||
lines.append(f"## {_t(config, 'summary')}\n")
|
||||
lines.append(f"| {_t(config, 'prop')} | {_t(config, 'val')} |")
|
||||
lines.append("|----------|-------|")
|
||||
lines.append(f"| {_t(config, 'total_iter')} | {total_iter} |")
|
||||
lines.append(f"| {_t(config, 'final_verdict')} | **{result.final_verdict}** |")
|
||||
lines.append(f"| {_t(config, 'duration')} | {duration_str} |")
|
||||
|
||||
if phased and config.phases:
|
||||
phase_names = " → ".join(p.name for p in config.phases)
|
||||
lines.append(f"| {_t(config, 'phases_label')} | {phase_names} |")
|
||||
for p in config.phases:
|
||||
lines.append(
|
||||
f"| {_t(config, 'phase')}: {p.name} | "
|
||||
f"{_t(config, 'max_iterations')} {p.max_iterations}, "
|
||||
f"{p.consecutive_pass}x {_t(config, 'consec_pass')} |"
|
||||
)
|
||||
else:
|
||||
lines.append(f"| {_t(config, 'max_iter')} | {config.max_iterations} |")
|
||||
|
||||
lines.append("")
|
||||
|
||||
|
||||
def _append_iteration_steps(
|
||||
lines: list[str],
|
||||
config: PipelineConfig,
|
||||
iter_result: IterationResult,
|
||||
steps: list[StepConfig],
|
||||
out_of_scope_items: list[tuple[int, str]],
|
||||
) -> None:
|
||||
"""Append step details for one iteration."""
|
||||
for step in steps:
|
||||
agent_result = iter_result.step_results.get(step.output_key)
|
||||
output = iter_result.step_outputs.get(step.output_key, "")
|
||||
|
||||
agent_name = agent_result.agent_name if agent_result else step.agent
|
||||
duration = f" ({agent_result.duration_seconds}s)" if agent_result else ""
|
||||
|
||||
lines.append(f"### {_t(config, 'step')}: {step.name} ({agent_name}){duration}\n")
|
||||
|
||||
if step.verdict and iter_result.verdict:
|
||||
lines.append(f"**{_t(config, 'verdict')}: {iter_result.verdict}**\n")
|
||||
|
||||
if len(output) > 500:
|
||||
lines.append("<details>")
|
||||
lines.append(
|
||||
f"<summary>{_t(config, 'output_chars', n=str(len(output)))}</summary>\n"
|
||||
)
|
||||
lines.append(output)
|
||||
lines.append("\n</details>\n")
|
||||
else:
|
||||
lines.append(output)
|
||||
lines.append("")
|
||||
|
||||
if step.role == "review":
|
||||
oos = _extract_out_of_scope(output)
|
||||
if oos:
|
||||
out_of_scope_items.append((iter_result.iteration, oos))
|
||||
|
||||
# Parse and accumulate review metrics for this iteration
|
||||
step_metrics = parse_review_metrics(output)
|
||||
if iter_result.review_metrics is None:
|
||||
iter_result.review_metrics = step_metrics
|
||||
else:
|
||||
iter_result.review_metrics = _aggregate_metrics(
|
||||
iter_result.review_metrics, step_metrics,
|
||||
)
|
||||
|
||||
|
||||
def _append_review_metrics_table(
|
||||
lines: list[str],
|
||||
config: PipelineConfig,
|
||||
result: PipelineResult,
|
||||
) -> None:
|
||||
"""Append per-iteration review metrics table and trend summary."""
|
||||
# Only include if at least one iteration has metrics
|
||||
has_metrics = any(ir.review_metrics for ir in result.iterations)
|
||||
if not has_metrics:
|
||||
return
|
||||
|
||||
na = _t(config, "metrics_na")
|
||||
|
||||
lines.append("---\n")
|
||||
lines.append(f"## {_t(config, 'metrics_title')}\n")
|
||||
|
||||
# Table header
|
||||
lines.append(
|
||||
f"| {_t(config, 'metrics_iter')} | {_t(config, 'verdict')} "
|
||||
f"| Critical | Major | Minor "
|
||||
f"| Over-eng | Omission "
|
||||
f"| CONFIRMED | DISMISSED |"
|
||||
)
|
||||
lines.append("|------|---------|----------|-------|-------|----------|----------|-----------|-----------|")
|
||||
|
||||
# Table rows
|
||||
for ir in result.iterations:
|
||||
m = ir.review_metrics
|
||||
v = ir.verdict or "-"
|
||||
if m:
|
||||
lines.append(
|
||||
f"| {ir.iteration} | {v} "
|
||||
f"| {m.critical} | {m.major} | {m.minor} "
|
||||
f"| {m.over_engineering} | {m.omission} "
|
||||
f"| {m.confirmed} | {m.dismissed} |"
|
||||
)
|
||||
else:
|
||||
lines.append(
|
||||
f"| {ir.iteration} | {v} "
|
||||
f"| {na} | {na} | {na} "
|
||||
f"| {na} | {na} "
|
||||
f"| {na} | {na} |"
|
||||
)
|
||||
|
||||
lines.append("")
|
||||
|
||||
# Trend summary
|
||||
metrics_list = [
|
||||
(ir.iteration, ir.review_metrics)
|
||||
for ir in result.iterations
|
||||
if ir.review_metrics
|
||||
]
|
||||
if len(metrics_list) >= 2:
|
||||
lines.append(f"### {_t(config, 'metrics_trend_title')}\n")
|
||||
_append_trend_line(
|
||||
lines, "Issues",
|
||||
[(it, m.critical + m.major + m.minor) for it, m in metrics_list],
|
||||
)
|
||||
_append_trend_line(
|
||||
lines, "Over-engineering",
|
||||
[(it, m.over_engineering) for it, m in metrics_list],
|
||||
)
|
||||
_append_trend_line(
|
||||
lines, "Omission",
|
||||
[(it, m.omission) for it, m in metrics_list],
|
||||
)
|
||||
_append_trend_line(
|
||||
lines, "CONFIRMED",
|
||||
[(it, m.confirmed) for it, m in metrics_list],
|
||||
)
|
||||
_append_trend_line(
|
||||
lines, "DISMISSED",
|
||||
[(it, m.dismissed) for it, m in metrics_list],
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
|
||||
def _append_trend_line(
|
||||
lines: list[str],
|
||||
label: str,
|
||||
values: list[tuple[int, int]],
|
||||
) -> None:
|
||||
"""Append a single trend line like '- Issues: 6 -> 2 -> 0 (decreasing)'."""
|
||||
nums = [v for _, v in values]
|
||||
arrow = " → ".join(str(n) for n in nums)
|
||||
if nums[-1] < nums[0]:
|
||||
direction = "decreasing"
|
||||
elif nums[-1] > nums[0]:
|
||||
direction = "increasing"
|
||||
else:
|
||||
direction = "stable"
|
||||
lines.append(f"- {label}: {arrow} ({direction})")
|
||||
|
||||
|
||||
def _append_out_of_scope(
|
||||
lines: list[str],
|
||||
config: PipelineConfig,
|
||||
out_of_scope_items: list[tuple[int, str]],
|
||||
) -> None:
|
||||
"""Append the out-of-scope issues section if any exist."""
|
||||
if not out_of_scope_items:
|
||||
return
|
||||
lines.append("---\n")
|
||||
lines.append(f"## {_t(config, 'oos_title')}\n")
|
||||
lines.append(f"{_t(config, 'oos_desc')}\n")
|
||||
for iteration_num, content in out_of_scope_items:
|
||||
lines.append(f"### {_t(config, 'iteration')} {iteration_num}\n")
|
||||
lines.append(content)
|
||||
lines.append("")
|
||||
|
||||
|
||||
def _append_final_verdict(
|
||||
lines: list[str],
|
||||
config: PipelineConfig,
|
||||
result: PipelineResult,
|
||||
) -> None:
|
||||
"""Append the final verdict section."""
|
||||
lines.append("---\n")
|
||||
lines.append(f"## {_t(config, 'final_verdict_title')}: {result.final_verdict}\n")
|
||||
|
||||
if result.final_verdict == "PASS":
|
||||
lines.append(_t(config, "pass_msg"))
|
||||
else:
|
||||
if config.phases:
|
||||
phase_names = " → ".join(p.name for p in config.phases)
|
||||
lines.append(_t(config, "fail_phased", phases=phase_names))
|
||||
else:
|
||||
lines.append(
|
||||
_t(config, "fail_simple", max_iter=str(config.max_iterations))
|
||||
)
|
||||
|
||||
|
||||
def _append_repeated_aggregate(
|
||||
lines: list[str],
|
||||
config: PipelineConfig,
|
||||
result: PipelineResult,
|
||||
) -> None:
|
||||
"""Append repeated aggregate warnings if any exist."""
|
||||
if not result.repeated_aggregate_warnings:
|
||||
return
|
||||
lines.append("---\n")
|
||||
lines.append(f"## {_t(config, 'repeat_title')}\n")
|
||||
lines.append(f"{_t(config, 'repeat_desc')}\n")
|
||||
for warning in result.repeated_aggregate_warnings:
|
||||
lines.append(f"- {warning}")
|
||||
lines.append("")
|
||||
Reference in New Issue
Block a user