initial commit

This commit is contained in:
이충영 에이닷서비스개발
2026-03-11 21:53:14 +09:00
commit ee4f1a07ef
42 changed files with 4533 additions and 0 deletions

1
cross_eval/__init__.py Normal file
View File

@@ -0,0 +1 @@
__version__ = "0.1.0"

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

162
cross_eval/agent.py Normal file
View File

@@ -0,0 +1,162 @@
"""Agent invocation via subprocess with live spinner."""
from __future__ import annotations
import itertools
import logging
import subprocess
import sys
import threading
import time
from pathlib import Path
from typing import Optional
from cross_eval.models import AgentConfig, AgentResult
logger = logging.getLogger(__name__)
# CLI tools that support --system-prompt flag natively
_SYSTEM_PROMPT_AGENTS = ("claude",)
_REASONING_EFFORT_AGENTS = ("codex",)
def _supports_system_prompt_flag(command: str) -> bool:
"""Check if the agent CLI supports --system-prompt flag."""
return any(name in command for name in _SYSTEM_PROMPT_AGENTS)
def _supports_reasoning_effort(command: str) -> bool:
"""Check if the agent CLI supports reasoning effort overrides."""
return any(name in command for name in _REASONING_EFFORT_AGENTS)
class _Spinner:
"""Animated spinner for long-running agent calls."""
FRAMES = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"
_CLEAR_LINE = "\r" + (" " * 160) + "\r"
def __init__(self, message: str) -> None:
self.message = message
self._running = False
self._thread: Optional[threading.Thread] = None
self._start_time = 0.0
def start(self) -> None:
self._running = True
self._start_time = time.monotonic()
self._thread = threading.Thread(target=self._spin, daemon=True)
self._thread.start()
def _spin(self) -> None:
for frame in itertools.cycle(self.FRAMES):
if not self._running:
break
elapsed = int(time.monotonic() - self._start_time)
line = f"\r {frame} {self.message} ({elapsed}s)"
sys.stderr.write(line)
sys.stderr.flush()
time.sleep(0.1)
def stop(self, final: str) -> None:
self._running = False
if self._thread:
self._thread.join(timeout=1)
elapsed = round(time.monotonic() - self._start_time, 1)
sys.stderr.write(self._CLEAR_LINE)
sys.stderr.write(f" \u2713 {final} ({elapsed}s)\n")
sys.stderr.flush()
def invoke_agent(
agent: AgentConfig,
prompt: str,
step_name: str,
cwd: Optional[Path] = None,
timeout: int | None = None,
quiet: bool = False,
) -> AgentResult:
"""Invoke an agent CLI with the given prompt.
Args:
quiet: If True, suppress spinner (for parallel execution).
"""
cmd = [agent.command]
if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
cmd.extend(agent.args)
# Build the full prompt (system prompt + user prompt)
if agent.system_prompt and _supports_system_prompt_flag(agent.command):
# claude: --system-prompt flag supported natively
cmd.extend(["--system-prompt", agent.system_prompt])
input_data = prompt
elif agent.system_prompt:
# codex, others: no --system-prompt flag, prepend to prompt
input_data = (
f"<system>\n{agent.system_prompt}\n</system>\n\n"
f"{prompt}"
)
else:
input_data = prompt
logger.debug("Invoking agent '%s': %s", agent.name, " ".join(cmd[:5]) + " ...")
spinner: Optional[_Spinner] = None
if not quiet:
logger.info(" cmd: %s", " ".join(cmd[:6]))
spinner = _Spinner(f"[{step_name}] {agent.name} running...")
spinner.start()
try:
start = time.monotonic()
result = subprocess.run(
cmd,
input=input_data,
capture_output=True,
text=True,
timeout=timeout,
cwd=cwd,
)
duration = time.monotonic() - start
except subprocess.TimeoutExpired:
if spinner:
spinner.stop(f"[{step_name}] TIMEOUT after {timeout}s")
raise
except Exception:
if spinner:
spinner.stop(f"[{step_name}] ERROR")
raise
output = result.stdout.strip()
chars = len(output)
if result.returncode != 0:
if spinner:
spinner.stop(f"[{step_name}] FAILED (exit {result.returncode})")
err_detail = result.stderr.strip() or result.stdout.strip()
if err_detail and len(err_detail) > 500:
err_detail = err_detail[:500] + "..."
cmd_preview = " ".join(cmd[:6])
raise RuntimeError(
f"Agent '{agent.name}' failed (exit code {result.returncode}) "
f"at step '{step_name}':\n"
f" cmd: {cmd_preview}\n"
f" error: {err_detail or '(no output)'}"
)
if spinner:
spinner.stop(f"[{step_name}] done — {chars} chars")
if not output:
logger.warning(
"Agent '%s' produced empty output at step '%s'",
agent.name, step_name,
)
return AgentResult(
output=output,
exit_code=result.returncode,
agent_name=agent.name,
step_name=step_name,
duration_seconds=round(duration, 1),
)

701
cross_eval/cli.py Normal file
View File

@@ -0,0 +1,701 @@
"""CLI entry point with argparse subcommands."""
from __future__ import annotations
import argparse
import logging
import sys
from pathlib import Path
from cross_eval import __version__
from cross_eval.config import REASONING_EFFORT_CHOICES
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Scaffolding templates for `cross-eval init`
# ---------------------------------------------------------------------------
DEFAULT_CONFIG_YAML = """\
# ─── cross-eval 설정 ───────────────────────────────────────────
#
# 기본 제공 에이전트 (별도 정의 없이 바로 사용 가능):
# claude-coder, claude-reviewer (Claude, opus 모델)
# claude-senior (Claude, opus 모델)
# codex-coder, codex-reviewer (Codex, gpt-5.4 모델)
# codex-senior (Codex, gpt-5.4 모델)
#
# CLI에서 --coder claude --reviewer codex --senior codex 같이 축약해서 지정 가능
# ────────────────────────────────────────────────────────────────
# 입력 파일 (이 파일 기준 상대경로)
inputs:
plan: plan.md
checklist: checklist.md
# 에이전트 역할 지정
coders: [claude-coder]
reviewers: [claude-reviewer]
# seniors: [codex-senior]
# 파이프라인 종류: simple | cross-review | review-only | review-fix
pipeline: preset:{preset}
# 반복 설정
max_iterations: 3
# min_iterations: 1 # PASS여도 최소 이만큼 반복
# 프롬프트 언어
language: {language}
# 결과 저장 경로
output_dir: output
# ─── 커스텀 에이전트 (선택) ────────────────────────────────────
# 기본 제공 에이전트를 덮어쓰거나 새 에이전트를 정의할 수 있습니다.
#
# agents:
# my-reviewer:
# command: my-tool
# args: ["--flag"]
# system_prompt: "..."
# ────────────────────────────────────────────────────────────────
"""
PLAN_SAMPLE_EN = """\
# Project Plan
## Objective
[Describe what you want to build]
## Requirements
1. [Requirement 1]
2. [Requirement 2]
## Constraints
- [Constraint 1]
- [Constraint 2]
## Out of Scope
- [Explicitly list what should NOT be implemented]
"""
PLAN_SAMPLE_KO = """\
# 프로젝트 기획서
## 목표
[구현할 내용을 설명하세요]
## 요구사항
1. [요구사항 1]
2. [요구사항 2]
## 제약조건
- [제약조건 1]
- [제약조건 2]
## 범위 밖 (구현하지 않을 것)
- [명시적으로 구현하지 않을 항목 나열]
"""
CHECKLIST_SAMPLE_EN = """\
# Implementation Checklist
## Functional Requirements
- [ ] [Item 1]
- [ ] [Item 2]
## Code Quality
- [ ] No unused imports or dead code
- [ ] Error handling for edge cases
- [ ] Follows project coding conventions
## Constraints
- [ ] Does NOT add features beyond the plan
- [ ] Does NOT introduce unnecessary abstractions
"""
CHECKLIST_SAMPLE_KO = """\
# 구현 체크리스트
## 기능 요구사항
- [ ] [항목 1]
- [ ] [항목 2]
## 코드 품질
- [ ] 사용하지 않는 import나 죽은 코드 없음
- [ ] 엣지 케이스에 대한 에러 처리
- [ ] 프로젝트 코딩 컨벤션 준수
## 제약
- [ ] 기획서 범위를 넘는 기능을 추가하지 않음
- [ ] 불필요한 추상화를 도입하지 않음
"""
# ---------------------------------------------------------------------------
# CLI entry point
# ---------------------------------------------------------------------------
def main(argv: list[str] | None = None) -> int:
"""Main CLI entry point."""
parser = argparse.ArgumentParser(
prog="cross-eval",
description=(
"AI 코딩 에이전트의 결과물을 자동으로 검증하는 CLI 도구.\n"
"\n"
"동작 방식:\n"
" 1. 기획서(plan)를 바탕으로 Coder 에이전트가 코드를 생성\n"
" 2. Reviewer 에이전트가 기획서 대비 코드를 검토하고 PASS/FAIL 판정\n"
" 3. FAIL이면 피드백을 반영해서 1~2를 반복 (최대 N회)\n"
"\n"
"빠른 시작:\n"
" cross-eval init 설정 파일 생성\n"
" cross-eval run --plan plan.md 기획서로 바로 실행\n"
" cross-eval run .cross-eval/config.yaml 기반 실행\n"
"\n"
"자세한 사용법: cross-eval <command> --help"
),
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"-v", "--version",
action="version",
version=f"%(prog)s {__version__}",
)
parser.add_argument(
"--verbose",
action="store_true",
help="상세 로그 출력",
)
subparsers = parser.add_subparsers(dest="command")
# --- init ---
init_parser = subparsers.add_parser(
"init",
help="설정 파일 생성 (config.yaml, plan.md, checklist.md)",
description=(
"현재 디렉토리에 .cross-eval/ 폴더를 만들고 템플릿을 생성합니다.\n"
"이미 있는 파일은 건드리지 않습니다.\n"
"\n"
"생성되는 파일:\n"
" .cross-eval/config.yaml 에이전트, 파이프라인 설정\n"
" .cross-eval/plan.md 기획서 템플릿\n"
" .cross-eval/checklist.md 체크리스트 템플릿"
),
formatter_class=argparse.RawDescriptionHelpFormatter,
)
init_parser.add_argument(
"--dir",
type=Path,
default=Path("."),
help="초기화할 디렉토리 (기본: 현재 디렉토리)",
)
init_parser.add_argument(
"--preset",
default="simple",
choices=["simple", "cross-review", "review-only", "review-fix"],
help=(
"파이프라인 종류 (기본: simple). "
"simple=코딩+리뷰, cross-review=교차리뷰, "
"review-only=리뷰만, review-fix=리뷰수렴+자동수정"
),
)
init_parser.add_argument(
"--lang",
default="ko",
choices=["en", "ko"],
help="프롬프트 언어 (기본: ko)",
)
# --- run ---
run_parser = subparsers.add_parser(
"run",
help="검증 파이프라인 실행",
description=(
"기획서(plan)를 기반으로 AI 에이전트가 코드 생성과 리뷰를 반복합니다.\n"
"\n"
"설정 파일 없이 바로 실행할 수 있고, config.yaml로도 실행할 수 있습니다.\n"
"CLI 옵션이 config.yaml보다 우선합니다."
),
epilog=(
"파이프라인 종류 (--preset):\n"
" ┌──────────────┬─────────────────────────────────────────────────────┐\n"
" │ simple │ Coder가 코드 생성 → Reviewer가 리뷰 │\n"
" │ (기본값) │ FAIL이면 피드백 반영해서 재생성, PASS까지 반복 │\n"
" ├──────────────┼─────────────────────────────────────────────────────┤\n"
" │ review-fix │ 2단계 파이프라인: │\n"
" │ │ Reviewer N명 병렬 리뷰 → 취합 → 수정 → 재검증 │\n"
" ├──────────────┼─────────────────────────────────────────────────────┤\n"
" │ review-only │ 코드 생성 없이 Reviewer N명이 기존 코드만 검토 │\n"
" │ │ (이미 작성된 코드의 품질 감사용) │\n"
" ├──────────────┼─────────────────────────────────────────────────────┤\n"
" │ cross-review │ Coder 2명이 각각 구현 → 상대방 코드를 교차 리뷰 │\n"
" │ │ (서로 다른 에이전트의 구현 비교용) │\n"
" └──────────────┴─────────────────────────────────────────────────────┘\n"
"\n"
"기본 제공 에이전트:\n"
" ┌──────────────────┬─────────┬───────────┬──────────────────────────┐\n"
" │ 이름 │ CLI │ 기본 모델 │ 역할 │\n"
" ├──────────────────┼─────────┼───────────┼──────────────────────────┤\n"
" │ claude-coder │ claude │ opus │ 코드 생성 │\n"
" │ claude-reviewer │ claude │ opus │ 코드 리뷰 │\n"
" │ claude-senior │ claude │ opus │ 리뷰 취합/판정 │\n"
" │ codex-coder │ codex │ gpt-5.4 │ 코드 생성 │\n"
" │ codex-reviewer │ codex │ gpt-5.4 │ 코드 리뷰 │\n"
" │ codex-senior │ codex │ gpt-5.4 │ 리뷰 취합/판정 │\n"
" └──────────────────┴─────────┴───────────┴──────────────────────────┘\n"
" --coder, --reviewer, --senior에서 축약 가능: claude → claude-<role>\n"
"\n"
"사용 예시:\n"
"\n"
" 기본 실행 (Claude가 코딩하고 Claude가 리뷰):\n"
" cross-eval run --plan plan.md\n"
"\n"
" Codex가 코딩, Claude가 리뷰:\n"
" cross-eval run --plan plan.md --coder codex --reviewer claude\n"
"\n"
" 리뷰어 2명 (Claude + Codex):\n"
" cross-eval run --plan plan.md --reviewer claude --reviewer codex\n"
"\n"
" 리뷰 취합용 Senior 추가:\n"
" cross-eval run --plan plan.md --preset review-fix \\\n"
" --reviewer claude --reviewer codex --senior codex\n"
"\n"
" 리뷰 수렴 후 자동 수정 (review-fix):\n"
" cross-eval run --plan plan.md --preset review-fix \\\n"
" --reviewer claude --reviewer codex\n"
"\n"
" 기존 코드 리뷰만 (review-only):\n"
" cross-eval run --plan plan.md --preset review-only \\\n"
" --reviewer claude --reviewer codex\n"
"\n"
" 모델 변경:\n"
" cross-eval run --plan plan.md --model sonnet\n"
"\n"
" config.yaml 기반 실행:\n"
" cross-eval run\n"
" cross-eval run -c my-config.yaml"
),
formatter_class=argparse.RawDescriptionHelpFormatter,
)
# -- 입력 파일 --
input_group = run_parser.add_argument_group("입력 파일")
input_group.add_argument(
"--plan", type=Path, default=None,
help="기획서 파일 경로 (필수)",
)
input_group.add_argument(
"--checklist", type=Path, default=None,
help="체크리스트 파일 경로 (선택)",
)
input_group.add_argument(
"--docs", type=Path, default=None,
help="참고 문서 폴더. 폴더 안 모든 파일을 에이전트에게 전달",
)
input_group.add_argument(
"--input", action="append", dest="inputs", metavar="KEY=PATH",
help="추가 입력 파일 (예: --input spec=./api-spec.md)",
)
# -- 에이전트 설정 --
agent_group = run_parser.add_argument_group(
"에이전트 설정",
"축약 가능: claude → claude-<role>, codex → codex-<role>",
)
agent_group.add_argument(
"--coder", action="append", dest="coders", metavar="NAME",
help="코드를 생성할 에이전트 (여러 개 가능, 기본: claude)",
)
agent_group.add_argument(
"--reviewer", action="append", dest="reviewers", metavar="NAME",
help="코드를 리뷰할 에이전트 (여러 개 가능, 기본: claude)",
)
agent_group.add_argument(
"--senior", action="append", dest="seniors", metavar="NAME",
help="리뷰를 취합하고 최종 판정할 시니어 에이전트 (선택)",
)
agent_group.add_argument(
"--reasoning-effort", default=None, metavar="LEVEL",
choices=REASONING_EFFORT_CHOICES + ("extra-high", "extra_high", "x-high"),
help="모든 역할의 reasoning effort (minimal|low|medium|high|xhigh)",
)
agent_group.add_argument(
"--coder-effort", default=None, metavar="LEVEL",
choices=REASONING_EFFORT_CHOICES + ("extra-high", "extra_high", "x-high"),
help="Coder용 reasoning effort",
)
agent_group.add_argument(
"--reviewer-effort", default=None, metavar="LEVEL",
choices=REASONING_EFFORT_CHOICES + ("extra-high", "extra_high", "x-high"),
help="Reviewer용 reasoning effort",
)
agent_group.add_argument(
"--senior-effort", default=None, metavar="LEVEL",
choices=REASONING_EFFORT_CHOICES + ("extra-high", "extra_high", "x-high"),
help="Senior용 reasoning effort",
)
agent_group.add_argument(
"--model", default=None, metavar="MODEL",
help="모든 에이전트의 모델을 한번에 변경 (예: sonnet, opus)",
)
agent_group.add_argument(
"--generator-model", default=None, metavar="MODEL",
help="Coder 에이전트 모델만 변경",
)
agent_group.add_argument(
"--reviewer-model", default=None, metavar="MODEL",
help="Reviewer 에이전트 모델만 변경",
)
# -- 파이프라인 --
pipe_group = run_parser.add_argument_group("파이프라인")
pipe_group.add_argument(
"--preset", default=None,
choices=["simple", "cross-review", "review-only", "review-fix"],
help="파이프라인 종류 (기본: simple). 각 종류 설명은 아래 참조",
)
pipe_group.add_argument(
"--max-iter", type=int, default=None,
help="최대 반복 횟수 (기본: 3)",
)
pipe_group.add_argument(
"--min-iter", type=int, default=None,
help="최소 반복 횟수. PASS여도 이 횟수까지 반복 (기본: 1)",
)
pipe_group.add_argument(
"--timeout", type=int, default=None, metavar="SEC",
help="에이전트 1회 호출 제한 시간(초). 0=무제한 (기본: 무제한)",
)
pipe_group.add_argument(
"--lang", default=None, choices=["en", "ko"],
help="프롬프트 언어 (기본: ko)",
)
# -- 기타 --
etc_group = run_parser.add_argument_group("기타")
etc_group.add_argument(
"-c", "--config", type=Path, default=None,
help="설정 파일 경로 (기본: .cross-eval/config.yaml)",
)
etc_group.add_argument(
"--output-dir", type=Path, default=None,
help="결과 저장 디렉토리 (기본: output/)",
)
etc_group.add_argument(
"--dry-run", action="store_true",
help="실제 실행 없이 에이전트에게 보낼 프롬프트만 미리보기",
)
args = parser.parse_args(argv)
# Setup logging
level = logging.DEBUG if args.verbose else logging.INFO
logging.basicConfig(
level=level,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%H:%M:%S",
)
if args.command == "init":
return cmd_init(args)
elif args.command == "run":
return cmd_run(args)
else:
parser.print_help()
return 0
def cmd_init(args: argparse.Namespace) -> int:
"""Scaffold a new cross-eval project."""
target = args.dir.resolve()
ce_dir = target / ".cross-eval"
ce_dir.mkdir(parents=True, exist_ok=True)
lang = args.lang
plan_sample = PLAN_SAMPLE_KO if lang == "ko" else PLAN_SAMPLE_EN
checklist_sample = CHECKLIST_SAMPLE_KO if lang == "ko" else CHECKLIST_SAMPLE_EN
files = {
".cross-eval/config.yaml": DEFAULT_CONFIG_YAML.format(
preset=args.preset, language=lang,
),
".cross-eval/plan.md": plan_sample,
".cross-eval/checklist.md": checklist_sample,
}
created = []
skipped = []
for name, content in files.items():
path = target / name
if path.exists():
skipped.append(name)
else:
path.write_text(content, encoding="utf-8")
created.append(name)
if created:
print(f" 생성: {', '.join(created)}")
if skipped:
print(f" 이미 존재 (건너뜀): {', '.join(skipped)}")
print(f"\n 파이프라인: {args.preset}")
print(f" 언어: {lang}")
print("")
print("다음 단계:")
print(" 1. .cross-eval/plan.md 에 기획서 작성")
print(" 2. .cross-eval/checklist.md 에 체크리스트 작성 (선택)")
print(" 3. cross-eval run 으로 실행")
print("")
print("주의: 에이전트는 기본적으로 파일 읽기/쓰기/실행 권한을 가집니다.")
print(" 실행 전에 .cross-eval/config.yaml 을 확인하세요.")
return 0
def _read_docs_dir(docs_dir: Path) -> str:
"""Read all files in a directory and concatenate with filename headers."""
parts: list[str] = []
for f in sorted(docs_dir.iterdir()):
if f.is_file() and not f.name.startswith("."):
try:
content = f.read_text(encoding="utf-8")
parts.append(f"### {f.name}\n{content}")
except (UnicodeDecodeError, OSError):
continue # skip binary or unreadable files
return "\n\n".join(parts)
def _apply_model_override(config, agent_name: str, model: str) -> None:
"""Replace --model in agent args."""
agent = config.agents.get(agent_name)
if agent is None:
return
new_args = list(agent.args)
for i, arg in enumerate(new_args):
if arg == "--model" and i + 1 < len(new_args):
new_args[i + 1] = model
agent.args = new_args
return
# --model not found, append it
new_args.extend(["--model", model])
agent.args = new_args
def cmd_run(args: argparse.Namespace) -> int:
"""Load config, validate, and execute the pipeline."""
from cross_eval.config import (
apply_input_overrides,
default_config,
load_config,
validate_config,
)
from cross_eval.prompts import PIPELINE_PRESETS
from cross_eval.pipeline import run_pipeline
# 1. Load config: YAML if exists, otherwise defaults
config_path = args.config
if config_path is not None:
config_path = config_path.resolve()
if not config_path.exists():
print(f"Config file not found: {config_path}", file=sys.stderr)
return 1
try:
config = load_config(config_path)
except (ValueError, FileNotFoundError) as e:
print(f"Config error: {e}", file=sys.stderr)
return 1
config_source = config_path.name
else:
# Try default location, fall back to built-in defaults
default_path = Path(".cross-eval/config.yaml").resolve()
if default_path.exists():
try:
config = load_config(default_path)
config_source = default_path.name
except (ValueError, FileNotFoundError) as e:
print(f"Config error: {e}", file=sys.stderr)
return 1
else:
config = default_config()
config_source = "defaults"
# 2. Apply CLI overrides
if args.max_iter is not None:
config.max_iterations = args.max_iter
if args.min_iter is not None:
config.min_iterations = args.min_iter
if args.output_dir is not None:
config.output_dir = args.output_dir
if args.lang is not None:
config.language = args.lang
# --coder / --reviewer: resolve shorthands and override roles
from cross_eval.config import (
_default_seniors_for_preset,
_infer_roles,
_resolve_agents,
apply_reasoning_effort_settings,
resolve_agent_shorthand,
)
if args.coders or args.reviewers or args.seniors:
coders = [resolve_agent_shorthand(c, "coder") for c in (args.coders or [])]
reviewers = [resolve_agent_shorthand(r, "reviewer") for r in (args.reviewers or [])]
seniors = [resolve_agent_shorthand(s, "senior") for s in (args.seniors or [])]
# Fill defaults if only one side specified
if not coders:
coders = config.coders or ["claude-coder"]
if not reviewers:
reviewers = config.reviewers or ["claude-reviewer"]
if not seniors:
seniors = config.seniors
config.coders = coders
config.reviewers = reviewers
config.seniors = seniors
# Auto-merge built-in agents
config.agents = _resolve_agents(config.agents, coders, reviewers, seniors)
# --preset: rebuild pipeline from preset
need_rebuild = args.preset is not None or args.coders or args.reviewers or args.seniors
if need_rebuild:
from cross_eval.prompts import PHASED_PRESETS
preset = args.preset or "simple"
# Determine which preset was configured (from YAML or defaults)
if args.preset is None and config.phases:
preset = "review-fix" # only phased preset currently
elif args.preset is None and not args.coders and not args.reviewers and not args.seniors:
pass # no changes needed
inferred_coders, inferred_reviewers, inferred_seniors = _infer_roles(
list(config.agents.keys())
)
coders = config.coders or inferred_coders
reviewers = config.reviewers or inferred_reviewers
seniors = config.seniors or []
if not seniors:
seniors = _default_seniors_for_preset(
f"preset:{preset}",
reviewers,
config.agents,
)
config.agents = _resolve_agents(config.agents, coders, reviewers, seniors)
config.coders = coders
config.reviewers = reviewers
config.seniors = seniors
config.preset_name = preset
if preset in PHASED_PRESETS:
config.phases = PHASED_PRESETS[preset](coders, reviewers, seniors)
config.pipeline = []
elif preset in PIPELINE_PRESETS:
config.pipeline = PIPELINE_PRESETS[preset](coders, reviewers, seniors)
config.phases = []
if preset == "review-only" and args.max_iter is None and args.min_iter is None:
config.max_iterations = 1
apply_reasoning_effort_settings(
config,
reasoning_effort=args.reasoning_effort,
coder_effort=args.coder_effort,
reviewer_effort=args.reviewer_effort,
senior_effort=args.senior_effort,
)
# --model: apply to ALL agents
if args.model is not None:
for agent_name in config.agents:
_apply_model_override(config, agent_name, args.model)
# --generator-model / --reviewer-model: apply by role
if args.generator_model is not None:
for coder_name in config.coders:
_apply_model_override(config, coder_name, args.generator_model)
if args.reviewer_model is not None:
for reviewer_name in config.reviewers:
_apply_model_override(config, reviewer_name, args.reviewer_model)
# --plan / --checklist shortcuts
for key, val in [("plan", args.plan), ("checklist", args.checklist)]:
if val is not None:
p = val.resolve()
if not p.exists():
print(f"File not found: {p}", file=sys.stderr)
return 1
config.inputs[key] = p
# --docs: read all files in directory, inject as {docs}
if args.docs is not None:
docs_dir = args.docs.resolve()
if not docs_dir.is_dir():
print(f"Not a directory: {docs_dir}", file=sys.stderr)
return 1
docs_content = _read_docs_dir(docs_dir)
if not docs_content:
print(f"No files found in: {docs_dir}", file=sys.stderr)
return 1
config.inputs["docs"] = docs_content
if args.inputs:
overrides = {}
for item in args.inputs:
if "=" not in item:
print(
f"Invalid --input format: '{item}'. Use KEY=PATH.",
file=sys.stderr,
)
return 1
key, path = item.split("=", 1)
overrides[key] = path
apply_input_overrides(config, overrides)
# 3. Validate after all overrides
from cross_eval.config import validate_config
errors = validate_config(config)
if errors:
print("Config error:\n " + "\n ".join(errors), file=sys.stderr)
return 1
# 4. Run pipeline
logger.info("Config: %s", config_source)
logger.info(
"Agents: %s",
", ".join(f"{n} ({a.command})" for n, a in config.agents.items()),
)
if config.coders or config.reviewers or config.seniors:
logger.info("Coders: %s", config.coders)
logger.info("Reviewers: %s", config.reviewers)
logger.info("Seniors: %s", config.seniors)
if config.phases:
phase_desc = "".join(
f"{p.name}(max {p.max_iterations}, {p.consecutive_pass}xPASS)"
for p in config.phases
)
logger.info("Pipeline: phased [%s], lang=%s", phase_desc, config.language)
else:
iter_info = f"max {config.max_iterations}"
if config.min_iterations > 1:
iter_info = f"min {config.min_iterations}, max {config.max_iterations}"
logger.info(
"Pipeline: %d steps, %s iterations, lang=%s",
len(config.pipeline), iter_info, config.language,
)
try:
raw_timeout = args.timeout if args.timeout is not None else 0
agent_timeout = None if raw_timeout == 0 else raw_timeout
result = run_pipeline(config, dry_run=args.dry_run, timeout=agent_timeout)
except (RuntimeError, KeyboardInterrupt) as e:
if isinstance(e, KeyboardInterrupt):
print("\nInterrupted by user.", file=sys.stderr)
return 130
print(f"Pipeline error: {e}", file=sys.stderr)
return 1
# 4. Print summary
print(f"\nResult: {result.final_verdict}")
print(f"Iterations: {len(result.iterations)}")
if not args.dry_run and result.run_dir:
print(f"Output: {result.run_dir}/")
return 0 if result.final_verdict == "PASS" else 1
if __name__ == "__main__":
sys.exit(main())

607
cross_eval/config.py Normal file
View File

@@ -0,0 +1,607 @@
"""Configuration loading, validation, and preset resolution."""
from __future__ import annotations
import logging
import re
from pathlib import Path
from typing import Any
import yaml
from cross_eval.models import AgentConfig, PhaseConfig, PipelineConfig, StepConfig
from cross_eval.prompts import PHASED_PRESETS, PIPELINE_PRESETS
logger = logging.getLogger(__name__)
REASONING_EFFORT_ALIASES = {
"extra-high": "xhigh",
"extra_high": "xhigh",
"x-high": "xhigh",
}
REASONING_EFFORT_CHOICES = ("minimal", "low", "medium", "high", "xhigh")
DEFAULT_ROLE_REASONING_EFFORTS = {
"coder": "medium",
"reviewer": "medium",
"senior": "high",
}
# ---------------------------------------------------------------------------
# Built-in agent registry
# ---------------------------------------------------------------------------
_CODEX_ARGS = [
"exec",
"--full-auto",
"--skip-git-repo-check",
"--model",
"gpt-5.4",
"-",
]
_CODER_SYSTEM_PROMPT = (
"You are a senior software engineer implementing code changes.\n"
"Rules:\n"
"1. FIRST explore the project directory to understand the existing codebase, "
"patterns, and conventions before writing any code.\n"
"2. Implement ONLY what the plan specifies. Do NOT add extra features, "
"unnecessary abstractions, premature optimizations, or \"nice-to-have\" improvements.\n"
"3. Follow the project's existing coding style, naming conventions, and directory structure.\n"
"4. If previous review feedback is provided, fix ONLY the specific issues mentioned. "
"Do NOT refactor unrelated code.\n"
"5. Ignore any items from previous feedback that were marked as DISMISSED or false positive.\n"
"6. When in doubt about scope, do LESS, not more."
)
_REVIEWER_SYSTEM_PROMPT = (
"You are a code reviewer. You MUST NOT create, modify, or delete any files.\n"
"Rules:\n"
"1. Explore the project directory to understand the full codebase context.\n"
"2. Compare the implementation against the plan and checklist ONLY.\n"
"3. Classify every issue with BOTH severity AND category:\n"
" - Severity: Critical (breaks functionality/security) > Major (requirement mismatch) > Minor (convention/style)\n"
" - Category: Over-engineering / Omission\n"
"4. When reviewing with previous feedback, mark items as CONFIRMED (still an issue) "
"or DISMISSED (false positive) with rationale.\n"
"5. Report out-of-scope issues separately — problems found outside plan/checklist scope.\n"
"6. Order issues by severity (Critical first).\n"
"7. Do NOT suggest improvements beyond the plan scope.\n"
"8. End with VERDICT: PASS (all requirements met, no over-engineering) "
"or VERDICT: FAIL (issues found)."
)
_SENIOR_SYSTEM_PROMPT = (
"You are a senior technical reviewer coordinating a review-fix-verification loop.\n"
"Rules:\n"
"1. Explore the project directory to understand the full codebase context.\n"
"2. In aggregation mode, deduplicate overlaps, resolve disagreements, and keep only "
"evidence-backed issues. Categorize dismissed findings as [False positive] or [Already fixed].\n"
"3. In verification mode, judge the current implementation directly against ONLY the "
"plan and checklist.\n"
"4. Be skeptical of false positives, but do not lower the bar on real requirement "
"gaps.\n"
"5. When issues remain, produce a concise prioritized action list the coder can act on.\n"
"6. Do NOT invent new requirements beyond the plan and checklist.\n"
"7. End with VERDICT: PASS or VERDICT: FAIL."
)
BUILTIN_AGENTS: dict[str, AgentConfig] = {
"claude-coder": AgentConfig(
name="claude-coder",
command="claude",
args=["-p", "--model", "opus", "--permission-mode", "auto"],
system_prompt=_CODER_SYSTEM_PROMPT,
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["coder"],
),
"claude-reviewer": AgentConfig(
name="claude-reviewer",
command="claude",
args=["-p", "--model", "opus", "--permission-mode", "auto"],
system_prompt=_REVIEWER_SYSTEM_PROMPT,
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["reviewer"],
),
"claude-senior": AgentConfig(
name="claude-senior",
command="claude",
args=["-p", "--model", "opus", "--permission-mode", "auto"],
system_prompt=_SENIOR_SYSTEM_PROMPT,
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["senior"],
),
"codex-coder": AgentConfig(
name="codex-coder",
command="codex",
args=list(_CODEX_ARGS),
system_prompt=_CODER_SYSTEM_PROMPT,
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["coder"],
),
"codex-reviewer": AgentConfig(
name="codex-reviewer",
command="codex",
args=list(_CODEX_ARGS),
system_prompt=_REVIEWER_SYSTEM_PROMPT,
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["reviewer"],
),
"codex-senior": AgentConfig(
name="codex-senior",
command="codex",
args=list(_CODEX_ARGS),
system_prompt=_SENIOR_SYSTEM_PROMPT,
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["senior"],
),
}
# Shorthand aliases: "claude" → "claude-coder"/"claude-reviewer", "codex" → same
_AGENT_ALIASES: dict[str, str] = {
"claude": "claude",
"codex": "codex",
}
def resolve_agent_shorthand(name: str, role: str) -> str:
"""Resolve shorthand agent name to full builtin name.
Examples:
resolve_agent_shorthand("claude", "coder") → "claude-coder"
resolve_agent_shorthand("codex", "reviewer") → "codex-reviewer"
resolve_agent_shorthand("claude-coder", "coder") → "claude-coder" (unchanged)
"""
if name in _AGENT_ALIASES:
return f"{_AGENT_ALIASES[name]}-{role}"
return name
# ---------------------------------------------------------------------------
# Role inference (backward compatibility)
# ---------------------------------------------------------------------------
_CODER_PATTERNS = ("gen", "coder", "implement", "develop", "write")
_SENIOR_PATTERNS = ("senior", "lead", "principal", "aggregate", "adjudicat", "synth")
_REVIEWER_PATTERNS = ("review", "audit", "check", "verify", "inspect")
def _infer_roles(agent_names: list[str]) -> tuple[list[str], list[str], list[str]]:
"""Infer coder/reviewer/senior roles from agent names.
Heuristic:
- Names containing 'gen', 'coder', etc. → coder
- Names containing 'senior', 'lead', etc. → senior
- Names containing 'review', 'audit', etc. → reviewer
- If no matches: first agent → coder, rest → reviewers
"""
coders: list[str] = []
reviewers: list[str] = []
seniors: list[str] = []
unclassified: list[str] = []
for name in agent_names:
lower = name.lower()
if any(p in lower for p in _CODER_PATTERNS):
coders.append(name)
elif any(p in lower for p in _SENIOR_PATTERNS):
seniors.append(name)
elif any(p in lower for p in _REVIEWER_PATTERNS):
reviewers.append(name)
else:
unclassified.append(name)
# Fallback: if no classification worked, use positional convention
if not coders and not reviewers:
if len(agent_names) >= 2:
coders = [agent_names[0]]
reviewers = list(agent_names[1:])
elif agent_names:
# Single agent: treat as reviewer (for review-only)
reviewers = list(agent_names)
elif not coders and unclassified:
coders = [unclassified.pop(0)]
elif not reviewers and unclassified:
reviewers = list(unclassified)
unclassified = []
# Any remaining unclassified go to reviewers
reviewers.extend(unclassified)
return coders, reviewers, seniors
def _resolve_agents(
user_agents: dict[str, AgentConfig],
coders: list[str],
reviewers: list[str],
seniors: list[str],
) -> dict[str, AgentConfig]:
"""Ensure all referenced agents exist by merging built-in definitions.
If a coder or reviewer name references an agent not in user_agents
but present in BUILTIN_AGENTS, the built-in definition is added.
"""
all_referenced = set(coders) | set(reviewers) | set(seniors)
result = dict(user_agents)
for name in all_referenced:
if name not in result and name in BUILTIN_AGENTS:
result[name] = BUILTIN_AGENTS[name]
return result
def _default_seniors_for_preset(
pipeline_raw: Any,
reviewers: list[str],
agents: dict[str, AgentConfig],
) -> list[str]:
"""Infer a default senior agent for presets that benefit from adjudication."""
if not (
isinstance(pipeline_raw, str)
and pipeline_raw == "preset:review-fix"
and reviewers
):
return []
first_reviewer = reviewers[0]
if first_reviewer.startswith("codex-"):
return ["codex-senior"]
if first_reviewer.startswith("claude-"):
return ["claude-senior"]
reviewer_agent = agents.get(first_reviewer)
if reviewer_agent is None:
return []
command = reviewer_agent.command.lower()
if "codex" in command:
return ["codex-senior"]
if "claude" in command:
return ["claude-senior"]
return []
def normalize_reasoning_effort(effort: str) -> str:
"""Normalize user-facing reasoning effort aliases."""
normalized = REASONING_EFFORT_ALIASES.get(effort, effort)
if normalized not in REASONING_EFFORT_CHOICES:
raise ValueError(
f"Unsupported reasoning effort '{effort}'. "
f"Use one of: {REASONING_EFFORT_CHOICES}"
)
return normalized
def apply_reasoning_effort_settings(
config: PipelineConfig,
*,
reasoning_effort: str | None = None,
coder_effort: str | None = None,
reviewer_effort: str | None = None,
senior_effort: str | None = None,
) -> None:
"""Apply default and override reasoning effort settings by role."""
shared_effort = normalize_reasoning_effort(reasoning_effort) if reasoning_effort else None
role_efforts = {
"coder": normalize_reasoning_effort(coder_effort) if coder_effort else shared_effort,
"reviewer": normalize_reasoning_effort(reviewer_effort) if reviewer_effort else shared_effort,
"senior": normalize_reasoning_effort(senior_effort) if senior_effort else shared_effort,
}
_apply_role_effort(config.agents, config.coders, role_efforts["coder"], "coder")
_apply_role_effort(config.agents, config.reviewers, role_efforts["reviewer"], "reviewer")
_apply_role_effort(config.agents, config.seniors, role_efforts["senior"], "senior")
def _apply_role_effort(
agents: dict[str, AgentConfig],
agent_names: list[str],
override_effort: str | None,
role: str,
) -> None:
"""Set reasoning effort on agents for a specific role."""
for agent_name in agent_names:
agent = agents.get(agent_name)
if agent is None:
continue
if override_effort is not None:
agent.reasoning_effort = override_effort
elif agent.reasoning_effort is None:
agent.reasoning_effort = DEFAULT_ROLE_REASONING_EFFORTS[role]
# ---------------------------------------------------------------------------
# Default config (no YAML)
# ---------------------------------------------------------------------------
def default_config() -> PipelineConfig:
"""Return a PipelineConfig with sensible defaults (no YAML needed)."""
agents = dict(BUILTIN_AGENTS)
coders = ["claude-coder"]
reviewers = ["claude-reviewer"]
seniors: list[str] = []
pipeline = PIPELINE_PRESETS["simple"](coders, reviewers, seniors)
return PipelineConfig(
output_dir=Path("output"),
max_iterations=3,
language="ko",
inputs={},
agents=agents,
coders=coders,
reviewers=reviewers,
seniors=seniors,
pipeline=pipeline,
)
# ---------------------------------------------------------------------------
# YAML loading
# ---------------------------------------------------------------------------
def load_config(path: Path) -> PipelineConfig:
"""Load and validate a YAML config file, returning PipelineConfig."""
path = path.resolve()
with open(path, encoding="utf-8") as f:
raw = yaml.safe_load(f)
if not isinstance(raw, dict):
raise ValueError(f"Config file must be a YAML mapping, got {type(raw).__name__}")
config = _parse_raw(raw, path)
errors = validate_config(config)
if errors:
raise ValueError("Config validation failed:\n " + "\n ".join(errors))
return config
def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
"""Parse raw YAML dict into PipelineConfig."""
# --- agents ---
agents: dict[str, AgentConfig] = {}
for name, agent_data in raw.get("agents", {}).items():
agents[name] = AgentConfig(
name=name,
command=agent_data.get("command", "claude"),
args=agent_data.get("args", ["-p"]),
system_prompt=agent_data.get("system_prompt"),
reasoning_effort=agent_data.get("reasoning_effort"),
stdin_mode=agent_data.get("stdin_mode", False),
)
# --- roles: explicit or inferred ---
pipeline_raw = raw.get("pipeline", "preset:simple")
coders_raw = raw.get("coders")
reviewers_raw = raw.get("reviewers")
seniors_raw = raw.get("seniors")
if coders_raw is not None or reviewers_raw is not None or seniors_raw is not None:
# Explicit role assignment from YAML
coders: list[str] = coders_raw if coders_raw is not None else []
reviewers: list[str] = reviewers_raw if reviewers_raw is not None else []
seniors: list[str] = seniors_raw if seniors_raw is not None else []
else:
# Backward compat: infer from agent names
coders, reviewers, seniors = _infer_roles(list(agents.keys()))
if not seniors:
seniors = _default_seniors_for_preset(pipeline_raw, reviewers, agents)
# Auto-merge built-in agents for any referenced names not yet defined
agents = _resolve_agents(agents, coders, reviewers, seniors)
config_stub = PipelineConfig(
agents=agents,
coders=coders,
reviewers=reviewers,
seniors=seniors,
)
apply_reasoning_effort_settings(config_stub)
# --- inputs (resolve relative to config file location) ---
config_dir = config_path.parent
inputs: dict[str, Path | str] = {}
for key, val in raw.get("inputs", {}).items():
p = Path(val)
if not p.is_absolute():
p = config_dir / p
inputs[key] = p
# --- pipeline (preset or custom) ---
steps, phases = _resolve_pipeline(pipeline_raw, coders, reviewers, seniors)
# Detect preset name for output directory naming
preset_name = "custom"
if isinstance(pipeline_raw, str) and pipeline_raw.startswith("preset:"):
preset_name = pipeline_raw.split(":", 1)[1]
return PipelineConfig(
output_dir=Path(raw.get("output_dir", "output")),
max_iterations=int(raw.get("max_iterations", 3)),
min_iterations=int(raw.get("min_iterations", 1)),
verbose=bool(raw.get("verbose", False)),
language=raw.get("language", "en"),
inputs=inputs,
agents=agents,
coders=coders,
reviewers=reviewers,
seniors=seniors,
pipeline=steps,
phases=phases,
preset_name=preset_name,
_config_path=config_path,
_config_mtime=config_path.stat().st_mtime,
)
def try_reload_config(config: PipelineConfig) -> PipelineConfig:
"""Reload config if the file has been modified on disk.
Returns the new config if reloaded, or the same config if unchanged.
Validation errors during reload are logged but do not crash the pipeline.
"""
if config._config_path is None or config._config_mtime is None:
return config
try:
current_mtime = config._config_path.stat().st_mtime
except OSError:
return config
if current_mtime <= config._config_mtime:
return config
logger.info("Config file changed, reloading: %s", config._config_path.name)
try:
new_config = load_config(config._config_path)
logger.info("Config reloaded successfully")
return new_config
except (ValueError, FileNotFoundError, yaml.YAMLError) as e:
logger.warning("Config reload failed, keeping previous config: %s", e)
return config
def _resolve_pipeline(
pipeline_raw: Any,
coders: list[str],
reviewers: list[str],
seniors: list[str],
) -> tuple[list[StepConfig], list[PhaseConfig]]:
"""Resolve pipeline from preset string or explicit step list.
Returns (steps, phases) tuple. Only one will be non-empty.
- Simple/cross-review/review-only → steps populated, phases empty.
- Phased presets (review-fix) → steps empty, phases populated.
"""
# Preset: "preset:simple" or "preset:review-fix"
if isinstance(pipeline_raw, str) and pipeline_raw.startswith("preset:"):
preset_name = pipeline_raw.split(":", 1)[1]
if preset_name in PIPELINE_PRESETS:
return PIPELINE_PRESETS[preset_name](coders, reviewers, seniors), []
if preset_name in PHASED_PRESETS:
return [], PHASED_PRESETS[preset_name](coders, reviewers, seniors)
all_presets = list(PIPELINE_PRESETS.keys()) + list(PHASED_PRESETS.keys())
raise ValueError(
f"Unknown pipeline preset '{preset_name}'. "
f"Available: {all_presets}"
)
# Explicit step list
if isinstance(pipeline_raw, list):
steps = []
for step_data in pipeline_raw:
steps.append(StepConfig(
name=step_data["name"],
agent=step_data["agent"],
role=step_data.get("role", "generate"),
prompt_template=step_data.get("prompt_template", f"default:{step_data.get('role', 'generate')}"),
output_key=step_data["output_key"],
verdict=step_data.get("verdict", False),
verdict_pattern=step_data.get("verdict_pattern", r"VERDICT:\s*PASS"),
context_override=step_data.get("context_override", {}),
))
return steps, []
raise ValueError(
f"'pipeline' must be a preset string (e.g. 'preset:simple') "
f"or a list of step definitions, got {type(pipeline_raw).__name__}"
)
def validate_config(config: PipelineConfig) -> list[str]:
"""Return list of validation error strings (empty = valid)."""
errors: list[str] = []
if config.phases:
# --- Phased pipeline validation ---
for phase in config.phases:
if not phase.steps:
errors.append(f"Phase '{phase.name}' has no steps")
for step in phase.steps:
if step.agent not in config.agents:
errors.append(
f"Phase '{phase.name}' step '{step.name}' references "
f"undefined agent '{step.agent}'. "
f"Defined agents: {list(config.agents.keys())}"
)
_validate_unique_step_fields(
phase.steps,
errors,
scope=f"Phase '{phase.name}'",
)
if not any(s.verdict for s in phase.steps):
errors.append(
f"Phase '{phase.name}' must have at least one step with verdict: true"
)
# Validate verdict patterns
for step in phase.steps:
if step.verdict:
try:
re.compile(step.verdict_pattern)
except re.error as e:
errors.append(
f"Phase '{phase.name}' step '{step.name}' "
f"has invalid verdict_pattern: {e}"
)
else:
# --- Simple pipeline validation ---
if not config.pipeline:
errors.append("Pipeline must have at least one step")
for step in config.pipeline:
if step.agent not in config.agents:
errors.append(
f"Step '{step.name}' references undefined agent '{step.agent}'. "
f"Defined agents: {list(config.agents.keys())}"
)
_validate_unique_step_fields(
config.pipeline,
errors,
scope="Pipeline",
)
if not any(s.verdict for s in config.pipeline):
errors.append("Pipeline must have at least one step with verdict: true")
for step in config.pipeline:
if step.verdict:
try:
re.compile(step.verdict_pattern)
except re.error as e:
errors.append(
f"Step '{step.name}' has invalid verdict_pattern: {e}"
)
# --- Common validation ---
for key, val in config.inputs.items():
if isinstance(val, Path) and not val.exists():
errors.append(f"Input file '{key}' not found: {val}")
if config.language not in ("en", "ko"):
errors.append(f"Unsupported language '{config.language}'. Use 'en' or 'ko'.")
return errors
def _validate_unique_step_fields(
steps: list[StepConfig],
errors: list[str],
*,
scope: str,
) -> None:
"""Ensure step names and output keys are unique within a step collection."""
seen_names: set[str] = set()
seen_output_keys: set[str] = set()
for step in steps:
if step.name in seen_names:
errors.append(f"{scope} has duplicate step name '{step.name}'")
seen_names.add(step.name)
if step.output_key in seen_output_keys:
errors.append(f"{scope} has duplicate output_key '{step.output_key}'")
seen_output_keys.add(step.output_key)
def apply_input_overrides(
config: PipelineConfig, overrides: dict[str, str]
) -> None:
"""Apply CLI --input overrides to the config."""
for key, path_str in overrides.items():
config.inputs[key] = Path(path_str)

118
cross_eval/models.py Normal file
View File

@@ -0,0 +1,118 @@
"""Data models for cross-eval pipeline."""
from __future__ import annotations
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
@dataclass
class AgentConfig:
"""Definition of a single agent."""
name: str
command: str
args: list[str] = field(default_factory=list)
system_prompt: Optional[str] = None
reasoning_effort: Optional[str] = None
stdin_mode: bool = False
@dataclass
class StepConfig:
"""One step in the pipeline."""
name: str
agent: str # reference to agents key
role: str # "generate" or "review"
prompt_template: str # "default:<role>" or file path
output_key: str
verdict: bool = False
verdict_pattern: str = r"VERDICT:\s*PASS"
context_override: dict[str, str] = field(default_factory=dict)
parallel: bool = False # Can run concurrently with adjacent parallel steps
@dataclass
class PhaseConfig:
"""One phase in a multi-phase pipeline (e.g. review-fix)."""
name: str
steps: list[StepConfig] = field(default_factory=list)
max_iterations: int = 10
consecutive_pass: int = 1 # stop after N consecutive PASSes
@dataclass
class PipelineConfig:
"""Full cross-eval configuration."""
output_dir: Path = field(default_factory=lambda: Path("output"))
max_iterations: int = 3
min_iterations: int = 1
verbose: bool = False
language: str = "en" # "en" or "ko"
inputs: dict[str, Path | str] = field(default_factory=dict)
agents: dict[str, AgentConfig] = field(default_factory=dict)
coders: list[str] = field(default_factory=list)
reviewers: list[str] = field(default_factory=list)
seniors: list[str] = field(default_factory=list)
pipeline: list[StepConfig] = field(default_factory=list)
phases: list[PhaseConfig] = field(default_factory=list)
preset_name: str = "custom"
_config_path: Optional[Path] = field(default=None, repr=False)
_config_mtime: Optional[float] = field(default=None, repr=False)
@dataclass
class AgentResult:
"""Result from an agent invocation."""
output: str
exit_code: int
agent_name: str
step_name: str
duration_seconds: float
@dataclass
class ReviewMetrics:
"""Parsed metrics from a single review output."""
# Severity counts
critical: int = 0
major: int = 0
minor: int = 0
# Category counts
over_engineering: int = 0
omission: int = 0
# Assessment counts
confirmed: int = 0
dismissed: int = 0
@dataclass
class IterationResult:
"""Results from a single iteration."""
iteration: int
step_results: dict[str, AgentResult] = field(default_factory=dict)
step_outputs: dict[str, str] = field(default_factory=dict)
verdict: Optional[str] = None
feedback: Optional[str] = None
phase_name: Optional[str] = None
repeated_aggregate_warning: Optional[str] = None
review_metrics: Optional[ReviewMetrics] = None
@dataclass
class PipelineResult:
"""Results from the entire pipeline run."""
iterations: list[IterationResult] = field(default_factory=list)
final_verdict: str = "MAX_ITERATIONS_REACHED"
total_duration: float = 0.0
run_dir: Optional[Path] = None
repeated_aggregate_warnings: list[str] = field(default_factory=list)

700
cross_eval/pipeline.py Normal file
View File

@@ -0,0 +1,700 @@
"""Main pipeline execution engine."""
from __future__ import annotations
import logging
import os
import re
import subprocess
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pathlib import Path
from cross_eval.agent import invoke_agent
from cross_eval.config import try_reload_config
from cross_eval.models import (
AgentResult,
IterationResult,
PipelineConfig,
PipelineResult,
StepConfig,
)
from cross_eval.prompts import render_template, resolve_template, set_language
from cross_eval.report import build_report
logger = logging.getLogger(__name__)
def run_pipeline(
config: PipelineConfig,
cwd: Path | None = None,
dry_run: bool = False,
timeout: int | None = None,
) -> PipelineResult:
"""Execute the full cross-eval pipeline."""
# Create run directory: output/{preset}_{datetime}/
run_dir = _make_run_dir(config)
if config.phases:
return _run_phased_pipeline(config, run_dir, cwd, dry_run, timeout)
return _run_simple_pipeline(config, run_dir, cwd, dry_run, timeout)
def _make_run_dir(config: PipelineConfig) -> Path:
"""Create timestamped run directory under output_dir."""
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
run_dir = config.output_dir / f"{config.preset_name}_{ts}"
run_dir.mkdir(parents=True, exist_ok=True)
return run_dir
def _run_simple_pipeline(
config: PipelineConfig,
run_dir: Path,
cwd: Path | None = None,
dry_run: bool = False,
timeout: int | None = None,
) -> PipelineResult:
"""Execute a simple (non-phased) pipeline."""
if cwd is None:
cwd = Path(os.getcwd())
set_language(config.language)
input_contents = _load_inputs(config)
feedback = "(no feedback — first iteration)"
iterations: list[IterationResult] = []
start_time = time.monotonic()
final_verdict = "MAX_ITERATIONS_REACHED"
aggregate_history: dict[str, int] = {}
aggregate_warnings: list[str] = []
for i in range(1, config.max_iterations + 1):
config = try_reload_config(config)
set_language(config.language)
_refresh_inputs(config, input_contents)
logger.info("=" * 50)
logger.info(" Iteration %d/%d", i, config.max_iterations)
logger.info("=" * 50)
step_outputs, step_results, verdict = _run_steps(
config.pipeline, config, input_contents, feedback,
i, config.max_iterations, cwd, timeout, dry_run,
run_dir=run_dir, output_iter=i,
)
iter_result = IterationResult(
iteration=i,
step_results=step_results,
step_outputs=step_outputs,
verdict=verdict,
)
warning = _detect_repeated_aggregate(
config.pipeline, step_outputs, aggregate_history, iteration=i,
)
if warning:
iter_result.repeated_aggregate_warning = warning
aggregate_warnings.append(warning)
logger.warning(" %s", warning)
iter_result.feedback = _collect_feedback(config.pipeline, step_outputs)
feedback = iter_result.feedback or feedback
iterations.append(iter_result)
if verdict == "PASS":
final_verdict = "PASS"
if i >= config.min_iterations:
logger.info(" PASS at iteration %d (min=%d reached)!", i, config.min_iterations)
break
else:
logger.info(
" PASS at iteration %d, but min_iterations=%d — continuing",
i, config.min_iterations,
)
if dry_run:
logger.info(" (dry-run: stopping after iteration 1)")
break
total_duration = time.monotonic() - start_time
pipeline_result = PipelineResult(
iterations=iterations,
final_verdict=final_verdict,
total_duration=round(total_duration, 1),
run_dir=run_dir,
repeated_aggregate_warnings=aggregate_warnings,
)
if not dry_run:
_save_report(run_dir, config, pipeline_result)
return pipeline_result
def _run_phased_pipeline(
config: PipelineConfig,
run_dir: Path,
cwd: Path | None = None,
dry_run: bool = False,
timeout: int | None = None,
) -> PipelineResult:
"""Execute a multi-phase pipeline (e.g. review-fix)."""
if cwd is None:
cwd = Path(os.getcwd())
set_language(config.language)
input_contents = _load_inputs(config)
iterations: list[IterationResult] = []
feedback = "(no feedback — first iteration)"
start_time = time.monotonic()
final_verdict = "MAX_ITERATIONS_REACHED"
global_iter = 0
aggregate_history_by_phase: dict[str, dict[str, int]] = {}
aggregate_warnings: list[str] = []
for phase_idx, phase in enumerate(config.phases):
logger.info("=" * 60)
logger.info(
" Phase: %s (max_iter=%d, consecutive_pass=%d)",
phase.name, phase.max_iterations, phase.consecutive_pass,
)
logger.info("=" * 60)
consecutive_passes = 0
phase_converged = False
for pi in range(1, phase.max_iterations + 1):
global_iter += 1
config = try_reload_config(config)
set_language(config.language)
_refresh_inputs(config, input_contents)
logger.info("-" * 50)
logger.info(
" [%s] Iteration %d/%d (global: v%d)",
phase.name, pi, phase.max_iterations, global_iter,
)
logger.info("-" * 50)
step_outputs, step_results, verdict = _run_steps(
phase.steps, config, input_contents, feedback,
pi, phase.max_iterations, cwd, timeout, dry_run,
run_dir=run_dir, output_iter=global_iter, phase_name=phase.name,
)
iter_result = IterationResult(
iteration=global_iter,
step_results=step_results,
step_outputs=step_outputs,
verdict=verdict,
phase_name=phase.name,
)
phase_history = aggregate_history_by_phase.setdefault(phase.name, {})
warning = _detect_repeated_aggregate(
phase.steps, step_outputs, phase_history, iteration=global_iter,
phase_name=phase.name,
)
if warning:
iter_result.repeated_aggregate_warning = warning
aggregate_warnings.append(warning)
logger.warning(" %s", warning)
iter_result.feedback = _collect_feedback(phase.steps, step_outputs)
feedback = iter_result.feedback or feedback
iterations.append(iter_result)
if verdict == "PASS":
consecutive_passes += 1
logger.info(
" [%s] PASS (%d/%d consecutive)",
phase.name, consecutive_passes, phase.consecutive_pass,
)
if consecutive_passes >= phase.consecutive_pass:
logger.info(
" [%s] Converged! %d consecutive PASSes.",
phase.name, phase.consecutive_pass,
)
phase_converged = True
break
else:
consecutive_passes = 0
if dry_run:
break
if phase_converged:
logger.info(" Phase '%s' completed: CONVERGED", phase.name)
else:
logger.info(
" Phase '%s' completed: max iterations (%d) reached",
phase.name, phase.max_iterations,
)
if phase_idx == len(config.phases) - 1:
final_verdict = "PASS" if phase_converged else "MAX_ITERATIONS_REACHED"
total_duration = time.monotonic() - start_time
pipeline_result = PipelineResult(
iterations=iterations,
final_verdict=final_verdict,
total_duration=round(total_duration, 1),
run_dir=run_dir,
repeated_aggregate_warnings=aggregate_warnings,
)
if not dry_run:
_save_report(run_dir, config, pipeline_result)
return pipeline_result
# ---------------------------------------------------------------------------
# Shared helpers
# ---------------------------------------------------------------------------
def _load_inputs(config: PipelineConfig) -> dict[str, str]:
"""Load input file contents from config."""
input_contents: dict[str, str] = {}
for key, val in config.inputs.items():
if isinstance(val, str):
input_contents[key] = val
else:
input_contents[key] = val.read_text(encoding="utf-8")
return input_contents
def _refresh_inputs(
config: PipelineConfig, input_contents: dict[str, str],
) -> None:
"""Re-read input files (they may have changed on disk)."""
for key, val in config.inputs.items():
if isinstance(val, str):
input_contents[key] = val
elif isinstance(val, Path) and val.exists():
input_contents[key] = val.read_text(encoding="utf-8")
# ---------------------------------------------------------------------------
# Parallel step grouping
# ---------------------------------------------------------------------------
def _get_step_dependencies(step: StepConfig) -> set[str]:
"""Extract output_key references from context_override values."""
deps: set[str] = set()
for val in step.context_override.values():
for match in re.finditer(r"\{(\w+)\}", val):
deps.add(match.group(1))
return deps
def _group_parallel_steps(steps: list[StepConfig]) -> list[list[StepConfig]]:
"""Group consecutive parallel steps into batches.
Consecutive steps with parallel=True are grouped together,
but a new batch starts when a step depends on an output_key
from a step in the current batch (dependency breaking).
"""
batches: list[list[StepConfig]] = []
current: list[StepConfig] = []
current_output_keys: set[str] = set()
for step in steps:
if not step.parallel:
if current:
batches.append(current)
current = []
current_output_keys = set()
batches.append([step])
continue
# Check if this step depends on any output from the current batch
deps = _get_step_dependencies(step)
if deps & current_output_keys:
batches.append(current)
current = []
current_output_keys = set()
current.append(step)
current_output_keys.add(step.output_key)
if current:
batches.append(current)
return batches
# ---------------------------------------------------------------------------
# Step execution
# ---------------------------------------------------------------------------
def _run_steps(
steps: list[StepConfig],
config: PipelineConfig,
input_contents: dict[str, str],
feedback: str,
iteration: int,
max_iterations: int,
cwd: Path,
timeout: int | None,
dry_run: bool,
*,
run_dir: Path,
output_iter: int,
phase_name: str | None = None,
) -> tuple[dict[str, str], dict[str, AgentResult], str | None]:
"""Execute all steps in one iteration, parallelizing where possible."""
step_outputs: dict[str, str] = {}
step_results: dict[str, AgentResult] = {}
verdict: str | None = None
batches = _group_parallel_steps(steps)
for batch in batches:
if len(batch) == 1:
# Single step — run directly
step = batch[0]
_execute_step(
step, config, input_contents, feedback,
iteration, max_iterations, cwd, timeout, dry_run,
step_outputs, step_results,
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
)
else:
# Parallel batch — run with ThreadPoolExecutor
_execute_parallel_batch(
batch, config, input_contents, feedback,
iteration, max_iterations, cwd, timeout, dry_run,
step_outputs, step_results,
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
)
# Extract verdict from all verdict steps (ALL must PASS)
for step in steps:
if step.verdict:
output = step_outputs.get(step.output_key, "")
step_verdict = _extract_verdict(output, step.verdict_pattern)
logger.info(" [%s] verdict: %s", step.name, step_verdict)
if verdict is None:
verdict = step_verdict
elif step_verdict == "FAIL":
verdict = "FAIL"
return step_outputs, step_results, verdict
def _execute_step(
step: StepConfig,
config: PipelineConfig,
input_contents: dict[str, str],
feedback: str,
iteration: int,
max_iterations: int,
cwd: Path,
timeout: int | None,
dry_run: bool,
step_outputs: dict[str, str],
step_results: dict[str, AgentResult],
*,
run_dir: Path,
output_iter: int,
phase_name: str | None = None,
quiet: bool = False,
) -> None:
"""Execute a single step, updating step_outputs and step_results in place."""
if not quiet:
logger.info(" [%s] agent='%s' role='%s'", step.name, step.agent, step.role)
# 1. Resolve template
template = resolve_template(step.prompt_template)
# 2. Build context
context = _build_context(
input_contents, step_outputs, feedback, iteration, max_iterations,
)
# 3. Apply context overrides
if step.context_override:
context = _apply_context_override(context, step.context_override)
# 4. Render prompt
prompt = render_template(template, context)
# 5. Dry run: print and skip
if dry_run:
phase_label = f" phase={phase_name}" if phase_name else ""
print(f"\n--- Step: {step.name} (agent={step.agent}{phase_label}) ---")
print(prompt)
print(f"--- end {step.name} ---\n")
step_outputs[step.output_key] = f"(dry-run: no output for {step.output_key})"
return
# 6. Invoke agent
agent_config = config.agents[step.agent]
try:
result = invoke_agent(
agent_config, prompt, step.name,
cwd=cwd, timeout=timeout, quiet=quiet,
)
except subprocess.TimeoutExpired as e:
stdout = (e.stdout or b"") if isinstance(e.stdout, bytes) else (e.stdout or "")
stderr = (e.stderr or b"") if isinstance(e.stderr, bytes) else (e.stderr or "")
if isinstance(stdout, bytes):
stdout = stdout.decode("utf-8", errors="replace")
if isinstance(stderr, bytes):
stderr = stderr.decode("utf-8", errors="replace")
phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
error_msg = (
f"# Agent Timeout\n\n"
f"{phase_info}"
f"- **Step**: {step.name}\n"
f"- **Agent**: {step.agent}\n"
f"- **Timeout**: {timeout}s\n\n"
f"Partial stdout ({len(stdout)} chars):\n"
f"```\n{stdout[:2000] or '(none)'}\n```\n\n"
f"Stderr:\n```\n{stderr[:2000] or '(none)'}\n```\n"
)
_save_step_output(run_dir, output_iter, f"{step.name}_error", error_msg)
logger.error(" [%s] TIMEOUT after %ss — saved to output", step.name, timeout)
raise RuntimeError(
f"Agent '{step.agent}' timed out after {timeout}s at step '{step.name}'. "
f"Error saved to {run_dir}/v{output_iter}/{step.name}_error.md. "
f"Try --timeout 0 (unlimited)"
)
except RuntimeError as e:
phase_info = f"- **Phase**: {phase_name}\n" if phase_name else ""
error_msg = (
f"# Agent Error\n\n{phase_info}"
f"- **Step**: {step.name}\n- **Agent**: {step.agent}\n\n```\n{e}\n```\n"
)
_save_step_output(run_dir, output_iter, f"{step.name}_error", error_msg)
logger.error(" [%s] FAILED — saved to output", step.name)
raise
# 7. Store output
step_outputs[step.output_key] = result.output
step_results[step.output_key] = result
if not quiet:
logger.info(
" [%s] completed (%.1fs, %d chars)",
step.name, result.duration_seconds, len(result.output),
)
# 8. Save to disk
_save_step_output(run_dir, output_iter, step.name, result.output)
def _execute_parallel_batch(
batch: list[StepConfig],
config: PipelineConfig,
input_contents: dict[str, str],
feedback: str,
iteration: int,
max_iterations: int,
cwd: Path,
timeout: int | None,
dry_run: bool,
step_outputs: dict[str, str],
step_results: dict[str, AgentResult],
*,
run_dir: Path,
output_iter: int,
phase_name: str | None = None,
) -> None:
"""Execute multiple steps in parallel using threads."""
agent_names = ", ".join(s.agent for s in batch)
logger.info(" [parallel] %d agents: %s", len(batch), agent_names)
if dry_run:
for step in batch:
_execute_step(
step, config, input_contents, feedback,
iteration, max_iterations, cwd, timeout, dry_run,
step_outputs, step_results,
run_dir=run_dir, output_iter=output_iter, phase_name=phase_name,
)
return
# Snapshot context before parallel execution (all steps see same state)
context_snapshot = dict(input_contents)
context_snapshot.update(step_outputs)
# Collect results from parallel threads
local_outputs: dict[str, str] = {}
local_results: dict[str, AgentResult] = {}
errors: list[Exception] = []
# Show a single spinner for the batch
from cross_eval.agent import _Spinner
spinner = _Spinner(
f"[parallel] {len(batch)} agents running ({agent_names})..."
)
spinner.start()
batch_start = time.monotonic()
def _run_one(step: StepConfig) -> tuple[str, str, AgentResult]:
"""Run one step, return (output_key, output, result)."""
template = resolve_template(step.prompt_template)
context = _build_context(
context_snapshot, {}, feedback, iteration, max_iterations,
)
if step.context_override:
context = _apply_context_override(context, step.context_override)
prompt = render_template(template, context)
agent_config = config.agents[step.agent]
result = invoke_agent(
agent_config, prompt, step.name,
cwd=cwd, timeout=timeout, quiet=True,
)
return step.output_key, result.output, result
with ThreadPoolExecutor(max_workers=len(batch)) as executor:
futures = {executor.submit(_run_one, step): step for step in batch}
for future in as_completed(futures):
step = futures[future]
try:
output_key, output, result = future.result()
local_results[output_key] = result
local_outputs[output_key] = output
except Exception as e:
errors.append(e)
batch_elapsed = round(time.monotonic() - batch_start, 1)
if errors:
spinner.stop(f"[parallel] FAILED ({batch_elapsed}s)")
raise errors[0]
spinner.stop(f"[parallel] {len(batch)} agents done ({batch_elapsed}s)")
# Merge results
for step in batch:
key = step.output_key
step_outputs[key] = local_outputs[key]
step_results[key] = local_results[key]
r = local_results[key]
logger.info(
" [%s] completed (%.1fs, %d chars)",
step.name, r.duration_seconds, len(r.output),
)
_save_step_output(run_dir, output_iter, step.name, r.output)
# ---------------------------------------------------------------------------
# Context and template helpers
# ---------------------------------------------------------------------------
def _build_context(
input_contents: dict[str, str],
step_outputs: dict[str, str],
feedback: str,
iteration: int,
max_iterations: int,
) -> dict[str, str]:
"""Build the template context dict."""
context: dict[str, str] = {}
context.update(input_contents)
context.update(step_outputs)
context["feedback"] = feedback
context["iteration"] = str(iteration)
context["max_iterations"] = str(max_iterations)
return context
def _apply_context_override(
context: dict[str, str],
overrides: dict[str, str],
) -> dict[str, str]:
"""Apply context_override mappings for cross-review scenarios."""
result = dict(context)
for key, value_template in overrides.items():
result[key] = render_template(value_template, context)
return result
def _collect_feedback(
steps: list[StepConfig],
step_outputs: dict[str, str],
) -> str:
"""Collect feedback from all verdict steps.
Single verdict step → raw output (backward compatible).
Multiple verdict steps → combined with agent headers for cross-referencing.
"""
verdict_steps = [s for s in steps if s.verdict]
if len(verdict_steps) == 1:
return step_outputs.get(verdict_steps[0].output_key, "")
parts: list[str] = []
for s in verdict_steps:
output = step_outputs.get(s.output_key, "")
if output:
parts.append(f"## Review by {s.agent} ({s.name})\n{output}")
return "\n\n---\n\n".join(parts)
def _detect_repeated_aggregate(
steps: list[StepConfig],
step_outputs: dict[str, str],
history: dict[str, int],
*,
iteration: int,
phase_name: str | None = None,
) -> str | None:
"""Detect repeated aggregate-review outputs across iterations."""
for step in steps:
if step.prompt_template != "default:aggregate-review":
continue
output = step_outputs.get(step.output_key, "")
normalized = _normalize_aggregate_output(output)
if not normalized:
return None
if normalized in history:
prev_iter = history[normalized]
phase_prefix = f"[{phase_name}] " if phase_name else ""
return (
f"{phase_prefix}Repeated aggregate_review detected at iteration {iteration} "
f"(same as iteration {prev_iter})."
)
history[normalized] = iteration
return None
return None
def _normalize_aggregate_output(output: str) -> str:
"""Normalize aggregate output for repeat detection."""
return " ".join(output.lower().split())
def _extract_verdict(output: str, pattern: str) -> str:
"""Extract PASS or FAIL from output using regex pattern."""
if re.search(pattern, output):
return "PASS"
return "FAIL"
def _save_step_output(
run_dir: Path,
iteration: int,
step_name: str,
content: str,
) -> Path:
"""Save step output to run_dir/v{iteration}/{step_name}.md"""
path = run_dir / f"v{iteration}" / f"{step_name}.md"
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content, encoding="utf-8")
return path
def _save_report(run_dir: Path, config: PipelineConfig, result: PipelineResult) -> None:
"""Generate and save the final markdown report."""
report = build_report(config, result)
report_path = run_dir / "final-report.md"
report_path.parent.mkdir(parents=True, exist_ok=True)
report_path.write_text(report, encoding="utf-8")
logger.info("Report saved: %s", report_path)

845
cross_eval/prompts.py Normal file
View File

@@ -0,0 +1,845 @@
"""Default prompt templates and pipeline presets."""
from __future__ import annotations
import collections
from pathlib import Path
from typing import Callable, Optional
from cross_eval.models import PhaseConfig, StepConfig
# ---------------------------------------------------------------------------
# Default prompt templates
# ---------------------------------------------------------------------------
GENERATE_TEMPLATE = """\
You are tasked with implementing code based on a plan and checklist.
## Plan
{plan}
## Checklist
{checklist}
## Reference Documents
{docs}
## Previous Review Feedback
{feedback}
## Iteration
This is iteration {iteration} of {max_iterations}.
## Instructions
1. Explore the project directory to understand the existing codebase structure.
2. Implement ONLY what the plan specifies. Do NOT add extra features, \
unnecessary abstractions, or premature optimizations.
3. Follow every item in the checklist.
4. If there is previous feedback, address ONLY the specific issues mentioned.
5. If previous feedback contains items marked as DISMISSED or false positive, \
IGNORE those items — they have been verified as correct.
6. Output the complete implementation.
"""
REVIEW_TEMPLATE = """\
You are tasked with reviewing code against a plan and checklist.
## Plan
{plan}
## Checklist
{checklist}
## Reference Documents
{docs}
## Generated Code / Previous Step Output
{generated_code}
## Previous Review Feedback
{feedback}
## Review Instructions
Explore the project directory to understand the full codebase context, \
then evaluate the code against ONLY the plan and checklist above.
For each issue found, classify it with BOTH severity AND category:
Severity levels:
- **Critical**: Breaks functionality, causes data loss, or introduces security vulnerabilities.
- **Major**: Requirement mismatch, significant logic errors, or missing core functionality.
- **Minor**: Coding convention violations, trivial omissions, or style issues.
Categories:
- **Over-engineering**: Code adds features, abstractions, or complexity \
NOT required by the plan.
- **Omission**: A requirement from the plan or checklist that is missing or \
incomplete in the implementation.
If previous review feedback is provided above, you MUST assess each item:
- **CONFIRMED**: The issue is still present in the current code.
- **DISMISSED (false positive)**: The flagged item is actually correct per \
the plan requirements. Provide rationale.
If you find issues outside the plan/checklist scope (e.g. pre-existing bugs, \
security concerns, performance problems), report them separately under \
"Out of Scope Issues".
## Output Format
### Previous Feedback Assessment
(Only include this section if previous feedback was provided.)
- CONFIRMED: [item description] — still an issue because [reason]
- DISMISSED (false positive): [item description] — actually correct because [reason]
(Write "N/A" if no previous feedback was provided.)
### Issues Found
List issues ordered by severity (Critical first):
- [Critical][Over-engineering] Description (reference specific plan/checklist item)
- [Major][Omission] Description (reference specific plan/checklist item)
- [Minor][Omission] Description (reference specific plan/checklist item)
### Out of Scope Issues
Issues found outside plan/checklist scope but worth noting:
- [Critical] Description of issue
- [Minor] Description of issue
(Write "None" if no out-of-scope issues found.)
### Summary
- Critical: N, Major: N, Minor: N
- Over-engineering count: N
- Omission count: N
- CONFIRMED: N, DISMISSED: N
- Overall quality: [BRIEF ASSESSMENT]
### Verdict
If all checklist items are satisfied and there is no over-engineering or \
omission, output: VERDICT: PASS
Otherwise output: VERDICT: FAIL
"""
GENERATE_TEMPLATE_KO = """\
당신은 기획서와 체크리스트를 기반으로 코드를 구현하는 개발자입니다.
## 기획서
{plan}
## 체크리스트
{checklist}
## 참고 문서
{docs}
## 이전 리뷰 피드백
{feedback}
## 반복 정보
현재 {max_iterations}회 중 {iteration}번째 반복입니다.
## 지침
1. 프로젝트 디렉토리를 탐색하여 기존 코드베이스 구조를 파악하세요.
2. 기획서에 명시된 것만 구현하세요. 추가 기능, 불필요한 추상화, 과도한 최적화를 하지 마세요.
3. 체크리스트의 모든 항목을 충족하세요.
4. 이전 리뷰 피드백이 있다면 해당 이슈만 해결하세요.
5. 이전 피드백에서 DISMISSED 또는 오탐으로 표시된 항목은 무시하세요 — 이미 올바른 것으로 검증되었습니다.
6. 완전한 구현을 출력하세요.
"""
REVIEW_TEMPLATE_KO = """\
당신은 기획서와 체크리스트 기준으로 코드를 검토하는 리뷰어입니다.
## 기획서
{plan}
## 체크리스트
{checklist}
## 참고 문서
{docs}
## 검토 대상 코드
{generated_code}
## 이전 리뷰 피드백
{feedback}
## 검토 지침
프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스 맥락을 파악한 뒤, \
위 기획서와 체크리스트 기준으로만 코드를 평가하세요.
발견된 각 이슈에 심각도와 카테고리를 모두 부여하세요:
심각도:
- **Critical**: 기능 장애, 데이터 손실, 보안 취약점을 유발하는 문제.
- **Major**: 요구사항 불일치, 중대한 로직 오류, 핵심 기능 누락.
- **Minor**: 코딩 컨벤션 위반, 사소한 누락, 스타일 문제.
카테고리:
- **과최적화**: 기획서에 없는 기능, 추상화, 복잡성을 추가한 경우.
- **누락**: 기획서/체크리스트에 있지만 구현에서 빠지거나 불완전한 요구사항.
이전 리뷰 피드백이 제공된 경우, 각 항목을 반드시 평가하세요:
- **CONFIRMED**: 현재 코드에 여전히 존재하는 이슈.
- **DISMISSED (오탐)**: 기획서 요구사항상 실제로 올바른 항목. 근거를 제시하세요.
기획서/체크리스트 범위 밖에서 발견된 문제(기존 버그, 보안 이슈, 성능 문제 등)는 \
"범위 밖 이슈" 섹션에 별도로 보고하세요.
## 출력 형식
### 이전 피드백 평가
(이전 피드백이 제공된 경우에만 포함하세요.)
- CONFIRMED: [항목 설명] — 여전히 이슈인 이유: [근거]
- DISMISSED (오탐): [항목 설명] — 실제로 올바른 이유: [근거]
(이전 피드백이 없으면 "해당 없음"이라고 작성하세요.)
### 발견된 이슈
심각도 순서(Critical 먼저)로 나열:
- [Critical][과최적화] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
- [Major][누락] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
- [Minor][누락] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
### 범위 밖 이슈
기획서/체크리스트 범위 밖이지만 주목할 만한 이슈:
- [Critical] 이슈 설명
- [Minor] 이슈 설명
(범위 밖 이슈가 없으면 "없음"이라고 작성하세요.)
### 요약
- Critical: N, Major: N, Minor: N
- 과최적화 수: N
- 누락 수: N
- CONFIRMED: N, DISMISSED: N
- 전체 품질: [간략한 평가]
### 판정
모든 체크리스트 항목이 충족되고 과최적화/누락이 없으면: VERDICT: PASS
그렇지 않으면: VERDICT: FAIL
"""
REVIEW_ONLY_TEMPLATE = """\
You are tasked with reviewing existing code against a plan and checklist.
## Plan
{plan}
## Checklist
{checklist}
## Reference Documents
{docs}
## Previous Review (iteration {iteration} of {max_iterations})
{feedback}
## Review Instructions
Explore the project directory thoroughly to understand the full codebase, \
then evaluate the EXISTING code against ONLY the plan and checklist above.
You are NOT generating or modifying code. You are auditing what already exists.
If previous review results are provided above, you MUST:
1. Verify each previously reported issue — is it a real issue or a false positive?
2. Look for issues the previous review MISSED.
3. Do NOT simply repeat the previous review. Provide your own independent assessment.
4. Explicitly mark items as CONFIRMED (still an issue) or DISMISSED (false positive).
For each issue found, classify it with BOTH severity AND category:
Severity levels:
- **Critical**: Breaks functionality, causes data loss, or introduces security vulnerabilities.
- **Major**: Requirement mismatch, significant logic errors, or missing core functionality.
- **Minor**: Coding convention violations, trivial omissions, or style issues.
Categories:
- **Over-engineering**: Code adds features, abstractions, or complexity \
NOT required by the plan.
- **Omission**: A requirement from the plan or checklist that is missing or \
incomplete in the implementation.
If you find issues outside the plan/checklist scope (e.g. pre-existing bugs, \
security concerns, performance problems), report them separately under \
"Out of Scope Issues".
## Output Format
### Issues Found
List issues ordered by severity (Critical first):
- [Critical][Over-engineering] Description (reference specific plan/checklist item)
- [Major][Omission] Description (reference specific plan/checklist item)
- [Minor][Omission] Description (reference specific plan/checklist item)
### Out of Scope Issues
Issues found outside plan/checklist scope but worth noting:
- [Critical] Description of issue
- [Minor] Description of issue
(Write "None" if no out-of-scope issues found.)
### Summary
- Critical: N, Major: N, Minor: N
- Over-engineering count: N
- Omission count: N
- CONFIRMED: N, DISMISSED: N
- Overall quality: [BRIEF ASSESSMENT]
### Verdict
If all checklist items are satisfied and there is no over-engineering or \
omission, output: VERDICT: PASS
Otherwise output: VERDICT: FAIL
"""
REVIEW_ONLY_TEMPLATE_KO = """\
당신은 기존 코드를 기획서와 체크리스트 기준으로 감사하는 리뷰어입니다.
## 기획서
{plan}
## 체크리스트
{checklist}
## 참고 문서
{docs}
## 이전 리뷰 결과 ({max_iterations}회 중 {iteration}번째)
{feedback}
## 검토 지침
프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스를 파악한 뒤, \
위 기획서와 체크리스트 기준으로 **기존 코드**를 평가하세요.
코드를 생성하거나 수정하지 마세요. 이미 존재하는 코드를 감사하는 것이 목적입니다.
이전 리뷰 결과가 제공된 경우 반드시:
1. 이전에 보고된 각 이슈를 검증하세요 — 진짜 이슈인지 오탐인지?
2. 이전 리뷰가 놓친 새로운 이슈를 찾으세요.
3. 이전 리뷰를 그대로 반복하지 마세요. 독립적인 평가를 제공하세요.
4. 각 항목에 CONFIRMED (여전히 이슈) 또는 DISMISSED (오탐) 태그를 명시하세요.
발견된 각 이슈에 심각도와 카테고리를 모두 부여하세요:
심각도:
- **Critical**: 기능 장애, 데이터 손실, 보안 취약점을 유발하는 문제.
- **Major**: 요구사항 불일치, 중대한 로직 오류, 핵심 기능 누락.
- **Minor**: 코딩 컨벤션 위반, 사소한 누락, 스타일 문제.
카테고리:
- **과최적화**: 기획서에 없는 기능, 추상화, 복잡성을 추가한 경우.
- **누락**: 기획서/체크리스트에 있지만 구현에서 빠지거나 불완전한 요구사항.
기획서/체크리스트 범위 밖에서 발견된 문제(기존 버그, 보안 이슈, 성능 문제 등)는 \
"범위 밖 이슈" 섹션에 별도로 보고하세요.
## 출력 형식
### 발견된 이슈
심각도 순서(Critical 먼저)로 나열:
- [Critical][과최적화] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
- [Major][누락] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
- [Minor][누락] 이슈 설명 (관련 기획서/체크리스트 항목 참조)
### 범위 밖 이슈
기획서/체크리스트 범위 밖이지만 주목할 만한 이슈:
- [Critical] 이슈 설명
- [Minor] 이슈 설명
(범위 밖 이슈가 없으면 "없음"이라고 작성하세요.)
### 요약
- Critical: N, Major: N, Minor: N
- 과최적화 수: N
- 누락 수: N
- CONFIRMED: N, DISMISSED: N
- 전체 품질: [간략한 평가]
### 판정
모든 체크리스트 항목이 충족되고 과최적화/누락이 없으면: VERDICT: PASS
그렇지 않으면: VERDICT: FAIL
"""
AGGREGATE_REVIEW_TEMPLATE = """\
You are adjudicating multiple review results and turning them into an actionable decision.
## Plan
{plan}
## Checklist
{checklist}
## Reference Documents
{docs}
## Candidate Outputs
{candidate_outputs}
## Reviewer Findings
{reviews_bundle}
## Previous Verification Feedback
{feedback}
## Instructions
Explore the project directory to confirm the current codebase state. Then:
1. Deduplicate overlapping issues across reviewers.
2. Resolve disagreements explicitly.
3. Keep only issues supported by the plan, checklist, code, or reviewer evidence.
4. When evidence is mixed, explain what was confirmed, what was dismissed, and what still needs follow-up.
5. Produce a prioritized action list for the coder.
6. If no confirmed issue remains, output VERDICT: PASS. Otherwise VERDICT: FAIL.
## Output Format
### Confirmed Issues
- [Critical][Omission] Description with rationale and source reviewer(s)
### Dismissed Findings
- [False positive] Claim — reason why it is actually correct (raised by: Reviewer X)
- [Already fixed] Claim — already resolved in the current code (raised by: Reviewer X)
(Write "None" if nothing was dismissed.)
### Action Items
1. Concrete fix the coder should make
2. Concrete fix the coder should make
### Summary
- Confirmed issues: N
- Dismissed findings: N (false positive: N, already fixed: N)
- Overall quality: [BRIEF ASSESSMENT]
### Verdict
VERDICT: PASS or VERDICT: FAIL
"""
AGGREGATE_REVIEW_TEMPLATE_KO = """\
당신은 여러 리뷰 결과를 판정하고 coder가 수정할 액션으로 정리하는 시니어 리뷰어입니다.
## 기획서
{plan}
## 체크리스트
{checklist}
## 참고 문서
{docs}
## 후보 결과물
{candidate_outputs}
## 개별 리뷰 결과
{reviews_bundle}
## 이전 검증 피드백
{feedback}
## 지침
프로젝트 디렉토리를 탐색하여 현재 코드베이스 상태를 확인한 뒤 다음을 수행하세요.
1. 리뷰어들 사이에 중복되는 이슈를 합치세요.
2. 의견 충돌은 명시적으로 정리하세요.
3. 기획서, 체크리스트, 코드, 리뷰 근거로 뒷받침되는 이슈만 남기세요.
4. 근거가 엇갈리면 무엇이 확정이고 무엇이 기각 또는 추가확인 대상인지 분명히 적으세요.
5. coder가 바로 수정할 수 있는 우선순위 액션 아이템을 만드세요.
6. 확정된 이슈가 없으면 VERDICT: PASS, 있으면 VERDICT: FAIL 을 출력하세요.
## 출력 형식
### 확정 이슈
- [Critical][누락] 확정된 이슈 설명, 근거, 출처 리뷰어
### 기각된 주장
- [오탐] 주장 내용 — 실제로 올바른 이유 (제기: 리뷰어 X)
- [수정 완료] 주장 내용 — 현재 코드에서 이미 해결됨 (제기: 리뷰어 X)
(기각된 항목이 없으면 "없음"이라고 작성하세요.)
### 액션 아이템
1. coder가 수정해야 할 구체적인 작업
2. coder가 수정해야 할 구체적인 작업
### 요약
- 확정 이슈 수: N
- 기각된 주장 수: N (오탐: N, 수정 완료: N)
- 전체 품질: [간략한 평가]
### 판정
VERDICT: PASS 또는 VERDICT: FAIL
"""
DEFAULT_TEMPLATES: dict[str, dict[str, str]] = {
"en": {
"generate": GENERATE_TEMPLATE,
"review": REVIEW_TEMPLATE,
"review-only": REVIEW_ONLY_TEMPLATE,
"aggregate-review": AGGREGATE_REVIEW_TEMPLATE,
},
"ko": {
"generate": GENERATE_TEMPLATE_KO,
"review": REVIEW_TEMPLATE_KO,
"review-only": REVIEW_ONLY_TEMPLATE_KO,
"aggregate-review": AGGREGATE_REVIEW_TEMPLATE_KO,
},
}
# Current language (set by pipeline before run)
_current_language: str = "en"
def set_language(lang: str) -> None:
"""Set the current template language."""
global _current_language
if lang not in DEFAULT_TEMPLATES:
raise ValueError(f"Unsupported language '{lang}'. Available: {list(DEFAULT_TEMPLATES.keys())}")
_current_language = lang
# ---------------------------------------------------------------------------
# Pipeline presets
# ---------------------------------------------------------------------------
def _safe_key(name: str) -> str:
"""Sanitize agent name for use as template variable / output_key.
Replaces hyphens with underscores so names like 'claude-coder'
become 'claude_coder', which is valid in format_map().
"""
return name.replace("-", "_")
def _unique_safe_keys(names: list[str]) -> list[str]:
"""Return stable, collision-free keys for agent names.
Duplicate names keep the first key unchanged and receive numeric suffixes
from the second occurrence onward.
"""
totals = collections.Counter(_safe_key(name) for name in names)
seen: collections.defaultdict[str, int] = collections.defaultdict(int)
keys: list[str] = []
for name in names:
base = _safe_key(name)
seen[base] += 1
if totals[base] == 1 or seen[base] == 1:
keys.append(base)
else:
keys.append(f"{base}_{seen[base]}")
return keys
def _build_named_bundle(
labels: list[str],
step_names: list[str],
output_keys: list[str],
title: str,
) -> str:
"""Build a templated bundle from prior step outputs."""
parts: list[str] = []
for label, step_name, output_key in zip(labels, step_names, output_keys):
parts.append(
f"## {title}: {label} ({step_name})\n"
f"{{{output_key}}}"
)
return "\n\n---\n\n".join(parts)
def _build_simple_preset(
coders: list[str], reviewers: list[str], seniors: list[str],
) -> list[StepConfig]:
"""First coder generates, first reviewer reviews."""
if not coders:
raise ValueError("'simple' preset requires at least 1 coder")
if not reviewers:
raise ValueError("'simple' preset requires at least 1 reviewer")
steps = [
StepConfig(
name="generate",
agent=coders[0],
role="generate",
prompt_template="default:generate",
output_key="generated_code",
),
StepConfig(
name="review",
agent=reviewers[0],
role="review",
prompt_template="default:review",
output_key="review_result",
verdict=not seniors,
),
]
if seniors:
steps.append(
StepConfig(
name="senior_review",
agent=seniors[0],
role="review",
prompt_template="default:aggregate-review",
output_key="senior_review_result",
verdict=True,
context_override={
"candidate_outputs": "## Generated code\n{generated_code}",
"reviews_bundle": f"## Review: {reviewers[0]} (review)\n{{review_result}}",
},
),
)
return steps
def _build_cross_review_preset(
coders: list[str], reviewers: list[str], seniors: list[str],
) -> list[StepConfig]:
"""Both coders generate, then cross-review each other's output."""
if len(coders) < 2:
raise ValueError("'cross-review' preset requires at least 2 coders")
a, b = coders[0], coders[1]
ak, bk = _unique_safe_keys([a, b])
steps = [
StepConfig(
name=f"generate_{ak}",
agent=a,
role="generate",
prompt_template="default:generate",
output_key=f"code_{ak}",
parallel=True,
),
StepConfig(
name=f"generate_{bk}",
agent=b,
role="generate",
prompt_template="default:generate",
output_key=f"code_{bk}",
parallel=True,
),
StepConfig(
name=f"review_by_{ak}",
agent=a,
role="review",
prompt_template="default:review",
output_key=f"review_by_{ak}",
context_override={"generated_code": f"{{code_{bk}}}"},
parallel=True,
verdict=not seniors,
),
StepConfig(
name=f"review_by_{bk}",
agent=b,
role="review",
prompt_template="default:review",
output_key=f"review_by_{bk}",
verdict=not seniors,
context_override={"generated_code": f"{{code_{ak}}}"},
parallel=True,
),
]
if seniors:
steps.append(
StepConfig(
name="senior_review",
agent=seniors[0],
role="review",
prompt_template="default:aggregate-review",
output_key="senior_review_result",
verdict=True,
context_override={
"candidate_outputs": _build_named_bundle(
[a, b],
[f"generate_{ak}", f"generate_{bk}"],
[f"code_{ak}", f"code_{bk}"],
"Candidate",
),
"reviews_bundle": _build_named_bundle(
[a, b],
[f"review_by_{ak}", f"review_by_{bk}"],
[f"review_by_{ak}", f"review_by_{bk}"],
"Review",
),
},
),
)
return steps
def _build_review_only_preset(
coders: list[str], reviewers: list[str], seniors: list[str],
) -> list[StepConfig]:
"""Review-only: all reviewers audit existing code independently."""
if not reviewers:
raise ValueError("'review-only' preset requires at least 1 reviewer")
if len(reviewers) == 1 and not seniors:
# Single reviewer — backward compatible
return [
StepConfig(
name="review",
agent=reviewers[0],
role="review",
prompt_template="default:review-only",
output_key="review_result",
verdict=True,
),
]
# Multiple reviewers — each produces a separate review with verdict (parallel)
steps: list[StepConfig] = []
reviewer_keys = _unique_safe_keys(reviewers)
for reviewer, rk in zip(reviewers, reviewer_keys):
steps.append(
StepConfig(
name=f"review_{rk}",
agent=reviewer,
role="review",
prompt_template="default:review-only",
output_key=f"review_{rk}",
verdict=not seniors,
parallel=True,
),
)
if seniors:
step_names = [f"review_{rk}" for rk in reviewer_keys]
output_keys = [f"review_{rk}" for rk in reviewer_keys]
steps.append(
StepConfig(
name="senior_review",
agent=seniors[0],
role="review",
prompt_template="default:aggregate-review",
output_key="senior_review_result",
verdict=True,
context_override={
"candidate_outputs": "Current repository working tree under review.",
"reviews_bundle": _build_named_bundle(
reviewers, step_names, output_keys, "Review",
),
},
),
)
return steps
def _build_review_fix_preset(
coders: list[str], reviewers: list[str], seniors: list[str],
) -> list[PhaseConfig]:
"""Review in parallel, aggregate findings, fix, then verify in a loop."""
if not coders:
raise ValueError("'review-fix' preset requires at least 1 coder")
if not reviewers:
raise ValueError("'review-fix' preset requires at least 1 reviewer")
review_steps: list[StepConfig] = []
reviewer_keys = _unique_safe_keys(reviewers)
for reviewer, rk in zip(reviewers, reviewer_keys):
review_steps.append(
StepConfig(
name=f"review_{rk}",
agent=reviewer,
role="review",
prompt_template="default:review-only",
output_key=f"review_{rk}",
verdict=False,
parallel=True,
),
)
fix_coder = coders[0]
senior_agent = seniors[0] if seniors else reviewers[0]
review_step_names = [f"review_{rk}" for rk in reviewer_keys]
review_output_keys = [f"review_{rk}" for rk in reviewer_keys]
return [
PhaseConfig(
name="review_fix",
steps=review_steps + [
StepConfig(
name="aggregate_review",
agent=senior_agent,
role="review",
prompt_template="default:aggregate-review",
output_key="aggregate_review",
context_override={
"candidate_outputs": "Current repository working tree under review.",
"reviews_bundle": _build_named_bundle(
reviewers, review_step_names, review_output_keys, "Review",
),
},
),
StepConfig(
name="generate",
agent=fix_coder,
role="generate",
prompt_template="default:generate",
output_key="generated_code",
context_override={"feedback": "{aggregate_review}"},
),
StepConfig(
name="verify",
agent=senior_agent,
role="review",
prompt_template="default:review",
output_key="verify_result",
verdict=True,
),
],
max_iterations=5,
consecutive_pass=1,
),
]
PIPELINE_PRESETS: dict[str, Callable] = {
"simple": _build_simple_preset,
"cross-review": _build_cross_review_preset,
"review-only": _build_review_only_preset,
}
PHASED_PRESETS: dict[str, Callable] = {
"review-fix": _build_review_fix_preset,
}
ALL_PRESET_NAMES: list[str] = list(PIPELINE_PRESETS.keys()) + list(PHASED_PRESETS.keys())
# ---------------------------------------------------------------------------
# Template resolution and rendering
# ---------------------------------------------------------------------------
def resolve_template(template_ref: str, templates_dir: Optional[Path] = None) -> str:
"""Resolve a template reference to its content string.
Formats:
- "default:generate" -> built-in GENERATE_TEMPLATE
- "default:review" -> built-in REVIEW_TEMPLATE
- "path/to/file.md" -> read file contents
"""
if template_ref.startswith("default:"):
key = template_ref.split(":", 1)[1]
lang_templates = DEFAULT_TEMPLATES.get(_current_language, DEFAULT_TEMPLATES["en"])
if key not in lang_templates:
raise ValueError(
f"Unknown default template '{key}'. "
f"Available: {list(lang_templates.keys())}"
)
return lang_templates[key]
# Treat as file path
path = Path(template_ref)
if templates_dir and not path.is_absolute():
path = templates_dir / path
if not path.exists():
raise FileNotFoundError(f"Template file not found: {path}")
return path.read_text(encoding="utf-8")
class _DefaultDict(collections.defaultdict):
"""defaultdict that uses the missing key name in the default value."""
def __missing__(self, key: str) -> str:
return f"(no {key} provided)"
def render_template(template: str, context: dict[str, str]) -> str:
"""Render a template string with {variable} placeholders.
Missing keys produce "(no <key> provided)" instead of raising KeyError.
"""
safe_context = _DefaultDict(str)
safe_context.update(context)
return template.format_map(safe_context)

497
cross_eval/report.py Normal file
View File

@@ -0,0 +1,497 @@
"""Markdown report generation."""
from __future__ import annotations
import re
from itertools import groupby
from cross_eval.models import (
IterationResult,
PipelineConfig,
PipelineResult,
ReviewMetrics,
StepConfig,
)
# ---------------------------------------------------------------------------
# i18n strings
# ---------------------------------------------------------------------------
_STRINGS: dict[str, dict[str, str]] = {
"en": {
"title": "Cross-Eval Report",
"summary": "Summary",
"prop": "Property",
"val": "Value",
"total_iter": "Total Iterations",
"final_verdict": "Final Verdict",
"duration": "Duration",
"max_iter": "Max Iterations",
"phases_label": "Phases",
"iteration": "Iteration",
"phase": "Phase",
"steps": "Steps",
"max_iterations": "Max iterations",
"consec_pass": "Consecutive PASS required",
"step": "Step",
"verdict": "Verdict",
"output_chars": "Output ({n} chars)",
"feedback_next": "Feedback for next iteration:",
"oos_title": "Out of Scope Issues",
"oos_desc": (
"The following issues were found outside the plan/checklist scope "
"but are worth noting."
),
"final_verdict_title": "Final Verdict",
"repeat_title": "Repeated Aggregate Findings",
"repeat_desc": "The following aggregate-review outputs repeated across iterations.",
"pass_msg": "All checklist items satisfied. No over-engineering or omissions detected.",
"fail_phased": "Pipeline phases ({phases}) completed without full convergence.",
"fail_simple": "Maximum iterations ({max_iter}) reached without passing all checks.",
"metrics_title": "Review Metrics",
"metrics_trend_title": "Metrics Trend",
"metrics_iter": "Iter",
"metrics_total_issues": "Total Issues",
"metrics_na": "N/A",
},
"ko": {
"title": "교차 검증 리포트",
"summary": "요약",
"prop": "항목",
"val": "",
"total_iter": "총 반복 횟수",
"final_verdict": "최종 판정",
"duration": "소요 시간",
"max_iter": "최대 반복",
"phases_label": "페이즈",
"iteration": "반복",
"phase": "페이즈",
"steps": "단계",
"max_iterations": "최대 반복",
"consec_pass": "연속 PASS 필요",
"step": "단계",
"verdict": "판정",
"output_chars": "출력 ({n}자)",
"feedback_next": "다음 반복을 위한 피드백:",
"oos_title": "범위 밖 이슈",
"oos_desc": (
"아래는 기획서/체크리스트 범위 밖이지만 "
"리뷰 중 발견된 이슈입니다."
),
"final_verdict_title": "최종 판정",
"repeat_title": "반복된 Aggregate 이슈",
"repeat_desc": "아래 aggregate-review 결과가 여러 반복에서 동일하게 다시 나타났습니다.",
"pass_msg": "모든 체크리스트 항목 충족. 과최적화/누락 없음.",
"fail_phased": "파이프라인 페이즈 ({phases}) 완료, 완전한 수렴에 도달하지 못함.",
"fail_simple": "최대 반복 횟수 ({max_iter})에 도달, 모든 검증을 통과하지 못함.",
"metrics_title": "리뷰 메트릭",
"metrics_trend_title": "메트릭 추이",
"metrics_iter": "반복",
"metrics_total_issues": "총 이슈",
"metrics_na": "해당 없음",
},
}
def _t(config: PipelineConfig, key: str, **kwargs: str) -> str:
"""Get translated string."""
lang = getattr(config, "language", "en")
strings = _STRINGS.get(lang, _STRINGS["en"])
s = strings.get(key, _STRINGS["en"].get(key, key))
if kwargs:
s = s.format(**kwargs)
return s
# ---------------------------------------------------------------------------
# Review output parsing
# ---------------------------------------------------------------------------
def parse_review_metrics(output: str) -> ReviewMetrics:
"""Parse review output to extract severity, category, and assessment counts."""
metrics = ReviewMetrics()
# Severity: count tagged issue lines (e.g. "[Critical]", "[Major]", "[Minor]")
metrics.critical = len(re.findall(r"\[Critical\]", output, re.IGNORECASE))
metrics.major = len(re.findall(r"\[Major\]", output, re.IGNORECASE))
metrics.minor = len(re.findall(r"\[Minor\]", output, re.IGNORECASE))
# Categories (EN and KO variants)
metrics.over_engineering = len(re.findall(
r"\[Over-engineering\]|\[과최적화\]", output, re.IGNORECASE,
))
metrics.omission = len(re.findall(
r"\[Omission\]|\[누락\]", output, re.IGNORECASE,
))
# Assessments — match "CONFIRMED: <description>" but not summary "CONFIRMED: N"
metrics.confirmed = len(re.findall(r"\bCONFIRMED:\s+(?!\d)", output))
metrics.dismissed = len(re.findall(r"\bDISMISSED\b(?:\s*\([^)]*\))?\s*:\s+(?!\d)", output))
return metrics
def _aggregate_metrics(a: ReviewMetrics, b: ReviewMetrics) -> ReviewMetrics:
"""Combine metrics from two review steps."""
return ReviewMetrics(
critical=a.critical + b.critical,
major=a.major + b.major,
minor=a.minor + b.minor,
over_engineering=a.over_engineering + b.over_engineering,
omission=a.omission + b.omission,
confirmed=a.confirmed + b.confirmed,
dismissed=a.dismissed + b.dismissed,
)
def _extract_out_of_scope(output: str) -> str:
"""Extract the 'Out of Scope Issues' section from review output.
Looks for '### Out of Scope Issues' or '### 범위 밖 이슈' heading,
captures text until the next '###' heading or end of string.
Returns empty string if not found or contains only 'None'/'없음'.
"""
pattern = r"###\s*(?:Out of Scope Issues|범위 밖 이슈)\s*\n(.*?)(?=\n###|\Z)"
match = re.search(pattern, output, re.DOTALL)
if not match:
return ""
content = match.group(1).strip()
if content.lower() in ("none", "없음", ""):
return ""
return content
def build_report(config: PipelineConfig, result: PipelineResult) -> str:
"""Build the complete markdown report string."""
has_phases = any(ir.phase_name for ir in result.iterations)
if has_phases:
return _build_phased_report(config, result)
return _build_simple_report(config, result)
def _build_simple_report(
config: PipelineConfig, result: PipelineResult,
) -> str:
"""Build report for a non-phased (simple) pipeline run."""
lines: list[str] = []
lines.append(f"# {_t(config, 'title')}\n")
_append_summary_table(lines, config, result)
out_of_scope_items: list[tuple[int, str]] = []
for iter_result in result.iterations:
lines.append("---\n")
lines.append(f"## {_t(config, 'iteration')} {iter_result.iteration}\n")
_append_iteration_steps(lines, config, iter_result, config.pipeline, out_of_scope_items)
if iter_result.feedback:
lines.append(f"**{_t(config, 'feedback_next')}** {iter_result.feedback[:200]}...")
lines.append("")
_append_out_of_scope(lines, config, out_of_scope_items)
_append_review_metrics_table(lines, config, result)
_append_repeated_aggregate(lines, config, result)
_append_final_verdict(lines, config, result)
return "\n".join(lines)
def _build_phased_report(
config: PipelineConfig, result: PipelineResult,
) -> str:
"""Build report for a phased pipeline run (e.g. review-fix)."""
lines: list[str] = []
lines.append(f"# {_t(config, 'title')}\n")
_append_summary_table(lines, config, result, phased=True)
phase_map = {p.name: p for p in config.phases}
out_of_scope_items: list[tuple[int, str]] = []
for phase_name, phase_iters_iter in groupby(
result.iterations, key=lambda ir: ir.phase_name,
):
phase_iters = list(phase_iters_iter)
phase_config = phase_map.get(phase_name or "")
lines.append("---\n")
lines.append(f"## {_t(config, 'phase')}: {phase_name}\n")
if phase_config:
step_desc = "".join(s.name for s in phase_config.steps)
lines.append(
f"{_t(config, 'steps')}: {step_desc} | "
f"{_t(config, 'max_iterations')}: {phase_config.max_iterations} | "
f"{_t(config, 'consec_pass')}: {phase_config.consecutive_pass}\n"
)
steps = phase_config.steps if phase_config else config.pipeline
consecutive = 0
for iter_result in phase_iters:
verdict_label = ""
if iter_result.verdict:
if iter_result.verdict == "PASS":
consecutive += 1
if phase_config and phase_config.consecutive_pass > 1:
verdict_label = f" — PASS ({consecutive}/{phase_config.consecutive_pass})"
if consecutive >= phase_config.consecutive_pass:
verdict_label += ""
else:
verdict_label = " — PASS ✓"
else:
consecutive = 0
verdict_label = " — FAIL"
lines.append(
f"### {_t(config, 'iteration')} {iter_result.iteration}{verdict_label}\n"
)
_append_iteration_steps(lines, config, iter_result, steps, out_of_scope_items)
if iter_result.feedback:
lines.append(
f"**{_t(config, 'feedback_next')}** {iter_result.feedback[:200]}..."
)
lines.append("")
_append_out_of_scope(lines, config, out_of_scope_items)
_append_review_metrics_table(lines, config, result)
_append_repeated_aggregate(lines, config, result)
_append_final_verdict(lines, config, result)
return "\n".join(lines)
# ---------------------------------------------------------------------------
# Shared helpers
# ---------------------------------------------------------------------------
def _append_summary_table(
lines: list[str],
config: PipelineConfig,
result: PipelineResult,
phased: bool = False,
) -> None:
"""Append the summary table to lines."""
total_iter = len(result.iterations)
minutes = int(result.total_duration // 60)
seconds = int(result.total_duration % 60)
duration_str = f"{minutes}m {seconds}s" if minutes else f"{seconds}s"
lines.append(f"## {_t(config, 'summary')}\n")
lines.append(f"| {_t(config, 'prop')} | {_t(config, 'val')} |")
lines.append("|----------|-------|")
lines.append(f"| {_t(config, 'total_iter')} | {total_iter} |")
lines.append(f"| {_t(config, 'final_verdict')} | **{result.final_verdict}** |")
lines.append(f"| {_t(config, 'duration')} | {duration_str} |")
if phased and config.phases:
phase_names = "".join(p.name for p in config.phases)
lines.append(f"| {_t(config, 'phases_label')} | {phase_names} |")
for p in config.phases:
lines.append(
f"| {_t(config, 'phase')}: {p.name} | "
f"{_t(config, 'max_iterations')} {p.max_iterations}, "
f"{p.consecutive_pass}x {_t(config, 'consec_pass')} |"
)
else:
lines.append(f"| {_t(config, 'max_iter')} | {config.max_iterations} |")
lines.append("")
def _append_iteration_steps(
lines: list[str],
config: PipelineConfig,
iter_result: IterationResult,
steps: list[StepConfig],
out_of_scope_items: list[tuple[int, str]],
) -> None:
"""Append step details for one iteration."""
for step in steps:
agent_result = iter_result.step_results.get(step.output_key)
output = iter_result.step_outputs.get(step.output_key, "")
agent_name = agent_result.agent_name if agent_result else step.agent
duration = f" ({agent_result.duration_seconds}s)" if agent_result else ""
lines.append(f"### {_t(config, 'step')}: {step.name} ({agent_name}){duration}\n")
if step.verdict and iter_result.verdict:
lines.append(f"**{_t(config, 'verdict')}: {iter_result.verdict}**\n")
if len(output) > 500:
lines.append("<details>")
lines.append(
f"<summary>{_t(config, 'output_chars', n=str(len(output)))}</summary>\n"
)
lines.append(output)
lines.append("\n</details>\n")
else:
lines.append(output)
lines.append("")
if step.role == "review":
oos = _extract_out_of_scope(output)
if oos:
out_of_scope_items.append((iter_result.iteration, oos))
# Parse and accumulate review metrics for this iteration
step_metrics = parse_review_metrics(output)
if iter_result.review_metrics is None:
iter_result.review_metrics = step_metrics
else:
iter_result.review_metrics = _aggregate_metrics(
iter_result.review_metrics, step_metrics,
)
def _append_review_metrics_table(
lines: list[str],
config: PipelineConfig,
result: PipelineResult,
) -> None:
"""Append per-iteration review metrics table and trend summary."""
# Only include if at least one iteration has metrics
has_metrics = any(ir.review_metrics for ir in result.iterations)
if not has_metrics:
return
na = _t(config, "metrics_na")
lines.append("---\n")
lines.append(f"## {_t(config, 'metrics_title')}\n")
# Table header
lines.append(
f"| {_t(config, 'metrics_iter')} | {_t(config, 'verdict')} "
f"| Critical | Major | Minor "
f"| Over-eng | Omission "
f"| CONFIRMED | DISMISSED |"
)
lines.append("|------|---------|----------|-------|-------|----------|----------|-----------|-----------|")
# Table rows
for ir in result.iterations:
m = ir.review_metrics
v = ir.verdict or "-"
if m:
lines.append(
f"| {ir.iteration} | {v} "
f"| {m.critical} | {m.major} | {m.minor} "
f"| {m.over_engineering} | {m.omission} "
f"| {m.confirmed} | {m.dismissed} |"
)
else:
lines.append(
f"| {ir.iteration} | {v} "
f"| {na} | {na} | {na} "
f"| {na} | {na} "
f"| {na} | {na} |"
)
lines.append("")
# Trend summary
metrics_list = [
(ir.iteration, ir.review_metrics)
for ir in result.iterations
if ir.review_metrics
]
if len(metrics_list) >= 2:
lines.append(f"### {_t(config, 'metrics_trend_title')}\n")
_append_trend_line(
lines, "Issues",
[(it, m.critical + m.major + m.minor) for it, m in metrics_list],
)
_append_trend_line(
lines, "Over-engineering",
[(it, m.over_engineering) for it, m in metrics_list],
)
_append_trend_line(
lines, "Omission",
[(it, m.omission) for it, m in metrics_list],
)
_append_trend_line(
lines, "CONFIRMED",
[(it, m.confirmed) for it, m in metrics_list],
)
_append_trend_line(
lines, "DISMISSED",
[(it, m.dismissed) for it, m in metrics_list],
)
lines.append("")
def _append_trend_line(
lines: list[str],
label: str,
values: list[tuple[int, int]],
) -> None:
"""Append a single trend line like '- Issues: 6 -> 2 -> 0 (decreasing)'."""
nums = [v for _, v in values]
arrow = "".join(str(n) for n in nums)
if nums[-1] < nums[0]:
direction = "decreasing"
elif nums[-1] > nums[0]:
direction = "increasing"
else:
direction = "stable"
lines.append(f"- {label}: {arrow} ({direction})")
def _append_out_of_scope(
lines: list[str],
config: PipelineConfig,
out_of_scope_items: list[tuple[int, str]],
) -> None:
"""Append the out-of-scope issues section if any exist."""
if not out_of_scope_items:
return
lines.append("---\n")
lines.append(f"## {_t(config, 'oos_title')}\n")
lines.append(f"{_t(config, 'oos_desc')}\n")
for iteration_num, content in out_of_scope_items:
lines.append(f"### {_t(config, 'iteration')} {iteration_num}\n")
lines.append(content)
lines.append("")
def _append_final_verdict(
lines: list[str],
config: PipelineConfig,
result: PipelineResult,
) -> None:
"""Append the final verdict section."""
lines.append("---\n")
lines.append(f"## {_t(config, 'final_verdict_title')}: {result.final_verdict}\n")
if result.final_verdict == "PASS":
lines.append(_t(config, "pass_msg"))
else:
if config.phases:
phase_names = "".join(p.name for p in config.phases)
lines.append(_t(config, "fail_phased", phases=phase_names))
else:
lines.append(
_t(config, "fail_simple", max_iter=str(config.max_iterations))
)
def _append_repeated_aggregate(
lines: list[str],
config: PipelineConfig,
result: PipelineResult,
) -> None:
"""Append repeated aggregate warnings if any exist."""
if not result.repeated_aggregate_warnings:
return
lines.append("---\n")
lines.append(f"## {_t(config, 'repeat_title')}\n")
lines.append(f"{_t(config, 'repeat_desc')}\n")
for warning in result.repeated_aggregate_warnings:
lines.append(f"- {warning}")
lines.append("")