release: cut 0.2.0 baseline

This commit is contained in:
chungyeong
2026-03-13 21:47:54 +09:00
parent 204e071b74
commit 941304398d
15 changed files with 1930 additions and 270 deletions

View File

@@ -1,6 +1,7 @@
"""Configuration loading, validation, and preset resolution."""
from __future__ import annotations
import copy
import logging
import re
from pathlib import Path
@@ -8,7 +9,13 @@ from typing import Any
import yaml
from cross_eval.models import AgentConfig, PhaseConfig, PipelineConfig, StepConfig
from cross_eval.models import (
AgentConfig,
ExecutionConfig,
PhaseConfig,
PipelineConfig,
StepConfig,
)
from cross_eval.prompts import PHASED_PRESETS, PIPELINE_PRESETS
logger = logging.getLogger(__name__)
@@ -24,6 +31,7 @@ DEFAULT_ROLE_REASONING_EFFORTS = {
"reviewer": "medium",
"senior": "high",
}
FIX_STYLE_PRESETS = {"review-fix", "coding-review-fix"}
# ---------------------------------------------------------------------------
@@ -54,7 +62,12 @@ _CLAUDE_CODER_ARGS = list(_CLAUDE_BASE_ARGS) + [
"bypassPermissions",
]
_CLAUDE_REVIEW_ARGS = list(_CLAUDE_BASE_ARGS) + [
_CLAUDE_REVIEW_ARGS = [
"--setting-sources",
"user",
"--disable-slash-commands",
"--model",
"opus",
"--permission-mode",
"plan",
]
@@ -64,29 +77,37 @@ _CODER_SYSTEM_PROMPT = (
"Rules:\n"
"1. FIRST explore the project directory to understand the existing codebase, "
"patterns, and conventions before writing any code.\n"
"2. Implement ONLY what the plan specifies. Do NOT add extra features, "
"2. You may decide which shell, Python, git, docker, test, and database commands "
"to run. The user does not need to pre-specify exact commands.\n"
"3. Environment variables from configured .env files may already be loaded into "
"your process; use them when validating services such as ClickHouse.\n"
"4. Implement ONLY what the plan specifies. Do NOT add extra features, "
"unnecessary abstractions, premature optimizations, or \"nice-to-have\" improvements.\n"
"3. Follow the project's existing coding style, naming conventions, and directory structure.\n"
"4. If previous review feedback is provided, fix ONLY the specific issues mentioned. "
"5. Follow the project's existing coding style, naming conventions, and directory structure.\n"
"6. If previous review feedback is provided, fix ONLY the specific issues mentioned. "
"Do NOT refactor unrelated code.\n"
"5. Ignore any items from previous feedback that were marked as DISMISSED or false positive.\n"
"6. When in doubt about scope, do LESS, not more."
"7. Ignore any items from previous feedback that were marked as DISMISSED or false positive.\n"
"8. When in doubt about scope, do LESS, not more."
)
_REVIEWER_SYSTEM_PROMPT = (
"You are a code reviewer. You MUST NOT create, modify, or delete any files.\n"
"Rules:\n"
"1. Explore the project directory to understand the full codebase context.\n"
"2. Compare the implementation against the plan and checklist ONLY.\n"
"3. Classify every issue with BOTH severity AND category:\n"
"2. You may decide which shell, Python, test, git, docker, and database read commands "
"to run in order to verify behavior. The user does not need to pre-specify exact commands.\n"
"3. Environment variables from configured .env files may already be loaded into "
"your process; use them for verification when relevant.\n"
"4. Compare the implementation against the plan and checklist ONLY.\n"
"5. Classify every issue with BOTH severity AND category:\n"
" - Severity: Critical (breaks functionality/security) > Major (requirement mismatch) > Minor (convention/style)\n"
" - Category: Over-engineering / Omission\n"
"4. When reviewing with previous feedback, mark items as CONFIRMED (still an issue) "
"6. When reviewing with previous feedback, mark items as CONFIRMED (still an issue) "
"or DISMISSED (false positive) with rationale.\n"
"5. Report out-of-scope issues separately — problems found outside plan/checklist scope.\n"
"6. Order issues by severity (Critical first).\n"
"7. Do NOT suggest improvements beyond the plan scope.\n"
"8. End with VERDICT: PASS (all requirements met, no over-engineering) "
"7. Report out-of-scope issues separately — problems found outside plan/checklist scope.\n"
"8. Order issues by severity (Critical first).\n"
"9. Do NOT suggest improvements beyond the plan scope.\n"
"10. End with VERDICT: PASS (all requirements met, no over-engineering) "
"or VERDICT: FAIL (issues found)."
)
@@ -94,16 +115,20 @@ _SENIOR_SYSTEM_PROMPT = (
"You are a senior technical reviewer coordinating a review-fix-verification loop.\n"
"Rules:\n"
"1. Explore the project directory to understand the full codebase context.\n"
"2. In aggregation mode, deduplicate overlaps, resolve disagreements, and keep only "
"2. You may decide which shell, Python, test, git, docker, and database read commands "
"to run to verify disputed issues. The user does not need to pre-specify exact commands.\n"
"3. Environment variables from configured .env files may already be loaded into "
"your process; use them when validating service integrations.\n"
"4. In aggregation mode, deduplicate overlaps, resolve disagreements, and keep only "
"evidence-backed issues. Categorize dismissed findings as [False positive] or [Already fixed].\n"
"3. In verification mode, judge the current implementation directly against ONLY the "
"5. In verification mode, judge the current implementation directly against ONLY the "
"plan and checklist.\n"
"4. Be skeptical of false positives, but do not lower the bar on real requirement "
"6. Be skeptical of false positives, but do not lower the bar on real requirement "
"gaps.\n"
"5. When issues remain, produce a concise prioritized action list the coder can act on.\n"
"6. Maintain an Issue Tracker table across iterations to track issue status.\n"
"7. Do NOT invent new requirements beyond the plan and checklist.\n"
"8. End with one of three verdicts:\n"
"7. When issues remain, produce a concise prioritized action list the coder can act on.\n"
"8. Maintain an Issue Tracker table across iterations to track issue status.\n"
"9. Do NOT invent new requirements beyond the plan and checklist.\n"
"10. End with one of three verdicts:\n"
" - VERDICT: PASS — all requirements met, no issues remain.\n"
" - VERDICT: FAIL — issues found that the coder can fix.\n"
" - VERDICT: ESCALATE — issues that require human intervention. Use ESCALATE when:\n"
@@ -263,7 +288,7 @@ def _resolve_agents(
for name in all_referenced:
if name not in result and name in BUILTIN_AGENTS:
result[name] = BUILTIN_AGENTS[name]
result[name] = copy.deepcopy(BUILTIN_AGENTS[name])
return result
@@ -354,15 +379,16 @@ def _apply_role_effort(
def default_config() -> PipelineConfig:
"""Return a PipelineConfig with sensible defaults (no YAML needed)."""
agents = dict(BUILTIN_AGENTS)
agents = copy.deepcopy(BUILTIN_AGENTS)
coders = ["claude-coder"]
reviewers = ["claude-reviewer"]
seniors: list[str] = []
pipeline = PIPELINE_PRESETS["simple"](coders, reviewers, seniors)
return PipelineConfig(
output_dir=Path("output"),
output_dir=Path(".cross-eval/output"),
max_iterations=3,
language="ko",
execution=ExecutionConfig(),
inputs={},
agents=agents,
coders=coders,
@@ -406,6 +432,7 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
system_prompt=agent_data.get("system_prompt"),
reasoning_effort=agent_data.get("reasoning_effort"),
stdin_mode=agent_data.get("stdin_mode", False),
agentic=agent_data.get("agentic", False),
)
# --- roles: explicit or inferred ---
@@ -445,6 +472,17 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
p = config_dir / p
inputs[key] = p
execution_raw = raw.get("execution", {}) or {}
execution = ExecutionConfig(
mode=execution_raw.get("mode", "agent-decides"),
command_policy=execution_raw.get("command_policy", "broad"),
inherit_env=bool(execution_raw.get("inherit_env", True)),
auto_env_files=list(execution_raw.get("auto_env_files", [".env", ".env.local"])),
env_files=list(execution_raw.get("env_files", [])),
expose_env_names=bool(execution_raw.get("expose_env_names", True)),
auto_context_targets=list(execution_raw.get("auto_context_targets", [])),
)
# --- pipeline (preset or custom) ---
steps, phases = _resolve_pipeline(pipeline_raw, coders, reviewers, seniors)
@@ -453,12 +491,13 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
if isinstance(pipeline_raw, str) and pipeline_raw.startswith("preset:"):
preset_name = pipeline_raw.split(":", 1)[1]
return PipelineConfig(
output_dir=Path(raw.get("output_dir", "output")),
config = PipelineConfig(
output_dir=Path(raw.get("output_dir", ".cross-eval/output")),
max_iterations=int(raw.get("max_iterations", 3)),
min_iterations=int(raw.get("min_iterations", 1)),
verbose=bool(raw.get("verbose", False)),
language=raw.get("language", "en"),
execution=execution,
inputs=inputs,
agents=agents,
coders=coders,
@@ -470,6 +509,9 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
_config_path=config_path,
_config_mtime=config_path.stat().st_mtime,
)
sync_phased_iterations(config)
ensure_fix_preset_agentic(config)
return config
def try_reload_config(config: PipelineConfig) -> PipelineConfig:
@@ -619,6 +661,16 @@ def validate_config(config: PipelineConfig) -> list[str]:
if config.language not in ("en", "ko"):
errors.append(f"Unsupported language '{config.language}'. Use 'en' or 'ko'.")
if config.execution.mode not in {"agent-decides"}:
errors.append(
f"Unsupported execution.mode '{config.execution.mode}'. Use 'agent-decides'."
)
if config.execution.command_policy not in {"broad", "restricted"}:
errors.append(
"Unsupported execution.command_policy "
f"'{config.execution.command_policy}'. Use 'broad' or 'restricted'."
)
return errors
@@ -642,6 +694,37 @@ def _validate_unique_step_fields(
seen_output_keys.add(step.output_key)
def _make_agentic(agent: AgentConfig) -> None:
"""Convert an agent to agentic mode in-place (remove -p, set agentic=True)."""
agent.agentic = True
agent.args = [a for a in agent.args if a != "-p"]
def sync_phased_iterations(
config: PipelineConfig,
max_iter: int | None = None,
) -> None:
"""Apply effective max iterations to converging phases while preserving setup phases."""
if not config.phases:
return
effective_max_iter = config.max_iterations if max_iter is None else max_iter
for phase in config.phases:
if any(step.verdict for step in phase.steps):
phase.max_iterations = effective_max_iter
def ensure_fix_preset_agentic(config: PipelineConfig) -> None:
"""Fix-style presets should modify code, so coders run agentically by default."""
if config.preset_name not in FIX_STYLE_PRESETS:
return
for coder_name in config.coders:
agent = config.agents.get(coder_name)
if agent is not None and not agent.agentic:
_make_agentic(agent)
def apply_input_overrides(
config: PipelineConfig, overrides: dict[str, str]
) -> None: