release: cut 0.2.0 baseline
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
"""Configuration loading, validation, and preset resolution."""
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
@@ -8,7 +9,13 @@ from typing import Any
|
||||
|
||||
import yaml
|
||||
|
||||
from cross_eval.models import AgentConfig, PhaseConfig, PipelineConfig, StepConfig
|
||||
from cross_eval.models import (
|
||||
AgentConfig,
|
||||
ExecutionConfig,
|
||||
PhaseConfig,
|
||||
PipelineConfig,
|
||||
StepConfig,
|
||||
)
|
||||
from cross_eval.prompts import PHASED_PRESETS, PIPELINE_PRESETS
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -24,6 +31,7 @@ DEFAULT_ROLE_REASONING_EFFORTS = {
|
||||
"reviewer": "medium",
|
||||
"senior": "high",
|
||||
}
|
||||
FIX_STYLE_PRESETS = {"review-fix", "coding-review-fix"}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -54,7 +62,12 @@ _CLAUDE_CODER_ARGS = list(_CLAUDE_BASE_ARGS) + [
|
||||
"bypassPermissions",
|
||||
]
|
||||
|
||||
_CLAUDE_REVIEW_ARGS = list(_CLAUDE_BASE_ARGS) + [
|
||||
_CLAUDE_REVIEW_ARGS = [
|
||||
"--setting-sources",
|
||||
"user",
|
||||
"--disable-slash-commands",
|
||||
"--model",
|
||||
"opus",
|
||||
"--permission-mode",
|
||||
"plan",
|
||||
]
|
||||
@@ -64,29 +77,37 @@ _CODER_SYSTEM_PROMPT = (
|
||||
"Rules:\n"
|
||||
"1. FIRST explore the project directory to understand the existing codebase, "
|
||||
"patterns, and conventions before writing any code.\n"
|
||||
"2. Implement ONLY what the plan specifies. Do NOT add extra features, "
|
||||
"2. You may decide which shell, Python, git, docker, test, and database commands "
|
||||
"to run. The user does not need to pre-specify exact commands.\n"
|
||||
"3. Environment variables from configured .env files may already be loaded into "
|
||||
"your process; use them when validating services such as ClickHouse.\n"
|
||||
"4. Implement ONLY what the plan specifies. Do NOT add extra features, "
|
||||
"unnecessary abstractions, premature optimizations, or \"nice-to-have\" improvements.\n"
|
||||
"3. Follow the project's existing coding style, naming conventions, and directory structure.\n"
|
||||
"4. If previous review feedback is provided, fix ONLY the specific issues mentioned. "
|
||||
"5. Follow the project's existing coding style, naming conventions, and directory structure.\n"
|
||||
"6. If previous review feedback is provided, fix ONLY the specific issues mentioned. "
|
||||
"Do NOT refactor unrelated code.\n"
|
||||
"5. Ignore any items from previous feedback that were marked as DISMISSED or false positive.\n"
|
||||
"6. When in doubt about scope, do LESS, not more."
|
||||
"7. Ignore any items from previous feedback that were marked as DISMISSED or false positive.\n"
|
||||
"8. When in doubt about scope, do LESS, not more."
|
||||
)
|
||||
|
||||
_REVIEWER_SYSTEM_PROMPT = (
|
||||
"You are a code reviewer. You MUST NOT create, modify, or delete any files.\n"
|
||||
"Rules:\n"
|
||||
"1. Explore the project directory to understand the full codebase context.\n"
|
||||
"2. Compare the implementation against the plan and checklist ONLY.\n"
|
||||
"3. Classify every issue with BOTH severity AND category:\n"
|
||||
"2. You may decide which shell, Python, test, git, docker, and database read commands "
|
||||
"to run in order to verify behavior. The user does not need to pre-specify exact commands.\n"
|
||||
"3. Environment variables from configured .env files may already be loaded into "
|
||||
"your process; use them for verification when relevant.\n"
|
||||
"4. Compare the implementation against the plan and checklist ONLY.\n"
|
||||
"5. Classify every issue with BOTH severity AND category:\n"
|
||||
" - Severity: Critical (breaks functionality/security) > Major (requirement mismatch) > Minor (convention/style)\n"
|
||||
" - Category: Over-engineering / Omission\n"
|
||||
"4. When reviewing with previous feedback, mark items as CONFIRMED (still an issue) "
|
||||
"6. When reviewing with previous feedback, mark items as CONFIRMED (still an issue) "
|
||||
"or DISMISSED (false positive) with rationale.\n"
|
||||
"5. Report out-of-scope issues separately — problems found outside plan/checklist scope.\n"
|
||||
"6. Order issues by severity (Critical first).\n"
|
||||
"7. Do NOT suggest improvements beyond the plan scope.\n"
|
||||
"8. End with VERDICT: PASS (all requirements met, no over-engineering) "
|
||||
"7. Report out-of-scope issues separately — problems found outside plan/checklist scope.\n"
|
||||
"8. Order issues by severity (Critical first).\n"
|
||||
"9. Do NOT suggest improvements beyond the plan scope.\n"
|
||||
"10. End with VERDICT: PASS (all requirements met, no over-engineering) "
|
||||
"or VERDICT: FAIL (issues found)."
|
||||
)
|
||||
|
||||
@@ -94,16 +115,20 @@ _SENIOR_SYSTEM_PROMPT = (
|
||||
"You are a senior technical reviewer coordinating a review-fix-verification loop.\n"
|
||||
"Rules:\n"
|
||||
"1. Explore the project directory to understand the full codebase context.\n"
|
||||
"2. In aggregation mode, deduplicate overlaps, resolve disagreements, and keep only "
|
||||
"2. You may decide which shell, Python, test, git, docker, and database read commands "
|
||||
"to run to verify disputed issues. The user does not need to pre-specify exact commands.\n"
|
||||
"3. Environment variables from configured .env files may already be loaded into "
|
||||
"your process; use them when validating service integrations.\n"
|
||||
"4. In aggregation mode, deduplicate overlaps, resolve disagreements, and keep only "
|
||||
"evidence-backed issues. Categorize dismissed findings as [False positive] or [Already fixed].\n"
|
||||
"3. In verification mode, judge the current implementation directly against ONLY the "
|
||||
"5. In verification mode, judge the current implementation directly against ONLY the "
|
||||
"plan and checklist.\n"
|
||||
"4. Be skeptical of false positives, but do not lower the bar on real requirement "
|
||||
"6. Be skeptical of false positives, but do not lower the bar on real requirement "
|
||||
"gaps.\n"
|
||||
"5. When issues remain, produce a concise prioritized action list the coder can act on.\n"
|
||||
"6. Maintain an Issue Tracker table across iterations to track issue status.\n"
|
||||
"7. Do NOT invent new requirements beyond the plan and checklist.\n"
|
||||
"8. End with one of three verdicts:\n"
|
||||
"7. When issues remain, produce a concise prioritized action list the coder can act on.\n"
|
||||
"8. Maintain an Issue Tracker table across iterations to track issue status.\n"
|
||||
"9. Do NOT invent new requirements beyond the plan and checklist.\n"
|
||||
"10. End with one of three verdicts:\n"
|
||||
" - VERDICT: PASS — all requirements met, no issues remain.\n"
|
||||
" - VERDICT: FAIL — issues found that the coder can fix.\n"
|
||||
" - VERDICT: ESCALATE — issues that require human intervention. Use ESCALATE when:\n"
|
||||
@@ -263,7 +288,7 @@ def _resolve_agents(
|
||||
|
||||
for name in all_referenced:
|
||||
if name not in result and name in BUILTIN_AGENTS:
|
||||
result[name] = BUILTIN_AGENTS[name]
|
||||
result[name] = copy.deepcopy(BUILTIN_AGENTS[name])
|
||||
|
||||
return result
|
||||
|
||||
@@ -354,15 +379,16 @@ def _apply_role_effort(
|
||||
|
||||
def default_config() -> PipelineConfig:
|
||||
"""Return a PipelineConfig with sensible defaults (no YAML needed)."""
|
||||
agents = dict(BUILTIN_AGENTS)
|
||||
agents = copy.deepcopy(BUILTIN_AGENTS)
|
||||
coders = ["claude-coder"]
|
||||
reviewers = ["claude-reviewer"]
|
||||
seniors: list[str] = []
|
||||
pipeline = PIPELINE_PRESETS["simple"](coders, reviewers, seniors)
|
||||
return PipelineConfig(
|
||||
output_dir=Path("output"),
|
||||
output_dir=Path(".cross-eval/output"),
|
||||
max_iterations=3,
|
||||
language="ko",
|
||||
execution=ExecutionConfig(),
|
||||
inputs={},
|
||||
agents=agents,
|
||||
coders=coders,
|
||||
@@ -406,6 +432,7 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
|
||||
system_prompt=agent_data.get("system_prompt"),
|
||||
reasoning_effort=agent_data.get("reasoning_effort"),
|
||||
stdin_mode=agent_data.get("stdin_mode", False),
|
||||
agentic=agent_data.get("agentic", False),
|
||||
)
|
||||
|
||||
# --- roles: explicit or inferred ---
|
||||
@@ -445,6 +472,17 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
|
||||
p = config_dir / p
|
||||
inputs[key] = p
|
||||
|
||||
execution_raw = raw.get("execution", {}) or {}
|
||||
execution = ExecutionConfig(
|
||||
mode=execution_raw.get("mode", "agent-decides"),
|
||||
command_policy=execution_raw.get("command_policy", "broad"),
|
||||
inherit_env=bool(execution_raw.get("inherit_env", True)),
|
||||
auto_env_files=list(execution_raw.get("auto_env_files", [".env", ".env.local"])),
|
||||
env_files=list(execution_raw.get("env_files", [])),
|
||||
expose_env_names=bool(execution_raw.get("expose_env_names", True)),
|
||||
auto_context_targets=list(execution_raw.get("auto_context_targets", [])),
|
||||
)
|
||||
|
||||
# --- pipeline (preset or custom) ---
|
||||
steps, phases = _resolve_pipeline(pipeline_raw, coders, reviewers, seniors)
|
||||
|
||||
@@ -453,12 +491,13 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
|
||||
if isinstance(pipeline_raw, str) and pipeline_raw.startswith("preset:"):
|
||||
preset_name = pipeline_raw.split(":", 1)[1]
|
||||
|
||||
return PipelineConfig(
|
||||
output_dir=Path(raw.get("output_dir", "output")),
|
||||
config = PipelineConfig(
|
||||
output_dir=Path(raw.get("output_dir", ".cross-eval/output")),
|
||||
max_iterations=int(raw.get("max_iterations", 3)),
|
||||
min_iterations=int(raw.get("min_iterations", 1)),
|
||||
verbose=bool(raw.get("verbose", False)),
|
||||
language=raw.get("language", "en"),
|
||||
execution=execution,
|
||||
inputs=inputs,
|
||||
agents=agents,
|
||||
coders=coders,
|
||||
@@ -470,6 +509,9 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
|
||||
_config_path=config_path,
|
||||
_config_mtime=config_path.stat().st_mtime,
|
||||
)
|
||||
sync_phased_iterations(config)
|
||||
ensure_fix_preset_agentic(config)
|
||||
return config
|
||||
|
||||
|
||||
def try_reload_config(config: PipelineConfig) -> PipelineConfig:
|
||||
@@ -619,6 +661,16 @@ def validate_config(config: PipelineConfig) -> list[str]:
|
||||
if config.language not in ("en", "ko"):
|
||||
errors.append(f"Unsupported language '{config.language}'. Use 'en' or 'ko'.")
|
||||
|
||||
if config.execution.mode not in {"agent-decides"}:
|
||||
errors.append(
|
||||
f"Unsupported execution.mode '{config.execution.mode}'. Use 'agent-decides'."
|
||||
)
|
||||
if config.execution.command_policy not in {"broad", "restricted"}:
|
||||
errors.append(
|
||||
"Unsupported execution.command_policy "
|
||||
f"'{config.execution.command_policy}'. Use 'broad' or 'restricted'."
|
||||
)
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
@@ -642,6 +694,37 @@ def _validate_unique_step_fields(
|
||||
seen_output_keys.add(step.output_key)
|
||||
|
||||
|
||||
def _make_agentic(agent: AgentConfig) -> None:
|
||||
"""Convert an agent to agentic mode in-place (remove -p, set agentic=True)."""
|
||||
agent.agentic = True
|
||||
agent.args = [a for a in agent.args if a != "-p"]
|
||||
|
||||
|
||||
def sync_phased_iterations(
|
||||
config: PipelineConfig,
|
||||
max_iter: int | None = None,
|
||||
) -> None:
|
||||
"""Apply effective max iterations to converging phases while preserving setup phases."""
|
||||
if not config.phases:
|
||||
return
|
||||
|
||||
effective_max_iter = config.max_iterations if max_iter is None else max_iter
|
||||
for phase in config.phases:
|
||||
if any(step.verdict for step in phase.steps):
|
||||
phase.max_iterations = effective_max_iter
|
||||
|
||||
|
||||
def ensure_fix_preset_agentic(config: PipelineConfig) -> None:
|
||||
"""Fix-style presets should modify code, so coders run agentically by default."""
|
||||
if config.preset_name not in FIX_STYLE_PRESETS:
|
||||
return
|
||||
|
||||
for coder_name in config.coders:
|
||||
agent = config.agents.get(coder_name)
|
||||
if agent is not None and not agent.agentic:
|
||||
_make_agentic(agent)
|
||||
|
||||
|
||||
def apply_input_overrides(
|
||||
config: PipelineConfig, overrides: dict[str, str]
|
||||
) -> None:
|
||||
|
||||
Reference in New Issue
Block a user