release: cut 0.2.0 baseline

2026-03-13 21:47:54 +09:00
parent 204e071b74
commit 941304398d
15 changed files with 1930 additions and 270 deletions
--- a/cross_eval/config.py
+++ b/cross_eval/config.py
@@ -1,6 +1,7 @@
 """Configuration loading, validation, and preset resolution."""
 from __future__ import annotations

+import copy
 import logging
 import re
 from pathlib import Path
@@ -8,7 +9,13 @@ from typing import Any

 import yaml

-from cross_eval.models import AgentConfig, PhaseConfig, PipelineConfig, StepConfig
+from cross_eval.models import (
+    AgentConfig,
+    ExecutionConfig,
+    PhaseConfig,
+    PipelineConfig,
+    StepConfig,
+)
 from cross_eval.prompts import PHASED_PRESETS, PIPELINE_PRESETS

 logger = logging.getLogger(__name__)
@@ -24,6 +31,7 @@ DEFAULT_ROLE_REASONING_EFFORTS = {
    "reviewer": "medium",
    "senior": "high",
 }
+FIX_STYLE_PRESETS = {"review-fix", "coding-review-fix"}


 # ---------------------------------------------------------------------------
@@ -54,7 +62,12 @@ _CLAUDE_CODER_ARGS = list(_CLAUDE_BASE_ARGS) + [
    "bypassPermissions",
 ]

-_CLAUDE_REVIEW_ARGS = list(_CLAUDE_BASE_ARGS) + [
+_CLAUDE_REVIEW_ARGS = [
+    "--setting-sources",
+    "user",
+    "--disable-slash-commands",
+    "--model",
+    "opus",
    "--permission-mode",
    "plan",
 ]
@@ -64,29 +77,37 @@ _CODER_SYSTEM_PROMPT = (
    "Rules:\n"
    "1. FIRST explore the project directory to understand the existing codebase, "
    "patterns, and conventions before writing any code.\n"
-    "2. Implement ONLY what the plan specifies. Do NOT add extra features, "
+    "2. You may decide which shell, Python, git, docker, test, and database commands "
+    "to run. The user does not need to pre-specify exact commands.\n"
+    "3. Environment variables from configured .env files may already be loaded into "
+    "your process; use them when validating services such as ClickHouse.\n"
+    "4. Implement ONLY what the plan specifies. Do NOT add extra features, "
    "unnecessary abstractions, premature optimizations, or \"nice-to-have\" improvements.\n"
-    "3. Follow the project's existing coding style, naming conventions, and directory structure.\n"
-    "4. If previous review feedback is provided, fix ONLY the specific issues mentioned. "
+    "5. Follow the project's existing coding style, naming conventions, and directory structure.\n"
+    "6. If previous review feedback is provided, fix ONLY the specific issues mentioned. "
    "Do NOT refactor unrelated code.\n"
-    "5. Ignore any items from previous feedback that were marked as DISMISSED or false positive.\n"
-    "6. When in doubt about scope, do LESS, not more."
+    "7. Ignore any items from previous feedback that were marked as DISMISSED or false positive.\n"
+    "8. When in doubt about scope, do LESS, not more."
 )

 _REVIEWER_SYSTEM_PROMPT = (
    "You are a code reviewer. You MUST NOT create, modify, or delete any files.\n"
    "Rules:\n"
    "1. Explore the project directory to understand the full codebase context.\n"
-    "2. Compare the implementation against the plan and checklist ONLY.\n"
-    "3. Classify every issue with BOTH severity AND category:\n"
+    "2. You may decide which shell, Python, test, git, docker, and database read commands "
+    "to run in order to verify behavior. The user does not need to pre-specify exact commands.\n"
+    "3. Environment variables from configured .env files may already be loaded into "
+    "your process; use them for verification when relevant.\n"
+    "4. Compare the implementation against the plan and checklist ONLY.\n"
+    "5. Classify every issue with BOTH severity AND category:\n"
    "   - Severity: Critical (breaks functionality/security) > Major (requirement mismatch) > Minor (convention/style)\n"
    "   - Category: Over-engineering / Omission\n"
-    "4. When reviewing with previous feedback, mark items as CONFIRMED (still an issue) "
+    "6. When reviewing with previous feedback, mark items as CONFIRMED (still an issue) "
    "or DISMISSED (false positive) with rationale.\n"
-    "5. Report out-of-scope issues separately — problems found outside plan/checklist scope.\n"
-    "6. Order issues by severity (Critical first).\n"
-    "7. Do NOT suggest improvements beyond the plan scope.\n"
-    "8. End with VERDICT: PASS (all requirements met, no over-engineering) "
+    "7. Report out-of-scope issues separately — problems found outside plan/checklist scope.\n"
+    "8. Order issues by severity (Critical first).\n"
+    "9. Do NOT suggest improvements beyond the plan scope.\n"
+    "10. End with VERDICT: PASS (all requirements met, no over-engineering) "
    "or VERDICT: FAIL (issues found)."
 )

@@ -94,16 +115,20 @@ _SENIOR_SYSTEM_PROMPT = (
    "You are a senior technical reviewer coordinating a review-fix-verification loop.\n"
    "Rules:\n"
    "1. Explore the project directory to understand the full codebase context.\n"
-    "2. In aggregation mode, deduplicate overlaps, resolve disagreements, and keep only "
+    "2. You may decide which shell, Python, test, git, docker, and database read commands "
+    "to run to verify disputed issues. The user does not need to pre-specify exact commands.\n"
+    "3. Environment variables from configured .env files may already be loaded into "
+    "your process; use them when validating service integrations.\n"
+    "4. In aggregation mode, deduplicate overlaps, resolve disagreements, and keep only "
    "evidence-backed issues. Categorize dismissed findings as [False positive] or [Already fixed].\n"
-    "3. In verification mode, judge the current implementation directly against ONLY the "
+    "5. In verification mode, judge the current implementation directly against ONLY the "
    "plan and checklist.\n"
-    "4. Be skeptical of false positives, but do not lower the bar on real requirement "
+    "6. Be skeptical of false positives, but do not lower the bar on real requirement "
    "gaps.\n"
-    "5. When issues remain, produce a concise prioritized action list the coder can act on.\n"
-    "6. Maintain an Issue Tracker table across iterations to track issue status.\n"
-    "7. Do NOT invent new requirements beyond the plan and checklist.\n"
-    "8. End with one of three verdicts:\n"
+    "7. When issues remain, produce a concise prioritized action list the coder can act on.\n"
+    "8. Maintain an Issue Tracker table across iterations to track issue status.\n"
+    "9. Do NOT invent new requirements beyond the plan and checklist.\n"
+    "10. End with one of three verdicts:\n"
    "   - VERDICT: PASS — all requirements met, no issues remain.\n"
    "   - VERDICT: FAIL — issues found that the coder can fix.\n"
    "   - VERDICT: ESCALATE — issues that require human intervention. Use ESCALATE when:\n"
@@ -263,7 +288,7 @@ def _resolve_agents(

    for name in all_referenced:
        if name not in result and name in BUILTIN_AGENTS:
-            result[name] = BUILTIN_AGENTS[name]
+            result[name] = copy.deepcopy(BUILTIN_AGENTS[name])

    return result

@@ -354,15 +379,16 @@ def _apply_role_effort(

 def default_config() -> PipelineConfig:
    """Return a PipelineConfig with sensible defaults (no YAML needed)."""
-    agents = dict(BUILTIN_AGENTS)
+    agents = copy.deepcopy(BUILTIN_AGENTS)
    coders = ["claude-coder"]
    reviewers = ["claude-reviewer"]
    seniors: list[str] = []
    pipeline = PIPELINE_PRESETS["simple"](coders, reviewers, seniors)
    return PipelineConfig(
-        output_dir=Path("output"),
+        output_dir=Path(".cross-eval/output"),
        max_iterations=3,
        language="ko",
+        execution=ExecutionConfig(),
        inputs={},
        agents=agents,
        coders=coders,
@@ -406,6 +432,7 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
            system_prompt=agent_data.get("system_prompt"),
            reasoning_effort=agent_data.get("reasoning_effort"),
            stdin_mode=agent_data.get("stdin_mode", False),
+            agentic=agent_data.get("agentic", False),
        )

    # --- roles: explicit or inferred ---
@@ -445,6 +472,17 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
            p = config_dir / p
        inputs[key] = p

+    execution_raw = raw.get("execution", {}) or {}
+    execution = ExecutionConfig(
+        mode=execution_raw.get("mode", "agent-decides"),
+        command_policy=execution_raw.get("command_policy", "broad"),
+        inherit_env=bool(execution_raw.get("inherit_env", True)),
+        auto_env_files=list(execution_raw.get("auto_env_files", [".env", ".env.local"])),
+        env_files=list(execution_raw.get("env_files", [])),
+        expose_env_names=bool(execution_raw.get("expose_env_names", True)),
+        auto_context_targets=list(execution_raw.get("auto_context_targets", [])),
+    )
+
    # --- pipeline (preset or custom) ---
    steps, phases = _resolve_pipeline(pipeline_raw, coders, reviewers, seniors)

@@ -453,12 +491,13 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
    if isinstance(pipeline_raw, str) and pipeline_raw.startswith("preset:"):
        preset_name = pipeline_raw.split(":", 1)[1]

-    return PipelineConfig(
-        output_dir=Path(raw.get("output_dir", "output")),
+    config = PipelineConfig(
+        output_dir=Path(raw.get("output_dir", ".cross-eval/output")),
        max_iterations=int(raw.get("max_iterations", 3)),
        min_iterations=int(raw.get("min_iterations", 1)),
        verbose=bool(raw.get("verbose", False)),
        language=raw.get("language", "en"),
+        execution=execution,
        inputs=inputs,
        agents=agents,
        coders=coders,
@@ -470,6 +509,9 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
        _config_path=config_path,
        _config_mtime=config_path.stat().st_mtime,
    )
+    sync_phased_iterations(config)
+    ensure_fix_preset_agentic(config)
+    return config


 def try_reload_config(config: PipelineConfig) -> PipelineConfig:
@@ -619,6 +661,16 @@ def validate_config(config: PipelineConfig) -> list[str]:
    if config.language not in ("en", "ko"):
        errors.append(f"Unsupported language '{config.language}'. Use 'en' or 'ko'.")

+    if config.execution.mode not in {"agent-decides"}:
+        errors.append(
+            f"Unsupported execution.mode '{config.execution.mode}'. Use 'agent-decides'."
+        )
+    if config.execution.command_policy not in {"broad", "restricted"}:
+        errors.append(
+            "Unsupported execution.command_policy "
+            f"'{config.execution.command_policy}'. Use 'broad' or 'restricted'."
+        )
+
    return errors


@@ -642,6 +694,37 @@ def _validate_unique_step_fields(
        seen_output_keys.add(step.output_key)


+def _make_agentic(agent: AgentConfig) -> None:
+    """Convert an agent to agentic mode in-place (remove -p, set agentic=True)."""
+    agent.agentic = True
+    agent.args = [a for a in agent.args if a != "-p"]
+
+
+def sync_phased_iterations(
+    config: PipelineConfig,
+    max_iter: int | None = None,
+) -> None:
+    """Apply effective max iterations to converging phases while preserving setup phases."""
+    if not config.phases:
+        return
+
+    effective_max_iter = config.max_iterations if max_iter is None else max_iter
+    for phase in config.phases:
+        if any(step.verdict for step in phase.steps):
+            phase.max_iterations = effective_max_iter
+
+
+def ensure_fix_preset_agentic(config: PipelineConfig) -> None:
+    """Fix-style presets should modify code, so coders run agentically by default."""
+    if config.preset_name not in FIX_STYLE_PRESETS:
+        return
+
+    for coder_name in config.coders:
+        agent = config.agents.get(coder_name)
+        if agent is not None and not agent.agentic:
+            _make_agentic(agent)
+
+
 def apply_input_overrides(
    config: PipelineConfig, overrides: dict[str, str]
 ) -> None: