continue

2026-03-15 17:54:30 +09:00
parent 28efd5bb8f
commit 0bbe0f6f7b
14 changed files with 871 additions and 183 deletions
--- a/cross_eval/cli.py
+++ b/cross_eval/cli.py
@@ -38,7 +38,7 @@ coders: [claude-coder]
 reviewers: [claude-reviewer]
 # seniors: [codex-senior]

-# 파이프라인 종류: simple | cross-review | plan-review | review-only | review-fix | coding-review-fix
+# 파이프라인 종류: plan-review | coding-plan-review
 pipeline: preset:{preset}

 # 반복 설정
@@ -194,20 +194,12 @@ def main(argv: list[str] | None = None) -> int:
    )
    init_parser.add_argument(
        "--preset",
-        default="simple",
-        choices=[
-            "simple",
-            "cross-review",
-            "plan-review",
-            "review-only",
-            "review-fix",
-            "coding-review-fix",
-        ],
+        default="coding-plan-review",
+        choices=["plan-review", "coding-plan-review"],
        help=(
-            "파이프라인 종류 (기본: simple). "
-            "simple=코딩+리뷰, cross-review=교차리뷰, plan-review=문서리뷰수정재검증, "
-            "review-only=리뷰만, review-fix=리뷰수렴+자동수정, "
-            "coding-review-fix=초기코딩후리뷰수렴"
+            "파이프라인 종류 (기본: coding-plan-review). "
+            "plan-review=문서리뷰수정재검증, "
+            "coding-plan-review=문서기반구현후 코드+문서 리뷰/수정/재검증"
        ),
    )
    init_parser.add_argument(
@@ -252,9 +244,9 @@ def main(argv: list[str] | None = None) -> int:
    )
    demo_parser.add_argument(
        "--preset",
-        default="simple",
-        choices=["simple", "review-fix", "coding-review-fix"],
-        help="데모할 파이프라인 종류 (기본: simple)",
+        default="coding-plan-review",
+        choices=["plan-review", "coding-plan-review"],
+        help="데모할 파이프라인 종류 (기본: coding-plan-review)",
    )
    demo_parser.add_argument(
        "--escalate",
@@ -281,25 +273,12 @@ def main(argv: list[str] | None = None) -> int:
        ),
        epilog=(
            "파이프라인 종류 (--preset):\n"
-            "  ┌──────────────┬─────────────────────────────────────────────────────┐\n"
-            "  │ simple       │ Coder가 코드 작성 → Reviewer가 리뷰               │\n"
-            "  │ (기본값)     │ FAIL이면 피드백 반영해서 재코딩, PASS까지 반복     │\n"
-            "  ├──────────────┼─────────────────────────────────────────────────────┤\n"
-            "  │ review-fix   │ 2단계 파이프라인:                                  │\n"
-            "  │              │  Reviewer N명 병렬 리뷰 → 취합 → 수정 → 재검증   │\n"
-            "  ├──────────────┼─────────────────────────────────────────────────────┤\n"
-            "  │ coding-      │ 3단계 파이프라인:                                  │\n"
-            "  │ review-fix   │  초기 코딩 1회 → 리뷰 취합 → 수정 → 재검증 반복   │\n"
-            "  ├──────────────┼─────────────────────────────────────────────────────┤\n"
-            "  │ plan-review  │ 구현 전 기획서/체크리스트/문서를 검토하고       │\n"
-            "  │              │ 수정한 뒤 시니어가 재검증할 때까지 반복         │\n"
-            "  ├──────────────┼─────────────────────────────────────────────────────┤\n"
-            "  │ review-only  │ 코드 작성 없이 Reviewer N명이 기존 코드만 검토    │\n"
-            "  │              │ (이미 작성된 코드의 품질 감사용)                   │\n"
-            "  ├──────────────┼─────────────────────────────────────────────────────┤\n"
-            "  │ cross-review │ Coder 2명이 각각 구현 → 상대방 코드를 교차 리뷰   │\n"
-            "  │              │ (서로 다른 에이전트의 구현 비교용)                 │\n"
-            "  └──────────────┴─────────────────────────────────────────────────────┘\n"
+            "  ┌─────────────────────┬──────────────────────────────────────────────┐\n"
+            "  │ coding-plan-review  │ 입력 문서 기반 구현 → 코드+문서 리뷰/수정   │\n"
+            "  │ (기본값)            │ → 재검증 반복                                │\n"
+            "  ├─────────────────────┼──────────────────────────────────────────────┤\n"
+            "  │ plan-review         │ 구현 전 문서 리뷰 → 문서 수정 → 재검증 반복 │\n"
+            "  └─────────────────────┴──────────────────────────────────────────────┘\n"
            "\n"
            "기본 제공 에이전트:\n"
            "  ┌──────────────────┬─────────┬───────────┬──────────────────────────┐\n"
@@ -316,34 +295,13 @@ def main(argv: list[str] | None = None) -> int:
            "\n"
            "사용 예시:\n"
            "\n"
-            "  기본 실행 (Claude가 코딩하고 Claude가 리뷰):\n"
-            "    cross-eval run --plan plan.md\n"
-            "\n"
-            "  Codex가 코딩, Claude가 리뷰:\n"
-            "    cross-eval run --plan plan.md --coder codex --reviewer claude\n"
-            "\n"
-            "  리뷰어 2명 (Claude + Codex):\n"
-            "    cross-eval run --plan plan.md --reviewer claude --reviewer codex\n"
-            "\n"
-            "  리뷰 취합용 Senior 추가:\n"
-            "    cross-eval run --plan plan.md --preset review-fix \\\n"
-            "      --reviewer claude --reviewer codex --senior codex\n"
-            "\n"
-            "  리뷰 수렴 후 자동 수정 (review-fix):\n"
-            "    cross-eval run --plan plan.md --preset review-fix \\\n"
-            "      --reviewer claude --reviewer codex\n"
-            "\n"
-            "  초기 코딩 후 리뷰 수렴 + 자동 수정 (coding-review-fix):\n"
-            "    cross-eval run --plan plan.md --preset coding-review-fix \\\n"
-            "      --reviewer claude --reviewer codex\n"
-            "\n"
-            "  기존 코드 리뷰만 (review-only):\n"
-            "    cross-eval run --plan plan.md --preset review-only \\\n"
-            "      --reviewer claude --reviewer codex\n"
+            "  코드 + 문서 구현/리뷰 루프 (coding-plan-review):\n"
+            "    cross-eval run --plan plan.md --preset coding-plan-review \\\n"
+            "      --coder claude --reviewer codex --reviewer claude --senior codex\n"
            "\n"
            "  문서 리뷰 + 수정 + 재검증 반복 (plan-review):\n"
            "    cross-eval run --plan plan.md --preset plan-review \\\n"
-            "      --coder codex --reviewer codex\n"
+            "      --coder claude --reviewer codex --reviewer claude --senior codex\n"
            "\n"
            "  모델 변경:\n"
            "    cross-eval run --plan plan.md --model sonnet\n"
@@ -420,7 +378,11 @@ def main(argv: list[str] | None = None) -> int:
    )
    agent_group.add_argument(
        "--agentic", action="store_true", default=False,
-        help="Coder를 agentic 모드로 실행 (worktree에서 파일 직접 수정, git diff로 결과 캡처)",
+        help="Coder를 agentic 모드로 실행 (파일 직접 수정, git diff로 결과 캡처)",
+    )
+    agent_group.add_argument(
+        "--worktree", action="store_true", default=False,
+        help="기본 direct mode 대신 isolated git worktree에서 실행",
    )
    agent_group.add_argument(
        "--model", default=None, metavar="MODEL",
@@ -443,15 +405,8 @@ def main(argv: list[str] | None = None) -> int:
    pipe_group = run_parser.add_argument_group("파이프라인")
    pipe_group.add_argument(
        "--preset", default=None,
-        choices=[
-            "simple",
-            "cross-review",
-            "plan-review",
-            "review-only",
-            "review-fix",
-            "coding-review-fix",
-        ],
-        help="파이프라인 종류 (기본: simple). 각 종류 설명은 아래 참조",
+        choices=["plan-review", "coding-plan-review"],
+        help="파이프라인 종류 (기본: coding-plan-review). 각 종류 설명은 아래 참조",
    )
    pipe_group.add_argument(
        "--max-iter", type=int, default=None,
@@ -560,18 +515,11 @@ def cmd_demo(args: argparse.Namespace) -> int:
 # ---------------------------------------------------------------------------

 _PRESET_DESCRIPTIONS = {
-    "simple": "코딩 + 리뷰 (가장 기본)",
-    "review-fix": "리뷰 → 취합 → 수정 → 재검증 반복",
-    "coding-review-fix": "초기 코딩 + 리뷰 수렴 반복",
+    "coding-plan-review": "입력 문서 기반 구현 후 코드+문서 리뷰/수정 반복",
    "plan-review": "문서 리뷰 → 수정 → 재검증 반복",
-    "review-only": "기존 코드만 리뷰 (코딩 없음)",
-    "cross-review": "2명이 각각 구현 후 교차 리뷰",
 }

-_PRESET_ORDER = [
-    "simple", "review-fix", "coding-review-fix",
-    "plan-review", "review-only", "cross-review",
-]
+_PRESET_ORDER = ["coding-plan-review", "plan-review"]


 def _prompt_choice(
@@ -640,7 +588,7 @@ def _run_guided_init(target: Path) -> dict:
    coder = _prompt_text("  Coder 에이전트", default="claude")
    reviewer = _prompt_text("  Reviewer 에이전트", default="claude")

-    needs_senior = preset in ("review-fix", "coding-review-fix")
+    needs_senior = preset in ("coding-plan-review", "plan-review")
    senior = ""
    if needs_senior:
        senior = _prompt_text("  Senior 에이전트", default=reviewer)
@@ -899,10 +847,10 @@ def cmd_run(args: argparse.Namespace) -> int:
    need_rebuild = args.preset is not None or args.coders or args.reviewers or args.seniors
    if need_rebuild:
        from cross_eval.prompts import PHASED_PRESETS
-        preset = args.preset or "simple"
+        preset = args.preset or "coding-plan-review"
        # Determine which preset was configured (from YAML or defaults)
        if args.preset is None and config.phases:
-            preset = config.preset_name if config.preset_name != "custom" else "review-fix"
+            preset = config.preset_name if config.preset_name != "custom" else "coding-plan-review"
        elif args.preset is None and not args.coders and not args.reviewers and not args.seniors:
            pass  # no changes needed
        inferred_coders, inferred_reviewers, inferred_seniors = _infer_roles(
@@ -929,8 +877,6 @@ def cmd_run(args: argparse.Namespace) -> int:
        elif preset in PIPELINE_PRESETS:
            config.pipeline = PIPELINE_PRESETS[preset](coders, reviewers, seniors)
            config.phases = []
-            if preset == "review-only" and args.max_iter is None and args.min_iter is None:
-                config.max_iterations = 1

    sync_phased_iterations(config)
    if args.max_iter is not None:
@@ -951,6 +897,9 @@ def cmd_run(args: argparse.Namespace) -> int:
            if coder_name in config.agents:
                _make_agentic(config.agents[coder_name])

+    if args.worktree:
+        config.use_worktree = True
+
    ensure_fix_preset_agentic(config)

    # --model: apply to ALL agents
@@ -988,7 +937,7 @@ def cmd_run(args: argparse.Namespace) -> int:
            print(f"No files found in: {docs_dir}", file=sys.stderr)
            return 1
        config.inputs["docs"] = docs_content
-        config.inputs["docs_ref"] = str(docs_dir)
+        config.inputs["docs_ref"] = docs_dir

    if args.env_files:
        for env_file in args.env_files:
@@ -1062,6 +1011,9 @@ def cmd_run(args: argparse.Namespace) -> int:
    if not args.dry_run and result.run_dir:
        print(f"Output: {result.run_dir}/")

+    if args.dry_run:
+        return 0
+
    if result.final_verdict == "ESCALATE":
        from cross_eval.report import print_escalation_report
        print_escalation_report(config, result)
--- a/cross_eval/config.py
+++ b/cross_eval/config.py
@@ -31,7 +31,10 @@ DEFAULT_ROLE_REASONING_EFFORTS = {
    "reviewer": "medium",
    "senior": "high",
 }
-FIX_STYLE_PRESETS = {"plan-review", "review-fix", "coding-review-fix"}
+FIX_STYLE_PRESETS = {
+    "plan-review",
+    "coding-plan-review",
+}


 # ---------------------------------------------------------------------------
@@ -298,8 +301,7 @@ def _default_seniors_for_preset(
        isinstance(pipeline_raw, str)
        and pipeline_raw in {
            "preset:plan-review",
-            "preset:review-fix",
-            "preset:coding-review-fix",
+            "preset:coding-plan-review",
        }
        and reviewers
    ):
@@ -382,9 +384,11 @@ def default_config() -> PipelineConfig:
    coders = ["claude-coder"]
    reviewers = ["claude-reviewer"]
    seniors: list[str] = []
-    pipeline = PIPELINE_PRESETS["simple"](coders, reviewers, seniors)
+    pipeline: list[StepConfig] = []
+    phases = PHASED_PRESETS["coding-plan-review"](coders, reviewers, seniors)
    return PipelineConfig(
        output_dir=Path(".cross-eval/output"),
+        use_worktree=False,
        max_iterations=3,
        language="ko",
        execution=ExecutionConfig(),
@@ -394,6 +398,8 @@ def default_config() -> PipelineConfig:
        reviewers=reviewers,
        seniors=seniors,
        pipeline=pipeline,
+        phases=phases,
+        preset_name="coding-plan-review",
    )


@@ -437,7 +443,7 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:
        )

    # --- roles: explicit or inferred ---
-    pipeline_raw = raw.get("pipeline", "preset:simple")
+    pipeline_raw = raw.get("pipeline", "preset:coding-plan-review")
    coders_raw = raw.get("coders")
    reviewers_raw = raw.get("reviewers")
    seniors_raw = raw.get("seniors")
@@ -498,6 +504,7 @@ def _parse_raw(raw: dict[str, Any], config_path: Path) -> PipelineConfig:

    config = PipelineConfig(
        output_dir=output_dir,
+        use_worktree=bool(raw.get("use_worktree", False)),
        max_iterations=int(raw.get("max_iterations", 3)),
        min_iterations=int(raw.get("min_iterations", 1)),
        verbose=bool(raw.get("verbose", False)),
@@ -555,10 +562,10 @@ def _resolve_pipeline(
    """Resolve pipeline from preset string or explicit step list.

    Returns (steps, phases) tuple.  Only one will be non-empty.
-    - Simple/cross-review/plan-review/review-only → steps populated, phases empty.
-    - Phased presets (review-fix) → steps empty, phases populated.
+    - plan-review → steps populated, phases empty.
+    - coding-plan-review → steps empty, phases populated.
    """
-    # Preset: "preset:simple" or "preset:review-fix"
+    # Preset: "preset:plan-review" or "preset:coding-plan-review"
    if isinstance(pipeline_raw, str) and pipeline_raw.startswith("preset:"):
        preset_name = pipeline_raw.split(":", 1)[1]
        if preset_name in PIPELINE_PRESETS:
@@ -592,7 +599,7 @@ def _resolve_pipeline(
        return steps, []

    raise ValueError(
-        f"'pipeline' must be a preset string (e.g. 'preset:simple') "
+        f"'pipeline' must be a preset string (e.g. 'preset:plan-review') "
        f"or a list of step definitions, got {type(pipeline_raw).__name__}"
    )

--- a/cross_eval/demo.py
+++ b/cross_eval/demo.py
@@ -165,7 +165,7 @@ CYAN = "\033[36m"
 RESET = "\033[0m"


-def run_mock_demo(preset: str = "simple", show_escalate: bool = False) -> None:
+def run_mock_demo(preset: str = "coding-plan-review", show_escalate: bool = False) -> None:
    """Run a simulated demo showing the full pipeline lifecycle."""
    steps = _MOCK_ESCALATE_STEPS if show_escalate else _MOCK_STEPS

@@ -229,7 +229,7 @@ def run_mock_demo(preset: str = "simple", show_escalate: bool = False) -> None:


 def run_live_demo(
-    preset: str = "simple",
+    preset: str = "coding-plan-review",
    timeout: int | None = None,
 ) -> PipelineResult:
    """Run a live demo with real agents using the built-in plan."""
@@ -255,8 +255,9 @@ def run_live_demo(
        pipeline = []
        phases = PHASED_PRESETS[preset](coders, reviewers, seniors)
    else:
-        pipeline = PIPELINE_PRESETS["simple"](coders, reviewers, seniors)
-        phases = []
+        pipeline = []
+        phases = PHASED_PRESETS["coding-plan-review"](coders, reviewers, seniors)
+        

    with tempfile.TemporaryDirectory() as tmpdir:
        plan_path = Path(tmpdir) / "plan.md"
--- a/cross_eval/models.py
+++ b/cross_eval/models.py
@@ -62,6 +62,7 @@ class PipelineConfig:
    """Full cross-eval configuration."""

    output_dir: Path = field(default_factory=lambda: Path(".cross-eval/output"))
+    use_worktree: bool = False
    max_iterations: int = 3
    min_iterations: int = 1
    verbose: bool = False
--- a/cross_eval/pipeline.py
+++ b/cross_eval/pipeline.py
@@ -4,6 +4,7 @@ from __future__ import annotations
 import logging
 import os
 import re
+import shutil
 import subprocess
 import time
 from hashlib import sha256
@@ -34,6 +35,19 @@ from cross_eval.runtime_env import (
 logger = logging.getLogger(__name__)


+def _get_current_head(cwd: Path) -> str | None:
+    """Return the current HEAD SHA for an existing repository."""
+    result = subprocess.run(
+        ["git", "rev-parse", "HEAD"],
+        cwd=cwd,
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        return None
+    return result.stdout.strip() or None
+
+
 def run_pipeline(
    config: PipelineConfig,
    cwd: Path | None = None,
@@ -124,8 +138,6 @@ def _copy_inputs_to_worktree(
    Updates ``config.inputs`` in-place so subsequent reference refreshes use
    worktree-local paths.
    """
-    import shutil
-
    base_root = base_cwd.resolve()
    track_external_inputs = config.preset_name == "plan-review"
    inputs_dir = worktree_path / ".cross-eval-inputs"
@@ -134,7 +146,7 @@ def _copy_inputs_to_worktree(
        # Exclude read-only input copies from git so they don't pollute code diffs.
        (inputs_dir / ".gitignore").write_text("*\n", encoding="utf-8")
    for key, val in list(config.inputs.items()):
-        if key.endswith("_ref") or not isinstance(val, Path):
+        if not isinstance(val, Path):
            continue
        if not val.exists():
            continue
@@ -143,17 +155,71 @@ def _copy_inputs_to_worktree(
            rel_path = resolved.relative_to(base_root)
        except ValueError:
            dest = inputs_dir / val.name
-            shutil.copy2(resolved, dest)
+            _copy_path(resolved, dest)
            config.inputs[key] = dest
            continue

        worktree_target = worktree_path / rel_path
        if not worktree_target.exists():
-            worktree_target.parent.mkdir(parents=True, exist_ok=True)
-            shutil.copy2(resolved, worktree_target)
+            _copy_path(resolved, worktree_target)
        config.inputs[key] = worktree_target


+def _snapshot_input_paths(config: PipelineConfig) -> dict[str, Path]:
+    """Capture original on-disk input paths before remapping into a worktree."""
+    return {
+        key: val
+        for key, val in config.inputs.items()
+        if isinstance(val, Path)
+    }
+
+
+def _apply_worktree_inputs_to_base(
+    config: PipelineConfig,
+    original_inputs: dict[str, Path],
+    *,
+    cwd: Path,
+) -> list[Path]:
+    """Copy the final worktree-edited inputs back onto the user-provided paths."""
+    restored: list[Path] = []
+    for key, original_path in original_inputs.items():
+        current_path = config.inputs.get(key)
+        if not isinstance(current_path, Path) or not current_path.exists():
+            continue
+        if current_path.resolve() == original_path.resolve():
+            continue
+        _copy_path(current_path, original_path)
+        restored.append(original_path)
+    return restored
+
+
+def _commit_base_repo_paths(cwd: Path, paths: list[Path], message: str) -> bool:
+    """Commit changed input paths in the base repository when they live under cwd."""
+    rel_paths: list[str] = []
+    for path in paths:
+        try:
+            rel_paths.append(str(path.resolve().relative_to(cwd.resolve())))
+        except ValueError:
+            continue
+
+    if not rel_paths:
+        return False
+
+    subprocess.run(
+        ["git", "add", "--", *rel_paths],
+        cwd=cwd,
+        capture_output=True,
+        check=True,
+    )
+    result = subprocess.run(
+        ["git", "commit", "-m", message],
+        cwd=cwd,
+        capture_output=True,
+        text=True,
+    )
+    return result.returncode == 0
+
+
 def _snapshot_repo_state(cwd: Path) -> dict[str, str]:
    """Capture the base repository working-tree state.

@@ -344,18 +410,26 @@ def _run_simple_pipeline(

    # Setup shared worktree for agentic mode
    worktree_path: Path | None = None
+    agent_execution_path: Path | None = None
    agentic_branch_name: str | None = None
    agentic_base_commit: str | None = None
+    original_input_paths: dict[str, Path] = {}
    base_repo_state: dict[str, str] | None = None
    base_repo_status: str | None = None
    if not dry_run and _has_agentic_steps(config, config.pipeline):
-        worktree_path, agentic_branch_name, agentic_base_commit = _setup_worktree(
-            cwd, run_dir, config.preset_name,
-        )
-        _copy_inputs_to_worktree(config, worktree_path, base_cwd=cwd)
-        _refresh_input_references(config, input_contents)
-        base_repo_state = _snapshot_repo_state(cwd)
-        base_repo_status = _snapshot_repo_status(cwd)
+        if config.use_worktree:
+            worktree_path, agentic_branch_name, agentic_base_commit = _setup_worktree(
+                cwd, run_dir, config.preset_name,
+            )
+            original_input_paths = _snapshot_input_paths(config)
+            _copy_inputs_to_worktree(config, worktree_path, base_cwd=cwd)
+            _refresh_input_references(config, input_contents)
+            base_repo_state = _snapshot_repo_state(cwd)
+            base_repo_status = _snapshot_repo_status(cwd)
+            agent_execution_path = worktree_path
+        else:
+            agent_execution_path = cwd
+            agentic_base_commit = _get_current_head(cwd)

    feedback = "(no feedback — first iteration)"
    iterations: list[IterationResult] = []
@@ -381,7 +455,7 @@ def _run_simple_pipeline(
                config.pipeline, config, input_contents, feedback,
                i, config.max_iterations, cwd, timeout, dry_run,
                run_dir=run_dir, output_iter=i,
-                worktree_path=worktree_path,
+                worktree_path=agent_execution_path,
                runtime_env=runtime_env,
                base_repo_state=base_repo_state,
                base_repo_status=base_repo_status,
@@ -389,7 +463,7 @@ def _run_simple_pipeline(
            )

            # Intermediate commit so next iteration's diff only shows new changes
-            if worktree_path is not None:
+            if config.use_worktree and worktree_path is not None:
                agentic_base_commit = _commit_iteration(worktree_path, config.preset_name, i, verdict)

            iter_result = IterationResult(
@@ -480,8 +554,25 @@ def _run_simple_pipeline(
                break

    finally:
+        if config.use_worktree and worktree_path is not None and original_input_paths:
+            restored_paths = _apply_worktree_inputs_to_base(
+                config, original_input_paths, cwd=cwd,
+            )
+            if restored_paths:
+                try:
+                    committed = _commit_base_repo_paths(
+                        cwd,
+                        restored_paths,
+                        f"cross-eval: {config.preset_name} ({final_verdict})",
+                    )
+                    if committed:
+                        logger.info("  Applied and committed final input changes in base repo.")
+                    else:
+                        logger.info("  Applied final input changes in base repo (no commit created).")
+                except Exception:
+                    logger.warning("  Failed to commit final input changes in base repo", exc_info=True)
        agentic_branch: str | None = None
-        if worktree_path is not None and agentic_branch_name is not None:
+        if config.use_worktree and worktree_path is not None and agentic_branch_name is not None:
            agentic_branch = _finalize_worktree(
                cwd, worktree_path, agentic_branch_name,
                config.preset_name, final_verdict,
@@ -523,18 +614,26 @@ def _run_phased_pipeline(
    # Setup shared worktree for agentic mode
    all_phase_steps = [s for p in config.phases for s in p.steps]
    worktree_path: Path | None = None
+    agent_execution_path: Path | None = None
    agentic_branch_name: str | None = None
    agentic_base_commit: str | None = None
+    original_input_paths: dict[str, Path] = {}
    base_repo_state: dict[str, str] | None = None
    base_repo_status: str | None = None
    if not dry_run and _has_agentic_steps(config, all_phase_steps):
-        worktree_path, agentic_branch_name, agentic_base_commit = _setup_worktree(
-            cwd, run_dir, config.preset_name,
-        )
-        _copy_inputs_to_worktree(config, worktree_path, base_cwd=cwd)
-        _refresh_input_references(config, input_contents)
-        base_repo_state = _snapshot_repo_state(cwd)
-        base_repo_status = _snapshot_repo_status(cwd)
+        if config.use_worktree:
+            worktree_path, agentic_branch_name, agentic_base_commit = _setup_worktree(
+                cwd, run_dir, config.preset_name,
+            )
+            original_input_paths = _snapshot_input_paths(config)
+            _copy_inputs_to_worktree(config, worktree_path, base_cwd=cwd)
+            _refresh_input_references(config, input_contents)
+            base_repo_state = _snapshot_repo_state(cwd)
+            base_repo_status = _snapshot_repo_status(cwd)
+            agent_execution_path = worktree_path
+        else:
+            agent_execution_path = cwd
+            agentic_base_commit = _get_current_head(cwd)

    iterations: list[IterationResult] = []
    feedback = "(no feedback — first iteration)"
@@ -581,7 +680,7 @@ def _run_phased_pipeline(
                    phase.steps, config, input_contents, feedback,
                    pi, phase.max_iterations, cwd, timeout, dry_run,
                    run_dir=run_dir, output_iter=global_iter, phase_name=phase.name,
-                    worktree_path=worktree_path,
+                    worktree_path=agent_execution_path,
                    runtime_env=runtime_env,
                    base_repo_state=base_repo_state,
                    base_repo_status=base_repo_status,
@@ -589,7 +688,7 @@ def _run_phased_pipeline(
                )

                # Intermediate commit so next iteration's diff only shows new changes
-                if worktree_path is not None:
+                if config.use_worktree and worktree_path is not None:
                    agentic_base_commit = _commit_iteration(
                        worktree_path, f"{config.preset_name}/{phase.name}",
                        global_iter, verdict,
@@ -717,8 +816,25 @@ def _run_phased_pipeline(
                final_verdict = "PASS" if phase_converged else "MAX_ITERATIONS_REACHED"

    finally:
+        if config.use_worktree and worktree_path is not None and original_input_paths:
+            restored_paths = _apply_worktree_inputs_to_base(
+                config, original_input_paths, cwd=cwd,
+            )
+            if restored_paths:
+                try:
+                    committed = _commit_base_repo_paths(
+                        cwd,
+                        restored_paths,
+                        f"cross-eval: {config.preset_name} ({final_verdict})",
+                    )
+                    if committed:
+                        logger.info("  Applied and committed final input changes in base repo.")
+                    else:
+                        logger.info("  Applied final input changes in base repo (no commit created).")
+                except Exception:
+                    logger.warning("  Failed to commit final input changes in base repo", exc_info=True)
        agentic_branch: str | None = None
-        if worktree_path is not None and agentic_branch_name is not None:
+        if config.use_worktree and worktree_path is not None and agentic_branch_name is not None:
            agentic_branch = _finalize_worktree(
                cwd, worktree_path, agentic_branch_name,
                config.preset_name, final_verdict,
@@ -752,6 +868,8 @@ def _load_inputs(config: PipelineConfig) -> dict[str, str]:
    for key, val in config.inputs.items():
        if key.endswith("_ref"):
            input_contents[key] = str(val)
+        elif key == "docs":
+            input_contents[key] = _load_docs_input(config, current_value=val)
        elif isinstance(val, str):
            input_contents[key] = val
        else:
@@ -767,6 +885,8 @@ def _refresh_inputs(
    for key, val in config.inputs.items():
        if key.endswith("_ref"):
            input_contents[key] = str(val)
+        elif key == "docs":
+            input_contents[key] = _load_docs_input(config, current_value=val)
        elif isinstance(val, str):
            input_contents[key] = val
        elif isinstance(val, Path) and val.exists():
@@ -774,6 +894,40 @@ def _refresh_inputs(
    _refresh_input_references(config, input_contents)


+def _load_docs_input(config: PipelineConfig, *, current_value: Path | str) -> str:
+    """Load docs content from docs_ref when available so edits are visible next iteration."""
+    docs_ref = config.inputs.get("docs_ref")
+    docs_path = docs_ref if isinstance(docs_ref, Path) else None
+    if docs_path is not None and docs_path.exists():
+        if docs_path.is_dir():
+            return _read_docs_tree(docs_path)
+        try:
+            return docs_path.read_text(encoding="utf-8")
+        except (UnicodeDecodeError, OSError):
+            return ""
+    if isinstance(current_value, str):
+        return current_value
+    if current_value.exists() and current_value.is_file():
+        return current_value.read_text(encoding="utf-8")
+    return ""
+
+
+def _read_docs_tree(docs_dir: Path) -> str:
+    """Read all visible text files under a docs tree and concatenate them."""
+    parts: list[str] = []
+    for f in sorted(
+        path for path in docs_dir.rglob("*")
+        if path.is_file() and not any(part.startswith(".") for part in path.relative_to(docs_dir).parts)
+    ):
+        try:
+            content = f.read_text(encoding="utf-8")
+        except (UnicodeDecodeError, OSError):
+            continue
+        rel_path = f.relative_to(docs_dir).as_posix()
+        parts.append(f"### {rel_path}\n{content}")
+    return "\n\n".join(parts)
+
+
 def _refresh_input_references(
    config: PipelineConfig,
    input_contents: dict[str, str],
@@ -1703,3 +1857,12 @@ def _save_report(run_dir: Path, config: PipelineConfig, result: PipelineResult)
    report_path.parent.mkdir(parents=True, exist_ok=True)
    report_path.write_text(report, encoding="utf-8")
    logger.info("Report saved: %s", report_path)
+
+
+def _copy_path(src: Path, dest: Path) -> None:
+    """Copy a file or directory into the worktree, preserving structure."""
+    if src.is_dir():
+        shutil.copytree(src, dest, dirs_exist_ok=True)
+        return
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copy2(src, dest)
--- a/cross_eval/prompts.py
+++ b/cross_eval/prompts.py
@@ -512,6 +512,218 @@ PLAN_FIX_TEMPLATE_KO = """\
 8. 수정이 끝나면 무엇을 바꿨는지와 아직 사람 판단이 필요한 blocker가 있는지 짧게 정리하세요.
 """

+PLAN_VERIFY_TEMPLATE = """\
+You are verifying the latest planning package after plan-only revisions.
+
+## Plan
+{plan}
+
+## Checklist
+{checklist}
+
+## Reference Documents
+{docs}
+
+## Previous Review (iteration {iteration} of {max_iterations})
+{feedback}
+
+## Execution Evidence
+{execution_evidence}
+
+## Verify Instructions
+Review the latest planning package itself: the plan, checklist, and reference documents.
+You MAY inspect the current repository to confirm that the documents describe the current reality accurately enough.
+Do NOT require production code, scripts, infrastructure, or external environments to already be fixed.
+
+For `plan-review`, PASS means the documents are now clear enough to execute without further document edits.
+A known implementation gap, repo mismatch, legacy script problem, external dependency, or environment blocker is NOT a FAIL by itself if:
+- the issue is described accurately in the planning package,
+- the affected scope or gate is documented clearly,
+- the required follow-up action or non-go condition is documented clearly, and
+- the package does not misrepresent unresolved work as already complete.
+
+Only mark FAIL when the planning package still needs correction, such as:
+- unresolved ambiguity or contradiction in the documents,
+- missing prerequisite, dependency, gate, ownership, or evidence rule,
+- a known blocker that is still described inaccurately or misleadingly,
+- conflicting source-of-truth rules across the planning documents,
+- checklist or status criteria that would cause an operator to make the wrong decision.
+
+Report implementation/repository problems that are already documented correctly under "Out of Scope Issues" or note them as documented risks, not as FAIL reasons.
+
+## Output Format
+
+### Remaining Document Issues
+- [Major][Omission] Description (reference specific plan/checklist/doc item)
+(Write "None" if no document issue remains.)
+
+### Documented Risks / Out of Scope
+- Description of a real implementation/repository/environment risk that is already documented correctly
+(Write "None" if nothing notable remains.)
+
+### Summary
+- Remaining document issues: N
+- Documented risks / out-of-scope items: N
+- Overall quality: [BRIEF ASSESSMENT]
+
+### Verdict
+If the planning package no longer needs document changes, output: VERDICT: PASS
+Otherwise output: VERDICT: FAIL
+"""
+
+PLAN_VERIFY_TEMPLATE_KO = """\
+당신은 plan-only 수정 이후 최신 기획 패키지를 재검증하는 검토자입니다.
+
+## 기획서
+{plan}
+
+## 체크리스트
+{checklist}
+
+## 참고 문서
+{docs}
+
+## 이전 리뷰 결과 ({max_iterations}회 중 {iteration}번째)
+{feedback}
+
+## 실행 증거
+{execution_evidence}
+
+## 검증 지침
+최신 기획 패키지 자체를 다시 검토하세요: 기획서, 체크리스트, 참고 문서를 함께 봅니다.
+현재 저장소를 살펴보며 문서가 현실을 정확히 설명하는지 확인할 수는 있지만, 프로덕션 코드, 스크립트, 인프라, 외부 환경이 이미 수정되어 있을 것을 요구하면 안 됩니다.
+
+`plan-review`에서 PASS의 뜻은 "이제 문서를 더 고칠 필요 없이 이 계획을 실행할 수 있다"입니다.
+즉 구현 공백, 저장소 불일치, legacy 스크립트 문제, 외부 의존성, 환경 blocker가 남아 있어도 아래 조건을 만족하면 FAIL 사유가 아닙니다.
+- 그 문제가 기획 패키지에 정확히 기록되어 있고
+- 어떤 범위/게이트에 영향을 주는지 분명히 적혀 있고
+- 필요한 후속 조치나 non-go 조건이 명확히 적혀 있고
+- 아직 해결되지 않은 일을 이미 해결된 것처럼 오해하게 만들지 않는 경우
+
+반대로 아래와 같은 경우에만 FAIL로 판정하세요.
+- 문서 안에 아직 모호성이나 모순이 남아 있는 경우
+- 선행조건, 의존성, 게이트, 담당 주체, evidence 규칙이 빠진 경우
+- 알려진 blocker가 여전히 부정확하거나 오해를 부르는 방식으로 서술된 경우
+- 기획 문서들 사이에서 source-of-truth 규칙이 충돌하는 경우
+- 체크리스트나 상태 판정 기준 때문에 실행자가 잘못된 결정을 내릴 수 있는 경우
+
+이미 문서에 정확히 기록된 구현/저장소 문제는 "범위 밖 이슈" 또는 "문서화된 리스크"로만 남기고, 그 자체를 FAIL 사유로 삼지 마세요.
+
+## 출력 형식
+
+### 남은 문서 이슈
+- [Major][누락] 이슈 설명 (관련 기획서/체크리스트/참고 문서 항목 참조)
+(남은 문서 이슈가 없으면 "없음"이라고 작성하세요.)
+
+### 문서화된 리스크 / 범위 밖 이슈
+- 실제 구현/저장소/환경 리스크이지만 문서에는 이미 정확히 반영된 항목
+(해당 사항이 없으면 "없음"이라고 작성하세요.)
+
+### 요약
+- 남은 문서 이슈 수: N
+- 문서화된 리스크 / 범위 밖 항목 수: N
+- 전체 품질: [간략한 평가]
+
+### 판정
+기획 패키지를 더 수정할 필요가 없으면: VERDICT: PASS
+그렇지 않으면: VERDICT: FAIL
+"""
+
+CODING_PLAN_REVIEW_TEMPLATE = """\
+You are reviewing both the implementation and the planning package together.
+
+## Artifact References
+{artifact_references}
+
+## Execution Evidence
+{execution_evidence}
+
+## Review Instructions
+Read the referenced plan/checklist/docs/review artifacts directly from disk. \
+Inspect the current repository and evaluate BOTH:
+1. whether the implementation matches the plan/checklist/docs, and
+2. whether the planning package still accurately describes the implementation target and constraints.
+
+Report only issues that matter to delivering the original plan correctly. \
+Do not invent new scope. Distinguish between code issues, document issues, and consistency gaps between them.
+
+For each issue found, classify it with BOTH severity AND category:
+- Severity: Critical / Major / Minor
+- Category: Over-engineering / Omission
+
+If previous review feedback is provided above, mark each prior item as CONFIRMED or DISMISSED.
+If you find issues outside the original plan scope, report them separately under "Out of Scope Issues".
+
+### Verdict
+If the implementation satisfies the plan/checklist and the planning package no longer needs correction, output: VERDICT: PASS
+Otherwise output: VERDICT: FAIL
+"""
+
+CODING_PLAN_REVIEW_TEMPLATE_KO = """\
+당신은 구현 결과와 기획 문서 패키지를 함께 검토하는 리뷰어입니다.
+
+## 참조 아티팩트
+{artifact_references}
+
+## 실행 증거
+{execution_evidence}
+
+## 검토 지침
+참조된 plan/checklist/docs/review markdown를 직접 읽고 현재 저장소를 확인한 뒤, 아래 두 가지를 함께 평가하세요.
+1. 현재 구현이 plan/checklist/docs와 일치하는가
+2. 기획 문서 패키지가 현재 구현 목표와 제약을 여전히 정확하게 설명하는가
+
+원래 계획을 제대로 완수하는 데 필요한 이슈만 보고하세요. 새로운 범위를 만들지 마세요.
+코드 이슈, 문서 이슈, 코드-문서 불일치를 구분해서 적으세요.
+
+발견된 각 이슈에 심각도와 카테고리를 모두 부여하세요.
+- 심각도: Critical / Major / Minor
+- 카테고리: 과최적화 / 누락
+
+이전 리뷰 피드백이 있으면 각 항목을 CONFIRMED 또는 DISMISSED로 판정하세요.
+원래 계획 범위 밖 이슈는 "범위 밖 이슈"로 별도 분리하세요.
+
+### 판정
+구현이 plan/checklist를 충족하고 기획 문서 패키지도 더 이상 수정할 필요가 없으면: VERDICT: PASS
+그렇지 않으면: VERDICT: FAIL
+"""
+
+CODING_PLAN_FIX_TEMPLATE = """\
+You are fixing confirmed issues in both the implementation and the planning package.
+
+## Artifact References
+{artifact_references}
+
+## Current Review Feedback
+{feedback}
+
+## Instructions
+1. Read the referenced plan/checklist/docs/review artifacts directly from disk.
+2. Fix ONLY the confirmed issues from the current review feedback.
+3. You may update both implementation files and planning artifacts when needed.
+4. Preserve the original plan intent and scope. Do not silently broaden requirements.
+5. Keep code, plan, checklist, and supporting docs consistent after edits.
+6. After editing, briefly summarize what you changed and any blocker that still needs human input.
+"""
+
+CODING_PLAN_FIX_TEMPLATE_KO = """\
+당신은 현재 리뷰에서 확정된 이슈를 코드와 기획 문서 패키지에 함께 반영하는 수정 담당자입니다.
+
+## 참조 아티팩트
+{artifact_references}
+
+## 현재 리뷰 피드백
+{feedback}
+
+## 지침
+1. 참조된 plan/checklist/docs/review markdown를 직접 읽으세요.
+2. 현재 리뷰 피드백에서 확정된 이슈만 수정하세요.
+3. 필요하면 코드와 기획 문서를 모두 수정할 수 있습니다.
+4. 최초 plan의 의도와 범위를 유지하세요. 요구사항을 몰래 넓히지 마세요.
+5. 수정 후 코드, plan, checklist, 참고 문서가 서로 모순되지 않게 유지하세요.
+6. 수정이 끝나면 무엇을 바꿨는지와 아직 사람 판단이 필요한 blocker가 있는지 짧게 정리하세요.
+"""
+
 AGGREGATE_REVIEW_TEMPLATE = """\
 You are adjudicating multiple review results and turning them into an actionable decision.

@@ -645,6 +857,9 @@ DEFAULT_TEMPLATES: dict[str, dict[str, str]] = {
        "review": REVIEW_TEMPLATE,
        "plan-review": PLAN_REVIEW_TEMPLATE,
        "plan-fix": PLAN_FIX_TEMPLATE,
+        "plan-verify": PLAN_VERIFY_TEMPLATE,
+        "coding-plan-review": CODING_PLAN_REVIEW_TEMPLATE,
+        "coding-plan-fix": CODING_PLAN_FIX_TEMPLATE,
        "review-only": REVIEW_ONLY_TEMPLATE,
        "aggregate-review": AGGREGATE_REVIEW_TEMPLATE,
    },
@@ -653,6 +868,9 @@ DEFAULT_TEMPLATES: dict[str, dict[str, str]] = {
        "review": REVIEW_TEMPLATE_KO,
        "plan-review": PLAN_REVIEW_TEMPLATE_KO,
        "plan-fix": PLAN_FIX_TEMPLATE_KO,
+        "plan-verify": PLAN_VERIFY_TEMPLATE_KO,
+        "coding-plan-review": CODING_PLAN_REVIEW_TEMPLATE_KO,
+        "coding-plan-fix": CODING_PLAN_FIX_TEMPLATE_KO,
        "review-only": REVIEW_ONLY_TEMPLATE_KO,
        "aggregate-review": AGGREGATE_REVIEW_TEMPLATE_KO,
    },
@@ -961,7 +1179,7 @@ def _build_plan_review_preset(
            name="verify",
            agent=senior_agent,
            role="review",
-            prompt_template="default:plan-review",
+            prompt_template="default:plan-verify",
            output_key="verify_result",
            verdict=True,
        ),
@@ -1065,16 +1283,97 @@ def _build_coding_review_fix_preset(
    ]


+def _build_coding_plan_review_preset(
+    coders: list[str], reviewers: list[str], seniors: list[str],
+) -> list[PhaseConfig]:
+    """Implement from plan/docs, then review and fix code+docs together."""
+    if not coders:
+        raise ValueError("'coding-plan-review' preset requires at least 1 coder")
+    if not reviewers:
+        raise ValueError("'coding-plan-review' preset requires at least 1 reviewer")
+
+    review_steps: list[StepConfig] = []
+    reviewer_keys = _unique_safe_keys(reviewers)
+    for reviewer, rk in zip(reviewers, reviewer_keys):
+        review_steps.append(
+            StepConfig(
+                name=f"review_{rk}",
+                agent=reviewer,
+                role="review",
+                prompt_template="default:coding-plan-review",
+                output_key=f"review_{rk}",
+                verdict=False,
+                parallel=True,
+            ),
+        )
+
+    senior_agent = seniors[0] if seniors else reviewers[0]
+    review_step_names = [f"review_{rk}" for rk in reviewer_keys]
+    review_output_keys = [f"review_{rk}" for rk in reviewer_keys]
+
+    return [
+        PhaseConfig(
+            name="initial_coding",
+            steps=[
+                StepConfig(
+                    name="coding",
+                    agent=coders[0],
+                    role="coding",
+                    prompt_template="default:coding",
+                    output_key="coding_output",
+                ),
+            ],
+            max_iterations=1,
+            consecutive_pass=1,
+        ),
+        PhaseConfig(
+            name="coding_plan_review",
+            steps=review_steps + [
+                StepConfig(
+                    name="aggregate_review",
+                    agent=senior_agent,
+                    role="review",
+                    prompt_template="default:aggregate-review",
+                    output_key="aggregate_review",
+                    context_override={
+                        "candidate_outputs": (
+                            "Current implementation and planning package under review "
+                            "(code + plan/checklist/reference docs)."
+                        ),
+                        "reviews_bundle": _build_named_bundle(
+                            reviewers, review_step_names, review_output_keys, "Review",
+                        ),
+                    },
+                ),
+                StepConfig(
+                    name="coding_plan_fix",
+                    agent=coders[0],
+                    role="coding",
+                    prompt_template="default:coding-plan-fix",
+                    output_key="coding_plan_fix_output",
+                    context_override={"feedback": "{aggregate_review}"},
+                ),
+                StepConfig(
+                    name="verify",
+                    agent=senior_agent,
+                    role="review",
+                    prompt_template="default:coding-plan-review",
+                    output_key="verify_result",
+                    verdict=True,
+                ),
+            ],
+            max_iterations=5,
+            consecutive_pass=1,
+        ),
+    ]
+
+
 PIPELINE_PRESETS: dict[str, Callable] = {
-    "simple": _build_simple_preset,
-    "cross-review": _build_cross_review_preset,
    "plan-review": _build_plan_review_preset,
-    "review-only": _build_review_only_preset,
 }

 PHASED_PRESETS: dict[str, Callable] = {
-    "review-fix": _build_review_fix_preset,
-    "coding-review-fix": _build_coding_review_fix_preset,
+    "coding-plan-review": _build_coding_plan_review_preset,
 }

 ALL_PRESET_NAMES: list[str] = list(PIPELINE_PRESETS.keys()) + list(PHASED_PRESETS.keys())