continue

2026-03-15 17:54:30 +09:00
parent 28efd5bb8f
commit 0bbe0f6f7b
14 changed files with 871 additions and 183 deletions
--- a/cross_eval/prompts.py
+++ b/cross_eval/prompts.py
@@ -512,6 +512,218 @@ PLAN_FIX_TEMPLATE_KO = """\
 8. 수정이 끝나면 무엇을 바꿨는지와 아직 사람 판단이 필요한 blocker가 있는지 짧게 정리하세요.
 """

+PLAN_VERIFY_TEMPLATE = """\
+You are verifying the latest planning package after plan-only revisions.
+
+## Plan
+{plan}
+
+## Checklist
+{checklist}
+
+## Reference Documents
+{docs}
+
+## Previous Review (iteration {iteration} of {max_iterations})
+{feedback}
+
+## Execution Evidence
+{execution_evidence}
+
+## Verify Instructions
+Review the latest planning package itself: the plan, checklist, and reference documents.
+You MAY inspect the current repository to confirm that the documents describe the current reality accurately enough.
+Do NOT require production code, scripts, infrastructure, or external environments to already be fixed.
+
+For `plan-review`, PASS means the documents are now clear enough to execute without further document edits.
+A known implementation gap, repo mismatch, legacy script problem, external dependency, or environment blocker is NOT a FAIL by itself if:
+- the issue is described accurately in the planning package,
+- the affected scope or gate is documented clearly,
+- the required follow-up action or non-go condition is documented clearly, and
+- the package does not misrepresent unresolved work as already complete.
+
+Only mark FAIL when the planning package still needs correction, such as:
+- unresolved ambiguity or contradiction in the documents,
+- missing prerequisite, dependency, gate, ownership, or evidence rule,
+- a known blocker that is still described inaccurately or misleadingly,
+- conflicting source-of-truth rules across the planning documents,
+- checklist or status criteria that would cause an operator to make the wrong decision.
+
+Report implementation/repository problems that are already documented correctly under "Out of Scope Issues" or note them as documented risks, not as FAIL reasons.
+
+## Output Format
+
+### Remaining Document Issues
+- [Major][Omission] Description (reference specific plan/checklist/doc item)
+(Write "None" if no document issue remains.)
+
+### Documented Risks / Out of Scope
+- Description of a real implementation/repository/environment risk that is already documented correctly
+(Write "None" if nothing notable remains.)
+
+### Summary
+- Remaining document issues: N
+- Documented risks / out-of-scope items: N
+- Overall quality: [BRIEF ASSESSMENT]
+
+### Verdict
+If the planning package no longer needs document changes, output: VERDICT: PASS
+Otherwise output: VERDICT: FAIL
+"""
+
+PLAN_VERIFY_TEMPLATE_KO = """\
+당신은 plan-only 수정 이후 최신 기획 패키지를 재검증하는 검토자입니다.
+
+## 기획서
+{plan}
+
+## 체크리스트
+{checklist}
+
+## 참고 문서
+{docs}
+
+## 이전 리뷰 결과 ({max_iterations}회 중 {iteration}번째)
+{feedback}
+
+## 실행 증거
+{execution_evidence}
+
+## 검증 지침
+최신 기획 패키지 자체를 다시 검토하세요: 기획서, 체크리스트, 참고 문서를 함께 봅니다.
+현재 저장소를 살펴보며 문서가 현실을 정확히 설명하는지 확인할 수는 있지만, 프로덕션 코드, 스크립트, 인프라, 외부 환경이 이미 수정되어 있을 것을 요구하면 안 됩니다.
+
+`plan-review`에서 PASS의 뜻은 "이제 문서를 더 고칠 필요 없이 이 계획을 실행할 수 있다"입니다.
+즉 구현 공백, 저장소 불일치, legacy 스크립트 문제, 외부 의존성, 환경 blocker가 남아 있어도 아래 조건을 만족하면 FAIL 사유가 아닙니다.
+- 그 문제가 기획 패키지에 정확히 기록되어 있고
+- 어떤 범위/게이트에 영향을 주는지 분명히 적혀 있고
+- 필요한 후속 조치나 non-go 조건이 명확히 적혀 있고
+- 아직 해결되지 않은 일을 이미 해결된 것처럼 오해하게 만들지 않는 경우
+
+반대로 아래와 같은 경우에만 FAIL로 판정하세요.
+- 문서 안에 아직 모호성이나 모순이 남아 있는 경우
+- 선행조건, 의존성, 게이트, 담당 주체, evidence 규칙이 빠진 경우
+- 알려진 blocker가 여전히 부정확하거나 오해를 부르는 방식으로 서술된 경우
+- 기획 문서들 사이에서 source-of-truth 규칙이 충돌하는 경우
+- 체크리스트나 상태 판정 기준 때문에 실행자가 잘못된 결정을 내릴 수 있는 경우
+
+이미 문서에 정확히 기록된 구현/저장소 문제는 "범위 밖 이슈" 또는 "문서화된 리스크"로만 남기고, 그 자체를 FAIL 사유로 삼지 마세요.
+
+## 출력 형식
+
+### 남은 문서 이슈
+- [Major][누락] 이슈 설명 (관련 기획서/체크리스트/참고 문서 항목 참조)
+(남은 문서 이슈가 없으면 "없음"이라고 작성하세요.)
+
+### 문서화된 리스크 / 범위 밖 이슈
+- 실제 구현/저장소/환경 리스크이지만 문서에는 이미 정확히 반영된 항목
+(해당 사항이 없으면 "없음"이라고 작성하세요.)
+
+### 요약
+- 남은 문서 이슈 수: N
+- 문서화된 리스크 / 범위 밖 항목 수: N
+- 전체 품질: [간략한 평가]
+
+### 판정
+기획 패키지를 더 수정할 필요가 없으면: VERDICT: PASS
+그렇지 않으면: VERDICT: FAIL
+"""
+
+CODING_PLAN_REVIEW_TEMPLATE = """\
+You are reviewing both the implementation and the planning package together.
+
+## Artifact References
+{artifact_references}
+
+## Execution Evidence
+{execution_evidence}
+
+## Review Instructions
+Read the referenced plan/checklist/docs/review artifacts directly from disk. \
+Inspect the current repository and evaluate BOTH:
+1. whether the implementation matches the plan/checklist/docs, and
+2. whether the planning package still accurately describes the implementation target and constraints.
+
+Report only issues that matter to delivering the original plan correctly. \
+Do not invent new scope. Distinguish between code issues, document issues, and consistency gaps between them.
+
+For each issue found, classify it with BOTH severity AND category:
+- Severity: Critical / Major / Minor
+- Category: Over-engineering / Omission
+
+If previous review feedback is provided above, mark each prior item as CONFIRMED or DISMISSED.
+If you find issues outside the original plan scope, report them separately under "Out of Scope Issues".
+
+### Verdict
+If the implementation satisfies the plan/checklist and the planning package no longer needs correction, output: VERDICT: PASS
+Otherwise output: VERDICT: FAIL
+"""
+
+CODING_PLAN_REVIEW_TEMPLATE_KO = """\
+당신은 구현 결과와 기획 문서 패키지를 함께 검토하는 리뷰어입니다.
+
+## 참조 아티팩트
+{artifact_references}
+
+## 실행 증거
+{execution_evidence}
+
+## 검토 지침
+참조된 plan/checklist/docs/review markdown를 직접 읽고 현재 저장소를 확인한 뒤, 아래 두 가지를 함께 평가하세요.
+1. 현재 구현이 plan/checklist/docs와 일치하는가
+2. 기획 문서 패키지가 현재 구현 목표와 제약을 여전히 정확하게 설명하는가
+
+원래 계획을 제대로 완수하는 데 필요한 이슈만 보고하세요. 새로운 범위를 만들지 마세요.
+코드 이슈, 문서 이슈, 코드-문서 불일치를 구분해서 적으세요.
+
+발견된 각 이슈에 심각도와 카테고리를 모두 부여하세요.
+- 심각도: Critical / Major / Minor
+- 카테고리: 과최적화 / 누락
+
+이전 리뷰 피드백이 있으면 각 항목을 CONFIRMED 또는 DISMISSED로 판정하세요.
+원래 계획 범위 밖 이슈는 "범위 밖 이슈"로 별도 분리하세요.
+
+### 판정
+구현이 plan/checklist를 충족하고 기획 문서 패키지도 더 이상 수정할 필요가 없으면: VERDICT: PASS
+그렇지 않으면: VERDICT: FAIL
+"""
+
+CODING_PLAN_FIX_TEMPLATE = """\
+You are fixing confirmed issues in both the implementation and the planning package.
+
+## Artifact References
+{artifact_references}
+
+## Current Review Feedback
+{feedback}
+
+## Instructions
+1. Read the referenced plan/checklist/docs/review artifacts directly from disk.
+2. Fix ONLY the confirmed issues from the current review feedback.
+3. You may update both implementation files and planning artifacts when needed.
+4. Preserve the original plan intent and scope. Do not silently broaden requirements.
+5. Keep code, plan, checklist, and supporting docs consistent after edits.
+6. After editing, briefly summarize what you changed and any blocker that still needs human input.
+"""
+
+CODING_PLAN_FIX_TEMPLATE_KO = """\
+당신은 현재 리뷰에서 확정된 이슈를 코드와 기획 문서 패키지에 함께 반영하는 수정 담당자입니다.
+
+## 참조 아티팩트
+{artifact_references}
+
+## 현재 리뷰 피드백
+{feedback}
+
+## 지침
+1. 참조된 plan/checklist/docs/review markdown를 직접 읽으세요.
+2. 현재 리뷰 피드백에서 확정된 이슈만 수정하세요.
+3. 필요하면 코드와 기획 문서를 모두 수정할 수 있습니다.
+4. 최초 plan의 의도와 범위를 유지하세요. 요구사항을 몰래 넓히지 마세요.
+5. 수정 후 코드, plan, checklist, 참고 문서가 서로 모순되지 않게 유지하세요.
+6. 수정이 끝나면 무엇을 바꿨는지와 아직 사람 판단이 필요한 blocker가 있는지 짧게 정리하세요.
+"""
+
 AGGREGATE_REVIEW_TEMPLATE = """\
 You are adjudicating multiple review results and turning them into an actionable decision.

@@ -645,6 +857,9 @@ DEFAULT_TEMPLATES: dict[str, dict[str, str]] = {
        "review": REVIEW_TEMPLATE,
        "plan-review": PLAN_REVIEW_TEMPLATE,
        "plan-fix": PLAN_FIX_TEMPLATE,
+        "plan-verify": PLAN_VERIFY_TEMPLATE,
+        "coding-plan-review": CODING_PLAN_REVIEW_TEMPLATE,
+        "coding-plan-fix": CODING_PLAN_FIX_TEMPLATE,
        "review-only": REVIEW_ONLY_TEMPLATE,
        "aggregate-review": AGGREGATE_REVIEW_TEMPLATE,
    },
@@ -653,6 +868,9 @@ DEFAULT_TEMPLATES: dict[str, dict[str, str]] = {
        "review": REVIEW_TEMPLATE_KO,
        "plan-review": PLAN_REVIEW_TEMPLATE_KO,
        "plan-fix": PLAN_FIX_TEMPLATE_KO,
+        "plan-verify": PLAN_VERIFY_TEMPLATE_KO,
+        "coding-plan-review": CODING_PLAN_REVIEW_TEMPLATE_KO,
+        "coding-plan-fix": CODING_PLAN_FIX_TEMPLATE_KO,
        "review-only": REVIEW_ONLY_TEMPLATE_KO,
        "aggregate-review": AGGREGATE_REVIEW_TEMPLATE_KO,
    },
@@ -961,7 +1179,7 @@ def _build_plan_review_preset(
            name="verify",
            agent=senior_agent,
            role="review",
-            prompt_template="default:plan-review",
+            prompt_template="default:plan-verify",
            output_key="verify_result",
            verdict=True,
        ),
@@ -1065,16 +1283,97 @@ def _build_coding_review_fix_preset(
    ]


+def _build_coding_plan_review_preset(
+    coders: list[str], reviewers: list[str], seniors: list[str],
+) -> list[PhaseConfig]:
+    """Implement from plan/docs, then review and fix code+docs together."""
+    if not coders:
+        raise ValueError("'coding-plan-review' preset requires at least 1 coder")
+    if not reviewers:
+        raise ValueError("'coding-plan-review' preset requires at least 1 reviewer")
+
+    review_steps: list[StepConfig] = []
+    reviewer_keys = _unique_safe_keys(reviewers)
+    for reviewer, rk in zip(reviewers, reviewer_keys):
+        review_steps.append(
+            StepConfig(
+                name=f"review_{rk}",
+                agent=reviewer,
+                role="review",
+                prompt_template="default:coding-plan-review",
+                output_key=f"review_{rk}",
+                verdict=False,
+                parallel=True,
+            ),
+        )
+
+    senior_agent = seniors[0] if seniors else reviewers[0]
+    review_step_names = [f"review_{rk}" for rk in reviewer_keys]
+    review_output_keys = [f"review_{rk}" for rk in reviewer_keys]
+
+    return [
+        PhaseConfig(
+            name="initial_coding",
+            steps=[
+                StepConfig(
+                    name="coding",
+                    agent=coders[0],
+                    role="coding",
+                    prompt_template="default:coding",
+                    output_key="coding_output",
+                ),
+            ],
+            max_iterations=1,
+            consecutive_pass=1,
+        ),
+        PhaseConfig(
+            name="coding_plan_review",
+            steps=review_steps + [
+                StepConfig(
+                    name="aggregate_review",
+                    agent=senior_agent,
+                    role="review",
+                    prompt_template="default:aggregate-review",
+                    output_key="aggregate_review",
+                    context_override={
+                        "candidate_outputs": (
+                            "Current implementation and planning package under review "
+                            "(code + plan/checklist/reference docs)."
+                        ),
+                        "reviews_bundle": _build_named_bundle(
+                            reviewers, review_step_names, review_output_keys, "Review",
+                        ),
+                    },
+                ),
+                StepConfig(
+                    name="coding_plan_fix",
+                    agent=coders[0],
+                    role="coding",
+                    prompt_template="default:coding-plan-fix",
+                    output_key="coding_plan_fix_output",
+                    context_override={"feedback": "{aggregate_review}"},
+                ),
+                StepConfig(
+                    name="verify",
+                    agent=senior_agent,
+                    role="review",
+                    prompt_template="default:coding-plan-review",
+                    output_key="verify_result",
+                    verdict=True,
+                ),
+            ],
+            max_iterations=5,
+            consecutive_pass=1,
+        ),
+    ]
+
+
 PIPELINE_PRESETS: dict[str, Callable] = {
-    "simple": _build_simple_preset,
-    "cross-review": _build_cross_review_preset,
    "plan-review": _build_plan_review_preset,
-    "review-only": _build_review_only_preset,
 }

 PHASED_PRESETS: dict[str, Callable] = {
-    "review-fix": _build_review_fix_preset,
-    "coding-review-fix": _build_coding_review_fix_preset,
+    "coding-plan-review": _build_coding_plan_review_preset,
 }

 ALL_PRESET_NAMES: list[str] = list(PIPELINE_PRESETS.keys()) + list(PHASED_PRESETS.keys())