Make plan-review a review-fix-verify loop

This commit is contained in:
chungyeong
2026-03-15 00:01:26 +09:00
parent 60c7b07939
commit a85a490a9b
7 changed files with 289 additions and 73 deletions

View File

@@ -42,6 +42,8 @@ from cross_eval.prompts import (
REVIEW_TEMPLATE_KO,
PLAN_REVIEW_TEMPLATE,
PLAN_REVIEW_TEMPLATE_KO,
PLAN_FIX_TEMPLATE,
PLAN_FIX_TEMPLATE_KO,
REVIEW_ONLY_TEMPLATE,
REVIEW_ONLY_TEMPLATE_KO,
AGGREGATE_REVIEW_TEMPLATE,
@@ -310,7 +312,23 @@ class BuiltinAgentConfigTest(unittest.TestCase):
self.assertIn("Repeated Aggregate Findings", report)
self.assertIn("same as iteration 3", report)
def test_review_fix_defaults_senior_from_reviewer_family(self) -> None:
def test_fix_and_plan_presets_default_senior_from_reviewer_family(self) -> None:
self.assertEqual(
_default_seniors_for_preset(
"preset:plan-review",
["codex-reviewer"],
BUILTIN_AGENTS,
),
["codex-senior"],
)
self.assertEqual(
_default_seniors_for_preset(
"preset:plan-review",
["claude-reviewer"],
BUILTIN_AGENTS,
),
["claude-senior"],
)
self.assertEqual(
_default_seniors_for_preset(
"preset:review-fix",
@@ -421,23 +439,49 @@ class BuiltinAgentConfigTest(unittest.TestCase):
)
self.assertEqual(
[step.output_key for step in steps],
[step.output_key for step in steps[:2]],
["plan_review_codex_reviewer", "plan_review_codex_reviewer_2"],
)
def test_plan_review_with_senior_adds_aggregate_step(self) -> None:
def test_plan_review_builds_review_fix_verify_loop(self) -> None:
steps = _build_plan_review_preset(
["codex-coder"],
["claude-reviewer", "codex-reviewer"],
["claude-senior"],
)
self.assertEqual(steps[-1].name, "senior_review")
self.assertEqual(steps[-1].agent, "claude-senior")
self.assertTrue(steps[-1].verdict)
self.assertEqual(
[step.name for step in steps],
[
"plan_review_claude_reviewer",
"plan_review_codex_reviewer",
"aggregate_review",
"plan_fix",
"verify",
],
)
self.assertEqual(steps[2].agent, "claude-senior")
self.assertEqual(steps[3].agent, "codex-coder")
self.assertEqual(steps[4].agent, "claude-senior")
self.assertTrue(steps[4].verdict)
self.assertFalse(steps[0].verdict)
self.assertFalse(steps[1].verdict)
def test_plan_review_single_reviewer_uses_default_loop_steps(self) -> None:
steps = _build_plan_review_preset(
["codex-coder"],
["codex-reviewer"],
[],
)
self.assertEqual(
[step.name for step in steps],
["plan_review", "aggregate_review", "plan_fix", "verify"],
)
self.assertEqual(steps[1].agent, "codex-reviewer")
self.assertEqual(steps[2].prompt_template, "default:plan-fix")
self.assertTrue(steps[3].verdict)
def test_cross_review_duplicate_coders_get_unique_step_keys(self) -> None:
steps = _build_cross_review_preset(
["codex-coder", "codex-coder"],
@@ -576,6 +620,8 @@ class PromptTemplateTest(unittest.TestCase):
"""Coding templates should tell coder to ignore DISMISSED items."""
self.assertIn("DISMISSED", CODING_TEMPLATE)
self.assertIn("DISMISSED", CODING_TEMPLATE_KO)
self.assertIn("DISMISSED", PLAN_FIX_TEMPLATE)
self.assertIn("DISMISSED", PLAN_FIX_TEMPLATE_KO)
def test_aggregate_templates_dismissed_structure(self) -> None:
"""Aggregate templates should use [False positive] / [Already fixed] tags."""
@@ -583,6 +629,10 @@ class PromptTemplateTest(unittest.TestCase):
self.assertIn("[Already fixed]", AGGREGATE_REVIEW_TEMPLATE)
self.assertIn("[오탐]", AGGREGATE_REVIEW_TEMPLATE_KO)
self.assertIn("[수정 완료]", AGGREGATE_REVIEW_TEMPLATE_KO)
self.assertIn("{candidate_outputs}", AGGREGATE_REVIEW_TEMPLATE)
self.assertIn("{reviews_bundle}", AGGREGATE_REVIEW_TEMPLATE)
self.assertIn("{candidate_outputs}", AGGREGATE_REVIEW_TEMPLATE_KO)
self.assertIn("{reviews_bundle}", AGGREGATE_REVIEW_TEMPLATE_KO)
class ReviewMetricsParsingTest(unittest.TestCase):
@@ -1033,6 +1083,34 @@ class FixPresetBehaviorTest(unittest.TestCase):
self.assertTrue(captured["agentic"])
self.assertEqual(captured["phase_max"], 3)
def test_run_preset_plan_review_auto_enables_agentic_without_flag(self) -> None:
captured: dict[str, object] = {}
def _fake_run_pipeline(config, **kwargs):
captured["preset"] = config.preset_name
captured["agentic"] = config.agents[config.coders[0]].agentic
captured["seniors"] = list(config.seniors)
captured["steps"] = [step.name for step in config.pipeline]
captured["max_iter"] = config.max_iterations
return PipelineResult(
iterations=[],
final_verdict="PASS",
run_dir=Path(".cross-eval/output"),
)
with patch("cross_eval.pipeline.run_pipeline", side_effect=_fake_run_pipeline):
exit_code = main(["run", "--preset", "plan-review", "--dry-run"])
self.assertEqual(exit_code, 0)
self.assertEqual(captured["preset"], "plan-review")
self.assertTrue(captured["agentic"])
self.assertEqual(captured["seniors"], ["claude-senior"])
self.assertEqual(
captured["steps"],
["plan_review", "aggregate_review", "plan_fix", "verify"],
)
self.assertEqual(captured["max_iter"], 3)
def test_run_senior_model_override_applies_only_to_seniors(self) -> None:
captured: dict[str, list[str]] = {}

View File

@@ -13,7 +13,11 @@ from cross_eval.models import (
StepConfig,
)
from cross_eval.pipeline import run_pipeline
from cross_eval.prompts import _build_review_fix_preset, _build_simple_preset
from cross_eval.prompts import (
_build_plan_review_preset,
_build_review_fix_preset,
_build_simple_preset,
)
def _make_mock_agent(outputs: list[str]):
@@ -262,6 +266,60 @@ class TestPhasedPipelineEscalateBreaksPhase(unittest.TestCase):
self.assertTrue(len(result.escalated_issues) > 0)
class TestPlanReviewPipelineLoopsUntilVerifyPass(unittest.TestCase):
"""Document plan-review should revise docs and re-verify across iterations."""
def test_plan_review_fail_then_pass(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
coders = ["claude-coder"]
reviewers = ["claude-reviewer"]
seniors = ["claude-senior"]
steps = _build_plan_review_preset(coders, reviewers, seniors)
config = PipelineConfig(
output_dir=Path(tmpdir),
max_iterations=4,
min_iterations=1,
language="en",
inputs={
"plan": "Test plan",
"checklist": "Test checklist",
"docs": "Reference docs",
},
agents=dict(BUILTIN_AGENTS),
coders=coders,
reviewers=reviewers,
seniors=seniors,
pipeline=steps,
preset_name="plan-review",
)
mock = _make_step_mock({
"plan_review": [
"Requirements are ambiguous\n\nVERDICT: FAIL",
"Looks aligned\n\nVERDICT: PASS",
],
"aggregate_review": [
"### Confirmed Issues\n- Clarify acceptance criteria\n\n"
"### Action Items\n1. Tighten the checklist\n\nVERDICT: FAIL",
"### Confirmed Issues\nNone\n\n"
"### Dismissed Findings\nNone\n\n"
"### Action Items\n1. No document changes needed\n\nVERDICT: PASS",
],
"plan_fix": ["Updated plan and checklist", "No-op"],
"verify": [
"Still missing edge-case criteria\n\nVERDICT: FAIL",
"Planning package is now implementable\n\nVERDICT: PASS",
],
})
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
result = run_pipeline(config)
self.assertEqual(result.final_verdict, "PASS")
self.assertEqual(len(result.iterations), 2)
class TestAutoEscalateFiresWithoutSenior(unittest.TestCase):
"""Test 6: simple pipeline without senior, same FAIL feedback 3 times -> auto-escalate."""