Make plan-review a review-fix-verify loop
This commit is contained in:
@@ -42,6 +42,8 @@ from cross_eval.prompts import (
|
||||
REVIEW_TEMPLATE_KO,
|
||||
PLAN_REVIEW_TEMPLATE,
|
||||
PLAN_REVIEW_TEMPLATE_KO,
|
||||
PLAN_FIX_TEMPLATE,
|
||||
PLAN_FIX_TEMPLATE_KO,
|
||||
REVIEW_ONLY_TEMPLATE,
|
||||
REVIEW_ONLY_TEMPLATE_KO,
|
||||
AGGREGATE_REVIEW_TEMPLATE,
|
||||
@@ -310,7 +312,23 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
||||
self.assertIn("Repeated Aggregate Findings", report)
|
||||
self.assertIn("same as iteration 3", report)
|
||||
|
||||
def test_review_fix_defaults_senior_from_reviewer_family(self) -> None:
|
||||
def test_fix_and_plan_presets_default_senior_from_reviewer_family(self) -> None:
|
||||
self.assertEqual(
|
||||
_default_seniors_for_preset(
|
||||
"preset:plan-review",
|
||||
["codex-reviewer"],
|
||||
BUILTIN_AGENTS,
|
||||
),
|
||||
["codex-senior"],
|
||||
)
|
||||
self.assertEqual(
|
||||
_default_seniors_for_preset(
|
||||
"preset:plan-review",
|
||||
["claude-reviewer"],
|
||||
BUILTIN_AGENTS,
|
||||
),
|
||||
["claude-senior"],
|
||||
)
|
||||
self.assertEqual(
|
||||
_default_seniors_for_preset(
|
||||
"preset:review-fix",
|
||||
@@ -421,23 +439,49 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
[step.output_key for step in steps],
|
||||
[step.output_key for step in steps[:2]],
|
||||
["plan_review_codex_reviewer", "plan_review_codex_reviewer_2"],
|
||||
)
|
||||
|
||||
def test_plan_review_with_senior_adds_aggregate_step(self) -> None:
|
||||
def test_plan_review_builds_review_fix_verify_loop(self) -> None:
|
||||
steps = _build_plan_review_preset(
|
||||
["codex-coder"],
|
||||
["claude-reviewer", "codex-reviewer"],
|
||||
["claude-senior"],
|
||||
)
|
||||
|
||||
self.assertEqual(steps[-1].name, "senior_review")
|
||||
self.assertEqual(steps[-1].agent, "claude-senior")
|
||||
self.assertTrue(steps[-1].verdict)
|
||||
self.assertEqual(
|
||||
[step.name for step in steps],
|
||||
[
|
||||
"plan_review_claude_reviewer",
|
||||
"plan_review_codex_reviewer",
|
||||
"aggregate_review",
|
||||
"plan_fix",
|
||||
"verify",
|
||||
],
|
||||
)
|
||||
self.assertEqual(steps[2].agent, "claude-senior")
|
||||
self.assertEqual(steps[3].agent, "codex-coder")
|
||||
self.assertEqual(steps[4].agent, "claude-senior")
|
||||
self.assertTrue(steps[4].verdict)
|
||||
self.assertFalse(steps[0].verdict)
|
||||
self.assertFalse(steps[1].verdict)
|
||||
|
||||
def test_plan_review_single_reviewer_uses_default_loop_steps(self) -> None:
|
||||
steps = _build_plan_review_preset(
|
||||
["codex-coder"],
|
||||
["codex-reviewer"],
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
[step.name for step in steps],
|
||||
["plan_review", "aggregate_review", "plan_fix", "verify"],
|
||||
)
|
||||
self.assertEqual(steps[1].agent, "codex-reviewer")
|
||||
self.assertEqual(steps[2].prompt_template, "default:plan-fix")
|
||||
self.assertTrue(steps[3].verdict)
|
||||
|
||||
def test_cross_review_duplicate_coders_get_unique_step_keys(self) -> None:
|
||||
steps = _build_cross_review_preset(
|
||||
["codex-coder", "codex-coder"],
|
||||
@@ -576,6 +620,8 @@ class PromptTemplateTest(unittest.TestCase):
|
||||
"""Coding templates should tell coder to ignore DISMISSED items."""
|
||||
self.assertIn("DISMISSED", CODING_TEMPLATE)
|
||||
self.assertIn("DISMISSED", CODING_TEMPLATE_KO)
|
||||
self.assertIn("DISMISSED", PLAN_FIX_TEMPLATE)
|
||||
self.assertIn("DISMISSED", PLAN_FIX_TEMPLATE_KO)
|
||||
|
||||
def test_aggregate_templates_dismissed_structure(self) -> None:
|
||||
"""Aggregate templates should use [False positive] / [Already fixed] tags."""
|
||||
@@ -583,6 +629,10 @@ class PromptTemplateTest(unittest.TestCase):
|
||||
self.assertIn("[Already fixed]", AGGREGATE_REVIEW_TEMPLATE)
|
||||
self.assertIn("[오탐]", AGGREGATE_REVIEW_TEMPLATE_KO)
|
||||
self.assertIn("[수정 완료]", AGGREGATE_REVIEW_TEMPLATE_KO)
|
||||
self.assertIn("{candidate_outputs}", AGGREGATE_REVIEW_TEMPLATE)
|
||||
self.assertIn("{reviews_bundle}", AGGREGATE_REVIEW_TEMPLATE)
|
||||
self.assertIn("{candidate_outputs}", AGGREGATE_REVIEW_TEMPLATE_KO)
|
||||
self.assertIn("{reviews_bundle}", AGGREGATE_REVIEW_TEMPLATE_KO)
|
||||
|
||||
|
||||
class ReviewMetricsParsingTest(unittest.TestCase):
|
||||
@@ -1033,6 +1083,34 @@ class FixPresetBehaviorTest(unittest.TestCase):
|
||||
self.assertTrue(captured["agentic"])
|
||||
self.assertEqual(captured["phase_max"], 3)
|
||||
|
||||
def test_run_preset_plan_review_auto_enables_agentic_without_flag(self) -> None:
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
def _fake_run_pipeline(config, **kwargs):
|
||||
captured["preset"] = config.preset_name
|
||||
captured["agentic"] = config.agents[config.coders[0]].agentic
|
||||
captured["seniors"] = list(config.seniors)
|
||||
captured["steps"] = [step.name for step in config.pipeline]
|
||||
captured["max_iter"] = config.max_iterations
|
||||
return PipelineResult(
|
||||
iterations=[],
|
||||
final_verdict="PASS",
|
||||
run_dir=Path(".cross-eval/output"),
|
||||
)
|
||||
|
||||
with patch("cross_eval.pipeline.run_pipeline", side_effect=_fake_run_pipeline):
|
||||
exit_code = main(["run", "--preset", "plan-review", "--dry-run"])
|
||||
|
||||
self.assertEqual(exit_code, 0)
|
||||
self.assertEqual(captured["preset"], "plan-review")
|
||||
self.assertTrue(captured["agentic"])
|
||||
self.assertEqual(captured["seniors"], ["claude-senior"])
|
||||
self.assertEqual(
|
||||
captured["steps"],
|
||||
["plan_review", "aggregate_review", "plan_fix", "verify"],
|
||||
)
|
||||
self.assertEqual(captured["max_iter"], 3)
|
||||
|
||||
def test_run_senior_model_override_applies_only_to_seniors(self) -> None:
|
||||
captured: dict[str, list[str]] = {}
|
||||
|
||||
|
||||
@@ -13,7 +13,11 @@ from cross_eval.models import (
|
||||
StepConfig,
|
||||
)
|
||||
from cross_eval.pipeline import run_pipeline
|
||||
from cross_eval.prompts import _build_review_fix_preset, _build_simple_preset
|
||||
from cross_eval.prompts import (
|
||||
_build_plan_review_preset,
|
||||
_build_review_fix_preset,
|
||||
_build_simple_preset,
|
||||
)
|
||||
|
||||
|
||||
def _make_mock_agent(outputs: list[str]):
|
||||
@@ -262,6 +266,60 @@ class TestPhasedPipelineEscalateBreaksPhase(unittest.TestCase):
|
||||
self.assertTrue(len(result.escalated_issues) > 0)
|
||||
|
||||
|
||||
class TestPlanReviewPipelineLoopsUntilVerifyPass(unittest.TestCase):
|
||||
"""Document plan-review should revise docs and re-verify across iterations."""
|
||||
|
||||
def test_plan_review_fail_then_pass(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
coders = ["claude-coder"]
|
||||
reviewers = ["claude-reviewer"]
|
||||
seniors = ["claude-senior"]
|
||||
steps = _build_plan_review_preset(coders, reviewers, seniors)
|
||||
|
||||
config = PipelineConfig(
|
||||
output_dir=Path(tmpdir),
|
||||
max_iterations=4,
|
||||
min_iterations=1,
|
||||
language="en",
|
||||
inputs={
|
||||
"plan": "Test plan",
|
||||
"checklist": "Test checklist",
|
||||
"docs": "Reference docs",
|
||||
},
|
||||
agents=dict(BUILTIN_AGENTS),
|
||||
coders=coders,
|
||||
reviewers=reviewers,
|
||||
seniors=seniors,
|
||||
pipeline=steps,
|
||||
preset_name="plan-review",
|
||||
)
|
||||
|
||||
mock = _make_step_mock({
|
||||
"plan_review": [
|
||||
"Requirements are ambiguous\n\nVERDICT: FAIL",
|
||||
"Looks aligned\n\nVERDICT: PASS",
|
||||
],
|
||||
"aggregate_review": [
|
||||
"### Confirmed Issues\n- Clarify acceptance criteria\n\n"
|
||||
"### Action Items\n1. Tighten the checklist\n\nVERDICT: FAIL",
|
||||
"### Confirmed Issues\nNone\n\n"
|
||||
"### Dismissed Findings\nNone\n\n"
|
||||
"### Action Items\n1. No document changes needed\n\nVERDICT: PASS",
|
||||
],
|
||||
"plan_fix": ["Updated plan and checklist", "No-op"],
|
||||
"verify": [
|
||||
"Still missing edge-case criteria\n\nVERDICT: FAIL",
|
||||
"Planning package is now implementable\n\nVERDICT: PASS",
|
||||
],
|
||||
})
|
||||
|
||||
with patch("cross_eval.pipeline.invoke_agent", side_effect=mock):
|
||||
result = run_pipeline(config)
|
||||
|
||||
self.assertEqual(result.final_verdict, "PASS")
|
||||
self.assertEqual(len(result.iterations), 2)
|
||||
|
||||
|
||||
class TestAutoEscalateFiresWithoutSenior(unittest.TestCase):
|
||||
"""Test 6: simple pipeline without senior, same FAIL feedback 3 times -> auto-escalate."""
|
||||
|
||||
|
||||
Reference in New Issue
Block a user