Make plan-review a review-fix-verify loop
This commit is contained in:
@@ -42,6 +42,8 @@ from cross_eval.prompts import (
|
||||
REVIEW_TEMPLATE_KO,
|
||||
PLAN_REVIEW_TEMPLATE,
|
||||
PLAN_REVIEW_TEMPLATE_KO,
|
||||
PLAN_FIX_TEMPLATE,
|
||||
PLAN_FIX_TEMPLATE_KO,
|
||||
REVIEW_ONLY_TEMPLATE,
|
||||
REVIEW_ONLY_TEMPLATE_KO,
|
||||
AGGREGATE_REVIEW_TEMPLATE,
|
||||
@@ -310,7 +312,23 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
||||
self.assertIn("Repeated Aggregate Findings", report)
|
||||
self.assertIn("same as iteration 3", report)
|
||||
|
||||
def test_review_fix_defaults_senior_from_reviewer_family(self) -> None:
|
||||
def test_fix_and_plan_presets_default_senior_from_reviewer_family(self) -> None:
|
||||
self.assertEqual(
|
||||
_default_seniors_for_preset(
|
||||
"preset:plan-review",
|
||||
["codex-reviewer"],
|
||||
BUILTIN_AGENTS,
|
||||
),
|
||||
["codex-senior"],
|
||||
)
|
||||
self.assertEqual(
|
||||
_default_seniors_for_preset(
|
||||
"preset:plan-review",
|
||||
["claude-reviewer"],
|
||||
BUILTIN_AGENTS,
|
||||
),
|
||||
["claude-senior"],
|
||||
)
|
||||
self.assertEqual(
|
||||
_default_seniors_for_preset(
|
||||
"preset:review-fix",
|
||||
@@ -421,23 +439,49 @@ class BuiltinAgentConfigTest(unittest.TestCase):
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
[step.output_key for step in steps],
|
||||
[step.output_key for step in steps[:2]],
|
||||
["plan_review_codex_reviewer", "plan_review_codex_reviewer_2"],
|
||||
)
|
||||
|
||||
def test_plan_review_with_senior_adds_aggregate_step(self) -> None:
|
||||
def test_plan_review_builds_review_fix_verify_loop(self) -> None:
|
||||
steps = _build_plan_review_preset(
|
||||
["codex-coder"],
|
||||
["claude-reviewer", "codex-reviewer"],
|
||||
["claude-senior"],
|
||||
)
|
||||
|
||||
self.assertEqual(steps[-1].name, "senior_review")
|
||||
self.assertEqual(steps[-1].agent, "claude-senior")
|
||||
self.assertTrue(steps[-1].verdict)
|
||||
self.assertEqual(
|
||||
[step.name for step in steps],
|
||||
[
|
||||
"plan_review_claude_reviewer",
|
||||
"plan_review_codex_reviewer",
|
||||
"aggregate_review",
|
||||
"plan_fix",
|
||||
"verify",
|
||||
],
|
||||
)
|
||||
self.assertEqual(steps[2].agent, "claude-senior")
|
||||
self.assertEqual(steps[3].agent, "codex-coder")
|
||||
self.assertEqual(steps[4].agent, "claude-senior")
|
||||
self.assertTrue(steps[4].verdict)
|
||||
self.assertFalse(steps[0].verdict)
|
||||
self.assertFalse(steps[1].verdict)
|
||||
|
||||
def test_plan_review_single_reviewer_uses_default_loop_steps(self) -> None:
|
||||
steps = _build_plan_review_preset(
|
||||
["codex-coder"],
|
||||
["codex-reviewer"],
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
[step.name for step in steps],
|
||||
["plan_review", "aggregate_review", "plan_fix", "verify"],
|
||||
)
|
||||
self.assertEqual(steps[1].agent, "codex-reviewer")
|
||||
self.assertEqual(steps[2].prompt_template, "default:plan-fix")
|
||||
self.assertTrue(steps[3].verdict)
|
||||
|
||||
def test_cross_review_duplicate_coders_get_unique_step_keys(self) -> None:
|
||||
steps = _build_cross_review_preset(
|
||||
["codex-coder", "codex-coder"],
|
||||
@@ -576,6 +620,8 @@ class PromptTemplateTest(unittest.TestCase):
|
||||
"""Coding templates should tell coder to ignore DISMISSED items."""
|
||||
self.assertIn("DISMISSED", CODING_TEMPLATE)
|
||||
self.assertIn("DISMISSED", CODING_TEMPLATE_KO)
|
||||
self.assertIn("DISMISSED", PLAN_FIX_TEMPLATE)
|
||||
self.assertIn("DISMISSED", PLAN_FIX_TEMPLATE_KO)
|
||||
|
||||
def test_aggregate_templates_dismissed_structure(self) -> None:
|
||||
"""Aggregate templates should use [False positive] / [Already fixed] tags."""
|
||||
@@ -583,6 +629,10 @@ class PromptTemplateTest(unittest.TestCase):
|
||||
self.assertIn("[Already fixed]", AGGREGATE_REVIEW_TEMPLATE)
|
||||
self.assertIn("[오탐]", AGGREGATE_REVIEW_TEMPLATE_KO)
|
||||
self.assertIn("[수정 완료]", AGGREGATE_REVIEW_TEMPLATE_KO)
|
||||
self.assertIn("{candidate_outputs}", AGGREGATE_REVIEW_TEMPLATE)
|
||||
self.assertIn("{reviews_bundle}", AGGREGATE_REVIEW_TEMPLATE)
|
||||
self.assertIn("{candidate_outputs}", AGGREGATE_REVIEW_TEMPLATE_KO)
|
||||
self.assertIn("{reviews_bundle}", AGGREGATE_REVIEW_TEMPLATE_KO)
|
||||
|
||||
|
||||
class ReviewMetricsParsingTest(unittest.TestCase):
|
||||
@@ -1033,6 +1083,34 @@ class FixPresetBehaviorTest(unittest.TestCase):
|
||||
self.assertTrue(captured["agentic"])
|
||||
self.assertEqual(captured["phase_max"], 3)
|
||||
|
||||
def test_run_preset_plan_review_auto_enables_agentic_without_flag(self) -> None:
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
def _fake_run_pipeline(config, **kwargs):
|
||||
captured["preset"] = config.preset_name
|
||||
captured["agentic"] = config.agents[config.coders[0]].agentic
|
||||
captured["seniors"] = list(config.seniors)
|
||||
captured["steps"] = [step.name for step in config.pipeline]
|
||||
captured["max_iter"] = config.max_iterations
|
||||
return PipelineResult(
|
||||
iterations=[],
|
||||
final_verdict="PASS",
|
||||
run_dir=Path(".cross-eval/output"),
|
||||
)
|
||||
|
||||
with patch("cross_eval.pipeline.run_pipeline", side_effect=_fake_run_pipeline):
|
||||
exit_code = main(["run", "--preset", "plan-review", "--dry-run"])
|
||||
|
||||
self.assertEqual(exit_code, 0)
|
||||
self.assertEqual(captured["preset"], "plan-review")
|
||||
self.assertTrue(captured["agentic"])
|
||||
self.assertEqual(captured["seniors"], ["claude-senior"])
|
||||
self.assertEqual(
|
||||
captured["steps"],
|
||||
["plan_review", "aggregate_review", "plan_fix", "verify"],
|
||||
)
|
||||
self.assertEqual(captured["max_iter"], 3)
|
||||
|
||||
def test_run_senior_model_override_applies_only_to_seniors(self) -> None:
|
||||
captured: dict[str, list[str]] = {}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user