initial commit
This commit is contained in:
515
tests/test_config.py
Normal file
515
tests/test_config.py
Normal file
@@ -0,0 +1,515 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
from cross_eval.agent import _supports_reasoning_effort
|
||||
from cross_eval.agent import invoke_agent
|
||||
from cross_eval.config import (
|
||||
BUILTIN_AGENTS,
|
||||
_default_seniors_for_preset,
|
||||
apply_reasoning_effort_settings,
|
||||
normalize_reasoning_effort,
|
||||
validate_config,
|
||||
)
|
||||
from cross_eval.models import (
|
||||
AgentConfig,
|
||||
IterationResult,
|
||||
PhaseConfig,
|
||||
PipelineConfig,
|
||||
PipelineResult,
|
||||
ReviewMetrics,
|
||||
StepConfig,
|
||||
)
|
||||
from cross_eval.pipeline import _detect_repeated_aggregate
|
||||
from cross_eval.prompts import (
|
||||
GENERATE_TEMPLATE,
|
||||
GENERATE_TEMPLATE_KO,
|
||||
REVIEW_TEMPLATE,
|
||||
REVIEW_TEMPLATE_KO,
|
||||
REVIEW_ONLY_TEMPLATE,
|
||||
REVIEW_ONLY_TEMPLATE_KO,
|
||||
AGGREGATE_REVIEW_TEMPLATE,
|
||||
AGGREGATE_REVIEW_TEMPLATE_KO,
|
||||
_build_cross_review_preset,
|
||||
_build_review_fix_preset,
|
||||
_build_review_only_preset,
|
||||
_build_simple_preset,
|
||||
)
|
||||
from cross_eval.report import build_report, parse_review_metrics
|
||||
|
||||
|
||||
class BuiltinAgentConfigTest(unittest.TestCase):
|
||||
def test_codex_builtin_agents_skip_git_repo_check(self) -> None:
|
||||
for agent_name in ("codex-coder", "codex-reviewer", "codex-senior"):
|
||||
with self.subTest(agent=agent_name):
|
||||
self.assertIn(
|
||||
"--skip-git-repo-check",
|
||||
BUILTIN_AGENTS[agent_name].args,
|
||||
)
|
||||
|
||||
def test_senior_builtin_agents_exist(self) -> None:
|
||||
self.assertIn("claude-senior", BUILTIN_AGENTS)
|
||||
self.assertIn("codex-senior", BUILTIN_AGENTS)
|
||||
|
||||
def test_builtin_reasoning_effort_defaults_match_recommended_levels(self) -> None:
|
||||
self.assertEqual(BUILTIN_AGENTS["codex-coder"].reasoning_effort, "medium")
|
||||
self.assertEqual(BUILTIN_AGENTS["codex-reviewer"].reasoning_effort, "medium")
|
||||
self.assertEqual(BUILTIN_AGENTS["codex-senior"].reasoning_effort, "high")
|
||||
|
||||
def test_normalize_reasoning_effort_aliases(self) -> None:
|
||||
self.assertEqual(normalize_reasoning_effort("extra-high"), "xhigh")
|
||||
self.assertEqual(normalize_reasoning_effort("extra_high"), "xhigh")
|
||||
self.assertEqual(normalize_reasoning_effort("x-high"), "xhigh")
|
||||
|
||||
def test_apply_reasoning_effort_settings_uses_defaults_and_role_overrides(self) -> None:
|
||||
config = PipelineConfig(
|
||||
agents={
|
||||
"codex-coder": AgentConfig(name="codex-coder", command="codex"),
|
||||
"codex-reviewer": AgentConfig(name="codex-reviewer", command="codex"),
|
||||
"codex-senior": AgentConfig(name="codex-senior", command="codex"),
|
||||
},
|
||||
coders=["codex-coder"],
|
||||
reviewers=["codex-reviewer"],
|
||||
seniors=["codex-senior"],
|
||||
)
|
||||
|
||||
apply_reasoning_effort_settings(
|
||||
config,
|
||||
reviewer_effort="high",
|
||||
senior_effort="xhigh",
|
||||
)
|
||||
|
||||
self.assertEqual(config.agents["codex-coder"].reasoning_effort, "medium")
|
||||
self.assertEqual(config.agents["codex-reviewer"].reasoning_effort, "high")
|
||||
self.assertEqual(config.agents["codex-senior"].reasoning_effort, "xhigh")
|
||||
|
||||
def test_codex_supports_reasoning_effort_override(self) -> None:
|
||||
self.assertTrue(_supports_reasoning_effort("codex"))
|
||||
self.assertFalse(_supports_reasoning_effort("claude"))
|
||||
|
||||
def test_invoke_agent_passes_reasoning_effort_to_codex(self) -> None:
|
||||
captured: dict[str, list[str]] = {}
|
||||
|
||||
def _fake_run(cmd, **kwargs):
|
||||
captured["cmd"] = cmd
|
||||
|
||||
class _Result:
|
||||
returncode = 0
|
||||
stdout = "VERDICT: PASS"
|
||||
stderr = ""
|
||||
|
||||
return _Result()
|
||||
|
||||
agent = AgentConfig(
|
||||
name="codex-reviewer",
|
||||
command="codex",
|
||||
args=["exec", "--model", "gpt-5.4", "-"],
|
||||
reasoning_effort="high",
|
||||
)
|
||||
|
||||
with patch("subprocess.run", side_effect=_fake_run):
|
||||
invoke_agent(agent, "prompt", "review", quiet=True)
|
||||
|
||||
self.assertEqual(
|
||||
captured["cmd"][:3],
|
||||
["codex", "-c", 'model_reasoning_effort="high"'],
|
||||
)
|
||||
|
||||
def test_detect_repeated_aggregate_warns_on_same_output(self) -> None:
|
||||
steps = [
|
||||
StepConfig(
|
||||
name="aggregate_review",
|
||||
agent="codex-senior",
|
||||
role="review",
|
||||
prompt_template="default:aggregate-review",
|
||||
output_key="aggregate_review",
|
||||
),
|
||||
]
|
||||
history: dict[str, int] = {}
|
||||
|
||||
first = _detect_repeated_aggregate(
|
||||
steps, {"aggregate_review": "Same issue list"}, history, iteration=1,
|
||||
)
|
||||
second = _detect_repeated_aggregate(
|
||||
steps, {"aggregate_review": " same issue list "}, history, iteration=2,
|
||||
)
|
||||
|
||||
self.assertIsNone(first)
|
||||
self.assertEqual(
|
||||
second,
|
||||
"Repeated aggregate_review detected at iteration 2 (same as iteration 1).",
|
||||
)
|
||||
|
||||
def test_report_includes_repeated_aggregate_section(self) -> None:
|
||||
config = PipelineConfig(language="en")
|
||||
result = PipelineResult(repeated_aggregate_warnings=[
|
||||
"Repeated aggregate_review detected at iteration 4 (same as iteration 3).",
|
||||
])
|
||||
|
||||
report = build_report(config, result)
|
||||
|
||||
self.assertIn("Repeated Aggregate Findings", report)
|
||||
self.assertIn("same as iteration 3", report)
|
||||
|
||||
def test_review_fix_defaults_senior_from_reviewer_family(self) -> None:
|
||||
self.assertEqual(
|
||||
_default_seniors_for_preset(
|
||||
"preset:review-fix",
|
||||
["codex-reviewer", "claude-reviewer"],
|
||||
BUILTIN_AGENTS,
|
||||
),
|
||||
["codex-senior"],
|
||||
)
|
||||
self.assertEqual(
|
||||
_default_seniors_for_preset(
|
||||
"preset:review-fix",
|
||||
["claude-reviewer"],
|
||||
BUILTIN_AGENTS,
|
||||
),
|
||||
["claude-senior"],
|
||||
)
|
||||
self.assertEqual(
|
||||
_default_seniors_for_preset(
|
||||
"preset:simple",
|
||||
["codex-reviewer"],
|
||||
BUILTIN_AGENTS,
|
||||
),
|
||||
[],
|
||||
)
|
||||
|
||||
def test_review_fix_duplicate_reviewers_get_unique_step_keys(self) -> None:
|
||||
phases = _build_review_fix_preset(
|
||||
["codex-coder"],
|
||||
["codex-reviewer", "codex-reviewer", "codex-reviewer"],
|
||||
[],
|
||||
)
|
||||
|
||||
converge = phases[0]
|
||||
self.assertEqual(
|
||||
[step.name for step in converge.steps[:3]],
|
||||
[
|
||||
"review_codex_reviewer",
|
||||
"review_codex_reviewer_2",
|
||||
"review_codex_reviewer_3",
|
||||
],
|
||||
)
|
||||
self.assertEqual(
|
||||
[step.output_key for step in converge.steps[:3]],
|
||||
[
|
||||
"review_codex_reviewer",
|
||||
"review_codex_reviewer_2",
|
||||
"review_codex_reviewer_3",
|
||||
],
|
||||
)
|
||||
self.assertEqual(
|
||||
[step.name for step in converge.steps[3:]],
|
||||
["aggregate_review", "generate", "verify"],
|
||||
)
|
||||
|
||||
def test_review_only_duplicate_reviewers_get_unique_step_keys(self) -> None:
|
||||
steps = _build_review_only_preset(
|
||||
["codex-coder"],
|
||||
["codex-reviewer", "codex-reviewer"],
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
[step.output_key for step in steps],
|
||||
["review_codex_reviewer", "review_codex_reviewer_2"],
|
||||
)
|
||||
|
||||
def test_cross_review_duplicate_coders_get_unique_step_keys(self) -> None:
|
||||
steps = _build_cross_review_preset(
|
||||
["codex-coder", "codex-coder"],
|
||||
["codex-reviewer"],
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
[step.output_key for step in steps],
|
||||
[
|
||||
"code_codex_coder",
|
||||
"code_codex_coder_2",
|
||||
"review_by_codex_coder",
|
||||
"review_by_codex_coder_2",
|
||||
],
|
||||
)
|
||||
|
||||
def test_review_fix_uses_senior_for_aggregate_and_verify(self) -> None:
|
||||
phases = _build_review_fix_preset(
|
||||
["codex-coder"],
|
||||
["claude-reviewer", "codex-reviewer"],
|
||||
["codex-senior"],
|
||||
)
|
||||
|
||||
steps = phases[0].steps
|
||||
self.assertEqual(steps[2].name, "aggregate_review")
|
||||
self.assertEqual(steps[2].agent, "codex-senior")
|
||||
self.assertEqual(steps[3].name, "generate")
|
||||
self.assertEqual(steps[4].name, "verify")
|
||||
self.assertEqual(steps[4].agent, "codex-senior")
|
||||
self.assertTrue(steps[4].verdict)
|
||||
|
||||
def test_review_only_with_senior_adds_aggregate_step(self) -> None:
|
||||
steps = _build_review_only_preset(
|
||||
["codex-coder"],
|
||||
["claude-reviewer", "codex-reviewer"],
|
||||
["claude-senior"],
|
||||
)
|
||||
|
||||
self.assertEqual(steps[-1].name, "senior_review")
|
||||
self.assertEqual(steps[-1].agent, "claude-senior")
|
||||
self.assertTrue(steps[-1].verdict)
|
||||
self.assertFalse(steps[0].verdict)
|
||||
self.assertFalse(steps[1].verdict)
|
||||
|
||||
def test_simple_with_senior_adds_final_aggregate_step(self) -> None:
|
||||
steps = _build_simple_preset(
|
||||
["codex-coder"],
|
||||
["codex-reviewer"],
|
||||
["codex-senior"],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
[step.name for step in steps],
|
||||
["generate", "review", "senior_review"],
|
||||
)
|
||||
self.assertFalse(steps[1].verdict)
|
||||
self.assertTrue(steps[2].verdict)
|
||||
|
||||
def test_validate_config_rejects_duplicate_phase_step_names_and_output_keys(self) -> None:
|
||||
config = PipelineConfig(
|
||||
agents={
|
||||
"codex-reviewer": AgentConfig(
|
||||
name="codex-reviewer",
|
||||
command="codex",
|
||||
),
|
||||
},
|
||||
phases=[
|
||||
PhaseConfig(
|
||||
name="converge",
|
||||
steps=[
|
||||
StepConfig(
|
||||
name="review_dup",
|
||||
agent="codex-reviewer",
|
||||
role="review",
|
||||
prompt_template="default:review-only",
|
||||
output_key="same_key",
|
||||
verdict=True,
|
||||
),
|
||||
StepConfig(
|
||||
name="review_dup",
|
||||
agent="codex-reviewer",
|
||||
role="review",
|
||||
prompt_template="default:review-only",
|
||||
output_key="same_key",
|
||||
verdict=True,
|
||||
),
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
errors = validate_config(config)
|
||||
|
||||
self.assertIn("Phase 'converge' has duplicate step name 'review_dup'", errors)
|
||||
self.assertIn("Phase 'converge' has duplicate output_key 'same_key'", errors)
|
||||
|
||||
|
||||
class PromptTemplateTest(unittest.TestCase):
|
||||
"""Verify prompt template content after category/assessment refactor."""
|
||||
|
||||
def test_review_templates_no_false_positive_category(self) -> None:
|
||||
"""False positive should NOT appear as a category in review templates."""
|
||||
for tmpl, label in [
|
||||
(REVIEW_TEMPLATE, "REVIEW_TEMPLATE"),
|
||||
(REVIEW_TEMPLATE_KO, "REVIEW_TEMPLATE_KO"),
|
||||
(REVIEW_ONLY_TEMPLATE, "REVIEW_ONLY_TEMPLATE"),
|
||||
(REVIEW_ONLY_TEMPLATE_KO, "REVIEW_ONLY_TEMPLATE_KO"),
|
||||
]:
|
||||
with self.subTest(template=label):
|
||||
# Should not contain "False positive" as a category bullet
|
||||
self.assertNotIn(
|
||||
"**False positive**", tmpl,
|
||||
f"{label} still lists False positive as a category",
|
||||
)
|
||||
# KO variant
|
||||
if label.endswith("_KO"):
|
||||
self.assertNotIn(
|
||||
"**오탐**", tmpl,
|
||||
f"{label} still lists 오탐 as a category",
|
||||
)
|
||||
|
||||
def test_review_templates_have_confirmed_dismissed(self) -> None:
|
||||
"""Review templates should instruct CONFIRMED / DISMISSED assessment."""
|
||||
for tmpl, label in [
|
||||
(REVIEW_TEMPLATE, "REVIEW_TEMPLATE"),
|
||||
(REVIEW_TEMPLATE_KO, "REVIEW_TEMPLATE_KO"),
|
||||
]:
|
||||
with self.subTest(template=label):
|
||||
self.assertIn("CONFIRMED", tmpl)
|
||||
self.assertIn("DISMISSED", tmpl)
|
||||
|
||||
def test_generate_templates_ignore_dismissed(self) -> None:
|
||||
"""Generate templates should tell coder to ignore DISMISSED items."""
|
||||
self.assertIn("DISMISSED", GENERATE_TEMPLATE)
|
||||
self.assertIn("DISMISSED", GENERATE_TEMPLATE_KO)
|
||||
|
||||
def test_aggregate_templates_dismissed_structure(self) -> None:
|
||||
"""Aggregate templates should use [False positive] / [Already fixed] tags."""
|
||||
self.assertIn("[False positive]", AGGREGATE_REVIEW_TEMPLATE)
|
||||
self.assertIn("[Already fixed]", AGGREGATE_REVIEW_TEMPLATE)
|
||||
self.assertIn("[오탐]", AGGREGATE_REVIEW_TEMPLATE_KO)
|
||||
self.assertIn("[수정 완료]", AGGREGATE_REVIEW_TEMPLATE_KO)
|
||||
|
||||
|
||||
class ReviewMetricsParsingTest(unittest.TestCase):
|
||||
"""Test review output metrics parsing."""
|
||||
|
||||
def test_parse_review_metrics_basic(self) -> None:
|
||||
output = """\
|
||||
### Issues Found
|
||||
- [Critical][Over-engineering] Added unnecessary caching layer
|
||||
- [Major][Omission] Missing input validation for user_id
|
||||
- [Major][Omission] Missing error handling for DB calls
|
||||
- [Minor][Omission] No docstring on public API
|
||||
|
||||
### Summary
|
||||
- Critical: 1, Major: 2, Minor: 1
|
||||
- Over-engineering count: 1
|
||||
- Omission count: 3
|
||||
- CONFIRMED: 0, DISMISSED: 0
|
||||
"""
|
||||
m = parse_review_metrics(output)
|
||||
self.assertEqual(m.critical, 1)
|
||||
self.assertEqual(m.major, 2)
|
||||
self.assertEqual(m.minor, 1)
|
||||
self.assertEqual(m.over_engineering, 1)
|
||||
self.assertEqual(m.omission, 3)
|
||||
self.assertEqual(m.confirmed, 0)
|
||||
self.assertEqual(m.dismissed, 0)
|
||||
|
||||
def test_parse_review_metrics_korean(self) -> None:
|
||||
output = """\
|
||||
### 발견된 이슈
|
||||
- [Critical][과최적화] 불필요한 캐시 레이어 추가
|
||||
- [Major][누락] user_id 입력 검증 누락
|
||||
|
||||
### 이전 피드백 평가
|
||||
- CONFIRMED: DB 에러 핸들링 — 여전히 미구현
|
||||
- DISMISSED (오탐): 타입 힌트 누락 — 기획서에 없는 요구사항
|
||||
"""
|
||||
m = parse_review_metrics(output)
|
||||
self.assertEqual(m.critical, 1)
|
||||
self.assertEqual(m.major, 1)
|
||||
self.assertEqual(m.over_engineering, 1)
|
||||
self.assertEqual(m.omission, 1)
|
||||
self.assertEqual(m.confirmed, 1)
|
||||
self.assertEqual(m.dismissed, 1)
|
||||
|
||||
def test_parse_review_metrics_with_assessment(self) -> None:
|
||||
output = """\
|
||||
### Previous Feedback Assessment
|
||||
- CONFIRMED: Missing auth check — still not implemented
|
||||
- CONFIRMED: SQL injection risk — still present
|
||||
- DISMISSED (false positive): Unused import — actually used in tests
|
||||
|
||||
### Issues Found
|
||||
- [Critical][Omission] Missing auth check
|
||||
- [Critical][Omission] SQL injection risk
|
||||
"""
|
||||
m = parse_review_metrics(output)
|
||||
self.assertEqual(m.confirmed, 2)
|
||||
self.assertEqual(m.dismissed, 1)
|
||||
self.assertEqual(m.critical, 2)
|
||||
self.assertEqual(m.omission, 2)
|
||||
|
||||
def test_report_includes_metrics_table(self) -> None:
|
||||
config = PipelineConfig(
|
||||
language="en",
|
||||
pipeline=[
|
||||
StepConfig(
|
||||
name="review",
|
||||
agent="claude-reviewer",
|
||||
role="review",
|
||||
prompt_template="default:review",
|
||||
output_key="review_result",
|
||||
verdict=True,
|
||||
),
|
||||
],
|
||||
)
|
||||
result = PipelineResult(
|
||||
iterations=[
|
||||
IterationResult(
|
||||
iteration=1,
|
||||
step_outputs={
|
||||
"review_result": (
|
||||
"### Issues Found\n"
|
||||
"- [Critical][Omission] Missing auth\n"
|
||||
"- [Major][Over-engineering] Extra abstraction\n"
|
||||
"### Verdict\nVERDICT: FAIL"
|
||||
),
|
||||
},
|
||||
verdict="FAIL",
|
||||
),
|
||||
IterationResult(
|
||||
iteration=2,
|
||||
step_outputs={
|
||||
"review_result": (
|
||||
"### Previous Feedback Assessment\n"
|
||||
"- CONFIRMED: Missing auth — still missing\n"
|
||||
"- DISMISSED (false positive): Extra abstraction — needed per plan\n"
|
||||
"### Issues Found\n"
|
||||
"- [Major][Omission] Missing auth\n"
|
||||
"### Verdict\nVERDICT: FAIL"
|
||||
),
|
||||
},
|
||||
verdict="FAIL",
|
||||
),
|
||||
],
|
||||
final_verdict="FAIL",
|
||||
)
|
||||
|
||||
report = build_report(config, result)
|
||||
|
||||
self.assertIn("Review Metrics", report)
|
||||
# Check table headers
|
||||
self.assertIn("Critical", report)
|
||||
self.assertIn("CONFIRMED", report)
|
||||
self.assertIn("DISMISSED", report)
|
||||
# Check trend section
|
||||
self.assertIn("Metrics Trend", report)
|
||||
self.assertIn("decreasing", report)
|
||||
|
||||
def test_report_no_metrics_table_without_review_steps(self) -> None:
|
||||
config = PipelineConfig(
|
||||
language="en",
|
||||
pipeline=[
|
||||
StepConfig(
|
||||
name="generate",
|
||||
agent="claude-coder",
|
||||
role="generate",
|
||||
prompt_template="default:generate",
|
||||
output_key="generated_code",
|
||||
verdict=True,
|
||||
),
|
||||
],
|
||||
)
|
||||
result = PipelineResult(
|
||||
iterations=[
|
||||
IterationResult(
|
||||
iteration=1,
|
||||
step_outputs={"generated_code": "some code"},
|
||||
verdict="PASS",
|
||||
),
|
||||
],
|
||||
final_verdict="PASS",
|
||||
)
|
||||
|
||||
report = build_report(config, result)
|
||||
self.assertNotIn("Review Metrics", report)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user