Files
cross-eval/tests/test_config.py
이충영 에이닷서비스개발 ee4f1a07ef initial commit
2026-03-11 21:53:14 +09:00

516 lines
18 KiB
Python

from __future__ import annotations
import unittest
from unittest.mock import patch
from cross_eval.agent import _supports_reasoning_effort
from cross_eval.agent import invoke_agent
from cross_eval.config import (
BUILTIN_AGENTS,
_default_seniors_for_preset,
apply_reasoning_effort_settings,
normalize_reasoning_effort,
validate_config,
)
from cross_eval.models import (
AgentConfig,
IterationResult,
PhaseConfig,
PipelineConfig,
PipelineResult,
ReviewMetrics,
StepConfig,
)
from cross_eval.pipeline import _detect_repeated_aggregate
from cross_eval.prompts import (
GENERATE_TEMPLATE,
GENERATE_TEMPLATE_KO,
REVIEW_TEMPLATE,
REVIEW_TEMPLATE_KO,
REVIEW_ONLY_TEMPLATE,
REVIEW_ONLY_TEMPLATE_KO,
AGGREGATE_REVIEW_TEMPLATE,
AGGREGATE_REVIEW_TEMPLATE_KO,
_build_cross_review_preset,
_build_review_fix_preset,
_build_review_only_preset,
_build_simple_preset,
)
from cross_eval.report import build_report, parse_review_metrics
class BuiltinAgentConfigTest(unittest.TestCase):
def test_codex_builtin_agents_skip_git_repo_check(self) -> None:
for agent_name in ("codex-coder", "codex-reviewer", "codex-senior"):
with self.subTest(agent=agent_name):
self.assertIn(
"--skip-git-repo-check",
BUILTIN_AGENTS[agent_name].args,
)
def test_senior_builtin_agents_exist(self) -> None:
self.assertIn("claude-senior", BUILTIN_AGENTS)
self.assertIn("codex-senior", BUILTIN_AGENTS)
def test_builtin_reasoning_effort_defaults_match_recommended_levels(self) -> None:
self.assertEqual(BUILTIN_AGENTS["codex-coder"].reasoning_effort, "medium")
self.assertEqual(BUILTIN_AGENTS["codex-reviewer"].reasoning_effort, "medium")
self.assertEqual(BUILTIN_AGENTS["codex-senior"].reasoning_effort, "high")
def test_normalize_reasoning_effort_aliases(self) -> None:
self.assertEqual(normalize_reasoning_effort("extra-high"), "xhigh")
self.assertEqual(normalize_reasoning_effort("extra_high"), "xhigh")
self.assertEqual(normalize_reasoning_effort("x-high"), "xhigh")
def test_apply_reasoning_effort_settings_uses_defaults_and_role_overrides(self) -> None:
config = PipelineConfig(
agents={
"codex-coder": AgentConfig(name="codex-coder", command="codex"),
"codex-reviewer": AgentConfig(name="codex-reviewer", command="codex"),
"codex-senior": AgentConfig(name="codex-senior", command="codex"),
},
coders=["codex-coder"],
reviewers=["codex-reviewer"],
seniors=["codex-senior"],
)
apply_reasoning_effort_settings(
config,
reviewer_effort="high",
senior_effort="xhigh",
)
self.assertEqual(config.agents["codex-coder"].reasoning_effort, "medium")
self.assertEqual(config.agents["codex-reviewer"].reasoning_effort, "high")
self.assertEqual(config.agents["codex-senior"].reasoning_effort, "xhigh")
def test_codex_supports_reasoning_effort_override(self) -> None:
self.assertTrue(_supports_reasoning_effort("codex"))
self.assertFalse(_supports_reasoning_effort("claude"))
def test_invoke_agent_passes_reasoning_effort_to_codex(self) -> None:
captured: dict[str, list[str]] = {}
def _fake_run(cmd, **kwargs):
captured["cmd"] = cmd
class _Result:
returncode = 0
stdout = "VERDICT: PASS"
stderr = ""
return _Result()
agent = AgentConfig(
name="codex-reviewer",
command="codex",
args=["exec", "--model", "gpt-5.4", "-"],
reasoning_effort="high",
)
with patch("subprocess.run", side_effect=_fake_run):
invoke_agent(agent, "prompt", "review", quiet=True)
self.assertEqual(
captured["cmd"][:3],
["codex", "-c", 'model_reasoning_effort="high"'],
)
def test_detect_repeated_aggregate_warns_on_same_output(self) -> None:
steps = [
StepConfig(
name="aggregate_review",
agent="codex-senior",
role="review",
prompt_template="default:aggregate-review",
output_key="aggregate_review",
),
]
history: dict[str, int] = {}
first = _detect_repeated_aggregate(
steps, {"aggregate_review": "Same issue list"}, history, iteration=1,
)
second = _detect_repeated_aggregate(
steps, {"aggregate_review": " same issue list "}, history, iteration=2,
)
self.assertIsNone(first)
self.assertEqual(
second,
"Repeated aggregate_review detected at iteration 2 (same as iteration 1).",
)
def test_report_includes_repeated_aggregate_section(self) -> None:
config = PipelineConfig(language="en")
result = PipelineResult(repeated_aggregate_warnings=[
"Repeated aggregate_review detected at iteration 4 (same as iteration 3).",
])
report = build_report(config, result)
self.assertIn("Repeated Aggregate Findings", report)
self.assertIn("same as iteration 3", report)
def test_review_fix_defaults_senior_from_reviewer_family(self) -> None:
self.assertEqual(
_default_seniors_for_preset(
"preset:review-fix",
["codex-reviewer", "claude-reviewer"],
BUILTIN_AGENTS,
),
["codex-senior"],
)
self.assertEqual(
_default_seniors_for_preset(
"preset:review-fix",
["claude-reviewer"],
BUILTIN_AGENTS,
),
["claude-senior"],
)
self.assertEqual(
_default_seniors_for_preset(
"preset:simple",
["codex-reviewer"],
BUILTIN_AGENTS,
),
[],
)
def test_review_fix_duplicate_reviewers_get_unique_step_keys(self) -> None:
phases = _build_review_fix_preset(
["codex-coder"],
["codex-reviewer", "codex-reviewer", "codex-reviewer"],
[],
)
converge = phases[0]
self.assertEqual(
[step.name for step in converge.steps[:3]],
[
"review_codex_reviewer",
"review_codex_reviewer_2",
"review_codex_reviewer_3",
],
)
self.assertEqual(
[step.output_key for step in converge.steps[:3]],
[
"review_codex_reviewer",
"review_codex_reviewer_2",
"review_codex_reviewer_3",
],
)
self.assertEqual(
[step.name for step in converge.steps[3:]],
["aggregate_review", "generate", "verify"],
)
def test_review_only_duplicate_reviewers_get_unique_step_keys(self) -> None:
steps = _build_review_only_preset(
["codex-coder"],
["codex-reviewer", "codex-reviewer"],
[],
)
self.assertEqual(
[step.output_key for step in steps],
["review_codex_reviewer", "review_codex_reviewer_2"],
)
def test_cross_review_duplicate_coders_get_unique_step_keys(self) -> None:
steps = _build_cross_review_preset(
["codex-coder", "codex-coder"],
["codex-reviewer"],
[],
)
self.assertEqual(
[step.output_key for step in steps],
[
"code_codex_coder",
"code_codex_coder_2",
"review_by_codex_coder",
"review_by_codex_coder_2",
],
)
def test_review_fix_uses_senior_for_aggregate_and_verify(self) -> None:
phases = _build_review_fix_preset(
["codex-coder"],
["claude-reviewer", "codex-reviewer"],
["codex-senior"],
)
steps = phases[0].steps
self.assertEqual(steps[2].name, "aggregate_review")
self.assertEqual(steps[2].agent, "codex-senior")
self.assertEqual(steps[3].name, "generate")
self.assertEqual(steps[4].name, "verify")
self.assertEqual(steps[4].agent, "codex-senior")
self.assertTrue(steps[4].verdict)
def test_review_only_with_senior_adds_aggregate_step(self) -> None:
steps = _build_review_only_preset(
["codex-coder"],
["claude-reviewer", "codex-reviewer"],
["claude-senior"],
)
self.assertEqual(steps[-1].name, "senior_review")
self.assertEqual(steps[-1].agent, "claude-senior")
self.assertTrue(steps[-1].verdict)
self.assertFalse(steps[0].verdict)
self.assertFalse(steps[1].verdict)
def test_simple_with_senior_adds_final_aggregate_step(self) -> None:
steps = _build_simple_preset(
["codex-coder"],
["codex-reviewer"],
["codex-senior"],
)
self.assertEqual(
[step.name for step in steps],
["generate", "review", "senior_review"],
)
self.assertFalse(steps[1].verdict)
self.assertTrue(steps[2].verdict)
def test_validate_config_rejects_duplicate_phase_step_names_and_output_keys(self) -> None:
config = PipelineConfig(
agents={
"codex-reviewer": AgentConfig(
name="codex-reviewer",
command="codex",
),
},
phases=[
PhaseConfig(
name="converge",
steps=[
StepConfig(
name="review_dup",
agent="codex-reviewer",
role="review",
prompt_template="default:review-only",
output_key="same_key",
verdict=True,
),
StepConfig(
name="review_dup",
agent="codex-reviewer",
role="review",
prompt_template="default:review-only",
output_key="same_key",
verdict=True,
),
],
),
],
)
errors = validate_config(config)
self.assertIn("Phase 'converge' has duplicate step name 'review_dup'", errors)
self.assertIn("Phase 'converge' has duplicate output_key 'same_key'", errors)
class PromptTemplateTest(unittest.TestCase):
"""Verify prompt template content after category/assessment refactor."""
def test_review_templates_no_false_positive_category(self) -> None:
"""False positive should NOT appear as a category in review templates."""
for tmpl, label in [
(REVIEW_TEMPLATE, "REVIEW_TEMPLATE"),
(REVIEW_TEMPLATE_KO, "REVIEW_TEMPLATE_KO"),
(REVIEW_ONLY_TEMPLATE, "REVIEW_ONLY_TEMPLATE"),
(REVIEW_ONLY_TEMPLATE_KO, "REVIEW_ONLY_TEMPLATE_KO"),
]:
with self.subTest(template=label):
# Should not contain "False positive" as a category bullet
self.assertNotIn(
"**False positive**", tmpl,
f"{label} still lists False positive as a category",
)
# KO variant
if label.endswith("_KO"):
self.assertNotIn(
"**오탐**", tmpl,
f"{label} still lists 오탐 as a category",
)
def test_review_templates_have_confirmed_dismissed(self) -> None:
"""Review templates should instruct CONFIRMED / DISMISSED assessment."""
for tmpl, label in [
(REVIEW_TEMPLATE, "REVIEW_TEMPLATE"),
(REVIEW_TEMPLATE_KO, "REVIEW_TEMPLATE_KO"),
]:
with self.subTest(template=label):
self.assertIn("CONFIRMED", tmpl)
self.assertIn("DISMISSED", tmpl)
def test_generate_templates_ignore_dismissed(self) -> None:
"""Generate templates should tell coder to ignore DISMISSED items."""
self.assertIn("DISMISSED", GENERATE_TEMPLATE)
self.assertIn("DISMISSED", GENERATE_TEMPLATE_KO)
def test_aggregate_templates_dismissed_structure(self) -> None:
"""Aggregate templates should use [False positive] / [Already fixed] tags."""
self.assertIn("[False positive]", AGGREGATE_REVIEW_TEMPLATE)
self.assertIn("[Already fixed]", AGGREGATE_REVIEW_TEMPLATE)
self.assertIn("[오탐]", AGGREGATE_REVIEW_TEMPLATE_KO)
self.assertIn("[수정 완료]", AGGREGATE_REVIEW_TEMPLATE_KO)
class ReviewMetricsParsingTest(unittest.TestCase):
"""Test review output metrics parsing."""
def test_parse_review_metrics_basic(self) -> None:
output = """\
### Issues Found
- [Critical][Over-engineering] Added unnecessary caching layer
- [Major][Omission] Missing input validation for user_id
- [Major][Omission] Missing error handling for DB calls
- [Minor][Omission] No docstring on public API
### Summary
- Critical: 1, Major: 2, Minor: 1
- Over-engineering count: 1
- Omission count: 3
- CONFIRMED: 0, DISMISSED: 0
"""
m = parse_review_metrics(output)
self.assertEqual(m.critical, 1)
self.assertEqual(m.major, 2)
self.assertEqual(m.minor, 1)
self.assertEqual(m.over_engineering, 1)
self.assertEqual(m.omission, 3)
self.assertEqual(m.confirmed, 0)
self.assertEqual(m.dismissed, 0)
def test_parse_review_metrics_korean(self) -> None:
output = """\
### 발견된 이슈
- [Critical][과최적화] 불필요한 캐시 레이어 추가
- [Major][누락] user_id 입력 검증 누락
### 이전 피드백 평가
- CONFIRMED: DB 에러 핸들링 — 여전히 미구현
- DISMISSED (오탐): 타입 힌트 누락 — 기획서에 없는 요구사항
"""
m = parse_review_metrics(output)
self.assertEqual(m.critical, 1)
self.assertEqual(m.major, 1)
self.assertEqual(m.over_engineering, 1)
self.assertEqual(m.omission, 1)
self.assertEqual(m.confirmed, 1)
self.assertEqual(m.dismissed, 1)
def test_parse_review_metrics_with_assessment(self) -> None:
output = """\
### Previous Feedback Assessment
- CONFIRMED: Missing auth check — still not implemented
- CONFIRMED: SQL injection risk — still present
- DISMISSED (false positive): Unused import — actually used in tests
### Issues Found
- [Critical][Omission] Missing auth check
- [Critical][Omission] SQL injection risk
"""
m = parse_review_metrics(output)
self.assertEqual(m.confirmed, 2)
self.assertEqual(m.dismissed, 1)
self.assertEqual(m.critical, 2)
self.assertEqual(m.omission, 2)
def test_report_includes_metrics_table(self) -> None:
config = PipelineConfig(
language="en",
pipeline=[
StepConfig(
name="review",
agent="claude-reviewer",
role="review",
prompt_template="default:review",
output_key="review_result",
verdict=True,
),
],
)
result = PipelineResult(
iterations=[
IterationResult(
iteration=1,
step_outputs={
"review_result": (
"### Issues Found\n"
"- [Critical][Omission] Missing auth\n"
"- [Major][Over-engineering] Extra abstraction\n"
"### Verdict\nVERDICT: FAIL"
),
},
verdict="FAIL",
),
IterationResult(
iteration=2,
step_outputs={
"review_result": (
"### Previous Feedback Assessment\n"
"- CONFIRMED: Missing auth — still missing\n"
"- DISMISSED (false positive): Extra abstraction — needed per plan\n"
"### Issues Found\n"
"- [Major][Omission] Missing auth\n"
"### Verdict\nVERDICT: FAIL"
),
},
verdict="FAIL",
),
],
final_verdict="FAIL",
)
report = build_report(config, result)
self.assertIn("Review Metrics", report)
# Check table headers
self.assertIn("Critical", report)
self.assertIn("CONFIRMED", report)
self.assertIn("DISMISSED", report)
# Check trend section
self.assertIn("Metrics Trend", report)
self.assertIn("decreasing", report)
def test_report_no_metrics_table_without_review_steps(self) -> None:
config = PipelineConfig(
language="en",
pipeline=[
StepConfig(
name="generate",
agent="claude-coder",
role="generate",
prompt_template="default:generate",
output_key="generated_code",
verdict=True,
),
],
)
result = PipelineResult(
iterations=[
IterationResult(
iteration=1,
step_outputs={"generated_code": "some code"},
verdict="PASS",
),
],
final_verdict="PASS",
)
report = build_report(config, result)
self.assertNotIn("Review Metrics", report)
if __name__ == "__main__":
unittest.main()