from __future__ import annotations import unittest from unittest.mock import patch from cross_eval.agent import _supports_reasoning_effort from cross_eval.agent import invoke_agent from cross_eval.config import ( BUILTIN_AGENTS, _default_seniors_for_preset, apply_reasoning_effort_settings, normalize_reasoning_effort, validate_config, ) from cross_eval.models import ( AgentConfig, IterationResult, PhaseConfig, PipelineConfig, PipelineResult, ReviewMetrics, StepConfig, ) from cross_eval.pipeline import _detect_repeated_aggregate from cross_eval.prompts import ( GENERATE_TEMPLATE, GENERATE_TEMPLATE_KO, REVIEW_TEMPLATE, REVIEW_TEMPLATE_KO, REVIEW_ONLY_TEMPLATE, REVIEW_ONLY_TEMPLATE_KO, AGGREGATE_REVIEW_TEMPLATE, AGGREGATE_REVIEW_TEMPLATE_KO, _build_cross_review_preset, _build_review_fix_preset, _build_review_only_preset, _build_simple_preset, ) from cross_eval.report import build_report, parse_review_metrics class BuiltinAgentConfigTest(unittest.TestCase): def test_codex_builtin_agents_skip_git_repo_check(self) -> None: for agent_name in ("codex-coder", "codex-reviewer", "codex-senior"): with self.subTest(agent=agent_name): self.assertIn( "--skip-git-repo-check", BUILTIN_AGENTS[agent_name].args, ) def test_senior_builtin_agents_exist(self) -> None: self.assertIn("claude-senior", BUILTIN_AGENTS) self.assertIn("codex-senior", BUILTIN_AGENTS) def test_builtin_reasoning_effort_defaults_match_recommended_levels(self) -> None: self.assertEqual(BUILTIN_AGENTS["codex-coder"].reasoning_effort, "medium") self.assertEqual(BUILTIN_AGENTS["codex-reviewer"].reasoning_effort, "medium") self.assertEqual(BUILTIN_AGENTS["codex-senior"].reasoning_effort, "high") def test_normalize_reasoning_effort_aliases(self) -> None: self.assertEqual(normalize_reasoning_effort("extra-high"), "xhigh") self.assertEqual(normalize_reasoning_effort("extra_high"), "xhigh") self.assertEqual(normalize_reasoning_effort("x-high"), "xhigh") def test_apply_reasoning_effort_settings_uses_defaults_and_role_overrides(self) -> None: config = PipelineConfig( agents={ "codex-coder": AgentConfig(name="codex-coder", command="codex"), "codex-reviewer": AgentConfig(name="codex-reviewer", command="codex"), "codex-senior": AgentConfig(name="codex-senior", command="codex"), }, coders=["codex-coder"], reviewers=["codex-reviewer"], seniors=["codex-senior"], ) apply_reasoning_effort_settings( config, reviewer_effort="high", senior_effort="xhigh", ) self.assertEqual(config.agents["codex-coder"].reasoning_effort, "medium") self.assertEqual(config.agents["codex-reviewer"].reasoning_effort, "high") self.assertEqual(config.agents["codex-senior"].reasoning_effort, "xhigh") def test_codex_supports_reasoning_effort_override(self) -> None: self.assertTrue(_supports_reasoning_effort("codex")) self.assertFalse(_supports_reasoning_effort("claude")) def test_invoke_agent_passes_reasoning_effort_to_codex(self) -> None: captured: dict[str, list[str]] = {} def _fake_run(cmd, **kwargs): captured["cmd"] = cmd class _Result: returncode = 0 stdout = "VERDICT: PASS" stderr = "" return _Result() agent = AgentConfig( name="codex-reviewer", command="codex", args=["exec", "--model", "gpt-5.4", "-"], reasoning_effort="high", ) with patch("subprocess.run", side_effect=_fake_run): invoke_agent(agent, "prompt", "review", quiet=True) self.assertEqual( captured["cmd"][:3], ["codex", "-c", 'model_reasoning_effort="high"'], ) def test_detect_repeated_aggregate_warns_on_same_output(self) -> None: steps = [ StepConfig( name="aggregate_review", agent="codex-senior", role="review", prompt_template="default:aggregate-review", output_key="aggregate_review", ), ] history: dict[str, int] = {} first = _detect_repeated_aggregate( steps, {"aggregate_review": "Same issue list"}, history, iteration=1, ) second = _detect_repeated_aggregate( steps, {"aggregate_review": " same issue list "}, history, iteration=2, ) self.assertIsNone(first) self.assertEqual( second, "Repeated aggregate_review detected at iteration 2 (same as iteration 1).", ) def test_report_includes_repeated_aggregate_section(self) -> None: config = PipelineConfig(language="en") result = PipelineResult(repeated_aggregate_warnings=[ "Repeated aggregate_review detected at iteration 4 (same as iteration 3).", ]) report = build_report(config, result) self.assertIn("Repeated Aggregate Findings", report) self.assertIn("same as iteration 3", report) def test_review_fix_defaults_senior_from_reviewer_family(self) -> None: self.assertEqual( _default_seniors_for_preset( "preset:review-fix", ["codex-reviewer", "claude-reviewer"], BUILTIN_AGENTS, ), ["codex-senior"], ) self.assertEqual( _default_seniors_for_preset( "preset:review-fix", ["claude-reviewer"], BUILTIN_AGENTS, ), ["claude-senior"], ) self.assertEqual( _default_seniors_for_preset( "preset:simple", ["codex-reviewer"], BUILTIN_AGENTS, ), [], ) def test_review_fix_duplicate_reviewers_get_unique_step_keys(self) -> None: phases = _build_review_fix_preset( ["codex-coder"], ["codex-reviewer", "codex-reviewer", "codex-reviewer"], [], ) converge = phases[0] self.assertEqual( [step.name for step in converge.steps[:3]], [ "review_codex_reviewer", "review_codex_reviewer_2", "review_codex_reviewer_3", ], ) self.assertEqual( [step.output_key for step in converge.steps[:3]], [ "review_codex_reviewer", "review_codex_reviewer_2", "review_codex_reviewer_3", ], ) self.assertEqual( [step.name for step in converge.steps[3:]], ["aggregate_review", "generate", "verify"], ) def test_review_only_duplicate_reviewers_get_unique_step_keys(self) -> None: steps = _build_review_only_preset( ["codex-coder"], ["codex-reviewer", "codex-reviewer"], [], ) self.assertEqual( [step.output_key for step in steps], ["review_codex_reviewer", "review_codex_reviewer_2"], ) def test_cross_review_duplicate_coders_get_unique_step_keys(self) -> None: steps = _build_cross_review_preset( ["codex-coder", "codex-coder"], ["codex-reviewer"], [], ) self.assertEqual( [step.output_key for step in steps], [ "code_codex_coder", "code_codex_coder_2", "review_by_codex_coder", "review_by_codex_coder_2", ], ) def test_review_fix_uses_senior_for_aggregate_and_verify(self) -> None: phases = _build_review_fix_preset( ["codex-coder"], ["claude-reviewer", "codex-reviewer"], ["codex-senior"], ) steps = phases[0].steps self.assertEqual(steps[2].name, "aggregate_review") self.assertEqual(steps[2].agent, "codex-senior") self.assertEqual(steps[3].name, "generate") self.assertEqual(steps[4].name, "verify") self.assertEqual(steps[4].agent, "codex-senior") self.assertTrue(steps[4].verdict) def test_review_only_with_senior_adds_aggregate_step(self) -> None: steps = _build_review_only_preset( ["codex-coder"], ["claude-reviewer", "codex-reviewer"], ["claude-senior"], ) self.assertEqual(steps[-1].name, "senior_review") self.assertEqual(steps[-1].agent, "claude-senior") self.assertTrue(steps[-1].verdict) self.assertFalse(steps[0].verdict) self.assertFalse(steps[1].verdict) def test_simple_with_senior_adds_final_aggregate_step(self) -> None: steps = _build_simple_preset( ["codex-coder"], ["codex-reviewer"], ["codex-senior"], ) self.assertEqual( [step.name for step in steps], ["generate", "review", "senior_review"], ) self.assertFalse(steps[1].verdict) self.assertTrue(steps[2].verdict) def test_validate_config_rejects_duplicate_phase_step_names_and_output_keys(self) -> None: config = PipelineConfig( agents={ "codex-reviewer": AgentConfig( name="codex-reviewer", command="codex", ), }, phases=[ PhaseConfig( name="converge", steps=[ StepConfig( name="review_dup", agent="codex-reviewer", role="review", prompt_template="default:review-only", output_key="same_key", verdict=True, ), StepConfig( name="review_dup", agent="codex-reviewer", role="review", prompt_template="default:review-only", output_key="same_key", verdict=True, ), ], ), ], ) errors = validate_config(config) self.assertIn("Phase 'converge' has duplicate step name 'review_dup'", errors) self.assertIn("Phase 'converge' has duplicate output_key 'same_key'", errors) class PromptTemplateTest(unittest.TestCase): """Verify prompt template content after category/assessment refactor.""" def test_review_templates_no_false_positive_category(self) -> None: """False positive should NOT appear as a category in review templates.""" for tmpl, label in [ (REVIEW_TEMPLATE, "REVIEW_TEMPLATE"), (REVIEW_TEMPLATE_KO, "REVIEW_TEMPLATE_KO"), (REVIEW_ONLY_TEMPLATE, "REVIEW_ONLY_TEMPLATE"), (REVIEW_ONLY_TEMPLATE_KO, "REVIEW_ONLY_TEMPLATE_KO"), ]: with self.subTest(template=label): # Should not contain "False positive" as a category bullet self.assertNotIn( "**False positive**", tmpl, f"{label} still lists False positive as a category", ) # KO variant if label.endswith("_KO"): self.assertNotIn( "**오탐**", tmpl, f"{label} still lists 오탐 as a category", ) def test_review_templates_have_confirmed_dismissed(self) -> None: """Review templates should instruct CONFIRMED / DISMISSED assessment.""" for tmpl, label in [ (REVIEW_TEMPLATE, "REVIEW_TEMPLATE"), (REVIEW_TEMPLATE_KO, "REVIEW_TEMPLATE_KO"), ]: with self.subTest(template=label): self.assertIn("CONFIRMED", tmpl) self.assertIn("DISMISSED", tmpl) def test_generate_templates_ignore_dismissed(self) -> None: """Generate templates should tell coder to ignore DISMISSED items.""" self.assertIn("DISMISSED", GENERATE_TEMPLATE) self.assertIn("DISMISSED", GENERATE_TEMPLATE_KO) def test_aggregate_templates_dismissed_structure(self) -> None: """Aggregate templates should use [False positive] / [Already fixed] tags.""" self.assertIn("[False positive]", AGGREGATE_REVIEW_TEMPLATE) self.assertIn("[Already fixed]", AGGREGATE_REVIEW_TEMPLATE) self.assertIn("[오탐]", AGGREGATE_REVIEW_TEMPLATE_KO) self.assertIn("[수정 완료]", AGGREGATE_REVIEW_TEMPLATE_KO) class ReviewMetricsParsingTest(unittest.TestCase): """Test review output metrics parsing.""" def test_parse_review_metrics_basic(self) -> None: output = """\ ### Issues Found - [Critical][Over-engineering] Added unnecessary caching layer - [Major][Omission] Missing input validation for user_id - [Major][Omission] Missing error handling for DB calls - [Minor][Omission] No docstring on public API ### Summary - Critical: 1, Major: 2, Minor: 1 - Over-engineering count: 1 - Omission count: 3 - CONFIRMED: 0, DISMISSED: 0 """ m = parse_review_metrics(output) self.assertEqual(m.critical, 1) self.assertEqual(m.major, 2) self.assertEqual(m.minor, 1) self.assertEqual(m.over_engineering, 1) self.assertEqual(m.omission, 3) self.assertEqual(m.confirmed, 0) self.assertEqual(m.dismissed, 0) def test_parse_review_metrics_korean(self) -> None: output = """\ ### 발견된 이슈 - [Critical][과최적화] 불필요한 캐시 레이어 추가 - [Major][누락] user_id 입력 검증 누락 ### 이전 피드백 평가 - CONFIRMED: DB 에러 핸들링 — 여전히 미구현 - DISMISSED (오탐): 타입 힌트 누락 — 기획서에 없는 요구사항 """ m = parse_review_metrics(output) self.assertEqual(m.critical, 1) self.assertEqual(m.major, 1) self.assertEqual(m.over_engineering, 1) self.assertEqual(m.omission, 1) self.assertEqual(m.confirmed, 1) self.assertEqual(m.dismissed, 1) def test_parse_review_metrics_with_assessment(self) -> None: output = """\ ### Previous Feedback Assessment - CONFIRMED: Missing auth check — still not implemented - CONFIRMED: SQL injection risk — still present - DISMISSED (false positive): Unused import — actually used in tests ### Issues Found - [Critical][Omission] Missing auth check - [Critical][Omission] SQL injection risk """ m = parse_review_metrics(output) self.assertEqual(m.confirmed, 2) self.assertEqual(m.dismissed, 1) self.assertEqual(m.critical, 2) self.assertEqual(m.omission, 2) def test_report_includes_metrics_table(self) -> None: config = PipelineConfig( language="en", pipeline=[ StepConfig( name="review", agent="claude-reviewer", role="review", prompt_template="default:review", output_key="review_result", verdict=True, ), ], ) result = PipelineResult( iterations=[ IterationResult( iteration=1, step_outputs={ "review_result": ( "### Issues Found\n" "- [Critical][Omission] Missing auth\n" "- [Major][Over-engineering] Extra abstraction\n" "### Verdict\nVERDICT: FAIL" ), }, verdict="FAIL", ), IterationResult( iteration=2, step_outputs={ "review_result": ( "### Previous Feedback Assessment\n" "- CONFIRMED: Missing auth — still missing\n" "- DISMISSED (false positive): Extra abstraction — needed per plan\n" "### Issues Found\n" "- [Major][Omission] Missing auth\n" "### Verdict\nVERDICT: FAIL" ), }, verdict="FAIL", ), ], final_verdict="FAIL", ) report = build_report(config, result) self.assertIn("Review Metrics", report) # Check table headers self.assertIn("Critical", report) self.assertIn("CONFIRMED", report) self.assertIn("DISMISSED", report) # Check trend section self.assertIn("Metrics Trend", report) self.assertIn("decreasing", report) def test_report_no_metrics_table_without_review_steps(self) -> None: config = PipelineConfig( language="en", pipeline=[ StepConfig( name="generate", agent="claude-coder", role="generate", prompt_template="default:generate", output_key="generated_code", verdict=True, ), ], ) result = PipelineResult( iterations=[ IterationResult( iteration=1, step_outputs={"generated_code": "some code"}, verdict="PASS", ), ], final_verdict="PASS", ) report = build_report(config, result) self.assertNotIn("Review Metrics", report) if __name__ == "__main__": unittest.main()