from __future__ import annotations import tempfile import unittest from pathlib import Path from unittest.mock import patch from cross_eval.agent import AgentInvocationError, _supports_reasoning_effort from cross_eval.cli import _apply_phased_iteration_override, main from cross_eval.agent import invoke_agent from cross_eval.config import ( BUILTIN_AGENTS, _SENIOR_SYSTEM_PROMPT, _default_seniors_for_preset, apply_reasoning_effort_settings, load_config, normalize_reasoning_effort, normalize_prompt_template, normalize_step_role, validate_config, ) from cross_eval.models import ( AgentConfig, AgentResult, IterationResult, PhaseConfig, PipelineConfig, PipelineResult, StepConfig, ) from cross_eval.pipeline import ( _detect_auto_escalate, _detect_repeated_aggregate, _execute_parallel_batch, _extract_senior_tracker, _extract_verdict, ) from cross_eval.prompts import ( CODING_TEMPLATE, CODING_TEMPLATE_KO, REVIEW_TEMPLATE, REVIEW_TEMPLATE_KO, PLAN_REVIEW_TEMPLATE, PLAN_REVIEW_TEMPLATE_KO, PLAN_FIX_TEMPLATE, PLAN_FIX_TEMPLATE_KO, REVIEW_ONLY_TEMPLATE, REVIEW_ONLY_TEMPLATE_KO, AGGREGATE_REVIEW_TEMPLATE, AGGREGATE_REVIEW_TEMPLATE_KO, _build_cross_review_preset, _build_coding_review_fix_preset, _build_plan_review_preset, _build_review_fix_preset, _build_review_only_preset, _build_simple_preset, ) from cross_eval.report import build_report, parse_review_metrics class BuiltinAgentConfigTest(unittest.TestCase): def test_claude_builtin_agents_use_user_settings_and_disable_slash_commands(self) -> None: for agent_name in ("claude-coder", "claude-reviewer", "claude-senior"): with self.subTest(agent=agent_name): args = BUILTIN_AGENTS[agent_name].args self.assertIn("--setting-sources", args) self.assertIn("user", args) self.assertIn("--disable-slash-commands", args) def test_claude_builtin_agents_use_role_specific_permission_modes(self) -> None: coder_args = BUILTIN_AGENTS["claude-coder"].args reviewer_args = BUILTIN_AGENTS["claude-reviewer"].args senior_args = BUILTIN_AGENTS["claude-senior"].args self.assertIn("--dangerously-skip-permissions", coder_args) self.assertIn("bypassPermissions", coder_args) # Reviewers/seniors use -p without --permission-mode plan self.assertIn("-p", reviewer_args) self.assertIn("-p", senior_args) self.assertNotIn("plan", reviewer_args) self.assertNotIn("plan", senior_args) def test_codex_builtin_agents_skip_git_repo_check(self) -> None: for agent_name in ("codex-coder", "codex-reviewer", "codex-senior"): with self.subTest(agent=agent_name): self.assertIn( "--skip-git-repo-check", BUILTIN_AGENTS[agent_name].args, ) def test_senior_builtin_agents_exist(self) -> None: self.assertIn("claude-senior", BUILTIN_AGENTS) self.assertIn("codex-senior", BUILTIN_AGENTS) def test_builtin_reasoning_effort_defaults_match_recommended_levels(self) -> None: self.assertEqual(BUILTIN_AGENTS["codex-coder"].reasoning_effort, "medium") self.assertEqual(BUILTIN_AGENTS["codex-reviewer"].reasoning_effort, "medium") self.assertEqual(BUILTIN_AGENTS["codex-senior"].reasoning_effort, "high") def test_normalize_reasoning_effort_aliases(self) -> None: self.assertEqual(normalize_reasoning_effort("extra-high"), "xhigh") self.assertEqual(normalize_reasoning_effort("extra_high"), "xhigh") self.assertEqual(normalize_reasoning_effort("x-high"), "xhigh") def test_normalize_step_role_and_template_aliases(self) -> None: self.assertEqual(normalize_step_role("coding"), "coding") self.assertEqual(normalize_prompt_template("default:coding"), "default:coding") def test_apply_reasoning_effort_settings_uses_defaults_and_role_overrides(self) -> None: config = PipelineConfig( agents={ "codex-coder": AgentConfig(name="codex-coder", command="codex"), "codex-reviewer": AgentConfig(name="codex-reviewer", command="codex"), "codex-senior": AgentConfig(name="codex-senior", command="codex"), }, coders=["codex-coder"], reviewers=["codex-reviewer"], seniors=["codex-senior"], ) apply_reasoning_effort_settings( config, reviewer_effort="high", senior_effort="xhigh", ) self.assertEqual(config.agents["codex-coder"].reasoning_effort, "medium") self.assertEqual(config.agents["codex-reviewer"].reasoning_effort, "high") self.assertEqual(config.agents["codex-senior"].reasoning_effort, "xhigh") def test_codex_supports_reasoning_effort_override(self) -> None: self.assertTrue(_supports_reasoning_effort("codex")) self.assertFalse(_supports_reasoning_effort("claude")) def test_invoke_agent_passes_reasoning_effort_to_codex(self) -> None: captured: dict[str, list[str]] = {} def _fake_run(cmd, **kwargs): captured["cmd"] = cmd class _Result: returncode = 0 stdout = "VERDICT: PASS" stderr = "" return _Result() agent = AgentConfig( name="codex-reviewer", command="codex", args=["exec", "--model", "gpt-5.4", "-"], reasoning_effort="high", ) with patch("subprocess.run", side_effect=_fake_run): invoke_agent(agent, "prompt", "review", quiet=True) self.assertEqual( captured["cmd"][:3], ["codex", "-c", 'model_reasoning_effort="high"'], ) def test_invoke_agent_classifies_auth_failures(self) -> None: def _fake_run(cmd, **kwargs): class _Result: returncode = 1 stdout = "" stderr = "Not logged in · Please run /login" return _Result() agent = AgentConfig( name="claude-reviewer", command="claude", args=["-p", "--model", "opus"], ) with patch("subprocess.run", side_effect=_fake_run): with self.assertRaises(AgentInvocationError) as ctx: invoke_agent(agent, "prompt", "review", quiet=True) self.assertEqual(ctx.exception.failure_type, "AUTH") self.assertIn("Re-authenticate", ctx.exception.suggested_action) def test_invoke_agent_classifies_usage_limit_failures(self) -> None: def _fake_run(cmd, **kwargs): class _Result: returncode = 1 stdout = "" stderr = "API Error: 429 rate limit exceeded for current quota" return _Result() agent = AgentConfig( name="codex-reviewer", command="codex", args=["exec", "--model", "gpt-5.4", "-"], ) with patch("subprocess.run", side_effect=_fake_run): with self.assertRaises(AgentInvocationError) as ctx: invoke_agent(agent, "prompt", "review", quiet=True) self.assertEqual(ctx.exception.failure_type, "USAGE_LIMIT") self.assertIn("quota", ctx.exception.suggested_action) def test_parallel_batch_saves_successes_before_failure(self) -> None: config = PipelineConfig( agents={ "ok-reviewer": AgentConfig(name="ok-reviewer", command="codex"), "bad-reviewer": AgentConfig(name="bad-reviewer", command="claude"), }, ) steps = [ StepConfig( name="review_ok", agent="ok-reviewer", role="review", prompt_template="default:review-only", output_key="review_ok", parallel=True, ), StepConfig( name="review_bad", agent="bad-reviewer", role="review", prompt_template="default:review-only", output_key="review_bad", parallel=True, ), ] step_outputs: dict[str, str] = {} step_results: dict[str, AgentResult] = {} def _fake_invoke(agent, prompt, step_name, **kwargs): if step_name == "review_ok": return AgentResult( output="VERDICT: PASS", exit_code=0, agent_name=agent.name, step_name=step_name, duration_seconds=1.0, ) raise AgentInvocationError( agent_name=agent.name, step_name=step_name, cmd_preview="claude -p ...", raw_error="API Error: 429 rate limit exceeded for current quota", failure_type="USAGE_LIMIT", suggested_action="Agent CLI hit a quota, billing, or token budget limit. Refill or raise the limit, then rerun.", ) with tempfile.TemporaryDirectory() as tmpdir: with patch("cross_eval.pipeline.invoke_agent", side_effect=_fake_invoke): with self.assertRaises(RuntimeError) as ctx: _execute_parallel_batch( steps, config, input_contents={}, feedback="", iteration=1, max_iterations=3, cwd=Path(tmpdir), timeout=None, dry_run=False, step_outputs=step_outputs, step_results=step_results, run_dir=Path(tmpdir), output_iter=1, ) self.assertIn("Successful outputs were saved for: review_ok", str(ctx.exception)) self.assertEqual(step_outputs["review_ok"], "VERDICT: PASS") self.assertTrue((Path(tmpdir) / "v1" / "review_ok.md").exists()) error_path = Path(tmpdir) / "v1" / "review_bad_error.md" self.assertTrue(error_path.exists()) self.assertIn("Failure Type", error_path.read_text(encoding="utf-8")) self.assertIn("USAGE_LIMIT", error_path.read_text(encoding="utf-8")) def test_detect_repeated_aggregate_warns_on_same_output(self) -> None: steps = [ StepConfig( name="aggregate_review", agent="codex-senior", role="review", prompt_template="default:aggregate-review", output_key="aggregate_review", ), ] history: dict[str, int] = {} first = _detect_repeated_aggregate( steps, {"aggregate_review": "Same issue list"}, history, iteration=1, ) second = _detect_repeated_aggregate( steps, {"aggregate_review": " same issue list "}, history, iteration=2, ) self.assertIsNone(first) self.assertEqual( second, "Repeated aggregate_review detected at iteration 2 (same as iteration 1).", ) def test_report_includes_repeated_aggregate_section(self) -> None: config = PipelineConfig(language="en") result = PipelineResult(repeated_aggregate_warnings=[ "Repeated aggregate_review detected at iteration 4 (same as iteration 3).", ]) report = build_report(config, result) self.assertIn("Repeated Aggregate Findings", report) self.assertIn("same as iteration 3", report) def test_fix_and_plan_presets_default_senior_from_reviewer_family(self) -> None: self.assertEqual( _default_seniors_for_preset( "preset:plan-review", ["codex-reviewer"], BUILTIN_AGENTS, ), ["codex-senior"], ) self.assertEqual( _default_seniors_for_preset( "preset:plan-review", ["claude-reviewer"], BUILTIN_AGENTS, ), ["claude-senior"], ) self.assertEqual( _default_seniors_for_preset( "preset:review-fix", ["codex-reviewer", "claude-reviewer"], BUILTIN_AGENTS, ), ["codex-senior"], ) self.assertEqual( _default_seniors_for_preset( "preset:review-fix", ["claude-reviewer"], BUILTIN_AGENTS, ), ["claude-senior"], ) self.assertEqual( _default_seniors_for_preset( "preset:coding-review-fix", ["codex-reviewer"], BUILTIN_AGENTS, ), ["codex-senior"], ) self.assertEqual( _default_seniors_for_preset( "preset:simple", ["codex-reviewer"], BUILTIN_AGENTS, ), [], ) def test_review_fix_duplicate_reviewers_get_unique_step_keys(self) -> None: phases = _build_review_fix_preset( ["codex-coder"], ["codex-reviewer", "codex-reviewer", "codex-reviewer"], [], ) converge = phases[0] self.assertEqual( [step.name for step in converge.steps[:3]], [ "review_codex_reviewer", "review_codex_reviewer_2", "review_codex_reviewer_3", ], ) self.assertEqual( [step.output_key for step in converge.steps[:3]], [ "review_codex_reviewer", "review_codex_reviewer_2", "review_codex_reviewer_3", ], ) self.assertEqual( [step.name for step in converge.steps[3:]], ["aggregate_review", "coding", "verify"], ) def test_coding_review_fix_starts_with_single_coding_phase(self) -> None: phases = _build_coding_review_fix_preset( ["codex-coder"], ["claude-reviewer", "codex-reviewer"], ["codex-senior"], ) self.assertEqual([phase.name for phase in phases], ["initial_coding", "review_fix"]) self.assertEqual(phases[0].max_iterations, 1) self.assertEqual([step.name for step in phases[0].steps], ["coding"]) self.assertEqual([step.name for step in phases[1].steps[2:]], ["aggregate_review", "coding", "verify"]) def test_apply_phased_iteration_override_updates_only_verdict_phases(self) -> None: config = PipelineConfig( phases=_build_coding_review_fix_preset( ["codex-coder"], ["codex-reviewer"], ["codex-senior"], ), ) _apply_phased_iteration_override(config, 10) self.assertEqual(config.phases[0].name, "initial_coding") self.assertEqual(config.phases[0].max_iterations, 1) self.assertEqual(config.phases[1].name, "review_fix") self.assertEqual(config.phases[1].max_iterations, 10) def test_review_only_duplicate_reviewers_get_unique_step_keys(self) -> None: steps = _build_review_only_preset( ["codex-coder"], ["codex-reviewer", "codex-reviewer"], [], ) self.assertEqual( [step.output_key for step in steps], ["review_codex_reviewer", "review_codex_reviewer_2"], ) def test_plan_review_duplicate_reviewers_get_unique_step_keys(self) -> None: steps = _build_plan_review_preset( ["codex-coder"], ["codex-reviewer", "codex-reviewer"], [], ) self.assertEqual( [step.output_key for step in steps[:2]], ["plan_review_codex_reviewer", "plan_review_codex_reviewer_2"], ) def test_plan_review_builds_review_fix_verify_loop(self) -> None: steps = _build_plan_review_preset( ["codex-coder"], ["claude-reviewer", "codex-reviewer"], ["claude-senior"], ) self.assertEqual( [step.name for step in steps], [ "plan_review_claude_reviewer", "plan_review_codex_reviewer", "aggregate_review", "plan_fix", "verify", ], ) self.assertEqual(steps[2].agent, "claude-senior") self.assertEqual(steps[3].agent, "codex-coder") self.assertEqual(steps[4].agent, "claude-senior") self.assertTrue(steps[4].verdict) self.assertFalse(steps[0].verdict) self.assertFalse(steps[1].verdict) def test_plan_review_single_reviewer_uses_default_loop_steps(self) -> None: steps = _build_plan_review_preset( ["codex-coder"], ["codex-reviewer"], [], ) self.assertEqual( [step.name for step in steps], ["plan_review", "aggregate_review", "plan_fix", "verify"], ) self.assertEqual(steps[1].agent, "codex-reviewer") self.assertEqual(steps[2].prompt_template, "default:plan-fix") self.assertTrue(steps[3].verdict) def test_cross_review_duplicate_coders_get_unique_step_keys(self) -> None: steps = _build_cross_review_preset( ["codex-coder", "codex-coder"], ["codex-reviewer"], [], ) self.assertEqual( [step.output_key for step in steps], [ "code_codex_coder", "code_codex_coder_2", "review_by_codex_coder", "review_by_codex_coder_2", ], ) def test_review_fix_uses_senior_for_aggregate_and_verify(self) -> None: phases = _build_review_fix_preset( ["codex-coder"], ["claude-reviewer", "codex-reviewer"], ["codex-senior"], ) steps = phases[0].steps self.assertEqual(steps[2].name, "aggregate_review") self.assertEqual(steps[2].agent, "codex-senior") self.assertEqual(steps[3].name, "coding") self.assertEqual(steps[4].name, "verify") self.assertEqual(steps[4].agent, "codex-senior") self.assertTrue(steps[4].verdict) def test_review_only_with_senior_adds_aggregate_step(self) -> None: steps = _build_review_only_preset( ["codex-coder"], ["claude-reviewer", "codex-reviewer"], ["claude-senior"], ) self.assertEqual(steps[-1].name, "senior_review") self.assertEqual(steps[-1].agent, "claude-senior") self.assertTrue(steps[-1].verdict) self.assertFalse(steps[0].verdict) self.assertFalse(steps[1].verdict) def test_simple_with_senior_adds_final_aggregate_step(self) -> None: steps = _build_simple_preset( ["codex-coder"], ["codex-reviewer"], ["codex-senior"], ) self.assertEqual( [step.name for step in steps], ["coding", "review", "senior_review"], ) self.assertFalse(steps[1].verdict) self.assertTrue(steps[2].verdict) def test_validate_config_rejects_duplicate_phase_step_names_and_output_keys(self) -> None: config = PipelineConfig( agents={ "codex-reviewer": AgentConfig( name="codex-reviewer", command="codex", ), }, phases=[ PhaseConfig( name="converge", steps=[ StepConfig( name="review_dup", agent="codex-reviewer", role="review", prompt_template="default:review-only", output_key="same_key", verdict=True, ), StepConfig( name="review_dup", agent="codex-reviewer", role="review", prompt_template="default:review-only", output_key="same_key", verdict=True, ), ], ), ], ) errors = validate_config(config) self.assertIn("Phase 'converge' has duplicate step name 'review_dup'", errors) self.assertIn("Phase 'converge' has duplicate output_key 'same_key'", errors) class PromptTemplateTest(unittest.TestCase): """Verify prompt template content after category/assessment refactor.""" def test_review_templates_no_false_positive_category(self) -> None: """False positive should NOT appear as a category in review templates.""" for tmpl, label in [ (REVIEW_TEMPLATE, "REVIEW_TEMPLATE"), (REVIEW_TEMPLATE_KO, "REVIEW_TEMPLATE_KO"), (PLAN_REVIEW_TEMPLATE, "PLAN_REVIEW_TEMPLATE"), (PLAN_REVIEW_TEMPLATE_KO, "PLAN_REVIEW_TEMPLATE_KO"), (REVIEW_ONLY_TEMPLATE, "REVIEW_ONLY_TEMPLATE"), (REVIEW_ONLY_TEMPLATE_KO, "REVIEW_ONLY_TEMPLATE_KO"), ]: with self.subTest(template=label): # Should not contain "False positive" as a category bullet self.assertNotIn( "**False positive**", tmpl, f"{label} still lists False positive as a category", ) # KO variant if label.endswith("_KO"): self.assertNotIn( "**오탐**", tmpl, f"{label} still lists 오탐 as a category", ) def test_review_templates_have_confirmed_dismissed(self) -> None: """Review templates should instruct CONFIRMED / DISMISSED assessment.""" for tmpl, label in [ (REVIEW_TEMPLATE, "REVIEW_TEMPLATE"), (REVIEW_TEMPLATE_KO, "REVIEW_TEMPLATE_KO"), ]: with self.subTest(template=label): self.assertIn("CONFIRMED", tmpl) self.assertIn("DISMISSED", tmpl) def test_coding_templates_ignore_dismissed(self) -> None: """Coding templates should tell coder to ignore DISMISSED items.""" self.assertIn("DISMISSED", CODING_TEMPLATE) self.assertIn("DISMISSED", CODING_TEMPLATE_KO) self.assertIn("DISMISSED", PLAN_FIX_TEMPLATE) self.assertIn("DISMISSED", PLAN_FIX_TEMPLATE_KO) def test_aggregate_templates_dismissed_structure(self) -> None: """Aggregate templates should use [False positive] / [Already fixed] tags.""" self.assertIn("[False positive]", AGGREGATE_REVIEW_TEMPLATE) self.assertIn("[Already fixed]", AGGREGATE_REVIEW_TEMPLATE) self.assertIn("[오탐]", AGGREGATE_REVIEW_TEMPLATE_KO) self.assertIn("[수정 완료]", AGGREGATE_REVIEW_TEMPLATE_KO) self.assertIn("{candidate_outputs}", AGGREGATE_REVIEW_TEMPLATE) self.assertIn("{reviews_bundle}", AGGREGATE_REVIEW_TEMPLATE) self.assertIn("{candidate_outputs}", AGGREGATE_REVIEW_TEMPLATE_KO) self.assertIn("{reviews_bundle}", AGGREGATE_REVIEW_TEMPLATE_KO) class ReviewMetricsParsingTest(unittest.TestCase): """Test review output metrics parsing.""" def test_parse_review_metrics_basic(self) -> None: output = """\ ### Issues Found - [Critical][Over-engineering] Added unnecessary caching layer - [Major][Omission] Missing input validation for user_id - [Major][Omission] Missing error handling for DB calls - [Minor][Omission] No docstring on public API ### Summary - Critical: 1, Major: 2, Minor: 1 - Over-engineering count: 1 - Omission count: 3 - CONFIRMED: 0, DISMISSED: 0 """ m = parse_review_metrics(output) self.assertEqual(m.critical, 1) self.assertEqual(m.major, 2) self.assertEqual(m.minor, 1) self.assertEqual(m.over_engineering, 1) self.assertEqual(m.omission, 3) self.assertEqual(m.confirmed, 0) self.assertEqual(m.dismissed, 0) def test_parse_review_metrics_korean(self) -> None: output = """\ ### 발견된 이슈 - [Critical][과최적화] 불필요한 캐시 레이어 추가 - [Major][누락] user_id 입력 검증 누락 ### 이전 피드백 평가 - CONFIRMED: DB 에러 핸들링 — 여전히 미구현 - DISMISSED (오탐): 타입 힌트 누락 — 기획서에 없는 요구사항 """ m = parse_review_metrics(output) self.assertEqual(m.critical, 1) self.assertEqual(m.major, 1) self.assertEqual(m.over_engineering, 1) self.assertEqual(m.omission, 1) self.assertEqual(m.confirmed, 1) self.assertEqual(m.dismissed, 1) def test_parse_review_metrics_with_assessment(self) -> None: output = """\ ### Previous Feedback Assessment - CONFIRMED: Missing auth check — still not implemented - CONFIRMED: SQL injection risk — still present - DISMISSED (false positive): Unused import — actually used in tests ### Issues Found - [Critical][Omission] Missing auth check - [Critical][Omission] SQL injection risk """ m = parse_review_metrics(output) self.assertEqual(m.confirmed, 2) self.assertEqual(m.dismissed, 1) self.assertEqual(m.critical, 2) self.assertEqual(m.omission, 2) def test_report_includes_metrics_table(self) -> None: config = PipelineConfig( language="en", pipeline=[ StepConfig( name="review", agent="claude-reviewer", role="review", prompt_template="default:review", output_key="review_result", verdict=True, ), ], ) result = PipelineResult( iterations=[ IterationResult( iteration=1, step_outputs={ "review_result": ( "### Issues Found\n" "- [Critical][Omission] Missing auth\n" "- [Major][Over-engineering] Extra abstraction\n" "### Verdict\nVERDICT: FAIL" ), }, verdict="FAIL", ), IterationResult( iteration=2, step_outputs={ "review_result": ( "### Previous Feedback Assessment\n" "- CONFIRMED: Missing auth — still missing\n" "- DISMISSED (false positive): Extra abstraction — needed per plan\n" "### Issues Found\n" "- [Major][Omission] Missing auth\n" "### Verdict\nVERDICT: FAIL" ), }, verdict="FAIL", ), ], final_verdict="FAIL", ) report = build_report(config, result) self.assertIn("Review Metrics", report) # Check table headers self.assertIn("Critical", report) self.assertIn("CONFIRMED", report) self.assertIn("DISMISSED", report) # Check trend section self.assertIn("Metrics Trend", report) self.assertIn("decreasing", report) def test_report_no_metrics_table_without_review_steps(self) -> None: config = PipelineConfig( language="en", pipeline=[ StepConfig( name="coding", agent="claude-coder", role="coding", prompt_template="default:coding", output_key="coding_output", verdict=True, ), ], ) result = PipelineResult( iterations=[ IterationResult( iteration=1, step_outputs={"coding_output": "some code"}, verdict="PASS", ), ], final_verdict="PASS", ) report = build_report(config, result) self.assertNotIn("Review Metrics", report) class EscalateVerdictTest(unittest.TestCase): """Test ESCALATE verdict functionality.""" def test_extract_verdict_escalate(self) -> None: output = "Some review content\n\nVERDICT: ESCALATE\n" result = _extract_verdict(output, r"VERDICT:\s*PASS") self.assertEqual(result, "ESCALATE") def test_extract_verdict_escalate_priority(self) -> None: """ESCALATE should take priority even if PASS pattern also matches.""" output = "VERDICT: PASS\n\nVERDICT: ESCALATE\n" result = _extract_verdict(output, r"VERDICT:\s*PASS") self.assertEqual(result, "ESCALATE") def test_extract_verdict_pass_still_works(self) -> None: output = "All good\n\nVERDICT: PASS\n" result = _extract_verdict(output, r"VERDICT:\s*PASS") self.assertEqual(result, "PASS") def test_extract_verdict_fail_still_works(self) -> None: output = "Issues found\n\nVERDICT: FAIL\n" result = _extract_verdict(output, r"VERDICT:\s*PASS") self.assertEqual(result, "FAIL") def test_extract_senior_tracker(self) -> None: output = ( "Some text\n\n" "## Issue Tracker\n" "| ISS-ID | Severity | Description | Status | Since |\n" "|--------|----------|-------------|--------|-------|\n" "| ISS-001 | Critical | Missing auth | Open | v1 |\n" "| ISS-002 | Major | Bad naming | Fixed | v1 |\n" "\nMore text" ) tracker = _extract_senior_tracker(output) self.assertIn("Issue Tracker", tracker) self.assertIn("ISS-001", tracker) self.assertIn("ISS-002", tracker) def test_extract_senior_tracker_empty(self) -> None: output = "No tracker table here" tracker = _extract_senior_tracker(output) self.assertEqual(tracker, "") def test_auto_escalate_heuristic(self) -> None: prev1 = "Issue in src/auth.py: missing validation" prev2 = "Issue in src/auth.py: validation still missing" current = "Issue in src/auth.py: validation not implemented" # Should detect repeated issue self.assertTrue(_detect_auto_escalate([prev1, prev2], current, threshold=2)) def test_auto_escalate_no_repeat(self) -> None: prev1 = "Issue in src/auth.py: missing validation" current = "Issue in src/database.py: connection pool" self.assertFalse(_detect_auto_escalate([prev1], current, threshold=2)) def test_auto_escalate_different_issues_same_file(self) -> None: """Same file path but different issues should NOT trigger escalation.""" prev1 = "Issue in src/utils.py: missing validation on input" prev2 = "Issue in src/utils.py: unused import at top of file" current = "Issue in src/utils.py: error handling not implemented" # All mention src/utils.py, but the issue keywords differ across # iterations, so this should NOT escalate. self.assertFalse(_detect_auto_escalate([prev1, prev2], current, threshold=2)) def test_report_escalate_verdict(self) -> None: config = PipelineConfig(language="en") result = PipelineResult( final_verdict="ESCALATE", escalated_issues=["Requirements are ambiguous — need stakeholder input"], ) report = build_report(config, result) self.assertIn("ESCALATE", report) self.assertIn("Human review required", report) self.assertIn("ambiguous", report) def test_report_escalate_verdict_ko(self) -> None: config = PipelineConfig(language="ko") result = PipelineResult( final_verdict="ESCALATE", escalated_issues=["요구사항이 모호함"], ) report = build_report(config, result) self.assertIn("ESCALATE", report) self.assertIn("사람의 확인이 필요합니다", report) def test_exit_code_escalate(self) -> None: from cross_eval.cli import main mock_result = PipelineResult( final_verdict="ESCALATE", escalated_issues=["Needs human review"], ) with patch("cross_eval.config.load_config") as mock_load, \ patch("cross_eval.config.validate_config", return_value=[]), \ patch("cross_eval.pipeline.run_pipeline", return_value=mock_result), \ patch("cross_eval.report.print_escalation_report"): mock_config = PipelineConfig( pipeline=[ StepConfig( name="review", agent="claude-reviewer", role="review", prompt_template="default:review", output_key="review_result", verdict=True, ), ], agents=dict(BUILTIN_AGENTS), coders=["claude-coder"], reviewers=["claude-reviewer"], inputs={"plan": Path("/tmp/plan.md")}, language="en", max_iterations=3, preset_name="simple", ) mock_load.return_value = mock_config with tempfile.NamedTemporaryFile(suffix=".yaml", mode="w") as f: f.write("inputs:\n plan: /tmp/plan.md\n") f.flush() exit_code = main(["run", "-c", f.name]) self.assertEqual(exit_code, 2) def test_senior_prompt_includes_escalate(self) -> None: self.assertIn("ESCALATE", _SENIOR_SYSTEM_PROMPT) self.assertIn("ambiguous", _SENIOR_SYSTEM_PROMPT.lower()) def test_aggregate_template_has_tracker(self) -> None: self.assertIn("{previous_senior_tracker}", AGGREGATE_REVIEW_TEMPLATE) self.assertIn("Issue Tracker", AGGREGATE_REVIEW_TEMPLATE) self.assertIn("VERDICT: ESCALATE", AGGREGATE_REVIEW_TEMPLATE) def test_report_includes_issue_tracker_summary(self) -> None: config = PipelineConfig( language="en", pipeline=[ StepConfig( name="review", agent="claude-reviewer", role="review", prompt_template="default:review", output_key="review_result", verdict=True, ), ], ) result = PipelineResult( iterations=[ IterationResult( iteration=1, step_outputs={ "review_result": ( "### Issues Found\n" "- ISS-001 [Critical][Omission] Missing auth check\n" "- ISS-002 [Major][Omission] No input validation\n" "### Verdict\nVERDICT: FAIL" ), }, verdict="FAIL", ), ], final_verdict="FAIL", ) report = build_report(config, result) self.assertIn("Issue Tracker Summary", report) self.assertIn("ISS-001", report) self.assertIn("ISS-002", report) def test_report_includes_senior_tracker_table(self) -> None: config = PipelineConfig( language="en", pipeline=[ StepConfig( name="senior_review", agent="claude-senior", role="review", prompt_template="default:aggregate-review", output_key="senior_review_result", verdict=True, ), ], ) result = PipelineResult( iterations=[ IterationResult( iteration=1, step_outputs={ "senior_review_result": ( "### Confirmed Issues\n- Missing auth\n\n" "## Issue Tracker\n" "| ISS-ID | Severity | Description | Status | Since |\n" "|--------|----------|-------------|--------|-------|\n" "| ISS-001 | Critical | Missing auth check | Open | v1 |\n" "| ISS-002 | Major | No validation | Fixed | v1 |\n" "\n### Verdict\nVERDICT: FAIL" ), }, verdict="FAIL", ), ], final_verdict="FAIL", ) report = build_report(config, result) self.assertIn("Issue Tracker Summary", report) self.assertIn("ISS-001", report) self.assertIn("Fixed", report) def test_aggregate_template_ko_has_tracker(self) -> None: self.assertIn("{previous_senior_tracker}", AGGREGATE_REVIEW_TEMPLATE_KO) self.assertIn("이슈 트래커", AGGREGATE_REVIEW_TEMPLATE_KO) self.assertIn("VERDICT: ESCALATE", AGGREGATE_REVIEW_TEMPLATE_KO) class FixPresetBehaviorTest(unittest.TestCase): def _write_fix_config(self, root: Path, *, max_iterations: int = 7) -> Path: (root / "plan.md").write_text("# plan\n", encoding="utf-8") (root / "checklist.md").write_text("# checklist\n", encoding="utf-8") config_path = root / "config.yaml" config_path.write_text( ( "inputs:\n" " plan: plan.md\n" " checklist: checklist.md\n" "coders: [claude-coder]\n" "reviewers: [claude-reviewer]\n" "pipeline: preset:review-fix\n" f"max_iterations: {max_iterations}\n" "language: en\n" ), encoding="utf-8", ) return config_path def test_load_config_syncs_phased_iterations_and_enables_agentic(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: config = load_config(self._write_fix_config(Path(tmpdir), max_iterations=7)) self.assertEqual(config.preset_name, "review-fix") self.assertEqual(config.phases[0].max_iterations, 7) self.assertTrue(config.agents["claude-coder"].agentic) self.assertNotIn("-p", config.agents["claude-coder"].args) def test_run_config_max_iter_updates_existing_phased_pipeline(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: config_path = self._write_fix_config(Path(tmpdir), max_iterations=7) captured: dict[str, object] = {} def _fake_run_pipeline(config, **kwargs): captured["phase_max"] = config.phases[0].max_iterations captured["agentic"] = config.agents[config.coders[0]].agentic return PipelineResult( iterations=[], final_verdict="PASS", run_dir=Path(tmpdir) / "output", ) with patch("cross_eval.pipeline.run_pipeline", side_effect=_fake_run_pipeline): exit_code = main([ "run", "--config", str(config_path), "--max-iter", "9", "--dry-run", ]) self.assertEqual(exit_code, 0) self.assertEqual(captured["phase_max"], 9) self.assertTrue(captured["agentic"]) def test_run_preset_review_fix_auto_enables_agentic_without_flag(self) -> None: captured: dict[str, object] = {} def _fake_run_pipeline(config, **kwargs): captured["preset"] = config.preset_name captured["agentic"] = config.agents[config.coders[0]].agentic captured["phase_max"] = config.phases[0].max_iterations return PipelineResult( iterations=[], final_verdict="PASS", run_dir=Path(".cross-eval/output"), ) with patch("cross_eval.pipeline.run_pipeline", side_effect=_fake_run_pipeline): exit_code = main(["run", "--preset", "review-fix", "--dry-run"]) self.assertEqual(exit_code, 0) self.assertEqual(captured["preset"], "review-fix") self.assertTrue(captured["agentic"]) self.assertEqual(captured["phase_max"], 3) def test_run_preset_plan_review_auto_enables_agentic_without_flag(self) -> None: captured: dict[str, object] = {} def _fake_run_pipeline(config, **kwargs): captured["preset"] = config.preset_name captured["agentic"] = config.agents[config.coders[0]].agentic captured["seniors"] = list(config.seniors) captured["steps"] = [step.name for step in config.pipeline] captured["max_iter"] = config.max_iterations return PipelineResult( iterations=[], final_verdict="PASS", run_dir=Path(".cross-eval/output"), ) with patch("cross_eval.pipeline.run_pipeline", side_effect=_fake_run_pipeline): exit_code = main(["run", "--preset", "plan-review", "--dry-run"]) self.assertEqual(exit_code, 0) self.assertEqual(captured["preset"], "plan-review") self.assertTrue(captured["agentic"]) self.assertEqual(captured["seniors"], ["claude-senior"]) self.assertEqual( captured["steps"], ["plan_review", "aggregate_review", "plan_fix", "verify"], ) self.assertEqual(captured["max_iter"], 3) def test_run_senior_model_override_applies_only_to_seniors(self) -> None: captured: dict[str, list[str]] = {} def _fake_run_pipeline(config, **kwargs): captured["coder_args"] = list(config.agents[config.coders[0]].args) captured["reviewer_args"] = list(config.agents[config.reviewers[0]].args) captured["senior_args"] = list(config.agents[config.seniors[0]].args) return PipelineResult( iterations=[], final_verdict="PASS", run_dir=Path(".cross-eval/output"), ) with patch("cross_eval.pipeline.run_pipeline", side_effect=_fake_run_pipeline): exit_code = main([ "run", "--preset", "review-fix", "--coder", "claude", "--reviewer", "claude", "--senior", "claude", "--senior-model", "sonnet", "--dry-run", ]) self.assertEqual(exit_code, 0) self.assertIn("opus", captured["coder_args"]) self.assertIn("opus", captured["reviewer_args"]) self.assertIn("sonnet", captured["senior_args"]) class OutputDirectoryResolutionTest(unittest.TestCase): def test_load_config_resolves_output_dir_from_project_root(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: root = Path(tmpdir) ce_dir = root / ".cross-eval" ce_dir.mkdir() (ce_dir / "plan.md").write_text("# plan\n", encoding="utf-8") config_path = ce_dir / "config.yaml" config_path.write_text( ( "inputs:\n" " plan: plan.md\n" "coders: [claude-coder]\n" "reviewers: [claude-reviewer]\n" "pipeline: preset:simple\n" "output_dir: .cross-eval/output\n" ), encoding="utf-8", ) config = load_config(config_path) self.assertEqual(config.output_dir.resolve(), (root / ".cross-eval" / "output").resolve()) if __name__ == "__main__": unittest.main()