feat: ESCALATE verdict, issue tracker, onboarding commands

Add 3-verdict system (PASS/FAIL/ESCALATE) with priority handling across
simple and phased pipelines. Senior reviewers can now escalate issues
requiring human intervention, immediately breaking the review loop.

- ESCALATE verdict extraction with highest priority over PASS/FAIL
- Issue Tracker tables (ISS-NNN) carried across iterations
- Auto-escalate heuristic using (file, keyword) composite fingerprints
- Report restructuring: executive view first (verdict → tracker → metrics)
- Onboarding: `doctor`, `demo`, `init --guided` commands
- Exit codes: PASS=0, FAIL=1, ESCALATE=2
- 87 tests passing (54 config + 25 onboarding + 8 integration)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
chungyeong
2026-03-13 18:19:05 +09:00
parent ee4f1a07ef
commit 204e071b74
15 changed files with 3032 additions and 156 deletions

View File

@@ -39,6 +39,26 @@ _CODEX_ARGS = [
"-",
]
_CLAUDE_BASE_ARGS = [
"-p",
"--setting-sources",
"user",
"--disable-slash-commands",
"--model",
"opus",
]
_CLAUDE_CODER_ARGS = list(_CLAUDE_BASE_ARGS) + [
"--dangerously-skip-permissions",
"--permission-mode",
"bypassPermissions",
]
_CLAUDE_REVIEW_ARGS = list(_CLAUDE_BASE_ARGS) + [
"--permission-mode",
"plan",
]
_CODER_SYSTEM_PROMPT = (
"You are a senior software engineer implementing code changes.\n"
"Rules:\n"
@@ -81,29 +101,37 @@ _SENIOR_SYSTEM_PROMPT = (
"4. Be skeptical of false positives, but do not lower the bar on real requirement "
"gaps.\n"
"5. When issues remain, produce a concise prioritized action list the coder can act on.\n"
"6. Do NOT invent new requirements beyond the plan and checklist.\n"
"7. End with VERDICT: PASS or VERDICT: FAIL."
"6. Maintain an Issue Tracker table across iterations to track issue status.\n"
"7. Do NOT invent new requirements beyond the plan and checklist.\n"
"8. End with one of three verdicts:\n"
" - VERDICT: PASS — all requirements met, no issues remain.\n"
" - VERDICT: FAIL — issues found that the coder can fix.\n"
" - VERDICT: ESCALATE — issues that require human intervention. Use ESCALATE when:\n"
" * Requirements are ambiguous and need clarification from stakeholders\n"
" * Architecture decisions are needed that go beyond the plan scope\n"
" * External dependency issues block progress\n"
" * The coder has failed to resolve the same issue 2+ times"
)
BUILTIN_AGENTS: dict[str, AgentConfig] = {
"claude-coder": AgentConfig(
name="claude-coder",
command="claude",
args=["-p", "--model", "opus", "--permission-mode", "auto"],
args=list(_CLAUDE_CODER_ARGS),
system_prompt=_CODER_SYSTEM_PROMPT,
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["coder"],
),
"claude-reviewer": AgentConfig(
name="claude-reviewer",
command="claude",
args=["-p", "--model", "opus", "--permission-mode", "auto"],
args=list(_CLAUDE_REVIEW_ARGS),
system_prompt=_REVIEWER_SYSTEM_PROMPT,
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["reviewer"],
),
"claude-senior": AgentConfig(
name="claude-senior",
command="claude",
args=["-p", "--model", "opus", "--permission-mode", "auto"],
args=list(_CLAUDE_REVIEW_ARGS),
system_prompt=_SENIOR_SYSTEM_PROMPT,
reasoning_effort=DEFAULT_ROLE_REASONING_EFFORTS["senior"],
),
@@ -136,6 +164,11 @@ _AGENT_ALIASES: dict[str, str] = {
"codex": "codex",
}
_ROLE_ALIASES: dict[str, str] = {
"coding": "coding",
"review": "review",
}
def resolve_agent_shorthand(name: str, role: str) -> str:
"""Resolve shorthand agent name to full builtin name.
@@ -150,6 +183,16 @@ def resolve_agent_shorthand(name: str, role: str) -> str:
return name
def normalize_step_role(role: str) -> str:
"""Normalize step role aliases to the canonical role name."""
return _ROLE_ALIASES.get(role, role)
def normalize_prompt_template(template_ref: str) -> str:
"""Normalize prompt template aliases to canonical template refs."""
return template_ref
# ---------------------------------------------------------------------------
# Role inference (backward compatibility)
# ---------------------------------------------------------------------------
@@ -233,7 +276,7 @@ def _default_seniors_for_preset(
"""Infer a default senior agent for presets that benefit from adjudication."""
if not (
isinstance(pipeline_raw, str)
and pipeline_raw == "preset:review-fix"
and pipeline_raw in {"preset:review-fix", "preset:coding-review-fix"}
and reviewers
):
return []
@@ -465,7 +508,7 @@ def _resolve_pipeline(
"""Resolve pipeline from preset string or explicit step list.
Returns (steps, phases) tuple. Only one will be non-empty.
- Simple/cross-review/review-only → steps populated, phases empty.
- Simple/cross-review/plan-review/review-only → steps populated, phases empty.
- Phased presets (review-fix) → steps empty, phases populated.
"""
# Preset: "preset:simple" or "preset:review-fix"
@@ -485,11 +528,15 @@ def _resolve_pipeline(
if isinstance(pipeline_raw, list):
steps = []
for step_data in pipeline_raw:
raw_role = step_data.get("role", "coding")
normalized_role = normalize_step_role(raw_role)
steps.append(StepConfig(
name=step_data["name"],
agent=step_data["agent"],
role=step_data.get("role", "generate"),
prompt_template=step_data.get("prompt_template", f"default:{step_data.get('role', 'generate')}"),
role=normalized_role,
prompt_template=normalize_prompt_template(
step_data.get("prompt_template", f"default:{normalized_role}")
),
output_key=step_data["output_key"],
verdict=step_data.get("verdict", False),
verdict_pattern=step_data.get("verdict_pattern", r"VERDICT:\s*PASS"),
@@ -524,10 +571,6 @@ def validate_config(config: PipelineConfig) -> list[str]:
errors,
scope=f"Phase '{phase.name}'",
)
if not any(s.verdict for s in phase.steps):
errors.append(
f"Phase '{phase.name}' must have at least one step with verdict: true"
)
# Validate verdict patterns
for step in phase.steps:
if step.verdict: