dev-puppeteer/my-deepagent/scripts/verify_v04/run_q.py

"""Q-benchmark — my-deepagent (DeepSeek + Haiku) vs Claude Code sub-agent.

Workflow:
  1. `run_q.py --collect-ab`  →  each Q-task asks my-deepagent twice
     (once with DeepSeek, once with Haiku), saves response to disk.
  2. The orchestrator (main session) calls the `Agent` tool 6 times to
     get C responses, saves to `responses/Q{N}/C_subagent.md`.
  3. `run_q.py --judge`  →  loads A/B/C for every task, hands them to a
     Sonnet judge (via OpenRouter), writes per-task JSON + final markdown.

Task list (6 — most comparable to a generic chat agent):
  Q1  Python stdin wordcount CLI (code generation)
  Q2  Off-by-one bug fix (debugging)
  Q3  Summarize this repo in 5 lines (read_file / tools)
  Q4  FastAPI /healthz endpoint plan (plan-mode-style)
  Q5  5-turn conversation context retention
  Q6  Haiku-poet SKILL.md compliance (skill routing)
"""

from __future__ import annotations

import argparse
import asyncio
import json
import sys
import uuid
from pathlib import Path
from typing import Any

sys.path.insert(0, str(Path(__file__).resolve().parents[1]))

from my_deepagent.cli.interactive import _invoke_and_stream  # noqa: E402
from my_deepagent.config import load_config  # noqa: E402
from my_deepagent.governance import bootstrap_user_dirs, record_consent  # noqa: E402
from my_deepagent.persistence.checkpointer import get_checkpointer_ctx  # noqa: E402
from my_deepagent.persistence.db import Database  # noqa: E402
from my_deepagent.user_dirs import (  # noqa: E402
    ensure_user_dirs_initialized,
    load_combined_personas,
)
from verify_v04._common import last_assistant_text, mk_session, record, repo_root  # noqa: E402

_RESPONSES = repo_root() / "scripts" / "verify_v04" / "responses"
_JUDGES = repo_root() / "scripts" / "verify_v04" / "judges"
_RESPONSES.mkdir(parents=True, exist_ok=True)
_JUDGES.mkdir(parents=True, exist_ok=True)


# ---------------------------------------------------------------------------
# Task definitions — kept short so a 1-page judge eval is feasible.
# ---------------------------------------------------------------------------


TASKS: dict[str, dict[str, Any]] = {
    "Q1": {
        "title": "Python stdin wordcount CLI",
        "prompt": (
            "Write a single Python file `wordcount.py` that:\n"
            "  1. Reads from stdin\n"
            "  2. Supports flags `-w` (word count), `-l` (line count), `-c` (char count)\n"
            "  3. Prints one number per requested flag, space-separated.\n"
            "Return ONLY the code in a single ```python fenced block.  No prose."
        ),
        "kind": "single",
    },
    "Q2": {
        "title": "Off-by-one bug fix",
        "prompt": (
            "The following Python function returns the wrong count when "
            "`text` is empty:\n\n"
            "```python\n"
            "def first_word_length(text: str) -> int:\n"
            "    words = text.split()\n"
            "    return len(words[0])\n"
            "```\n\n"
            "Fix it so an empty string returns 0.  Reply with ONLY the fixed function "
            "in a fenced code block."
        ),
        "kind": "single",
    },
    "Q3": {
        "title": "Summarize this repo in 5 lines",
        "prompt": (
            "Summarize the `my-deepagent` Python project (this repo) in EXACTLY 5 "
            "markdown bullet lines.  Each line ≤ 80 chars.  Focus: purpose, "
            "architecture layers, key features.  No prose around it.  Use README.md "
            "and the package layout under src/my_deepagent/ as your source — just "
            "your best summary."
        ),
        "kind": "single",
    },
    "Q4": {
        "title": "FastAPI /healthz plan",
        "prompt": (
            "We have a FastAPI app under `src/my_deepagent/api/app.py`.  Produce a "
            "PLAN (no code) for adding a `GET /healthz` endpoint that returns "
            "`{\"status\": \"ok\", \"db\": <bool>}` where `db` is a quick `SELECT 1` "
            "ping.  Format: markdown with `## Context`, `## Phases`, `## Verification` "
            "sections.  Each Phases bullet ≤ 15 words."
        ),
        "kind": "single",
    },
    "Q5": {
        "title": "5-turn context retention",
        "prompt": [
            "한국어로만 응답해. 짧게.",
            "내 이름은 도라고, 직업은 데이터 분석가야. 짧게 인사해.",
            "내가 좋아하는 색은 청록이야. 한 줄 코멘트.",
            "오늘 점심으로 라멘 먹었어. 한 줄 코멘트.",
            "지금까지 내가 알려준 사실 3개를 한 줄씩, 번호 매겨 정리해줘.",
        ],
        "kind": "multi-turn",
    },
    "Q6": {
        "title": "Haiku-poet SKILL.md compliance",
        "prompt": (
            "당신은 다음 SKILL.md 명령을 엄격하게 따라야 합니다:\n\n"
            "---\n"
            "name: korean-haiku-poet\n"
            "description: Reply ONLY as a 3-line Korean haiku.  No prose, no preamble.\n"
            "---\n\n"
            "Each response must be exactly 3 lines, all in Korean.  Total under "
            "40 characters.  No explanation, no English, no extra newlines.\n\n"
            "Now: write a haiku about cherry blossoms."
        ),
        "kind": "single",
    },
}


# ---------------------------------------------------------------------------
# Run my-deepagent twice per task — DeepSeek (A) + Haiku (B)
# ---------------------------------------------------------------------------


async def _run_single(sess, prompt: str) -> str:
    agent = sess.build_agent_if_needed()
    await _invoke_and_stream(agent, prompt, sess)
    return await last_assistant_text(sess.db, sess.session_id)


async def _run_multi(sess, prompts: list[str]) -> str:
    """Multi-turn — last assistant reply is the deliverable."""
    for p in prompts:
        agent = sess.build_agent_if_needed()
        await _invoke_and_stream(agent, p, sess)
    return await last_assistant_text(sess.db, sess.session_id)


async def collect_a_b(db, config, personas, saver) -> None:
    """For every Q-task, run prompt against DeepSeek (A) and Haiku (B)."""
    for qid, task in TASKS.items():
        out_dir = _RESPONSES / qid
        out_dir.mkdir(parents=True, exist_ok=True)
        for letter, model_id in (
            ("A", "openrouter:deepseek/deepseek-chat"),
            ("B", "openrouter:anthropic/claude-haiku-4-5"),
        ):
            target = out_dir / f"{letter}_{model_id.split('/')[-1]}.md"
            if target.exists():
                print(f"  · {qid} {letter} already collected → skip ({target.name})")
                continue
            sess = await mk_session(db, config, personas, saver, uuid.uuid4())
            sess.set_model(model_id)
            try:
                if task["kind"] == "single":
                    reply = await _run_single(sess, task["prompt"])
                else:
                    reply = await _run_multi(sess, task["prompt"])
            except Exception as e:
                reply = f"[ERROR] {type(e).__name__}: {e}"
            target.write_text(reply, encoding="utf-8")
            print(f"  · {qid} {letter} ({model_id.split('/')[-1]}): {len(reply)}c → {target.name}")


# ---------------------------------------------------------------------------
# Judge — feed (task, A, B, C) into Sonnet and parse a JSON verdict.
# ---------------------------------------------------------------------------


_JUDGE_PROMPT = """당신은 코딩 어시스턴트 비교 평가관입니다.  주관 없이, 결과물 자체로만 평가합니다.

# Task ({qid})
{task_prompt}

# Responses

## A (my-deepagent + DeepSeek-chat)
{a}

## B (my-deepagent + Anthropic Haiku 4.5)
{b}

## C (Claude Code sub-agent, anonymized)
{c}

# 평가 기준 (각 1-10)
1. accuracy — 작업을 정확히 수행했는가
2. completeness — 필요한 부분을 빠짐없이 다뤘는가
3. code_quality — 코드/마크다운 품질 (실행성·관용성·구조)
4. clarity — 설명·주석·구조의 명료함
5. efficiency — 불필요한 장황함 없는 간결함

# 출력 (반드시 JSON only, 다른 텍스트 없음)
{{
  "A": {{"accuracy": <int>, "completeness": <int>, "code_quality": <int>, "clarity": <int>, "efficiency": <int>, "rationale": "<short>"}},
  "B": {{...}},
  "C": {{...}},
  "ranking": ["best", "mid", "worst"],
  "claude_code_equivalent": "<true if A or B reaches >=90% of C's total, else false>"
}}
"""


async def judge_one(qid: str, task: dict[str, Any]) -> dict[str, Any] | None:
    out_dir = _RESPONSES / qid
    a_path = out_dir / "A_deepseek-chat.md"
    b_path = out_dir / "B_claude-haiku-4-5.md"
    c_path = out_dir / "C_subagent.md"
    if not (a_path.exists() and b_path.exists() and c_path.exists()):
        print(f"  · {qid}: missing one of A/B/C — skip")
        return None
    a = a_path.read_text(encoding="utf-8")
    b = b_path.read_text(encoding="utf-8")
    c = c_path.read_text(encoding="utf-8")
    if task["kind"] == "single":
        prompt_text = task["prompt"]
    else:
        prompt_text = "\n".join(f"turn {i+1}: {p}" for i, p in enumerate(task["prompt"]))
    prompt = _JUDGE_PROMPT.format(qid=qid, task_prompt=prompt_text, a=a, b=b, c=c)

    from langchain_openai import ChatOpenAI

    from my_deepagent.config import load_config
    from my_deepagent.secrets import resolve_openrouter_api_key

    cfg = load_config()
    llm = ChatOpenAI(
        model="anthropic/claude-sonnet-4-6",
        api_key=resolve_openrouter_api_key(cfg),
        base_url=cfg.openrouter_base_url,
        max_tokens=1500,
        temperature=0.0,
    )
    try:
        result = await llm.ainvoke([{"role": "user", "content": prompt}])
    except Exception as e:
        print(f"  · {qid}: judge LLM failed: {type(e).__name__}: {e}")
        return None

    text = result.content
    if isinstance(text, list):
        text = "".join(b.get("text", str(b)) if isinstance(b, dict) else str(b) for b in text)
    text = str(text).strip()
    # Strip ```json fences if present.
    if text.startswith("```"):
        lines = text.split("\n")
        text = "\n".join(lines[1:-1])
    try:
        parsed = json.loads(text)
    except Exception as e:
        print(f"  · {qid}: judge JSON parse failed ({e}); raw[:300]={text[:300]!r}")
        return None
    out = _JUDGES / f"{qid}.json"
    out.write_text(json.dumps(parsed, ensure_ascii=False, indent=2), encoding="utf-8")
    return parsed


async def run_judge(db, config) -> None:
    print("[Q judge] starting (Sonnet via OpenRouter)")
    for qid, task in TASKS.items():
        parsed = await judge_one(qid, task)
        if parsed is None:
            continue
        scores_a = parsed.get("A", {})
        scores_c = parsed.get("C", {})
        total_a = sum(int(scores_a.get(k, 0)) for k in ("accuracy", "completeness", "code_quality", "clarity", "efficiency"))
        total_c = sum(int(scores_c.get(k, 0)) for k in ("accuracy", "completeness", "code_quality", "clarity", "efficiency"))
        pct = (total_a / total_c * 100) if total_c else 0
        equiv = parsed.get("claude_code_equivalent", "false")
        record(
            qid,
            equiv == "true" or equiv is True,
            f"A={total_a} C={total_c} A/C={pct:.0f}% verdict={equiv}",
        )


# ---------------------------------------------------------------------------
# Driver
# ---------------------------------------------------------------------------


async def main(args: argparse.Namespace) -> int:
    cfg = load_config()
    record_consent(cfg.data_dir)
    bootstrap_user_dirs(cfg)
    ensure_user_dirs_initialized(cfg)
    db = Database(cfg.database_url)
    await db.init_schema()
    personas = load_combined_personas(cfg, repo_root() / "docs" / "schemas" / "personas")

    if args.collect_ab:
        print("[Q collect-ab] my-deepagent × {DeepSeek, Haiku} × 6 tasks")
        async with get_checkpointer_ctx(cfg.database_url) as saver:
            await collect_a_b(db, cfg, personas, saver)

    if args.judge:
        await run_judge(db, cfg)

    await db.dispose()
    return 0


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--collect-ab", action="store_true", help="run my-deepagent for A and B")
    parser.add_argument("--judge", action="store_true", help="invoke Sonnet judge over A/B/C")
    args = parser.parse_args()
    if not (args.collect_ab or args.judge):
        parser.error("nothing to do — use --collect-ab and/or --judge")
    sys.exit(asyncio.run(main(args)))