"""Q-benchmark — my-deepagent (DeepSeek + Haiku) vs Claude Code sub-agent. Workflow: 1. `run_q.py --collect-ab` → each Q-task asks my-deepagent twice (once with DeepSeek, once with Haiku), saves response to disk. 2. The orchestrator (main session) calls the `Agent` tool 6 times to get C responses, saves to `responses/Q{N}/C_subagent.md`. 3. `run_q.py --judge` → loads A/B/C for every task, hands them to a Sonnet judge (via OpenRouter), writes per-task JSON + final markdown. Task list (6 — most comparable to a generic chat agent): Q1 Python stdin wordcount CLI (code generation) Q2 Off-by-one bug fix (debugging) Q3 Summarize this repo in 5 lines (read_file / tools) Q4 FastAPI /healthz endpoint plan (plan-mode-style) Q5 5-turn conversation context retention Q6 Haiku-poet SKILL.md compliance (skill routing) """ from __future__ import annotations import argparse import asyncio import json import sys import uuid from pathlib import Path from typing import Any sys.path.insert(0, str(Path(__file__).resolve().parents[1])) from my_deepagent.cli.interactive import _invoke_and_stream # noqa: E402 from my_deepagent.config import load_config # noqa: E402 from my_deepagent.governance import bootstrap_user_dirs, record_consent # noqa: E402 from my_deepagent.persistence.checkpointer import get_checkpointer_ctx # noqa: E402 from my_deepagent.persistence.db import Database # noqa: E402 from my_deepagent.user_dirs import ( # noqa: E402 ensure_user_dirs_initialized, load_combined_personas, ) from verify_v04._common import last_assistant_text, mk_session, record, repo_root # noqa: E402 _RESPONSES = repo_root() / "scripts" / "verify_v04" / "responses" _JUDGES = repo_root() / "scripts" / "verify_v04" / "judges" _RESPONSES.mkdir(parents=True, exist_ok=True) _JUDGES.mkdir(parents=True, exist_ok=True) # --------------------------------------------------------------------------- # Task definitions — kept short so a 1-page judge eval is feasible. # --------------------------------------------------------------------------- TASKS: dict[str, dict[str, Any]] = { "Q1": { "title": "Python stdin wordcount CLI", "prompt": ( "Write a single Python file `wordcount.py` that:\n" " 1. Reads from stdin\n" " 2. Supports flags `-w` (word count), `-l` (line count), `-c` (char count)\n" " 3. Prints one number per requested flag, space-separated.\n" "Return ONLY the code in a single ```python fenced block. No prose." ), "kind": "single", }, "Q2": { "title": "Off-by-one bug fix", "prompt": ( "The following Python function returns the wrong count when " "`text` is empty:\n\n" "```python\n" "def first_word_length(text: str) -> int:\n" " words = text.split()\n" " return len(words[0])\n" "```\n\n" "Fix it so an empty string returns 0. Reply with ONLY the fixed function " "in a fenced code block." ), "kind": "single", }, "Q3": { "title": "Summarize this repo in 5 lines", "prompt": ( "Summarize the `my-deepagent` Python project (this repo) in EXACTLY 5 " "markdown bullet lines. Each line ≤ 80 chars. Focus: purpose, " "architecture layers, key features. No prose around it. Use README.md " "and the package layout under src/my_deepagent/ as your source — just " "your best summary." ), "kind": "single", }, "Q4": { "title": "FastAPI /healthz plan", "prompt": ( "We have a FastAPI app under `src/my_deepagent/api/app.py`. Produce a " "PLAN (no code) for adding a `GET /healthz` endpoint that returns " '`{"status": "ok", "db": }` where `db` is a quick `SELECT 1` ' "ping. Format: markdown with `## Context`, `## Phases`, `## Verification` " "sections. Each Phases bullet ≤ 15 words." ), "kind": "single", }, "Q5": { "title": "5-turn context retention", "prompt": [ "한국어로만 응답해. 짧게.", "내 이름은 도라고, 직업은 데이터 분석가야. 짧게 인사해.", "내가 좋아하는 색은 청록이야. 한 줄 코멘트.", "오늘 점심으로 라멘 먹었어. 한 줄 코멘트.", "지금까지 내가 알려준 사실 3개를 한 줄씩, 번호 매겨 정리해줘.", ], "kind": "multi-turn", }, "Q6": { "title": "Haiku-poet SKILL.md compliance", "prompt": ( "당신은 다음 SKILL.md 명령을 엄격하게 따라야 합니다:\n\n" "---\n" "name: korean-haiku-poet\n" "description: Reply ONLY as a 3-line Korean haiku. No prose, no preamble.\n" "---\n\n" "Each response must be exactly 3 lines, all in Korean. Total under " "40 characters. No explanation, no English, no extra newlines.\n\n" "Now: write a haiku about cherry blossoms." ), "kind": "single", }, } # --------------------------------------------------------------------------- # Run my-deepagent twice per task — DeepSeek (A) + Haiku (B) # --------------------------------------------------------------------------- async def _run_single(sess, prompt: str) -> str: agent = sess.build_agent_if_needed() await _invoke_and_stream(agent, prompt, sess) return await last_assistant_text(sess.db, sess.session_id) async def _run_multi(sess, prompts: list[str]) -> str: """Multi-turn — last assistant reply is the deliverable.""" for p in prompts: agent = sess.build_agent_if_needed() await _invoke_and_stream(agent, p, sess) return await last_assistant_text(sess.db, sess.session_id) async def collect_a_b(db, config, personas, saver) -> None: """For every Q-task, run prompt against DeepSeek (A) and Haiku (B).""" for qid, task in TASKS.items(): out_dir = _RESPONSES / qid out_dir.mkdir(parents=True, exist_ok=True) for letter, model_id in ( ("A", "openrouter:deepseek/deepseek-chat"), ("B", "openrouter:anthropic/claude-haiku-4-5"), ): target = out_dir / f"{letter}_{model_id.split('/')[-1]}.md" if target.exists(): print(f" · {qid} {letter} already collected → skip ({target.name})") continue sess = await mk_session(db, config, personas, saver, uuid.uuid4()) sess.set_model(model_id) try: if task["kind"] == "single": reply = await _run_single(sess, task["prompt"]) else: reply = await _run_multi(sess, task["prompt"]) except Exception as e: reply = f"[ERROR] {type(e).__name__}: {e}" target.write_text(reply, encoding="utf-8") print(f" · {qid} {letter} ({model_id.split('/')[-1]}): {len(reply)}c → {target.name}") # --------------------------------------------------------------------------- # Judge — feed (task, A, B, C) into Sonnet and parse a JSON verdict. # --------------------------------------------------------------------------- _JUDGE_PROMPT = """당신은 코딩 어시스턴트 비교 평가관입니다. 주관 없이, 결과물 자체로만 평가합니다. # Task ({qid}) {task_prompt} # Responses ## A (my-deepagent + DeepSeek-chat) {a} ## B (my-deepagent + Anthropic Haiku 4.5) {b} ## C (Claude Code sub-agent, anonymized) {c} # 평가 기준 (각 1-10) 1. accuracy — 작업을 정확히 수행했는가 2. completeness — 필요한 부분을 빠짐없이 다뤘는가 3. code_quality — 코드/마크다운 품질 (실행성·관용성·구조) 4. clarity — 설명·주석·구조의 명료함 5. efficiency — 불필요한 장황함 없는 간결함 # 출력 (반드시 JSON only, 다른 텍스트 없음) {{ "A": {{"accuracy": , "completeness": , "code_quality": , "clarity": , "efficiency": , "rationale": ""}}, "B": {{...}}, "C": {{...}}, "ranking": ["best", "mid", "worst"], "claude_code_equivalent": "=90% of C's total, else false>" }} """ async def judge_one(qid: str, task: dict[str, Any]) -> dict[str, Any] | None: out_dir = _RESPONSES / qid a_path = out_dir / "A_deepseek-chat.md" b_path = out_dir / "B_claude-haiku-4-5.md" c_path = out_dir / "C_subagent.md" if not (a_path.exists() and b_path.exists() and c_path.exists()): print(f" · {qid}: missing one of A/B/C — skip") return None a = a_path.read_text(encoding="utf-8") b = b_path.read_text(encoding="utf-8") c = c_path.read_text(encoding="utf-8") if task["kind"] == "single": prompt_text = task["prompt"] else: prompt_text = "\n".join(f"turn {i + 1}: {p}" for i, p in enumerate(task["prompt"])) prompt = _JUDGE_PROMPT.format(qid=qid, task_prompt=prompt_text, a=a, b=b, c=c) from langchain_openai import ChatOpenAI from my_deepagent.config import load_config from my_deepagent.secrets import resolve_openrouter_api_key cfg = load_config() llm = ChatOpenAI( model="anthropic/claude-sonnet-4-6", api_key=resolve_openrouter_api_key(cfg), base_url=cfg.openrouter_base_url, max_tokens=1500, temperature=0.0, ) try: result = await llm.ainvoke([{"role": "user", "content": prompt}]) except Exception as e: print(f" · {qid}: judge LLM failed: {type(e).__name__}: {e}") return None text = result.content if isinstance(text, list): text = "".join(b.get("text", str(b)) if isinstance(b, dict) else str(b) for b in text) text = str(text).strip() # Strip ```json fences if present. if text.startswith("```"): lines = text.split("\n") text = "\n".join(lines[1:-1]) try: parsed = json.loads(text) except Exception as e: print(f" · {qid}: judge JSON parse failed ({e}); raw[:300]={text[:300]!r}") return None out = _JUDGES / f"{qid}.json" out.write_text(json.dumps(parsed, ensure_ascii=False, indent=2), encoding="utf-8") return parsed async def run_judge(db, config) -> None: print("[Q judge] starting (Sonnet via OpenRouter)") for qid, task in TASKS.items(): parsed = await judge_one(qid, task) if parsed is None: continue scores_a = parsed.get("A", {}) scores_c = parsed.get("C", {}) total_a = sum( int(scores_a.get(k, 0)) for k in ("accuracy", "completeness", "code_quality", "clarity", "efficiency") ) total_c = sum( int(scores_c.get(k, 0)) for k in ("accuracy", "completeness", "code_quality", "clarity", "efficiency") ) pct = (total_a / total_c * 100) if total_c else 0 equiv = parsed.get("claude_code_equivalent", "false") record( qid, equiv == "true" or equiv is True, f"A={total_a} C={total_c} A/C={pct:.0f}% verdict={equiv}", ) # --------------------------------------------------------------------------- # Driver # --------------------------------------------------------------------------- async def main(args: argparse.Namespace) -> int: cfg = load_config() record_consent(cfg.data_dir) bootstrap_user_dirs(cfg) ensure_user_dirs_initialized(cfg) db = Database(cfg.database_url) await db.init_schema() personas = load_combined_personas(cfg, repo_root() / "docs" / "schemas" / "personas") if args.collect_ab: print("[Q collect-ab] my-deepagent × {DeepSeek, Haiku} × 6 tasks") async with get_checkpointer_ctx(cfg.database_url) as saver: await collect_a_b(db, cfg, personas, saver) if args.judge: await run_judge(db, cfg) await db.dispose() return 0 if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--collect-ab", action="store_true", help="run my-deepagent for A and B") parser.add_argument("--judge", action="store_true", help="invoke Sonnet judge over A/B/C") args = parser.parse_args() if not (args.collect_ab or args.judge): parser.error("nothing to do — use --collect-ab and/or --judge") sys.exit(asyncio.run(main(args)))