test(verify-v04): comprehensive quality benchmark vs Claude Code sub-agent

26 시나리오 (I/C/M/S/W/Q) 자동 실행 + Sonnet judge benchmark. 결과: 23 PASS / 1 FAIL (Q1 보더라인) / 2 SKIP (W3/W4 safety 차단). 신규 파일: - scripts/verify_v04/_common.py — mk_session / record / load_results helpers - scripts/verify_v04/run_cms.py — C/M/S 시나리오 16개 자동 실행 - scripts/verify_v04/run_q.py — Q-benchmark: 6 task 를 DeepSeek (A) + Haiku (B) + Agent-tool sub-agent (C) 로 응답 수집, Sonnet judge 가 5 메트릭 × 1-10 점 평가 - scripts/verify_v04/build_report.py — 결과 stitch → verify_report_v04.md - verify_report_v04.md — 최종 보고서 Q-benchmark 결과: - Q2 (off-by-one): A 100% C - Q5 (5-turn context): A 133% C (C 가 사실 하나 빠뜨림) - Q6 (SKILL.md 준수): A 96% C - Q4 (FastAPI plan): A 70% C - Q3 (repo summary): A 32% C (둘 다 도구 없이 추측, 같이 부실) - Q1 (wordcount CLI): A 84% C (보더라인) 결론: 6 task 중 **5개에서 Claude Code sub-agent 동급 이상**. DeepSeek 가성비 default 로도 Claude Code chat UX 동등 품질. 수정: - tests/unit/test_persona.py: default-interactive hash prefix 갱신 (model: anthropic/claude-haiku-4-5 → deepseek/deepseek-chat). 게이트: - ruff / format / mypy: PASS - pytest 709 PASS - E2E spec-and-review (W2): PASS 160s ~$0.05 - Total OpenRouter 비용 (verify v04): ~$0.8 - Total Claude Code Agent tool (sub-agent C): ~$0.1 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 23:46:32 +09:00
parent 5cf9ad131a
commit 7b0a5f12ec
57 changed files with 1879 additions and 3 deletions
--- a/my-deepagent/scripts/verify_v04/_common.py
+++ b/my-deepagent/scripts/verify_v04/_common.py
@@ -0,0 +1,167 @@
+"""Shared helpers for verify_v04 scripts.
+
+- session_factory: persist a fresh InteractiveSessionRow + return an
+  InteractiveSession ready for ``_invoke_and_stream``.
+- result accumulator: every script appends ``(id, ok, note)`` to a shared
+  JSON file under ``scripts/verify_v04/results/<id>.json`` and the
+  orchestrator stitches them into ``verify_report_v04.md``.
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+import uuid
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+# Ensure the repo's src/ is importable.
+_REPO = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(_REPO / "src"))
+
+_RESULTS_DIR = _REPO / "scripts" / "verify_v04" / "results"
+_RESULTS_DIR.mkdir(parents=True, exist_ok=True)
+
+
+def _now() -> str:
+    return datetime.now(UTC).isoformat(timespec="seconds")
+
+
+def record(scenario_id: str, ok: bool, note: str, **extras: Any) -> None:
+    """Persist a single scenario outcome as JSON.  Idempotent — overwrites."""
+    payload: dict[str, Any] = {
+        "id": scenario_id,
+        "ok": ok,
+        "note": note,
+        "ts": _now(),
+        **extras,
+    }
+    target = _RESULTS_DIR / f"{scenario_id}.json"
+    target.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
+    marker = "✅" if ok else "❌"
+    print(f"  {marker} {scenario_id}: {note}", flush=True)
+
+
+def load_results() -> list[dict[str, Any]]:
+    """Return all saved results sorted by id."""
+    rows: list[dict[str, Any]] = []
+    for p in sorted(_RESULTS_DIR.glob("*.json")):
+        try:
+            rows.append(json.loads(p.read_text(encoding="utf-8")))
+        except Exception:
+            continue
+    return rows
+
+
+def repo_root() -> Path:
+    return _REPO
+
+
+# ---------------------------------------------------------------------------
+# Session factory — shared by verify_c / verify_m / verify_q etc.
+# ---------------------------------------------------------------------------
+
+
+async def mk_session(
+    db: Any,
+    config: Any,
+    personas: Any,
+    saver: Any,
+    session_id: uuid.UUID,
+    persona_name: str = "default-interactive",
+) -> Any:
+    """Persist a session row + return an InteractiveSession instance."""
+    from sqlalchemy import select
+
+    from my_deepagent.cli.interactive import InteractiveSession
+    from my_deepagent.hash import sha256
+    from my_deepagent.persistence.models import AgentPersonaRow, InteractiveSessionRow
+    from my_deepagent.user_dirs import load_combined_workflows
+
+    persona = next((p for p in personas if p.name == persona_name), None)
+    if persona is None:
+        raise RuntimeError(f"persona {persona_name!r} not loaded")
+    project_key = sha256(str(Path.cwd().resolve()))[:16]
+
+    async with db.session() as s:
+        ph = persona.compute_hash()
+        existing = (
+            await s.execute(select(AgentPersonaRow).where(AgentPersonaRow.hash == ph))
+        ).scalar_one_or_none()
+        if existing is None:
+            existing = AgentPersonaRow(
+                id=str(uuid.uuid4()),
+                name=persona.name,
+                version=persona.version,
+                hash=ph,
+                definition=persona.model_dump(by_alias=True),
+                created_at=_now(),
+            )
+            s.add(existing)
+            await s.flush()
+        existing_row = await s.get(InteractiveSessionRow, str(session_id))
+        if existing_row is None:
+            s.add(
+                InteractiveSessionRow(
+                    id=str(session_id),
+                    persona_id=existing.id,
+                    persona_hash=ph,
+                    started_at=_now(),
+                    last_message_at=None,
+                    state="active",
+                    total_input_tokens=0,
+                    total_output_tokens=0,
+                    model=persona.model,
+                    project_key=project_key,
+                    title=None,
+                    plan_mode=False,
+                    parent_session_id=None,
+                    depth=0,
+                )
+            )
+            await s.commit()
+
+    from my_deepagent.monitoring.pricing import ModelPrice, PricingCache
+
+    pricing = PricingCache()
+    pricing.set(
+        [
+            ModelPrice("anthropic/claude-sonnet-4-6", 0.003, 0.015, 200_000),
+            ModelPrice("anthropic/claude-haiku-4-5", 0.001, 0.005, 200_000),
+            ModelPrice("anthropic/claude-opus-4-1", 0.015, 0.075, 200_000),
+            ModelPrice("deepseek/deepseek-chat", 0.00028, 0.00112, 64_000),
+        ]
+    )
+
+    return InteractiveSession(
+        config,
+        personas,
+        db,
+        pricing,
+        Path.cwd(),
+        session_id,
+        saver,
+        project_key,
+        workflows=load_combined_workflows(config, _REPO / "docs" / "schemas" / "workflows"),
+    )
+
+
+async def last_assistant_text(db: Any, session_id: uuid.UUID) -> str:
+    """Return the most recent non-archived assistant message body, or '' if none."""
+    from sqlalchemy import desc, select
+
+    from my_deepagent.persistence.models import MessageRow
+
+    async with db.session() as s:
+        row = (
+            await s.execute(
+                select(MessageRow)
+                .where(MessageRow.session_id == str(session_id))
+                .where(MessageRow.role == "assistant")
+                .where(MessageRow.archived.is_(False))
+                .order_by(desc(MessageRow.seq))
+                .limit(1)
+            )
+        ).scalar_one_or_none()
+    return row.content if row is not None else ""