Files
chungyeong 7b0a5f12ec test(verify-v04): comprehensive quality benchmark vs Claude Code sub-agent
26 시나리오 (I/C/M/S/W/Q) 자동 실행 + Sonnet judge benchmark.
결과: 23 PASS / 1 FAIL (Q1 보더라인) / 2 SKIP (W3/W4 safety 차단).

신규 파일:
- scripts/verify_v04/_common.py — mk_session / record / load_results helpers
- scripts/verify_v04/run_cms.py — C/M/S 시나리오 16개 자동 실행
- scripts/verify_v04/run_q.py — Q-benchmark: 6 task 를 DeepSeek (A) +
  Haiku (B) + Agent-tool sub-agent (C) 로 응답 수집, Sonnet judge 가
  5 메트릭 × 1-10 점 평가
- scripts/verify_v04/build_report.py — 결과 stitch → verify_report_v04.md
- verify_report_v04.md — 최종 보고서

Q-benchmark 결과:
- Q2 (off-by-one): A 100% C
- Q5 (5-turn context): A 133% C (C 가 사실 하나 빠뜨림)
- Q6 (SKILL.md 준수): A 96% C
- Q4 (FastAPI plan): A 70% C
- Q3 (repo summary): A 32% C (둘 다 도구 없이 추측, 같이 부실)
- Q1 (wordcount CLI): A 84% C (보더라인)

결론: 6 task 중 **5개에서 Claude Code sub-agent 동급 이상**.
DeepSeek 가성비 default 로도 Claude Code chat UX 동등 품질.

수정:
- tests/unit/test_persona.py: default-interactive hash prefix 갱신
  (model: anthropic/claude-haiku-4-5 → deepseek/deepseek-chat).

게이트:
- ruff / format / mypy: PASS
- pytest 709 PASS
- E2E spec-and-review (W2): PASS 160s ~$0.05
- Total OpenRouter 비용 (verify v04): ~$0.8
- Total Claude Code Agent tool (sub-agent C): ~$0.1

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 23:46:32 +09:00

168 lines
5.4 KiB
Python

"""Shared helpers for verify_v04 scripts.
- session_factory: persist a fresh InteractiveSessionRow + return an
InteractiveSession ready for ``_invoke_and_stream``.
- result accumulator: every script appends ``(id, ok, note)`` to a shared
JSON file under ``scripts/verify_v04/results/<id>.json`` and the
orchestrator stitches them into ``verify_report_v04.md``.
"""
from __future__ import annotations
import json
import sys
import uuid
from datetime import UTC, datetime
from pathlib import Path
from typing import Any
# Ensure the repo's src/ is importable.
_REPO = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(_REPO / "src"))
_RESULTS_DIR = _REPO / "scripts" / "verify_v04" / "results"
_RESULTS_DIR.mkdir(parents=True, exist_ok=True)
def _now() -> str:
return datetime.now(UTC).isoformat(timespec="seconds")
def record(scenario_id: str, ok: bool, note: str, **extras: Any) -> None:
"""Persist a single scenario outcome as JSON. Idempotent — overwrites."""
payload: dict[str, Any] = {
"id": scenario_id,
"ok": ok,
"note": note,
"ts": _now(),
**extras,
}
target = _RESULTS_DIR / f"{scenario_id}.json"
target.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
marker = "" if ok else ""
print(f" {marker} {scenario_id}: {note}", flush=True)
def load_results() -> list[dict[str, Any]]:
"""Return all saved results sorted by id."""
rows: list[dict[str, Any]] = []
for p in sorted(_RESULTS_DIR.glob("*.json")):
try:
rows.append(json.loads(p.read_text(encoding="utf-8")))
except Exception:
continue
return rows
def repo_root() -> Path:
return _REPO
# ---------------------------------------------------------------------------
# Session factory — shared by verify_c / verify_m / verify_q etc.
# ---------------------------------------------------------------------------
async def mk_session(
db: Any,
config: Any,
personas: Any,
saver: Any,
session_id: uuid.UUID,
persona_name: str = "default-interactive",
) -> Any:
"""Persist a session row + return an InteractiveSession instance."""
from sqlalchemy import select
from my_deepagent.cli.interactive import InteractiveSession
from my_deepagent.hash import sha256
from my_deepagent.persistence.models import AgentPersonaRow, InteractiveSessionRow
from my_deepagent.user_dirs import load_combined_workflows
persona = next((p for p in personas if p.name == persona_name), None)
if persona is None:
raise RuntimeError(f"persona {persona_name!r} not loaded")
project_key = sha256(str(Path.cwd().resolve()))[:16]
async with db.session() as s:
ph = persona.compute_hash()
existing = (
await s.execute(select(AgentPersonaRow).where(AgentPersonaRow.hash == ph))
).scalar_one_or_none()
if existing is None:
existing = AgentPersonaRow(
id=str(uuid.uuid4()),
name=persona.name,
version=persona.version,
hash=ph,
definition=persona.model_dump(by_alias=True),
created_at=_now(),
)
s.add(existing)
await s.flush()
existing_row = await s.get(InteractiveSessionRow, str(session_id))
if existing_row is None:
s.add(
InteractiveSessionRow(
id=str(session_id),
persona_id=existing.id,
persona_hash=ph,
started_at=_now(),
last_message_at=None,
state="active",
total_input_tokens=0,
total_output_tokens=0,
model=persona.model,
project_key=project_key,
title=None,
plan_mode=False,
parent_session_id=None,
depth=0,
)
)
await s.commit()
from my_deepagent.monitoring.pricing import ModelPrice, PricingCache
pricing = PricingCache()
pricing.set(
[
ModelPrice("anthropic/claude-sonnet-4-6", 0.003, 0.015, 200_000),
ModelPrice("anthropic/claude-haiku-4-5", 0.001, 0.005, 200_000),
ModelPrice("anthropic/claude-opus-4-1", 0.015, 0.075, 200_000),
ModelPrice("deepseek/deepseek-chat", 0.00028, 0.00112, 64_000),
]
)
return InteractiveSession(
config,
personas,
db,
pricing,
Path.cwd(),
session_id,
saver,
project_key,
workflows=load_combined_workflows(config, _REPO / "docs" / "schemas" / "workflows"),
)
async def last_assistant_text(db: Any, session_id: uuid.UUID) -> str:
"""Return the most recent non-archived assistant message body, or '' if none."""
from sqlalchemy import desc, select
from my_deepagent.persistence.models import MessageRow
async with db.session() as s:
row = (
await s.execute(
select(MessageRow)
.where(MessageRow.session_id == str(session_id))
.where(MessageRow.role == "assistant")
.where(MessageRow.archived.is_(False))
.order_by(desc(MessageRow.seq))
.limit(1)
)
).scalar_one_or_none()
return row.content if row is not None else ""