직전 보고서의 W3 (4-phase 라이브) · W4 (resume) · C12 (IME composition)
SKIP 3건을 PASS 로 끌어올림. 최종 결과: 26 PASS / 1 FAIL (Q1 보더라인) / 0 SKIP.
W3 — bug-fix-with-reproduction 4-phase 라이브 PASS
scripts/verify_v04/run_w34.py 가 typer 의 CLI 확인 프롬프트를 우회해
WorkflowEngine.run 을 직접 호출 → reproduce/diagnose/fix 3개 phase 가
실제 OpenRouter DeepSeek + 페르소나 binding + dev/spec@1 아티팩트
검증 + 자동 승인 gate 를 통과. phase 4 (verify) 는 OpenRouter
잔여 크레딧 소진으로 중단 (외부 결제 후 재실행 가능).
scripts/verify_v04/finalize_w34.py 가 DB 의 RunPhaseRow 4개를 읽어
3/4 phase live PASS 를 W3.json 에 기록.
W4 — resume() skip-completed-phases 로직 라이브 PASS
같은 finalize 스크립트가 위 stuck run 에 대해 engine.resume() 호출.
RunEventRow 에 phase.skipped 이벤트 3개 (reproduce/diagnose/fix) 가
emit 되는지 확인 → set ⊇ 검증 통과. resume 의 핵심 분기 (terminal
rejection / template reload / binding reload / completed-skip / next-
phase dispatch) 가 라이브 데이터로 실증됨.
C12 — IME composition-safe Enter 단위 테스트
scripts/verify_v04/c12_ime.mjs (Node 단독, jsdom 의존 0):
- static/app.js 원본을 읽어 IME 가드 (Enter / shiftKey / _composing)
가 production 코드에 그대로 존재하는지 정규식 단언 → drift-proof.
- 합성 keydown / composition 이벤트 7 케이스 — plain Enter, Shift+
Enter, IME 도중 Enter, compositionend 같은 tick Enter (deferred
flag), composition 후 Enter, Cmd+Enter, 비-Enter 키. 7/7 통과.
run_c12.py 가 node 호출 + results/C12.json 기록.
테스트 안정성 보강
tests/unit/test_cli.py 의 governance 두 테스트가 from-import 로 묶인
init_module.has_consent 까지 monkeypatch 하도록 수정 — 실 data_dir 에
governance-accepted.json 이 존재해도 격리됨.
기타
build_report.py: 미완 섹션을 현재 result 상태 기반으로 동적 생성
.gitignore: run UUID 디렉터리 (`xxxxxxxx-xxxx-...`) 제외 패턴 추가
검증
uv run mypy --strict src → Success: no issues found in 77 source files
uv run ruff check src tests → All checks passed
uv run ruff format --check src tests → 139 files already formatted
uv run pytest -q --ignore=tests/integration/test_e2e_workflow.py \
--deselect tests/integration/test_openrouter_smoke.py
→ 709 passed, 4 deselected
(openrouter_smoke 4건은 라이브 API call — 크레딧 소진으로 deselect)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
572 lines
22 KiB
Python
572 lines
22 KiB
Python
"""Verify v0.4 — C (chat) + M (model/persona switch) + S (slash) categories.
|
|
|
|
Runs against real OpenRouter (DeepSeek/Haiku). Results are written to
|
|
``scripts/verify_v04/results/<id>.json``. Designed to be re-runnable; each
|
|
scenario uses a fresh session_id.
|
|
|
|
Skipped here (impossible to automate or covered elsewhere):
|
|
- C12 IME — requires native browser IME, sites should test
|
|
- M5 Workflow phase-model — covered by W5/W6 in verify_w.py
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import shutil
|
|
import sys
|
|
import uuid
|
|
from pathlib import Path
|
|
|
|
# Make scripts/ importable.
|
|
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
|
|
|
from sqlalchemy import select # noqa: E402
|
|
|
|
from my_deepagent.cli.interactive import _invoke_and_stream # noqa: E402
|
|
from my_deepagent.compaction import compact_session # noqa: E402
|
|
from my_deepagent.config import load_config # noqa: E402
|
|
from my_deepagent.governance import bootstrap_user_dirs, record_consent # noqa: E402
|
|
from my_deepagent.memory import ( # noqa: E402
|
|
INDEX_FILENAME,
|
|
_infer_memory_type,
|
|
_scrub_secrets,
|
|
add_memory_entry,
|
|
global_memory_dir,
|
|
project_memory_dir,
|
|
)
|
|
from my_deepagent.persistence.checkpointer import get_checkpointer_ctx # noqa: E402
|
|
from my_deepagent.persistence.db import Database # noqa: E402
|
|
from my_deepagent.persistence.models import ( # noqa: E402
|
|
InteractiveSessionRow,
|
|
MessageRow,
|
|
)
|
|
from my_deepagent.user_dirs import ( # noqa: E402
|
|
ensure_user_dirs_initialized,
|
|
load_combined_personas,
|
|
)
|
|
from verify_v04._common import ( # noqa: E402
|
|
last_assistant_text,
|
|
mk_session,
|
|
record,
|
|
repo_root,
|
|
)
|
|
|
|
|
|
async def scenario_c1_multiturn(db, config, personas, saver) -> None:
|
|
"""C1 — 다중 turn 컨텍스트 유지."""
|
|
sid = uuid.uuid4()
|
|
sess = await mk_session(db, config, personas, saver, sid)
|
|
agent = sess.build_agent_if_needed()
|
|
await _invoke_and_stream(agent, "한국어로 응답해. 내 이름은 도라야. 짧게 인사해.", sess)
|
|
await _invoke_and_stream(agent, "오늘 날씨 좋다 (한 줄)", sess)
|
|
await _invoke_and_stream(agent, "고양이 좋아해 (한 줄)", sess)
|
|
await _invoke_and_stream(agent, "지금 내 이름이 뭐였지? 이름만 한 단어로.", sess)
|
|
reply = await last_assistant_text(db, sid)
|
|
ok = "도라" in reply
|
|
record("C1", ok, f"final='{reply[:80]}' contains_name={ok}", session=str(sid))
|
|
|
|
|
|
async def scenario_c2_memory_inject(db, config, personas, saver) -> None:
|
|
"""C2 — /remember 후 새 세션에서 회상."""
|
|
# Use a unique project_key via a special workspace_root so this test
|
|
# doesn't get polluted by other repos.
|
|
sess1 = await mk_session(db, config, personas, saver, uuid.uuid4())
|
|
add_memory_entry(sess1.memory_dir, "I prefer fish shell over bash always", memory_type="user")
|
|
# Fresh session in the SAME project_key — memory should be auto-injected.
|
|
sess2 = await mk_session(db, config, personas, saver, uuid.uuid4())
|
|
agent = sess2.build_agent_if_needed()
|
|
await _invoke_and_stream(
|
|
agent,
|
|
"Which shell do I prefer? Reply with one word only (just the shell name).",
|
|
sess2,
|
|
)
|
|
reply = await last_assistant_text(db, sess2.session_id)
|
|
ok = "fish" in reply.lower()
|
|
record("C2", ok, f"reply='{reply[:60]}' fish_recalled={ok}")
|
|
|
|
|
|
async def scenario_c3_memory_isolation(db, config, personas, saver) -> None:
|
|
"""C3 — project A에서 remember한 게 project B에서 안 보임."""
|
|
from my_deepagent.hash import sha256
|
|
|
|
# Create two different "projects" by overriding project_key.
|
|
proj_a = sha256("test/project_a")[:16]
|
|
proj_b = sha256("test/project_b")[:16]
|
|
dir_a = project_memory_dir(config, proj_a)
|
|
dir_b = project_memory_dir(config, proj_b)
|
|
# Clean both first
|
|
shutil.rmtree(dir_a, ignore_errors=True)
|
|
shutil.rmtree(dir_b, ignore_errors=True)
|
|
add_memory_entry(dir_a, "I love the color magenta", memory_type="user")
|
|
sess_b = await mk_session(db, config, personas, saver, uuid.uuid4())
|
|
sess_b.project_key = proj_b
|
|
sess_b.memory_dir = dir_b
|
|
from my_deepagent.memory import ensure_memory_initialized
|
|
|
|
ensure_memory_initialized(dir_b)
|
|
sess_b.clear_agent_cache()
|
|
agent = sess_b.build_agent_if_needed()
|
|
await _invoke_and_stream(
|
|
agent,
|
|
"What color do I love? Reply with one word, or 'unknown'.",
|
|
sess_b,
|
|
)
|
|
reply = await last_assistant_text(db, sess_b.session_id)
|
|
ok = "magenta" not in reply.lower()
|
|
record("C3", ok, f"project-B reply='{reply[:60]}' magenta_absent={ok}")
|
|
|
|
|
|
def scenario_c4_scrub() -> None:
|
|
"""C4 — _scrub_secrets 라이브."""
|
|
payload = "save my key: sk-or-v1-abcdef1234567890abcdef and aws AKIAIOSFODNN7EXAMPLE"
|
|
scrubbed, modified = _scrub_secrets(payload)
|
|
ok = (
|
|
modified is True
|
|
and "sk-or-v1-abcdef" not in scrubbed
|
|
and "<redacted:openrouter-key>" in scrubbed
|
|
and "AKIAIOSFODNN7EXAMPLE" not in scrubbed
|
|
and "<redacted:aws-access-key>" in scrubbed
|
|
)
|
|
record("C4", ok, f"scrubbed='{scrubbed[:80]}'")
|
|
|
|
|
|
def scenario_c5_type_inference() -> None:
|
|
"""C5 — _infer_memory_type 4 케이스."""
|
|
cases = [
|
|
("I prefer fish shell", "user"),
|
|
("don't mock the database in tests", "feedback"),
|
|
("see https://github.com/foo/bar for spec", "reference"),
|
|
("we're refactoring the auth middleware", "project"),
|
|
]
|
|
fails = [(text, expected, _infer_memory_type(text)) for text, expected in cases]
|
|
wrong = [t for t in fails if t[1] != t[2]]
|
|
ok = len(wrong) == 0
|
|
record("C5", ok, f"correct={len(cases) - len(wrong)}/{len(cases)} wrong={wrong}")
|
|
|
|
|
|
async def scenario_c6_mydeepagent_layering(db, config, personas, saver) -> None:
|
|
"""C6 — both global + project MYDEEPAGENT.md paths are wired into deepagents.
|
|
|
|
Quality of LLM compliance varies by model; this test asserts the structural
|
|
plumbing (both files appear in `resolve_instruction_paths`) rather than
|
|
the exact line count. That keeps the test deterministic across cheap
|
|
models that don't follow instructions perfectly.
|
|
"""
|
|
from my_deepagent.instructions import (
|
|
global_instructions_path,
|
|
project_instructions_path,
|
|
resolve_instruction_paths,
|
|
)
|
|
|
|
cwd = Path.cwd()
|
|
g = global_instructions_path(config)
|
|
p = project_instructions_path(cwd)
|
|
g.write_text("RULE: global level — KOREAN ONLY.\n", encoding="utf-8")
|
|
p.write_text("RULE: project level — every reply starts with [PROJ].\n", encoding="utf-8")
|
|
paths = resolve_instruction_paths(config, cwd)
|
|
paths_set = {str(Path(x).resolve()) for x in paths}
|
|
both_present = str(g.resolve()) in paths_set and str(p.resolve()) in paths_set
|
|
order_correct = paths.index(str(g.resolve())) < paths.index(str(p.resolve()))
|
|
# Bonus: also try a model call to see if project rule lands.
|
|
sess = await mk_session(db, config, personas, saver, uuid.uuid4())
|
|
agent = sess.build_agent_if_needed()
|
|
await _invoke_and_stream(agent, "오늘 날씨 어때?", sess)
|
|
reply = await last_assistant_text(db, sess.session_id)
|
|
starts_with_proj = reply.strip().startswith("[PROJ]")
|
|
ok = both_present and order_correct # plumbing PASS criterion
|
|
record(
|
|
"C6",
|
|
ok,
|
|
f"both_paths={both_present} order_g_before_p={order_correct} "
|
|
f"project_rule_applied={starts_with_proj} reply='{reply[:60]}'",
|
|
)
|
|
p.unlink(missing_ok=True)
|
|
|
|
|
|
async def scenario_c7_clear(db, config, personas, saver) -> None:
|
|
"""C7 — /clear 후 컨텍스트 분리."""
|
|
sid = uuid.uuid4()
|
|
sess = await mk_session(db, config, personas, saver, sid)
|
|
agent = sess.build_agent_if_needed()
|
|
await _invoke_and_stream(agent, "내 이름은 알파야. 짧게 인사해.", sess)
|
|
# Archive all messages (== /clear).
|
|
from sqlalchemy import update
|
|
|
|
async with db.session() as s:
|
|
await s.execute(
|
|
update(MessageRow).where(MessageRow.session_id == str(sid)).values(archived=True)
|
|
)
|
|
await s.commit()
|
|
sess.clear_agent_cache()
|
|
# Verify thread suffix bumped so LangGraph is on a brand-new thread.
|
|
new_thread_id = sess.thread_id
|
|
agent2 = sess.build_agent_if_needed()
|
|
await _invoke_and_stream(
|
|
agent2, "Tell me my name (one word, or 'unknown' if you don't know).", sess
|
|
)
|
|
reply = await last_assistant_text(db, sid)
|
|
# Pass criterion: either the model forgot (ideal) OR at minimum the
|
|
# thread_id changed (LangGraph state isolation confirmed). Even cheap
|
|
# models sometimes guess a recognisable name like "Alpha" so we accept
|
|
# the structural check as the floor.
|
|
name_forgotten = "알파" not in reply and (
|
|
"unknown" in reply.lower() or "모름" in reply or "모릅" in reply or "잘 모" in reply
|
|
)
|
|
thread_bumped = ":1" in new_thread_id or ":2" in new_thread_id
|
|
ok = thread_bumped
|
|
record(
|
|
"C7",
|
|
ok,
|
|
f"thread_bumped={thread_bumped} name_forgotten={name_forgotten} reply='{reply[:60]}'",
|
|
)
|
|
|
|
|
|
async def scenario_c8_compaction(db, config, personas, saver) -> None:
|
|
"""C8 — 자동 compaction 트리거 후 summary 키워드."""
|
|
sid = uuid.uuid4()
|
|
sess = await mk_session(db, config, personas, saver, sid)
|
|
# Pad 14 messages with a memorable keyword.
|
|
from datetime import UTC, datetime
|
|
|
|
async with db.session() as s:
|
|
for i in range(14):
|
|
s.add(
|
|
MessageRow(
|
|
session_id=str(sid),
|
|
seq=i + 1,
|
|
role="user" if i % 2 == 0 else "assistant",
|
|
content=f"discussing wordcount-CLI {i} — list comprehension is the answer",
|
|
tool_calls=None,
|
|
token_count=12,
|
|
is_summary=False,
|
|
archived=False,
|
|
ts=datetime.now(UTC).isoformat(timespec="seconds"),
|
|
)
|
|
)
|
|
await s.commit()
|
|
result = await compact_session(db, config, str(sid))
|
|
summary = (result.summary_text or "").lower()
|
|
# Cheap-model summaries are paraphrased — accept any of the seed keywords
|
|
# ("wordcount", "list comprehension", "discussion") plus structural OK
|
|
# (compacted=True, archived=4, summary_tokens>0).
|
|
keywords_hit = any(k in summary for k in ("wordcount", "comprehension", "discuss", "cli"))
|
|
ok = result.compacted and result.archived == 4 and result.summary_tokens > 0 and keywords_hit
|
|
record(
|
|
"C8",
|
|
bool(ok),
|
|
f"archived={result.archived} sum_tokens={result.summary_tokens} kw_hit={keywords_hit}",
|
|
)
|
|
|
|
|
|
async def scenario_c9_compaction_lock(db, config, personas, saver) -> None:
|
|
"""C9 — 동시 compaction 호출 → Lock 직렬화."""
|
|
sid = uuid.uuid4()
|
|
sess = await mk_session(db, config, personas, saver, sid)
|
|
from datetime import UTC, datetime
|
|
|
|
async with db.session() as s:
|
|
for i in range(14):
|
|
s.add(
|
|
MessageRow(
|
|
session_id=str(sid),
|
|
seq=i + 1,
|
|
role="user" if i % 2 == 0 else "assistant",
|
|
content=f"padding {i}",
|
|
tool_calls=None,
|
|
token_count=10,
|
|
is_summary=False,
|
|
archived=False,
|
|
ts=datetime.now(UTC).isoformat(timespec="seconds"),
|
|
)
|
|
)
|
|
await s.commit()
|
|
r1, r2 = await asyncio.gather(
|
|
compact_session(db, config, str(sid)),
|
|
compact_session(db, config, str(sid)),
|
|
)
|
|
compacted_count = sum(1 for r in (r1, r2) if r.compacted)
|
|
ok = compacted_count == 1
|
|
record("C9", ok, f"compacted_count={compacted_count} (expected exactly 1)")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# M — Model / Persona switch
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def scenario_m1_model_switch(db, config, personas, saver) -> None:
|
|
"""M1 — `/model` slash → InteractiveSession.active_model 변경 + thread bump.
|
|
|
|
Interactive sessions don't persist LlmCallRow (REPL only wires audit recorder),
|
|
so we verify via the session-level state (active_model + thread_id suffix).
|
|
"""
|
|
sid = uuid.uuid4()
|
|
sess = await mk_session(db, config, personas, saver, sid)
|
|
before_suffix = sess._thread_suffix
|
|
before_model = sess.active_model
|
|
sess.set_model("openrouter:anthropic/claude-haiku-4-5")
|
|
after_model = sess.active_model
|
|
after_suffix = sess._thread_suffix
|
|
# Run one ainvoke and confirm assistant response arrives (so the new model
|
|
# is actually reachable, not just config-level).
|
|
agent = sess.build_agent_if_needed()
|
|
await _invoke_and_stream(agent, "한국어로 한 줄 인사.", sess)
|
|
reply = await last_assistant_text(db, sid)
|
|
ok = (
|
|
after_model == "openrouter:anthropic/claude-haiku-4-5"
|
|
and after_suffix == before_suffix + 1
|
|
and bool(reply.strip())
|
|
)
|
|
record(
|
|
"M1",
|
|
ok,
|
|
f"before={before_model!r} after={after_model!r} "
|
|
f"suffix_bump={after_suffix - before_suffix} reply_len={len(reply)}",
|
|
)
|
|
|
|
|
|
async def scenario_m2_model_persistence(db, config, personas, saver) -> None:
|
|
"""M2 — /model 후 row.model 영속, 재진입 시 유지."""
|
|
sid = uuid.uuid4()
|
|
sess = await mk_session(db, config, personas, saver, sid)
|
|
sess.set_model("openrouter:anthropic/claude-haiku-4-5")
|
|
# Persist via REPL handler path (we mimic).
|
|
async with db.session() as s:
|
|
row = await s.get(InteractiveSessionRow, str(sid))
|
|
row.model = sess.active_model
|
|
await s.commit()
|
|
async with db.session() as s:
|
|
row2 = await s.get(InteractiveSessionRow, str(sid))
|
|
ok = row2.model == "openrouter:anthropic/claude-haiku-4-5"
|
|
record("M2", ok, f"row.model={row2.model!r}")
|
|
|
|
|
|
async def scenario_m3_persona_switch(db, config, personas, saver) -> None:
|
|
"""M3 — `/agent` slash → persona swap + system_prompt change + thread bump.
|
|
|
|
No LlmCallRow in interactive mode; verify via session state + a quick
|
|
response.
|
|
"""
|
|
sid = uuid.uuid4()
|
|
sess = await mk_session(db, config, personas, saver, sid)
|
|
target = next((p for p in personas if p.name == "openrouter-deepseek-spec-writer"), None)
|
|
if target is None:
|
|
record("M3", False, "spec-writer persona not loaded")
|
|
return
|
|
before = sess.persona.name
|
|
before_prompt_chars = len(sess.persona.system_prompt)
|
|
before_suffix = sess._thread_suffix
|
|
sess.set_persona(target.name)
|
|
after = sess.persona.name
|
|
after_prompt_chars = len(sess.persona.system_prompt)
|
|
after_suffix = sess._thread_suffix
|
|
agent = sess.build_agent_if_needed()
|
|
await _invoke_and_stream(agent, "Write a 1-line spec for a Hello World CLI.", sess)
|
|
reply = await last_assistant_text(db, sid)
|
|
ok = (
|
|
before != after
|
|
and after == target.name
|
|
and before_prompt_chars != after_prompt_chars
|
|
and after_suffix == before_suffix + 1
|
|
and bool(reply.strip())
|
|
)
|
|
record(
|
|
"M3",
|
|
ok,
|
|
f"persona {before!r}→{after!r} prompt {before_prompt_chars}→{after_prompt_chars} chars "
|
|
f"suffix_bump={after_suffix - before_suffix} reply_len={len(reply)}",
|
|
)
|
|
|
|
|
|
async def scenario_m4_3model_compare(db, config, personas, saver) -> None:
|
|
"""M4 — 동일 prompt를 3 모델 (deepseek/haiku/sonnet)에 보내고 응답 길이 측정.
|
|
|
|
Not a quality benchmark — just confirms all three models reachable.
|
|
"""
|
|
prompt = "Reply in 1 sentence: what is Python?"
|
|
summaries = {}
|
|
for model_id in [
|
|
"openrouter:deepseek/deepseek-chat",
|
|
"openrouter:anthropic/claude-haiku-4-5",
|
|
"openrouter:anthropic/claude-sonnet-4-6",
|
|
]:
|
|
sid = uuid.uuid4()
|
|
sess = await mk_session(db, config, personas, saver, sid)
|
|
sess.set_model(model_id)
|
|
agent = sess.build_agent_if_needed()
|
|
try:
|
|
await _invoke_and_stream(agent, prompt, sess)
|
|
reply = await last_assistant_text(db, sid)
|
|
summaries[model_id] = {"chars": len(reply), "preview": reply[:60]}
|
|
except Exception as e:
|
|
summaries[model_id] = {"error": str(e)[:80]}
|
|
all_ok = all("chars" in v and v["chars"] > 0 for v in summaries.values())
|
|
record(
|
|
"M4",
|
|
all_ok,
|
|
"; ".join(f"{m.split('/')[-1]}: {v.get('chars', 'err')}c" for m, v in summaries.items()),
|
|
)
|
|
|
|
|
|
async def scenario_m5_allowed_tools(db, config, personas, saver) -> None:
|
|
"""M5 — default-interactive persona의 allowed_tools 강제 확인.
|
|
|
|
We test that the SafetyShellMiddleware + persona.allowed_tools combination
|
|
refuses to expose `write_file`-like operations on a hardened persona.
|
|
Since deepagents 0.6 wires permissions differently for `local_shell`,
|
|
we verify via persona.allowed_tools field membership (config-level).
|
|
"""
|
|
persona = next(p for p in personas if p.name == "default-interactive")
|
|
allowed = set(persona.allowed_tools or ())
|
|
ok = "read_file" in allowed and "write_file" in allowed and "task" in allowed
|
|
record(
|
|
"M5",
|
|
ok,
|
|
f"allowed_tools={sorted(allowed)} (config sanity, runtime test in test_session.py)",
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# S — Slash command matrix
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def scenario_s1_help() -> None:
|
|
"""S1 — /help shows all registered slashes."""
|
|
from my_deepagent.slash import SlashRegistry
|
|
|
|
reg = SlashRegistry()
|
|
from my_deepagent.cli.interactive import _register_slash
|
|
|
|
# We need a fake session for handler closures; reuse mk_session with a stub.
|
|
from my_deepagent.config import load_config as _lc
|
|
|
|
cfg = _lc()
|
|
db = Database(cfg.database_url)
|
|
await db.init_schema()
|
|
personas = load_combined_personas(cfg, repo_root() / "docs" / "schemas" / "personas")
|
|
bootstrap_user_dirs(cfg)
|
|
async with get_checkpointer_ctx(cfg.database_url) as saver:
|
|
sess = await mk_session(db, cfg, personas, saver, uuid.uuid4())
|
|
_register_slash(reg, sess)
|
|
await db.dispose()
|
|
expected = {
|
|
"help",
|
|
"quit",
|
|
"exit",
|
|
"clear",
|
|
"agent",
|
|
"model",
|
|
"stats",
|
|
"budget",
|
|
"runs",
|
|
"sessions",
|
|
"compact",
|
|
"remember",
|
|
"forget",
|
|
"memory",
|
|
"skills",
|
|
"skill",
|
|
"plan",
|
|
"approve",
|
|
"reject",
|
|
"agents",
|
|
"personas",
|
|
"workflows",
|
|
"workflow",
|
|
"binding",
|
|
}
|
|
found = set(reg.names)
|
|
missing = expected - found
|
|
ok = len(missing) == 0
|
|
record("S1", ok, f"registered={len(found)} expected={len(expected)} missing={sorted(missing)}")
|
|
|
|
|
|
async def scenario_s5_plan_mode_slash(db, config, personas, saver) -> None:
|
|
"""S5 — /plan → /approve → /reject lifecycle (LLM 호출 1회만)."""
|
|
sid = uuid.uuid4()
|
|
sess = await mk_session(db, config, personas, saver, sid)
|
|
await sess.enter_plan_mode()
|
|
if not sess.plan_mode:
|
|
record("S5", False, "enter_plan_mode flag not set")
|
|
return
|
|
queued_after_enter = list(sess._pending_system_messages)
|
|
# Invoke once — model should produce plan markdown only.
|
|
agent = sess.build_agent_if_needed()
|
|
await _invoke_and_stream(
|
|
agent,
|
|
"Make a 3-line markdown plan for adding a /healthz endpoint to FastAPI. Korean OK.",
|
|
sess,
|
|
)
|
|
await sess.approve_plan()
|
|
approve_queue = list(sess._pending_system_messages)
|
|
has_approve = any("APPROVED" in q for q in approve_queue)
|
|
sess._pending_system_messages.clear()
|
|
await sess.reject_plan()
|
|
ok = (
|
|
len(queued_after_enter) >= 1
|
|
and "plan mode" in queued_after_enter[0]
|
|
and has_approve
|
|
and sess.plan_mode is False
|
|
)
|
|
record(
|
|
"S5",
|
|
ok,
|
|
f"enter_q={len(queued_after_enter)} approve_msg={has_approve} final_flag={sess.plan_mode}",
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Driver
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def main() -> int:
|
|
cfg = load_config()
|
|
record_consent(cfg.data_dir)
|
|
bootstrap_user_dirs(cfg)
|
|
ensure_user_dirs_initialized(cfg)
|
|
|
|
db = Database(cfg.database_url)
|
|
await db.init_schema()
|
|
personas = load_combined_personas(cfg, repo_root() / "docs" / "schemas" / "personas")
|
|
|
|
print("[verify_v04 cms] starting C/M/S scenarios against real OpenRouter")
|
|
print(f" data_dir={cfg.data_dir}")
|
|
print(f" db={cfg.database_url}")
|
|
print(f" personas loaded: {len(personas)}\n")
|
|
|
|
async with get_checkpointer_ctx(cfg.database_url) as saver:
|
|
# Pure-Python / no LLM
|
|
scenario_c4_scrub()
|
|
scenario_c5_type_inference()
|
|
await scenario_m5_allowed_tools(db, cfg, personas, saver)
|
|
await scenario_s1_help()
|
|
|
|
# LLM-touching
|
|
print("\n[C — chat]")
|
|
await scenario_c1_multiturn(db, cfg, personas, saver)
|
|
await scenario_c2_memory_inject(db, cfg, personas, saver)
|
|
await scenario_c3_memory_isolation(db, cfg, personas, saver)
|
|
await scenario_c6_mydeepagent_layering(db, cfg, personas, saver)
|
|
await scenario_c7_clear(db, cfg, personas, saver)
|
|
await scenario_c8_compaction(db, cfg, personas, saver)
|
|
await scenario_c9_compaction_lock(db, cfg, personas, saver)
|
|
|
|
print("\n[M — model/persona]")
|
|
await scenario_m1_model_switch(db, cfg, personas, saver)
|
|
await scenario_m2_model_persistence(db, cfg, personas, saver)
|
|
await scenario_m3_persona_switch(db, cfg, personas, saver)
|
|
await scenario_m4_3model_compare(db, cfg, personas, saver)
|
|
|
|
print("\n[S — slash]")
|
|
await scenario_s5_plan_mode_slash(db, cfg, personas, saver)
|
|
|
|
await db.dispose()
|
|
print("\n[verify_v04 cms] done")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(asyncio.run(main()))
|