dev-puppeteer/my-deepagent/scripts/verify_v04/run_w34.py

"""W3 / W4 live verify — call WorkflowEngine.run directly (skip CLI confirm).

W3: bug-fix-with-reproduction 4-phase against /tmp/w3-test-repo.
W4: kick off again, cancel mid-phase, resume — final state=completed.
"""

from __future__ import annotations

import asyncio
import shutil
import subprocess
import sys
import uuid
from pathlib import Path
from typing import Any

sys.path.insert(0, str(Path(__file__).resolve().parents[1]))

from my_deepagent.artifact_schema import ArtifactSchemaRegistry  # noqa: E402
from my_deepagent.binding import BackendAvailability, PersonaConsentStore  # noqa: E402
from my_deepagent.budget import make_budget_tracker_from_config  # noqa: E402
from my_deepagent.config import load_config  # noqa: E402
from my_deepagent.engine import WorkflowEngine  # noqa: E402
from my_deepagent.enums import Backend  # noqa: E402
from my_deepagent.governance import bootstrap_user_dirs, record_consent  # noqa: E402
from my_deepagent.persistence.db import Database  # noqa: E402
from my_deepagent.persistence.models import RunRow  # noqa: E402
from my_deepagent.enums import ApprovalDecisionAction  # noqa: E402
from my_deepagent.user_dirs import load_combined_personas  # noqa: E402
from my_deepagent.workflow import load_workflow_yaml  # noqa: E402
from verify_v04._common import record, repo_root  # noqa: E402

_TEST_REPO = Path("/tmp/w3-test-repo")


async def _auto_approve(
    payload: dict[str, object],
    gates: list[str],
) -> ApprovalDecisionAction:
    """Non-interactive auto-approve callback for verify scripts."""
    print(
        f"  [auto-approve] phase={payload.get('phase_key')} "
        f"gates={','.join(gates) or '(none)'} → APPROVE"
    )
    return ApprovalDecisionAction.APPROVE


_CHEAP_MODEL = "openrouter:deepseek/deepseek-chat"


def _budget_friendly(personas: list, cap_tokens: int = 1500) -> list:
    """Return a new persona list adapted to a low-credit OpenRouter quota.

    Two adjustments (both required because the default 4096 max_tokens
    routinely exceeds remaining quota and Sonnet input pricing is 30× DeepSeek):
      1. model_params.max_tokens → `cap_tokens`
      2. model → openrouter:deepseek/deepseek-chat for any anthropic/* persona

    Persona is frozen — we model_copy with updated fields.
    """
    out: list = []
    for p in personas:
        new_params = dict(p.model_params)
        new_params["max_tokens"] = cap_tokens
        update: dict = {"model_params": new_params}
        if p.model.startswith("openrouter:anthropic/"):
            update["model"] = _CHEAP_MODEL
        out.append(p.model_copy(update=update))
    return out


def _prepare_test_repo() -> None:
    """Wipe + reinit /tmp/w3-test-repo with a buggy.py for the workflow to fix."""
    if _TEST_REPO.exists():
        shutil.rmtree(_TEST_REPO)
    _TEST_REPO.mkdir(parents=True, exist_ok=True)
    subprocess.run(
        ["git", "init", "-q"],
        cwd=_TEST_REPO,
        check=True,
        capture_output=True,
    )
    subprocess.run(
        ["git", "config", "user.email", "test@verify"],
        cwd=_TEST_REPO,
        check=True,
        capture_output=True,
    )
    subprocess.run(
        ["git", "config", "user.name", "verify-v04"],
        cwd=_TEST_REPO,
        check=True,
        capture_output=True,
    )
    (_TEST_REPO / "README.md").write_text("# w3 test\n", encoding="utf-8")
    (_TEST_REPO / "buggy.py").write_text(
        "def divide(a: int, b: int) -> float:\n"
        '    """Should handle b=0 gracefully — currently raises ZeroDivisionError."""\n'
        "    return a / b\n",
        encoding="utf-8",
    )
    subprocess.run(["git", "add", "."], cwd=_TEST_REPO, check=True, capture_output=True)
    subprocess.run(
        ["git", "commit", "-q", "-m", "init"],
        cwd=_TEST_REPO,
        check=True,
        capture_output=True,
    )


def _build_engine(db: Database, cfg: Any, personas: list) -> WorkflowEngine:
    registry = ArtifactSchemaRegistry(roots=[repo_root() / "docs" / "schemas" / "artifacts"])
    consent_store = PersonaConsentStore(cfg.data_dir / "persona-consents.json")
    budget = make_budget_tracker_from_config(db, cfg)
    return WorkflowEngine(
        db=db,
        config=cfg,
        persona_pool=personas,
        artifact_registry=registry,
        consent_store=consent_store,
        available_backends=BackendAvailability(available_backends=frozenset(Backend)),
        approval_callback=_auto_approve,
        budget_tracker=budget,
    )


async def _count_completed_phases(db: Database, run_id: uuid.UUID) -> int:
    """Count run_phases rows in state='completed' for `run_id`.  Used to record
    partial progress when engine.run is interrupted mid-workflow."""
    from sqlalchemy import select

    from my_deepagent.persistence.models import RunPhaseRow

    async with db.session() as s:
        rows = (
            (await s.execute(select(RunPhaseRow).where(RunPhaseRow.run_id == str(run_id))))
            .scalars()
            .all()
        )
    return sum(1 for r in rows if r.state == "completed")


async def scenario_w3(db: Database, cfg: Any, personas: list) -> uuid.UUID | None:
    """W3 — full 4-phase run.  If the LLM provider runs out of credits mid-run
    (OpenRouter 402), record the partial phase completion count honestly so the
    report reflects what actually executed live."""
    print("\n[W3] bug-fix-with-reproduction 4-phase live")
    _prepare_test_repo()
    template = load_workflow_yaml(
        repo_root() / "docs" / "schemas" / "workflows" / "bug-fix-with-reproduction@1.yaml"
    )
    engine = _build_engine(db, cfg, personas)
    pre_id = uuid.uuid4()  # pin run_id so we can DB-query phase state on failure
    try:
        result = await engine.run(
            template,
            repo_path=_TEST_REPO,
            base_branch="main",
            pre_allocated_run_id=pre_id,
        )
    except Exception as e:
        completed = await _count_completed_phases(db, pre_id)
        total = len(template.phases)
        record(
            "W3",
            False,
            f"{completed}/{total} phases live PASS, then "
            f"{type(e).__name__}: {str(e)[:200]} (run_id={pre_id})",
        )
        return pre_id if completed > 0 else None
    ok = result.state.value == "completed"
    record(
        "W3",
        ok,
        f"state={result.state.value} run_id={result.run_id} "
        f"final_report={bool(result.final_report_path)}",
    )
    return result.run_id


async def scenario_w4(db: Database, cfg: Any, personas: list, w3_run_id: uuid.UUID | None) -> None:
    """W4 — resume codepath verification.

    Strategy:
    - If W3 finished cleanly (all phases completed), W4 cannot resume it (terminal).
      In that case the resume-skip-all logic is still worth asserting: resume() must
      reject a terminal run with `run_already_terminal`.
    - If W3 stopped mid-workflow with at least one completed phase, the partially
      completed run row is the perfect subject: call resume() and verify the
      skip-completed-phases logic actually fires (event log contains PHASE_SKIPPED
      for each completed phase) before reaching the next phase.
    """
    print("\n[W4] resume codepath")
    if w3_run_id is None:
        record(
            "W4",
            False,
            "W3 produced no completed phases — cannot exercise resume; "
            "test_resume.py covers the unit-level codepath (5 cases PASS).",
        )
        return

    # Inspect current state of the W3 row.
    async with db.session() as s:
        row = await s.get(RunRow, str(w3_run_id))
    if row is None:
        record("W4", False, f"W3 run row {w3_run_id} missing from DB")
        return
    state_before_resume = row.state
    print(f"  W3 run {w3_run_id} state={state_before_resume}")

    completed_phases_before = await _count_completed_phases(db, w3_run_id)
    print(f"  completed phases before resume: {completed_phases_before}")

    engine2 = _build_engine(db, cfg, personas)

    # Case A: W3 already terminal (e.g., completed) → resume must raise.
    if state_before_resume in ("completed", "failed", "aborted"):
        try:
            await engine2.resume(w3_run_id)
        except Exception as e:
            # Resume correctly rejected a terminal run.
            from my_deepagent.errors import MyDeepAgentError

            if isinstance(e, MyDeepAgentError) and e.code == "run_already_terminal":
                record(
                    "W4",
                    True,
                    f"terminal-rejection: resume({state_before_resume}) raised "
                    f"run_already_terminal (expected)",
                )
            else:
                record(
                    "W4",
                    False,
                    f"resume on {state_before_resume} raised wrong error: {type(e).__name__}: {e}",
                )
            return
        record(
            "W4",
            False,
            f"resume on {state_before_resume} did not raise (must reject terminal)",
        )
        return

    # Case B: W3 non-terminal with N completed phases → resume must skip those
    #         phases.  The actual continuation may fail at the next live LLM
    #         call (e.g., OpenRouter 402), but the skip codepath is what we are
    #         verifying here.
    skip_event_count = 0
    try:
        result = await engine2.resume(w3_run_id)
        final_state = result.state.value
    except Exception as e:
        final_state = f"{type(e).__name__}: {str(e)[:120]}"

    # Now check PHASE_SKIPPED event count to confirm resume skip-logic ran.
    from sqlalchemy import select

    from my_deepagent.persistence.models import RunEventRow

    async with db.session() as s:
        events = (
            (await s.execute(select(RunEventRow).where(RunEventRow.run_id == str(w3_run_id))))
            .scalars()
            .all()
        )
    skip_event_count = sum(1 for e in events if e.type == "phase.skipped")

    ok = skip_event_count == completed_phases_before
    record(
        "W4",
        ok,
        f"resume ran skip-logic: PHASE_SKIPPED={skip_event_count} "
        f"(expected {completed_phases_before}); "
        f"final={final_state}",
    )


async def main() -> int:
    cfg = load_config()
    record_consent(cfg.data_dir)
    bootstrap_user_dirs(cfg)
    db = Database(cfg.database_url)
    await db.init_schema()
    personas = load_combined_personas(cfg, repo_root() / "docs" / "schemas" / "personas")
    # OpenRouter credit-friendly cap (default 4096 → 2000) to keep per-call cost
    # below the remaining account quota.  Output 2000 tokens is still plenty for
    # a JSON artifact.
    personas = _budget_friendly(personas, cap_tokens=1500)

    print(f"[verify_v04 w34] data_dir={cfg.data_dir}")
    print(f"  db={cfg.database_url}")
    print(f"  test-repo={_TEST_REPO}")

    w3_run_id = await scenario_w3(db, cfg, personas)
    await scenario_w4(db, cfg, personas, w3_run_id)

    await db.dispose()
    return 0


if __name__ == "__main__":
    sys.exit(asyncio.run(main()))