feat(my-deepagent): v0.2 PR #2b — mydeepagent runs resume <id> real implementation

Closes the v0.1.0 KNOWN LIMIT where resume was an exit-2 stub. Builds on v0.2 PR #2a's LangGraph wiring + the existing DB phase-state machine + sweep_orphan_runs — no Temporal (per DR-3). Highlights - `WorkflowEngine.resume(run_id)` (new async method): - Loads RunRow, rejects terminal states with MyDeepAgentError("run_already_terminal"). - Reloads worktree_root from `RunRow.worktree_root`, template via `_reload_template` (WorkflowTemplateRow JOIN + model_validate), and bindings via `_reload_bindings` (run_bindings ⨝ agent_personas). - **Does NOT call `bind_personas` again** — locks in the original binding so consent / persona-pool changes since the original run don't silently shift role assignment. - `_execute_run` (extracted shared phase loop): `run()` and `resume()` both dispatch through it. Skips already-completed phases (emits `phase.skipped` event) and re-executes the rest. - 4 new private helpers on WorkflowEngine: `_get_run_or_raise`, `_reload_template`, `_reload_bindings`, `_get_completed_phase_keys`. - `RunEventType.RUN_RESUMED` and `PHASE_SKIPPED` are now actually emitted (the enum members existed already). - `cli/runs.py _runs_resume_async`: stub → real impl. Validates the run exists + non-terminal, loads seed personas + artifact schemas from `docs/schemas/`, constructs WorkflowEngine with an "abort-on-new-approval" callback (resume should not silently re-prompt the user — original gates already passed; a new gate means the workflow has changed). Calls engine.resume(UUID(id)), prints final state + report. Catches MyDeepAgentError and exits 1 with red error. Tests - `tests/integration/test_resume.py` (new, 5 scenarios): 1. 2-phase mock workflow: phase 1 succeeds, phase 2 fails first time, row flipped back to executing → resume → phase 2 completes. Asserts `phase.skipped` event for phase 1, `run.resumed` event, and exactly 1 mock invocation for phase 2 on resume. 2. Terminal run → `MyDeepAgentError(code="run_already_terminal")`. 3. Unknown run id → `MyDeepAgentError(code="run_not_found")`. 4. RunBindingRow rows missing → `MyDeepAgentError(code="run_metadata_missing")`. 5. Corrupt `workflow_templates.definition` → `MyDeepAgentError(code="template_load_failed")`. Mock pattern matches existing test_engine.py: patch `my_deepagent.engine.build_agent` to return a fake agent that writes the expected artifact and drives the watcher middleware. Gates - ruff check + ruff format --check + mypy --strict: PASS (103 source files) - pytest non-E2E: 587 PASS (12.69 s) — +5 from new resume tests - pytest E2E real OpenRouter on Postgres: PASS 78.52 s (baseline 71–122 s; within DR-3 acceptance threshold ≤+20%) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 22:07:24 +09:00
parent 50aacd3382
commit 501292a5cd
4 changed files with 804 additions and 16 deletions
--- a/my-deepagent/src/my_deepagent/engine.py
+++ b/my-deepagent/src/my_deepagent/engine.py
@@ -4,6 +4,7 @@ from __future__ import annotations

 import asyncio
 import json
+import logging
 import signal
 from collections.abc import AsyncIterator
 from contextlib import asynccontextmanager, suppress
@@ -58,6 +59,8 @@ ApprovalCallback = Any  # Callable[[dict, list[str]], Awaitable[ApprovalDecision

 _DEFAULT_PHASE_TIMEOUT_SECONDS = 300  # 5 minutes

+_LOG_CORRUPT_PERSONA = logging.getLogger(__name__ + ".resume")
+

@dataclass(frozen=True)
 class RunResult:
@@ -165,6 +168,11 @@ class WorkflowEngine:
        requirements_md: str = "",
        override: BindingOverride | None = None,
    ) -> RunResult:
+        """Start a brand-new run. Allocates a new `run_id`, binds personas, persists
+        skeleton metadata, and dispatches to the shared `_execute_run` phase loop.
+
+        For resuming an existing non-terminal run, use :meth:`resume` instead.
+        """
        run_id = uuid4()
        worktree_root = self._config.workspace_root / str(run_id)
        worktree_root.mkdir(parents=True, exist_ok=True)
@@ -186,6 +194,59 @@ class WorkflowEngine:

        await self._append_event(run_id, None, RunEventType.RUN_CREATED, {})
        await self._append_event(run_id, None, RunEventType.RUN_STARTED, {})
+
+        return await self._execute_run(run_id, template, worktree_root, bindings)
+
+    async def resume(self, run_id: UUID) -> RunResult:
+        """Resume a non-terminal run from its first non-completed phase.
+
+        Reloads worktree_root, template, and bindings from the DB — does **not**
+        re-run `bind_personas`, so consent/pool changes since the original run
+        do not silently shift the binding. Phases whose `RunPhaseRow.state` is
+        already ``completed`` are skipped; the rest re-execute and (when a
+        LangGraph saver is wired) replay deepagents from the last checkpoint
+        for that phase's thread_id.
+
+        Raises:
+            MyDeepAgentError: if the run is missing, terminal, or its bindings
+                / template metadata cannot be reloaded.
+        """
+        run_row = await self._get_run_or_raise(run_id)
+        if run_row.state in {
+            RunState.COMPLETED.value,
+            RunState.FAILED.value,
+            RunState.ABORTED.value,
+        }:
+            raise MyDeepAgentError.human_required(
+                "run_already_terminal",
+                message=(
+                    f"run {run_id} is already {run_row.state}; start a fresh run "
+                    f"with `mydeepagent run`"
+                ),
+            )
+
+        worktree_root = Path(run_row.worktree_root)
+        template = await self._reload_template(run_row.template_id)
+        bindings = await self._reload_bindings(run_id)
+        if not bindings:
+            raise MyDeepAgentError.human_required(
+                "run_metadata_missing",
+                message=(
+                    f"run {run_id} has no binding rows; cannot resume — start a fresh run instead"
+                ),
+            )
+
+        await self._append_event(run_id, None, RunEventType.RUN_RESUMED, {})
+        return await self._execute_run(run_id, template, worktree_root, bindings)
+
+    async def _execute_run(
+        self,
+        run_id: UUID,
+        template: WorkflowTemplate,
+        worktree_root: Path,
+        bindings: dict[str, Binding],
+    ) -> RunResult:
+        """Shared phase loop used by both `run` (new) and `resume`."""
        await self._set_run_state(run_id, RunState.EXECUTING)

        # Open the LangGraph AsyncPostgresSaver once per run; all phases share it.
@@ -195,8 +256,17 @@ class WorkflowEngine:
        # checkpointer=None and runs without resume support.
        async with self._maybe_open_saver() as saver:
            self._saver = saver
+            completed_keys = await self._get_completed_phase_keys(run_id)
            try:
                for phase_def in template.phases:
+                    if phase_def.key in completed_keys:
+                        await self._append_event(
+                            run_id,
+                            None,
+                            RunEventType.PHASE_SKIPPED,
+                            {"phase_key": phase_def.key, "reason": "already_completed"},
+                        )
+                        continue
                    role_binding = bindings[phase_def.role]
                    await self._run_phase(run_id, worktree_root, template, phase_def, role_binding)
                await self._set_run_state(run_id, RunState.COMPLETED)
@@ -933,6 +1003,90 @@ class WorkflowEngine:
            except Exception:
                await s.rollback()

+    # ------------------------------------------------------------------
+    # Resume helpers (used by `resume` to rehydrate state from DB)
+    # ------------------------------------------------------------------
+
+    async def _get_run_or_raise(self, run_id: UUID) -> RunRow:
+        async with self._db.session() as s:
+            row = await s.get(RunRow, str(run_id))
+        if row is None:
+            raise MyDeepAgentError.human_required(
+                "run_not_found",
+                message=f"run {run_id} not found in DB",
+            )
+        return row
+
+    async def _reload_template(self, template_id: str) -> WorkflowTemplate:
+        async with self._db.session() as s:
+            row = await s.get(WorkflowTemplateRow, template_id)
+        if row is None:
+            raise MyDeepAgentError.fatal(
+                "template_load_failed",
+                message=f"workflow_templates row {template_id} not found",
+            )
+        try:
+            return WorkflowTemplate.model_validate(row.definition)
+        except Exception as e:
+            raise MyDeepAgentError.fatal(
+                "template_load_failed",
+                message=f"workflow_templates.definition for {template_id} is malformed: {e}",
+            ) from e
+
+    async def _reload_bindings(self, run_id: UUID) -> dict[str, Binding]:
+        """Rebuild the `{role_id: Binding}` dict from `run_bindings` + `agent_personas`.
+
+        Empty result means the run was never fully persisted — caller raises
+        `run_metadata_missing`. We do NOT re-run `bind_personas` here on purpose:
+        consent / pool state could have shifted since the original run.
+        """
+        from .binding import Binding as _Binding  # local import to avoid cycle hint
+
+        async with self._db.session() as s:
+            binding_rows = (
+                (await s.execute(select(RunBindingRow).where(RunBindingRow.run_id == str(run_id))))
+                .scalars()
+                .all()
+            )
+            persona_rows: dict[str, AgentPersonaRow] = {}
+            for br in binding_rows:
+                pr = await s.get(AgentPersonaRow, br.persona_id)
+                if pr is not None:
+                    persona_rows[br.persona_id] = pr
+
+        out: dict[str, Binding] = {}
+        for br in binding_rows:
+            pr = persona_rows.get(br.persona_id)
+            if pr is None:
+                continue
+            try:
+                persona = Persona.model_validate(pr.definition)
+            except Exception as e:
+                # Corrupt persona JSON: skip the binding; an empty bindings dict
+                # surfaces as `run_metadata_missing` in `resume`.
+                _LOG_CORRUPT_PERSONA.warning("corrupt persona row %s during resume: %s", pr.id, e)
+                continue
+            out[br.role_id] = _Binding(
+                role_id=br.role_id, persona=persona, binding_hash=br.binding_hash
+            )
+        return out
+
+    async def _get_completed_phase_keys(self, run_id: UUID) -> set[str]:
+        """Return the set of phase_keys that already reached `completed` state."""
+        async with self._db.session() as s:
+            rows = (
+                (
+                    await s.execute(
+                        select(RunPhaseRow.phase_key)
+                        .where(RunPhaseRow.run_id == str(run_id))
+                        .where(RunPhaseRow.state == RunPhaseState.COMPLETED.value)
+                    )
+                )
+                .scalars()
+                .all()
+            )
+        return set(rows)
+

 # ------------------------------------------------------------------
 # Module-level helpers