feat(my-deepagent): v0.2 PR #2b — mydeepagent runs resume <id> real implementation
Closes the v0.1.0 KNOWN LIMIT where resume was an exit-2 stub. Builds on
v0.2 PR #2a's LangGraph wiring + the existing DB phase-state machine +
sweep_orphan_runs — no Temporal (per DR-3).
Highlights
- `WorkflowEngine.resume(run_id)` (new async method):
- Loads RunRow, rejects terminal states with
MyDeepAgentError("run_already_terminal").
- Reloads worktree_root from `RunRow.worktree_root`, template via
`_reload_template` (WorkflowTemplateRow JOIN + model_validate), and
bindings via `_reload_bindings` (run_bindings ⨝ agent_personas).
- **Does NOT call `bind_personas` again** — locks in the original
binding so consent / persona-pool changes since the original run
don't silently shift role assignment.
- `_execute_run` (extracted shared phase loop): `run()` and `resume()`
both dispatch through it. Skips already-completed phases (emits
`phase.skipped` event) and re-executes the rest.
- 4 new private helpers on WorkflowEngine: `_get_run_or_raise`,
`_reload_template`, `_reload_bindings`, `_get_completed_phase_keys`.
- `RunEventType.RUN_RESUMED` and `PHASE_SKIPPED` are now actually
emitted (the enum members existed already).
- `cli/runs.py _runs_resume_async`: stub → real impl. Validates the run
exists + non-terminal, loads seed personas + artifact schemas from
`docs/schemas/`, constructs WorkflowEngine with an
"abort-on-new-approval" callback (resume should not silently re-prompt
the user — original gates already passed; a new gate means the
workflow has changed). Calls engine.resume(UUID(id)), prints final
state + report. Catches MyDeepAgentError and exits 1 with red error.
Tests
- `tests/integration/test_resume.py` (new, 5 scenarios):
1. 2-phase mock workflow: phase 1 succeeds, phase 2 fails first time,
row flipped back to executing → resume → phase 2 completes.
Asserts `phase.skipped` event for phase 1, `run.resumed` event,
and exactly 1 mock invocation for phase 2 on resume.
2. Terminal run → `MyDeepAgentError(code="run_already_terminal")`.
3. Unknown run id → `MyDeepAgentError(code="run_not_found")`.
4. RunBindingRow rows missing → `MyDeepAgentError(code="run_metadata_missing")`.
5. Corrupt `workflow_templates.definition` →
`MyDeepAgentError(code="template_load_failed")`.
Mock pattern matches existing test_engine.py: patch
`my_deepagent.engine.build_agent` to return a fake agent that writes
the expected artifact and drives the watcher middleware.
Gates
- ruff check + ruff format --check + mypy --strict: PASS (103 source files)
- pytest non-E2E: 587 PASS (12.69 s) — +5 from new resume tests
- pytest E2E real OpenRouter on Postgres: PASS 78.52 s (baseline 71–122 s;
within DR-3 acceptance threshold ≤+20%)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -4,6 +4,7 @@ from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import signal
|
||||
from collections.abc import AsyncIterator
|
||||
from contextlib import asynccontextmanager, suppress
|
||||
@@ -58,6 +59,8 @@ ApprovalCallback = Any # Callable[[dict, list[str]], Awaitable[ApprovalDecision
|
||||
|
||||
_DEFAULT_PHASE_TIMEOUT_SECONDS = 300 # 5 minutes
|
||||
|
||||
_LOG_CORRUPT_PERSONA = logging.getLogger(__name__ + ".resume")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RunResult:
|
||||
@@ -165,6 +168,11 @@ class WorkflowEngine:
|
||||
requirements_md: str = "",
|
||||
override: BindingOverride | None = None,
|
||||
) -> RunResult:
|
||||
"""Start a brand-new run. Allocates a new `run_id`, binds personas, persists
|
||||
skeleton metadata, and dispatches to the shared `_execute_run` phase loop.
|
||||
|
||||
For resuming an existing non-terminal run, use :meth:`resume` instead.
|
||||
"""
|
||||
run_id = uuid4()
|
||||
worktree_root = self._config.workspace_root / str(run_id)
|
||||
worktree_root.mkdir(parents=True, exist_ok=True)
|
||||
@@ -186,6 +194,59 @@ class WorkflowEngine:
|
||||
|
||||
await self._append_event(run_id, None, RunEventType.RUN_CREATED, {})
|
||||
await self._append_event(run_id, None, RunEventType.RUN_STARTED, {})
|
||||
|
||||
return await self._execute_run(run_id, template, worktree_root, bindings)
|
||||
|
||||
async def resume(self, run_id: UUID) -> RunResult:
|
||||
"""Resume a non-terminal run from its first non-completed phase.
|
||||
|
||||
Reloads worktree_root, template, and bindings from the DB — does **not**
|
||||
re-run `bind_personas`, so consent/pool changes since the original run
|
||||
do not silently shift the binding. Phases whose `RunPhaseRow.state` is
|
||||
already ``completed`` are skipped; the rest re-execute and (when a
|
||||
LangGraph saver is wired) replay deepagents from the last checkpoint
|
||||
for that phase's thread_id.
|
||||
|
||||
Raises:
|
||||
MyDeepAgentError: if the run is missing, terminal, or its bindings
|
||||
/ template metadata cannot be reloaded.
|
||||
"""
|
||||
run_row = await self._get_run_or_raise(run_id)
|
||||
if run_row.state in {
|
||||
RunState.COMPLETED.value,
|
||||
RunState.FAILED.value,
|
||||
RunState.ABORTED.value,
|
||||
}:
|
||||
raise MyDeepAgentError.human_required(
|
||||
"run_already_terminal",
|
||||
message=(
|
||||
f"run {run_id} is already {run_row.state}; start a fresh run "
|
||||
f"with `mydeepagent run`"
|
||||
),
|
||||
)
|
||||
|
||||
worktree_root = Path(run_row.worktree_root)
|
||||
template = await self._reload_template(run_row.template_id)
|
||||
bindings = await self._reload_bindings(run_id)
|
||||
if not bindings:
|
||||
raise MyDeepAgentError.human_required(
|
||||
"run_metadata_missing",
|
||||
message=(
|
||||
f"run {run_id} has no binding rows; cannot resume — start a fresh run instead"
|
||||
),
|
||||
)
|
||||
|
||||
await self._append_event(run_id, None, RunEventType.RUN_RESUMED, {})
|
||||
return await self._execute_run(run_id, template, worktree_root, bindings)
|
||||
|
||||
async def _execute_run(
|
||||
self,
|
||||
run_id: UUID,
|
||||
template: WorkflowTemplate,
|
||||
worktree_root: Path,
|
||||
bindings: dict[str, Binding],
|
||||
) -> RunResult:
|
||||
"""Shared phase loop used by both `run` (new) and `resume`."""
|
||||
await self._set_run_state(run_id, RunState.EXECUTING)
|
||||
|
||||
# Open the LangGraph AsyncPostgresSaver once per run; all phases share it.
|
||||
@@ -195,8 +256,17 @@ class WorkflowEngine:
|
||||
# checkpointer=None and runs without resume support.
|
||||
async with self._maybe_open_saver() as saver:
|
||||
self._saver = saver
|
||||
completed_keys = await self._get_completed_phase_keys(run_id)
|
||||
try:
|
||||
for phase_def in template.phases:
|
||||
if phase_def.key in completed_keys:
|
||||
await self._append_event(
|
||||
run_id,
|
||||
None,
|
||||
RunEventType.PHASE_SKIPPED,
|
||||
{"phase_key": phase_def.key, "reason": "already_completed"},
|
||||
)
|
||||
continue
|
||||
role_binding = bindings[phase_def.role]
|
||||
await self._run_phase(run_id, worktree_root, template, phase_def, role_binding)
|
||||
await self._set_run_state(run_id, RunState.COMPLETED)
|
||||
@@ -933,6 +1003,90 @@ class WorkflowEngine:
|
||||
except Exception:
|
||||
await s.rollback()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Resume helpers (used by `resume` to rehydrate state from DB)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _get_run_or_raise(self, run_id: UUID) -> RunRow:
|
||||
async with self._db.session() as s:
|
||||
row = await s.get(RunRow, str(run_id))
|
||||
if row is None:
|
||||
raise MyDeepAgentError.human_required(
|
||||
"run_not_found",
|
||||
message=f"run {run_id} not found in DB",
|
||||
)
|
||||
return row
|
||||
|
||||
async def _reload_template(self, template_id: str) -> WorkflowTemplate:
|
||||
async with self._db.session() as s:
|
||||
row = await s.get(WorkflowTemplateRow, template_id)
|
||||
if row is None:
|
||||
raise MyDeepAgentError.fatal(
|
||||
"template_load_failed",
|
||||
message=f"workflow_templates row {template_id} not found",
|
||||
)
|
||||
try:
|
||||
return WorkflowTemplate.model_validate(row.definition)
|
||||
except Exception as e:
|
||||
raise MyDeepAgentError.fatal(
|
||||
"template_load_failed",
|
||||
message=f"workflow_templates.definition for {template_id} is malformed: {e}",
|
||||
) from e
|
||||
|
||||
async def _reload_bindings(self, run_id: UUID) -> dict[str, Binding]:
|
||||
"""Rebuild the `{role_id: Binding}` dict from `run_bindings` + `agent_personas`.
|
||||
|
||||
Empty result means the run was never fully persisted — caller raises
|
||||
`run_metadata_missing`. We do NOT re-run `bind_personas` here on purpose:
|
||||
consent / pool state could have shifted since the original run.
|
||||
"""
|
||||
from .binding import Binding as _Binding # local import to avoid cycle hint
|
||||
|
||||
async with self._db.session() as s:
|
||||
binding_rows = (
|
||||
(await s.execute(select(RunBindingRow).where(RunBindingRow.run_id == str(run_id))))
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
persona_rows: dict[str, AgentPersonaRow] = {}
|
||||
for br in binding_rows:
|
||||
pr = await s.get(AgentPersonaRow, br.persona_id)
|
||||
if pr is not None:
|
||||
persona_rows[br.persona_id] = pr
|
||||
|
||||
out: dict[str, Binding] = {}
|
||||
for br in binding_rows:
|
||||
pr = persona_rows.get(br.persona_id)
|
||||
if pr is None:
|
||||
continue
|
||||
try:
|
||||
persona = Persona.model_validate(pr.definition)
|
||||
except Exception as e:
|
||||
# Corrupt persona JSON: skip the binding; an empty bindings dict
|
||||
# surfaces as `run_metadata_missing` in `resume`.
|
||||
_LOG_CORRUPT_PERSONA.warning("corrupt persona row %s during resume: %s", pr.id, e)
|
||||
continue
|
||||
out[br.role_id] = _Binding(
|
||||
role_id=br.role_id, persona=persona, binding_hash=br.binding_hash
|
||||
)
|
||||
return out
|
||||
|
||||
async def _get_completed_phase_keys(self, run_id: UUID) -> set[str]:
|
||||
"""Return the set of phase_keys that already reached `completed` state."""
|
||||
async with self._db.session() as s:
|
||||
rows = (
|
||||
(
|
||||
await s.execute(
|
||||
select(RunPhaseRow.phase_key)
|
||||
.where(RunPhaseRow.run_id == str(run_id))
|
||||
.where(RunPhaseRow.state == RunPhaseState.COMPLETED.value)
|
||||
)
|
||||
)
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
return set(rows)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Module-level helpers
|
||||
|
||||
Reference in New Issue
Block a user