feat: add tui recovery state machine
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
export * from "./adapter.js";
|
||||
export * from "./fake.js";
|
||||
export * from "./manager.js";
|
||||
export * from "./recovery.js";
|
||||
export * from "./transcript.js";
|
||||
export * from "./tmux.js";
|
||||
|
||||
@@ -19,6 +19,7 @@ import type {
|
||||
TranscriptBaseline,
|
||||
TranscriptChunk,
|
||||
} from "./adapter.js";
|
||||
import { assertSessionTransition, retryRecoverable } from "./recovery.js";
|
||||
import { captureAndPersistTranscript } from "./transcript.js";
|
||||
|
||||
type Database = DbClient["db"];
|
||||
@@ -298,6 +299,12 @@ export class SessionManager implements SessionRuntime {
|
||||
if (this.db === undefined || !["CREATED", "BOOTSTRAPPING"].includes(session.state)) {
|
||||
return;
|
||||
}
|
||||
if (session.state === "CREATED") {
|
||||
assertSessionTransition("CREATED", "BOOTSTRAPPING");
|
||||
assertSessionTransition("BOOTSTRAPPING", "READY");
|
||||
} else {
|
||||
assertSessionTransition(session.state, "READY");
|
||||
}
|
||||
|
||||
const eventRepository = new RunEventRepository(this.db);
|
||||
const sessionUpdate: {
|
||||
@@ -345,6 +352,7 @@ export class SessionManager implements SessionRuntime {
|
||||
backend: string;
|
||||
cwd: string;
|
||||
recoveryAttempts: number;
|
||||
state: string;
|
||||
},
|
||||
error: unknown,
|
||||
): Promise<void> {
|
||||
@@ -357,6 +365,7 @@ export class SessionManager implements SessionRuntime {
|
||||
const gateKey = "session_recovery_required";
|
||||
const approvalIdempotencyKey = `${session.runId}:${gateKey}:${session.id}:${recoveryAttempts}`;
|
||||
const pauseCause = `session_recovery_failed:${session.id}:${recoveryAttempts}`;
|
||||
assertSessionTransition(session.state, "FAILED_NEEDS_HUMAN");
|
||||
await this.db.transaction(async (tx) => {
|
||||
await tx.execute(sql`SELECT 1 FROM ${runs} WHERE ${runs.id} = ${session.runId} FOR UPDATE`);
|
||||
const [run] = await tx
|
||||
@@ -431,18 +440,7 @@ export class SessionManager implements SessionRuntime {
|
||||
}
|
||||
|
||||
private async resumeWithRetry(handle: SessionHandle): Promise<SessionHandle> {
|
||||
let lastError: unknown;
|
||||
for (let attempt = 0; attempt <= 2; attempt += 1) {
|
||||
try {
|
||||
return await this.track(this.adapter.resume(handle));
|
||||
} catch (error) {
|
||||
lastError = error;
|
||||
if (!(error instanceof DevflowError) || error.class !== "recoverable") {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
throw lastError;
|
||||
return retryRecoverable("resume", () => this.track(this.adapter.resume(handle)));
|
||||
}
|
||||
|
||||
private async loadTranscriptBaseline(
|
||||
|
||||
103
packages/session/src/recovery.test.ts
Normal file
103
packages/session/src/recovery.test.ts
Normal file
@@ -0,0 +1,103 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import { DevflowError } from "@devflow/core";
|
||||
|
||||
import {
|
||||
SessionRecoveryBudget,
|
||||
assertSessionStateAssignment,
|
||||
assertSessionTransition,
|
||||
isSessionHung,
|
||||
retryRecoverable,
|
||||
} from "./recovery.js";
|
||||
|
||||
describe("session recovery policy", () => {
|
||||
it("allows only locked session state-machine transitions", () => {
|
||||
expect(() => assertSessionTransition("CREATED", "BOOTSTRAPPING")).not.toThrow();
|
||||
expect(() => assertSessionTransition("BOOTSTRAPPING", "READY")).not.toThrow();
|
||||
expect(() => assertSessionTransition("READY", "BUSY")).not.toThrow();
|
||||
expect(() => assertSessionTransition("BUSY", "READY")).not.toThrow();
|
||||
expect(() => assertSessionTransition("BUSY", "ARTIFACT_TIMEOUT")).not.toThrow();
|
||||
expect(() => assertSessionTransition("ARTIFACT_TIMEOUT", "RESUMING")).not.toThrow();
|
||||
expect(() => assertSessionTransition("RESUMING", "REBOOTSTRAPPED")).not.toThrow();
|
||||
expect(() => assertSessionTransition("REBOOTSTRAPPED", "READY")).not.toThrow();
|
||||
|
||||
expect(() => assertSessionTransition("READY", "REBOOTSTRAPPED")).toThrow(
|
||||
/Invalid session state transition/,
|
||||
);
|
||||
expect(() => assertSessionTransition("CRASHED", "CRASHED")).toThrow(
|
||||
/Invalid session state transition/,
|
||||
);
|
||||
expect(() => assertSessionTransition("FAILED_NEEDS_HUMAN", "READY")).toThrow(
|
||||
/Invalid session state transition/,
|
||||
);
|
||||
});
|
||||
|
||||
it("allows no-op state assignment without treating it as a transition", () => {
|
||||
expect(() => assertSessionStateAssignment("READY", "READY")).not.toThrow();
|
||||
expect(() => assertSessionStateAssignment("BUSY", "READY")).not.toThrow();
|
||||
expect(() => assertSessionStateAssignment("FAILED_NEEDS_HUMAN", "READY")).toThrow(
|
||||
/Invalid session state assignment/,
|
||||
);
|
||||
});
|
||||
|
||||
it("retries recoverable errors for one initial prompt send plus two retries", async () => {
|
||||
let attempts = 0;
|
||||
const result = await retryRecoverable("sendPrompt", async () => {
|
||||
attempts += 1;
|
||||
if (attempts < SessionRecoveryBudget.sendPrompt.physicalAttempts) {
|
||||
throw new DevflowError("temporary prompt failure", {
|
||||
class: "recoverable",
|
||||
code: "prompt_send_transient",
|
||||
});
|
||||
}
|
||||
return "sent";
|
||||
});
|
||||
|
||||
expect(result).toBe("sent");
|
||||
expect(attempts).toBe(3);
|
||||
});
|
||||
|
||||
it("throws the final recoverable error after the retry budget is exhausted", async () => {
|
||||
let attempts = 0;
|
||||
await expect(
|
||||
retryRecoverable("rebootstrap", async () => {
|
||||
attempts += 1;
|
||||
throw new DevflowError("pane briefly unresponsive", {
|
||||
class: "recoverable",
|
||||
code: "pane_briefly_unresponsive",
|
||||
});
|
||||
}),
|
||||
).rejects.toMatchObject({
|
||||
class: "recoverable",
|
||||
code: "pane_briefly_unresponsive",
|
||||
});
|
||||
|
||||
expect(attempts).toBe(SessionRecoveryBudget.rebootstrap.physicalAttempts);
|
||||
});
|
||||
|
||||
it("does not retry human-required or fatal errors", async () => {
|
||||
let attempts = 0;
|
||||
await expect(
|
||||
retryRecoverable("resume", async () => {
|
||||
attempts += 1;
|
||||
throw new DevflowError("operator action required", {
|
||||
class: "human_required",
|
||||
code: "session_recovery_required",
|
||||
});
|
||||
}),
|
||||
).rejects.toMatchObject({
|
||||
class: "human_required",
|
||||
code: "session_recovery_required",
|
||||
});
|
||||
|
||||
expect(attempts).toBe(1);
|
||||
});
|
||||
|
||||
it("uses the locked hung-session timeout boundary", () => {
|
||||
const now = new Date("2026-05-13T10:20:00.000Z");
|
||||
|
||||
expect(isSessionHung(new Date("2026-05-13T10:00:00.000Z"), now)).toBe(true);
|
||||
expect(isSessionHung(new Date("2026-05-13T10:00:01.000Z"), now)).toBe(false);
|
||||
expect(isSessionHung(undefined, now)).toBe(false);
|
||||
});
|
||||
});
|
||||
127
packages/session/src/recovery.ts
Normal file
127
packages/session/src/recovery.ts
Normal file
@@ -0,0 +1,127 @@
|
||||
import { DevflowError, SessionState, type SessionState as SessionStateName } from "@devflow/core";
|
||||
|
||||
export const SessionRecoveryBudget = Object.freeze({
|
||||
sendPrompt: Object.freeze({ retries: 2, physicalAttempts: 3 }),
|
||||
resume: Object.freeze({ retries: 2, physicalAttempts: 3 }),
|
||||
rebootstrap: Object.freeze({ retries: 1, physicalAttempts: 2 }),
|
||||
artifactRepair: Object.freeze({ retries: 1, physicalAttempts: 2 }),
|
||||
maxHungMs: 20 * 60 * 1000,
|
||||
});
|
||||
|
||||
export type SessionRetryOperation = "sendPrompt" | "resume" | "rebootstrap" | "artifactRepair";
|
||||
|
||||
const allowedSessionTransitions: ReadonlyMap<
|
||||
SessionStateName,
|
||||
ReadonlySet<SessionStateName>
|
||||
> = new Map([
|
||||
["CREATED", new Set(["BOOTSTRAPPING", "FAILED_NEEDS_HUMAN"])],
|
||||
["BOOTSTRAPPING", new Set(["READY", "FAILED_NEEDS_HUMAN"])],
|
||||
["READY", new Set(["BUSY", "FAILED_NEEDS_HUMAN"])],
|
||||
[
|
||||
"BUSY",
|
||||
new Set([
|
||||
"READY",
|
||||
"WAITING_FOR_APPROVAL",
|
||||
"ARTIFACT_TIMEOUT",
|
||||
"HUNG",
|
||||
"CRASHED",
|
||||
"FAILED_NEEDS_HUMAN",
|
||||
]),
|
||||
],
|
||||
["WAITING_FOR_APPROVAL", new Set(["READY", "FAILED_NEEDS_HUMAN"])],
|
||||
["ARTIFACT_TIMEOUT", new Set(["RESUMING", "FAILED_NEEDS_HUMAN"])],
|
||||
["HUNG", new Set(["RESUMING", "FAILED_NEEDS_HUMAN"])],
|
||||
["CRASHED", new Set(["RESUMING", "FAILED_NEEDS_HUMAN"])],
|
||||
["RESUMING", new Set(["READY", "REBOOTSTRAPPED", "FAILED_NEEDS_HUMAN"])],
|
||||
["REBOOTSTRAPPED", new Set(["READY", "FAILED_NEEDS_HUMAN"])],
|
||||
["FAILED_NEEDS_HUMAN", new Set()],
|
||||
]);
|
||||
|
||||
export function isAllowedSessionTransition(from: string, to: string): boolean {
|
||||
const parsedFrom = SessionState.safeParse(from);
|
||||
const parsedTo = SessionState.safeParse(to);
|
||||
if (!parsedFrom.success || !parsedTo.success) {
|
||||
return false;
|
||||
}
|
||||
return allowedSessionTransitions.get(parsedFrom.data)?.has(parsedTo.data) ?? false;
|
||||
}
|
||||
|
||||
export function isAllowedSessionStateAssignment(from: string, to: string): boolean {
|
||||
const parsedFrom = SessionState.safeParse(from);
|
||||
const parsedTo = SessionState.safeParse(to);
|
||||
if (!parsedFrom.success || !parsedTo.success) {
|
||||
return false;
|
||||
}
|
||||
if (parsedFrom.data === parsedTo.data) {
|
||||
return true;
|
||||
}
|
||||
return isAllowedSessionTransition(parsedFrom.data, parsedTo.data);
|
||||
}
|
||||
|
||||
export function assertSessionTransition(from: string, to: string): void {
|
||||
if (isAllowedSessionTransition(from, to)) {
|
||||
return;
|
||||
}
|
||||
|
||||
throw new DevflowError("Invalid session state transition", {
|
||||
class: "fatal",
|
||||
code: "internal_state_corruption",
|
||||
recoveryHint: `${from}->${to}`,
|
||||
});
|
||||
}
|
||||
|
||||
export function assertSessionStateAssignment(from: string, to: string): void {
|
||||
if (isAllowedSessionStateAssignment(from, to)) {
|
||||
return;
|
||||
}
|
||||
|
||||
throw new DevflowError("Invalid session state assignment", {
|
||||
class: "fatal",
|
||||
code: "internal_state_corruption",
|
||||
recoveryHint: `${from}->${to}`,
|
||||
});
|
||||
}
|
||||
|
||||
export async function retryRecoverable<T>(
|
||||
operation: SessionRetryOperation,
|
||||
run: (physicalAttempt: number) => Promise<T>,
|
||||
): Promise<T> {
|
||||
const physicalAttempts = recoveryPhysicalAttempts(operation);
|
||||
let lastError: unknown;
|
||||
for (let physicalAttempt = 1; physicalAttempt <= physicalAttempts; physicalAttempt += 1) {
|
||||
try {
|
||||
return await run(physicalAttempt);
|
||||
} catch (error) {
|
||||
lastError = error;
|
||||
if (!(error instanceof DevflowError) || error.class !== "recoverable") {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
throw lastError;
|
||||
}
|
||||
|
||||
export function isSessionHung(
|
||||
lastOutputAt: Date | undefined,
|
||||
now: Date,
|
||||
maxHungMs = SessionRecoveryBudget.maxHungMs,
|
||||
): boolean {
|
||||
if (lastOutputAt === undefined) {
|
||||
return false;
|
||||
}
|
||||
return now.getTime() - lastOutputAt.getTime() >= maxHungMs;
|
||||
}
|
||||
|
||||
function recoveryPhysicalAttempts(operation: SessionRetryOperation): number {
|
||||
switch (operation) {
|
||||
case "sendPrompt":
|
||||
return SessionRecoveryBudget.sendPrompt.physicalAttempts;
|
||||
case "resume":
|
||||
return SessionRecoveryBudget.resume.physicalAttempts;
|
||||
case "rebootstrap":
|
||||
return SessionRecoveryBudget.rebootstrap.physicalAttempts;
|
||||
case "artifactRepair":
|
||||
return SessionRecoveryBudget.artifactRepair.physicalAttempts;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user