feat: add tui recovery state machine

This commit is contained in:
chungyeong
2026-05-14 00:14:27 +09:00
parent ef4c56e6b0
commit e5020a59f0
15 changed files with 1414 additions and 97 deletions

View File

@@ -1,5 +1,6 @@
export * from "./adapter.js";
export * from "./fake.js";
export * from "./manager.js";
export * from "./recovery.js";
export * from "./transcript.js";
export * from "./tmux.js";

View File

@@ -19,6 +19,7 @@ import type {
TranscriptBaseline,
TranscriptChunk,
} from "./adapter.js";
import { assertSessionTransition, retryRecoverable } from "./recovery.js";
import { captureAndPersistTranscript } from "./transcript.js";
type Database = DbClient["db"];
@@ -298,6 +299,12 @@ export class SessionManager implements SessionRuntime {
if (this.db === undefined || !["CREATED", "BOOTSTRAPPING"].includes(session.state)) {
return;
}
if (session.state === "CREATED") {
assertSessionTransition("CREATED", "BOOTSTRAPPING");
assertSessionTransition("BOOTSTRAPPING", "READY");
} else {
assertSessionTransition(session.state, "READY");
}
const eventRepository = new RunEventRepository(this.db);
const sessionUpdate: {
@@ -345,6 +352,7 @@ export class SessionManager implements SessionRuntime {
backend: string;
cwd: string;
recoveryAttempts: number;
state: string;
},
error: unknown,
): Promise<void> {
@@ -357,6 +365,7 @@ export class SessionManager implements SessionRuntime {
const gateKey = "session_recovery_required";
const approvalIdempotencyKey = `${session.runId}:${gateKey}:${session.id}:${recoveryAttempts}`;
const pauseCause = `session_recovery_failed:${session.id}:${recoveryAttempts}`;
assertSessionTransition(session.state, "FAILED_NEEDS_HUMAN");
await this.db.transaction(async (tx) => {
await tx.execute(sql`SELECT 1 FROM ${runs} WHERE ${runs.id} = ${session.runId} FOR UPDATE`);
const [run] = await tx
@@ -431,18 +440,7 @@ export class SessionManager implements SessionRuntime {
}
private async resumeWithRetry(handle: SessionHandle): Promise<SessionHandle> {
let lastError: unknown;
for (let attempt = 0; attempt <= 2; attempt += 1) {
try {
return await this.track(this.adapter.resume(handle));
} catch (error) {
lastError = error;
if (!(error instanceof DevflowError) || error.class !== "recoverable") {
throw error;
}
}
}
throw lastError;
return retryRecoverable("resume", () => this.track(this.adapter.resume(handle)));
}
private async loadTranscriptBaseline(

View File

@@ -0,0 +1,103 @@
import { describe, expect, it } from "vitest";
import { DevflowError } from "@devflow/core";
import {
SessionRecoveryBudget,
assertSessionStateAssignment,
assertSessionTransition,
isSessionHung,
retryRecoverable,
} from "./recovery.js";
describe("session recovery policy", () => {
it("allows only locked session state-machine transitions", () => {
expect(() => assertSessionTransition("CREATED", "BOOTSTRAPPING")).not.toThrow();
expect(() => assertSessionTransition("BOOTSTRAPPING", "READY")).not.toThrow();
expect(() => assertSessionTransition("READY", "BUSY")).not.toThrow();
expect(() => assertSessionTransition("BUSY", "READY")).not.toThrow();
expect(() => assertSessionTransition("BUSY", "ARTIFACT_TIMEOUT")).not.toThrow();
expect(() => assertSessionTransition("ARTIFACT_TIMEOUT", "RESUMING")).not.toThrow();
expect(() => assertSessionTransition("RESUMING", "REBOOTSTRAPPED")).not.toThrow();
expect(() => assertSessionTransition("REBOOTSTRAPPED", "READY")).not.toThrow();
expect(() => assertSessionTransition("READY", "REBOOTSTRAPPED")).toThrow(
/Invalid session state transition/,
);
expect(() => assertSessionTransition("CRASHED", "CRASHED")).toThrow(
/Invalid session state transition/,
);
expect(() => assertSessionTransition("FAILED_NEEDS_HUMAN", "READY")).toThrow(
/Invalid session state transition/,
);
});
it("allows no-op state assignment without treating it as a transition", () => {
expect(() => assertSessionStateAssignment("READY", "READY")).not.toThrow();
expect(() => assertSessionStateAssignment("BUSY", "READY")).not.toThrow();
expect(() => assertSessionStateAssignment("FAILED_NEEDS_HUMAN", "READY")).toThrow(
/Invalid session state assignment/,
);
});
it("retries recoverable errors for one initial prompt send plus two retries", async () => {
let attempts = 0;
const result = await retryRecoverable("sendPrompt", async () => {
attempts += 1;
if (attempts < SessionRecoveryBudget.sendPrompt.physicalAttempts) {
throw new DevflowError("temporary prompt failure", {
class: "recoverable",
code: "prompt_send_transient",
});
}
return "sent";
});
expect(result).toBe("sent");
expect(attempts).toBe(3);
});
it("throws the final recoverable error after the retry budget is exhausted", async () => {
let attempts = 0;
await expect(
retryRecoverable("rebootstrap", async () => {
attempts += 1;
throw new DevflowError("pane briefly unresponsive", {
class: "recoverable",
code: "pane_briefly_unresponsive",
});
}),
).rejects.toMatchObject({
class: "recoverable",
code: "pane_briefly_unresponsive",
});
expect(attempts).toBe(SessionRecoveryBudget.rebootstrap.physicalAttempts);
});
it("does not retry human-required or fatal errors", async () => {
let attempts = 0;
await expect(
retryRecoverable("resume", async () => {
attempts += 1;
throw new DevflowError("operator action required", {
class: "human_required",
code: "session_recovery_required",
});
}),
).rejects.toMatchObject({
class: "human_required",
code: "session_recovery_required",
});
expect(attempts).toBe(1);
});
it("uses the locked hung-session timeout boundary", () => {
const now = new Date("2026-05-13T10:20:00.000Z");
expect(isSessionHung(new Date("2026-05-13T10:00:00.000Z"), now)).toBe(true);
expect(isSessionHung(new Date("2026-05-13T10:00:01.000Z"), now)).toBe(false);
expect(isSessionHung(undefined, now)).toBe(false);
});
});

View File

@@ -0,0 +1,127 @@
import { DevflowError, SessionState, type SessionState as SessionStateName } from "@devflow/core";
export const SessionRecoveryBudget = Object.freeze({
sendPrompt: Object.freeze({ retries: 2, physicalAttempts: 3 }),
resume: Object.freeze({ retries: 2, physicalAttempts: 3 }),
rebootstrap: Object.freeze({ retries: 1, physicalAttempts: 2 }),
artifactRepair: Object.freeze({ retries: 1, physicalAttempts: 2 }),
maxHungMs: 20 * 60 * 1000,
});
export type SessionRetryOperation = "sendPrompt" | "resume" | "rebootstrap" | "artifactRepair";
const allowedSessionTransitions: ReadonlyMap<
SessionStateName,
ReadonlySet<SessionStateName>
> = new Map([
["CREATED", new Set(["BOOTSTRAPPING", "FAILED_NEEDS_HUMAN"])],
["BOOTSTRAPPING", new Set(["READY", "FAILED_NEEDS_HUMAN"])],
["READY", new Set(["BUSY", "FAILED_NEEDS_HUMAN"])],
[
"BUSY",
new Set([
"READY",
"WAITING_FOR_APPROVAL",
"ARTIFACT_TIMEOUT",
"HUNG",
"CRASHED",
"FAILED_NEEDS_HUMAN",
]),
],
["WAITING_FOR_APPROVAL", new Set(["READY", "FAILED_NEEDS_HUMAN"])],
["ARTIFACT_TIMEOUT", new Set(["RESUMING", "FAILED_NEEDS_HUMAN"])],
["HUNG", new Set(["RESUMING", "FAILED_NEEDS_HUMAN"])],
["CRASHED", new Set(["RESUMING", "FAILED_NEEDS_HUMAN"])],
["RESUMING", new Set(["READY", "REBOOTSTRAPPED", "FAILED_NEEDS_HUMAN"])],
["REBOOTSTRAPPED", new Set(["READY", "FAILED_NEEDS_HUMAN"])],
["FAILED_NEEDS_HUMAN", new Set()],
]);
export function isAllowedSessionTransition(from: string, to: string): boolean {
const parsedFrom = SessionState.safeParse(from);
const parsedTo = SessionState.safeParse(to);
if (!parsedFrom.success || !parsedTo.success) {
return false;
}
return allowedSessionTransitions.get(parsedFrom.data)?.has(parsedTo.data) ?? false;
}
export function isAllowedSessionStateAssignment(from: string, to: string): boolean {
const parsedFrom = SessionState.safeParse(from);
const parsedTo = SessionState.safeParse(to);
if (!parsedFrom.success || !parsedTo.success) {
return false;
}
if (parsedFrom.data === parsedTo.data) {
return true;
}
return isAllowedSessionTransition(parsedFrom.data, parsedTo.data);
}
export function assertSessionTransition(from: string, to: string): void {
if (isAllowedSessionTransition(from, to)) {
return;
}
throw new DevflowError("Invalid session state transition", {
class: "fatal",
code: "internal_state_corruption",
recoveryHint: `${from}->${to}`,
});
}
export function assertSessionStateAssignment(from: string, to: string): void {
if (isAllowedSessionStateAssignment(from, to)) {
return;
}
throw new DevflowError("Invalid session state assignment", {
class: "fatal",
code: "internal_state_corruption",
recoveryHint: `${from}->${to}`,
});
}
export async function retryRecoverable<T>(
operation: SessionRetryOperation,
run: (physicalAttempt: number) => Promise<T>,
): Promise<T> {
const physicalAttempts = recoveryPhysicalAttempts(operation);
let lastError: unknown;
for (let physicalAttempt = 1; physicalAttempt <= physicalAttempts; physicalAttempt += 1) {
try {
return await run(physicalAttempt);
} catch (error) {
lastError = error;
if (!(error instanceof DevflowError) || error.class !== "recoverable") {
throw error;
}
}
}
throw lastError;
}
export function isSessionHung(
lastOutputAt: Date | undefined,
now: Date,
maxHungMs = SessionRecoveryBudget.maxHungMs,
): boolean {
if (lastOutputAt === undefined) {
return false;
}
return now.getTime() - lastOutputAt.getTime() >= maxHungMs;
}
function recoveryPhysicalAttempts(operation: SessionRetryOperation): number {
switch (operation) {
case "sendPrompt":
return SessionRecoveryBudget.sendPrompt.physicalAttempts;
case "resume":
return SessionRecoveryBudget.resume.physicalAttempts;
case "rebootstrap":
return SessionRecoveryBudget.rebootstrap.physicalAttempts;
case "artifactRepair":
return SessionRecoveryBudget.artifactRepair.physicalAttempts;
}
}