feat: add tui recovery state machine

This commit is contained in:
chungyeong
2026-05-14 00:14:27 +09:00
parent ef4c56e6b0
commit e5020a59f0
15 changed files with 1414 additions and 97 deletions

View File

@@ -4,3 +4,4 @@ LOG_LEVEL=info
TEMPORAL_ADDRESS=localhost:7233 TEMPORAL_ADDRESS=localhost:7233
DEVFLOW_POSTGRES_PORT=55432 DEVFLOW_POSTGRES_PORT=55432
DEVFLOW_BACKENDS_JSON=[{"id":"fake","enabled":true}] DEVFLOW_BACKENDS_JSON=[{"id":"fake","enabled":true}]
SESSION_MAX_HUNG_MS=1200000

View File

@@ -28,6 +28,7 @@ export interface StartM4ApiOptions {
sessionManager?: SessionManager; sessionManager?: SessionManager;
runEngine?: RunEngine; runEngine?: RunEngine;
maxConcurrentRuns?: number; maxConcurrentRuns?: number;
sessionMaxHungMs?: number;
} }
export interface StartM4ApiResult { export interface StartM4ApiResult {
@@ -50,6 +51,7 @@ export interface StartTemporalApiOptions {
availableBackends?: readonly BackendConfig[]; availableBackends?: readonly BackendConfig[];
maxConcurrentRuns?: number; maxConcurrentRuns?: number;
workspaceRoot?: string; workspaceRoot?: string;
sessionMaxHungMs?: number;
} }
export interface StartTemporalApiResult { export interface StartTemporalApiResult {
@@ -69,6 +71,7 @@ export async function startM4Api(options: StartM4ApiOptions = {}): Promise<Start
const config = ownedClient || options.workspaceRoot === undefined ? getConfig() : undefined; const config = ownedClient || options.workspaceRoot === undefined ? getConfig() : undefined;
const dbClient = const dbClient =
options.dbClient ?? createDbClient(config?.DATABASE_URL ?? getConfig().DATABASE_URL); options.dbClient ?? createDbClient(config?.DATABASE_URL ?? getConfig().DATABASE_URL);
const sessionMaxHungMs = options.sessionMaxHungMs ?? config?.SESSION_MAX_HUNG_MS;
const sessionManager = const sessionManager =
options.sessionManager ?? options.sessionManager ??
new SessionManager({ new SessionManager({
@@ -90,6 +93,7 @@ export async function startM4Api(options: StartM4ApiOptions = {}): Promise<Start
...(options.maxConcurrentRuns === undefined ...(options.maxConcurrentRuns === undefined
? {} ? {}
: { maxConcurrentRuns: options.maxConcurrentRuns }), : { maxConcurrentRuns: options.maxConcurrentRuns }),
...(sessionMaxHungMs === undefined ? {} : { recovery: { maxHungMs: sessionMaxHungMs } }),
}); });
try { try {
@@ -158,6 +162,7 @@ export async function startTemporalApi(
const replayValidationBackends = options.availableBackends ?? config?.backends; const replayValidationBackends = options.availableBackends ?? config?.backends;
const replayValidationMaxConcurrentRuns = const replayValidationMaxConcurrentRuns =
options.maxConcurrentRuns ?? config?.MAX_CONCURRENT_RUNS; options.maxConcurrentRuns ?? config?.MAX_CONCURRENT_RUNS;
const replayValidationSessionMaxHungMs = options.sessionMaxHungMs ?? config?.SESSION_MAX_HUNG_MS;
const replayValidationEngine = new DbRunEngine({ const replayValidationEngine = new DbRunEngine({
db: dbClient.db, db: dbClient.db,
sessions: dbOnlySessionRuntime(), sessions: dbOnlySessionRuntime(),
@@ -168,6 +173,9 @@ export async function startTemporalApi(
...(replayValidationMaxConcurrentRuns === undefined ...(replayValidationMaxConcurrentRuns === undefined
? {} ? {}
: { maxConcurrentRuns: replayValidationMaxConcurrentRuns }), : { maxConcurrentRuns: replayValidationMaxConcurrentRuns }),
...(replayValidationSessionMaxHungMs === undefined
? {}
: { recovery: { maxHungMs: replayValidationSessionMaxHungMs } }),
}); });
const engine = new TemporalRunEngine({ const engine = new TemporalRunEngine({
client: temporalClient, client: temporalClient,

View File

@@ -110,6 +110,7 @@ describe("startWorker", () => {
TEMPORAL_ADDRESS: "localhost:7233", TEMPORAL_ADDRESS: "localhost:7233",
WORKSPACE_ROOT: worktreeRoot, WORKSPACE_ROOT: worktreeRoot,
MAX_CONCURRENT_RUNS: 4, MAX_CONCURRENT_RUNS: 4,
SESSION_MAX_HUNG_MS: 20 * 60 * 1000,
backends: [{ id: "fake", enabled: true }], backends: [{ id: "fake", enabled: true }],
}, },
dbClient: client, dbClient: client,
@@ -148,6 +149,7 @@ describe("startWorker", () => {
TEMPORAL_ADDRESS: "localhost:7233", TEMPORAL_ADDRESS: "localhost:7233",
WORKSPACE_ROOT: realpathSync(mkdtempSync(join(tmpdir(), "devflow-worker-workspace-"))), WORKSPACE_ROOT: realpathSync(mkdtempSync(join(tmpdir(), "devflow-worker-workspace-"))),
MAX_CONCURRENT_RUNS: 4, MAX_CONCURRENT_RUNS: 4,
SESSION_MAX_HUNG_MS: 20 * 60 * 1000,
backends: [{ id: "fake", enabled: true }], backends: [{ id: "fake", enabled: true }],
}, },
dbClient: client, dbClient: client,
@@ -165,6 +167,7 @@ describe("startWorker", () => {
TEMPORAL_ADDRESS: "localhost:7233", TEMPORAL_ADDRESS: "localhost:7233",
WORKSPACE_ROOT: realpathSync(mkdtempSync(join(tmpdir(), "devflow-worker-workspace-"))), WORKSPACE_ROOT: realpathSync(mkdtempSync(join(tmpdir(), "devflow-worker-workspace-"))),
MAX_CONCURRENT_RUNS: 4, MAX_CONCURRENT_RUNS: 4,
SESSION_MAX_HUNG_MS: 20 * 60 * 1000,
backends: [{ id: "fake", enabled: true }], backends: [{ id: "fake", enabled: true }],
}, },
dbClient: client, dbClient: client,
@@ -191,6 +194,7 @@ describe("startWorker", () => {
TEMPORAL_ADDRESS: "localhost:7233", TEMPORAL_ADDRESS: "localhost:7233",
WORKSPACE_ROOT: workspaceRoot, WORKSPACE_ROOT: workspaceRoot,
MAX_CONCURRENT_RUNS: 4, MAX_CONCURRENT_RUNS: 4,
SESSION_MAX_HUNG_MS: 20 * 60 * 1000,
backends: [{ id: "fake", enabled: true }], backends: [{ id: "fake", enabled: true }],
}, },
dbClient: client, dbClient: client,
@@ -211,6 +215,7 @@ describe("startWorker", () => {
TEMPORAL_ADDRESS: "localhost:7233", TEMPORAL_ADDRESS: "localhost:7233",
WORKSPACE_ROOT: workspaceRoot, WORKSPACE_ROOT: workspaceRoot,
MAX_CONCURRENT_RUNS: 4, MAX_CONCURRENT_RUNS: 4,
SESSION_MAX_HUNG_MS: 20 * 60 * 1000,
backends: [{ id: "fake", enabled: true }], backends: [{ id: "fake", enabled: true }],
}, },
dbClient: client, dbClient: client,
@@ -233,6 +238,7 @@ describe("startWorker", () => {
TEMPORAL_ADDRESS: "localhost:7233", TEMPORAL_ADDRESS: "localhost:7233",
WORKSPACE_ROOT: workspaceRoot, WORKSPACE_ROOT: workspaceRoot,
MAX_CONCURRENT_RUNS: 4, MAX_CONCURRENT_RUNS: 4,
SESSION_MAX_HUNG_MS: 20 * 60 * 1000,
backends: [{ id: "fake", enabled: true }], backends: [{ id: "fake", enabled: true }],
}, },
dbClient: client, dbClient: client,
@@ -251,6 +257,7 @@ describe("startWorker", () => {
TEMPORAL_ADDRESS: "localhost:7233", TEMPORAL_ADDRESS: "localhost:7233",
WORKSPACE_ROOT: workspaceRoot, WORKSPACE_ROOT: workspaceRoot,
MAX_CONCURRENT_RUNS: 4, MAX_CONCURRENT_RUNS: 4,
SESSION_MAX_HUNG_MS: 20 * 60 * 1000,
backends: [{ id: "fake", enabled: true }], backends: [{ id: "fake", enabled: true }],
}, },
dbClient: client, dbClient: client,

View File

@@ -51,6 +51,7 @@ export async function startWorker(options: StartWorkerOptions = {}) {
workspaceRoot: config.WORKSPACE_ROOT, workspaceRoot: config.WORKSPACE_ROOT,
availableBackends: config.backends, availableBackends: config.backends,
maxConcurrentRuns: config.MAX_CONCURRENT_RUNS, maxConcurrentRuns: config.MAX_CONCURRENT_RUNS,
recovery: { maxHungMs: config.SESSION_MAX_HUNG_MS },
}), }),
connection: connection as NativeConnection, connection: connection as NativeConnection,
namespace: "devflow", namespace: "devflow",

View File

@@ -50,6 +50,26 @@ describe("config loader", () => {
}); });
expect(config.backends).toContainEqual({ id: "fake", enabled: true }); expect(config.backends).toContainEqual({ id: "fake", enabled: true });
expect(config.SESSION_MAX_HUNG_MS).toBe(20 * 60 * 1000);
});
it("loads configurable session hung timeout", () => {
const root = mkdtempSync(join(tmpdir(), "devflow-config-"));
const workspace = join(root, "workspace");
mkdirSync(workspace);
const config = loadConfigFromSources({
cwd: root,
env: {
DATABASE_URL: "postgres://devflow:devflow@localhost:5432/devflow",
WORKSPACE_ROOT: workspace,
LOG_LEVEL: "info",
TEMPORAL_ADDRESS: "localhost:7233",
SESSION_MAX_HUNG_MS: "2500",
},
});
expect(config.SESSION_MAX_HUNG_MS).toBe(2500);
}); });
it("resolves backend binaries from PATH during config load", () => { it("resolves backend binaries from PATH during config load", () => {

View File

@@ -22,6 +22,11 @@ const RawConfigSchema = z.object({
LOG_LEVEL: LogLevel, LOG_LEVEL: LogLevel,
TEMPORAL_ADDRESS: z.string().min(1), TEMPORAL_ADDRESS: z.string().min(1),
MAX_CONCURRENT_RUNS: z.coerce.number().int().positive().default(4), MAX_CONCURRENT_RUNS: z.coerce.number().int().positive().default(4),
SESSION_MAX_HUNG_MS: z.coerce
.number()
.int()
.positive()
.default(20 * 60 * 1000),
backends: z.array(BackendConfig).default([{ id: "fake", enabled: true }]), backends: z.array(BackendConfig).default([{ id: "fake", enabled: true }]),
}); });

View File

@@ -32,6 +32,7 @@ import {
} from "@devflow/db"; } from "@devflow/db";
import { import {
FakeSessionAdapter, FakeSessionAdapter,
type ProbeResult,
type SessionAdapter, type SessionAdapter,
type SessionHandle, type SessionHandle,
SessionManager, SessionManager,
@@ -161,6 +162,16 @@ class CaptureFailsAfterDisposeFakeAdapter extends FakeSessionAdapter {
} }
} }
class RecentlyHungProbeFakeAdapter extends FakeSessionAdapter {
override async probe(handle: SessionHandle): Promise<ProbeResult> {
const result = await super.probe(handle);
if (!result.alive || !result.paneActive) {
return result;
}
return { ...result, lastOutputAt: new Date(Date.now() - 5) };
}
}
class TerminalHandleRecordingRuntime implements SessionRuntime { class TerminalHandleRecordingRuntime implements SessionRuntime {
readonly adapter = new FakeSessionAdapter({ writeDelayMs: 0 }); readonly adapter = new FakeSessionAdapter({ writeDelayMs: 0 });
readonly captureHandles: SessionHandle[] = []; readonly captureHandles: SessionHandle[] = [];
@@ -480,6 +491,43 @@ describe("DbRunEngine", () => {
}); });
}); });
it("passes configured session hung timeout into phase recovery", async () => {
client = createDbClient(databaseUrl);
await seedDevelopmentRegistry(client.db);
const workspaceRoot = realpathSync(mkdtempSync(join(tmpdir(), "devflow-engine-workspace-")));
const repoPath = createGitRepo();
tempRoots.push(workspaceRoot, repoPath);
const engine = new DbRunEngine({
db: client.db,
sessions: sessionRuntime(client.db, new RecentlyHungProbeFakeAdapter({ writeDelayMs: 0 })),
maxConcurrentRuns: 100,
recovery: { maxHungMs: 1 },
workspaceRoot,
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 10 },
});
const { runId } = await engine.startRun({
requirementsMd: "Classify timeout recovery using the configured hung threshold.",
repoPath,
baseBranch: "main",
scenarios: {
spec: { scenario: "timeout", repairScenario: "ok" },
},
});
runIds.push(runId);
const status = await engine.getStatus(runId);
expect(status.run.state).toBe("awaiting_approval");
expect(pendingApproval(status, "spec_approved")).toBeDefined();
const [timeoutEvent] = await client.db
.select({ payload: runEvents.payload })
.from(runEvents)
.where(and(eq(runEvents.runId, runId), eq(runEvents.type, "artifact.timeout")))
.limit(1);
expect(timeoutEvent?.payload).toMatchObject({ sessionState: "HUNG" });
});
it("validates a prepared run replay without accepting changed start inputs", async () => { it("validates a prepared run replay without accepting changed start inputs", async () => {
client = createDbClient(databaseUrl); client = createDbClient(databaseUrl);
await seedDevelopmentRegistry(client.db); await seedDevelopmentRegistry(client.db);

View File

@@ -96,6 +96,9 @@ export interface DbRunEngineOptions {
workspaceRoot: string; workspaceRoot: string;
availableBackends?: readonly BackendConfig[]; availableBackends?: readonly BackendConfig[];
maxConcurrentRuns?: number; maxConcurrentRuns?: number;
recovery?: {
maxHungMs?: number;
};
wait?: { wait?: {
timeoutMs?: number; timeoutMs?: number;
pollIntervalMs?: number; pollIntervalMs?: number;
@@ -179,6 +182,7 @@ export class DbRunEngine implements RunEngine {
private readonly workspaceRoot: string; private readonly workspaceRoot: string;
private readonly availableBackends: readonly BackendConfig[]; private readonly availableBackends: readonly BackendConfig[];
private readonly maxConcurrentRuns: number; private readonly maxConcurrentRuns: number;
private readonly recovery: DbRunEngineOptions["recovery"];
private readonly wait: DbRunEngineOptions["wait"]; private readonly wait: DbRunEngineOptions["wait"];
constructor(options: DbRunEngineOptions) { constructor(options: DbRunEngineOptions) {
@@ -189,6 +193,7 @@ export class DbRunEngine implements RunEngine {
{ id: "fake", enabled: true, binaryPath: undefined }, { id: "fake", enabled: true, binaryPath: undefined },
]; ];
this.maxConcurrentRuns = options.maxConcurrentRuns ?? 4; this.maxConcurrentRuns = options.maxConcurrentRuns ?? 4;
this.recovery = options.recovery;
this.wait = options.wait; this.wait = options.wait;
} }
@@ -911,6 +916,7 @@ export class DbRunEngine implements RunEngine {
context.input.requirementsMd, context.input.requirementsMd,
scenarioForPhase(context.input.extra, phaseRow.phaseKey), scenarioForPhase(context.input.extra, phaseRow.phaseKey),
), ),
...(this.recovery === undefined ? {} : { recovery: this.recovery }),
...(wait === undefined ? {} : { wait }), ...(wait === undefined ? {} : { wait }),
terminalRun: false, terminalRun: false,
...(workflowApprovalGateKey === undefined ...(workflowApprovalGateKey === undefined

View File

@@ -55,6 +55,19 @@ class RebootstrapFailsOnceFakeAdapter extends FakeSessionAdapter {
} }
} }
class RebootstrapAlwaysRecoverableFakeAdapter extends FakeSessionAdapter {
rebootstrapAttempts = 0;
override async rebootstrap(_handle: SessionHandle): Promise<SessionHandle> {
this.rebootstrapAttempts += 1;
throw new DevflowError("rebootstrap retry budget exhausted", {
class: "recoverable",
code: "pane_briefly_unresponsive",
recoveryHint: "pane did not become responsive after rebootstrap",
});
}
}
class RebootstrapWritesStaleArtifactFakeAdapter extends FakeSessionAdapter { class RebootstrapWritesStaleArtifactFakeAdapter extends FakeSessionAdapter {
private expectedArtifactPath: string | undefined; private expectedArtifactPath: string | undefined;
@@ -98,6 +111,15 @@ class RebootstrapHumanRequiredFakeAdapter extends FakeSessionAdapter {
} }
} }
class RebootstrapCountingFakeAdapter extends FakeSessionAdapter {
rebootstrapAttempts = 0;
override async rebootstrap(handle: SessionHandle): Promise<SessionHandle> {
this.rebootstrapAttempts += 1;
return super.rebootstrap(handle);
}
}
class ProbeRecoverableFakeAdapter extends FakeSessionAdapter { class ProbeRecoverableFakeAdapter extends FakeSessionAdapter {
override async probe(_handle: SessionHandle): Promise<ProbeResult> { override async probe(_handle: SessionHandle): Promise<ProbeResult> {
throw new DevflowError("recoverable probe failure", { throw new DevflowError("recoverable probe failure", {
@@ -113,6 +135,18 @@ class ProbeUnknownFailureFakeAdapter extends FakeSessionAdapter {
} }
} }
class ProbeFailsOnceThenRecoversFakeAdapter extends FakeSessionAdapter {
private probeAttempts = 0;
override async probe(handle: SessionHandle): Promise<ProbeResult> {
this.probeAttempts += 1;
if (this.probeAttempts === 1) {
throw new Error("first probe failed unexpectedly");
}
return super.probe(handle);
}
}
class TmuxLivenessOnlyProbeFakeAdapter extends FakeSessionAdapter { class TmuxLivenessOnlyProbeFakeAdapter extends FakeSessionAdapter {
rebootstrapAttempts = 0; rebootstrapAttempts = 0;
@@ -126,6 +160,19 @@ class TmuxLivenessOnlyProbeFakeAdapter extends FakeSessionAdapter {
} }
} }
class DeadPaneProbeFakeAdapter extends FakeSessionAdapter {
rebootstrapAttempts = 0;
override async probe(_handle: SessionHandle): Promise<ProbeResult> {
return { alive: false, paneActive: false, hint: "pane exited after prompt" };
}
override async rebootstrap(handle: SessionHandle): Promise<SessionHandle> {
this.rebootstrapAttempts += 1;
return super.rebootstrap(handle);
}
}
class BreakArtifactParentFakeAdapter extends FakeSessionAdapter { class BreakArtifactParentFakeAdapter extends FakeSessionAdapter {
override async sendPrompt( override async sendPrompt(
handle: SessionHandle, handle: SessionHandle,
@@ -1685,6 +1732,72 @@ describe("runSingleFakePhase", () => {
.from(approvalRequests) .from(approvalRequests)
.where(eq(approvalRequests.runId, runId)); .where(eq(approvalRequests.runId, runId));
expect(approval).toEqual({ gateKey: "prompt_send_exhausted", state: "pending" }); expect(approval).toEqual({ gateKey: "prompt_send_exhausted", state: "pending" });
const [session] = await db
.select({ state: tuiSessions.state })
.from(tuiSessions)
.where(eq(tuiSessions.id, sessionId));
expect(session).toEqual({ state: "FAILED_NEEDS_HUMAN" });
const events = await db
.select({ type: runEvents.type })
.from(runEvents)
.where(eq(runEvents.runId, runId))
.orderBy(runEvents.seq);
expect(events.map((event) => event.type)).not.toContain("session.crashed");
});
it("does not emit a duplicate crash event when a crashed session resume exhausts retries", async () => {
const { db, phaseId, runId } = await createRunAndPhase();
const worktreeRoot = realpathSync(
mkdtempSync(join(tmpdir(), "devflow-fake-phase-crashed-resume-fails-")),
);
tempRoots.push(worktreeRoot);
const expectedArtifactPath = join(worktreeRoot, "artifacts", "spec.json");
const sessionId = randomUUID();
await db.insert(tuiSessions).values({
id: sessionId,
runId,
roleId: "implementer",
backend: "fake",
cwd: worktreeRoot,
expectedArtifactPath,
expectedSchema: "dev/spec@1",
recoveryAttempts: 1,
state: "CRASHED",
});
const adapter = new ResumeFailsFakeAdapter();
await expect(
runSingleFakePhase({
adapter,
db,
expectedArtifactPath,
expectedSchema: "dev/spec@1",
instructions: "Scenario: ok\nCrashed session resume still fails.",
phaseId,
phaseKey: "implement",
roleId: "implementer",
runId,
worktreeRoot,
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 500 },
uuidFactory: () => "00000000-0000-4000-8000-000000000045",
}),
).rejects.toMatchObject({ code: "prompt_send_exhausted" });
expect(adapter.resumeAttempts).toBe(3);
const [session] = await db
.select({ recoveryAttempts: tuiSessions.recoveryAttempts, state: tuiSessions.state })
.from(tuiSessions)
.where(eq(tuiSessions.id, sessionId));
expect(session).toEqual({ recoveryAttempts: 1, state: "FAILED_NEEDS_HUMAN" });
const events = await db
.select({ type: runEvents.type })
.from(runEvents)
.where(eq(runEvents.runId, runId))
.orderBy(runEvents.seq);
expect(events.map((event) => event.type)).not.toContain("session.crashed");
}); });
it("resumes a running phase when the crash happened before session creation", async () => { it("resumes a running phase when the crash happened before session creation", async () => {
@@ -2358,10 +2471,20 @@ describe("runSingleFakePhase", () => {
await expectRunPaused(db, runId); await expectRunPaused(db, runId);
const [approval] = await db const [approval] = await db
.select({ gateKey: approvalRequests.gateKey, state: approvalRequests.state }) .select({
gateKey: approvalRequests.gateKey,
payload: approvalRequests.payload,
state: approvalRequests.state,
})
.from(approvalRequests) .from(approvalRequests)
.where(eq(approvalRequests.runId, runId)); .where(eq(approvalRequests.runId, runId));
expect(approval).toEqual({ gateKey: "artifact_invalid_after_repair", state: "pending" }); expect(approval).toEqual({
gateKey: "artifact_invalid_after_repair",
payload: expect.objectContaining({
recoveryHint: expect.stringContaining(expectedArtifactPath),
}),
state: "pending",
});
}); });
it("revalidates an artifact file when validating replay has no artifact row yet", async () => { it("revalidates an artifact file when validating replay has no artifact row yet", async () => {
@@ -3068,6 +3191,51 @@ describe("runSingleFakePhase", () => {
await expectRunCompleted(db, runId); await expectRunCompleted(db, runId);
}); });
it("preserves rebootstrap exhaustion recovery hints in the human gate", async () => {
const { db, phaseId, runId } = await createRunAndPhase();
const worktreeRoot = realpathSync(
mkdtempSync(join(tmpdir(), "devflow-fake-phase-rebootstrap-exhausted-")),
);
tempRoots.push(worktreeRoot);
const expectedArtifactPath = join(worktreeRoot, "artifacts", "spec.json");
const sessionId = randomUUID();
const adapter = new RebootstrapAlwaysRecoverableFakeAdapter({
sessionIdFactory: () => sessionId,
writeDelayMs: 0,
});
await expect(
runSingleFakePhase({
adapter,
db,
expectedArtifactPath,
expectedSchema: "dev/spec@1",
instructions: "Scenario: timeout\nRepair-Scenario: ok\nRebootstrap never recovers.",
phaseId,
phaseKey: "implement",
roleId: "implementer",
runId,
worktreeRoot,
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 10 },
uuidFactory: () => "00000000-0000-4000-8000-000000000041",
}),
).rejects.toMatchObject({ code: "artifact_timeout_exhausted" });
expect(adapter.rebootstrapAttempts).toBe(2);
await expectRunPaused(db, runId);
const [approval] = await db
.select({ gateKey: approvalRequests.gateKey, payload: approvalRequests.payload })
.from(approvalRequests)
.where(eq(approvalRequests.runId, runId));
expect(approval).toEqual({
gateKey: "artifact_timeout_exhausted",
payload: expect.objectContaining({
recoveryHint: "pane did not become responsive after rebootstrap",
}),
});
});
it("fails the run instead of gating when timeout recovery rebootstrap is fatal", async () => { it("fails the run instead of gating when timeout recovery rebootstrap is fatal", async () => {
const { db, phaseId, runId } = await createRunAndPhase(); const { db, phaseId, runId } = await createRunAndPhase();
const worktreeRoot = realpathSync( const worktreeRoot = realpathSync(
@@ -3178,10 +3346,18 @@ describe("runSingleFakePhase", () => {
await expectRunPaused(db, runId); await expectRunPaused(db, runId);
const [approval] = await db const [approval] = await db
.select({ gateKey: approvalRequests.gateKey, state: approvalRequests.state }) .select({
gateKey: approvalRequests.gateKey,
payload: approvalRequests.payload,
state: approvalRequests.state,
})
.from(approvalRequests) .from(approvalRequests)
.where(eq(approvalRequests.runId, runId)); .where(eq(approvalRequests.runId, runId));
expect(approval).toEqual({ gateKey: "backend_unavailable", state: "pending" }); expect(approval).toEqual({
gateKey: "backend_unavailable",
payload: expect.objectContaining({ recoveryHint: "human rebootstrap failure" }),
state: "pending",
});
}); });
it("requests a human gate when timeout recovery probe fails recoverably", async () => { it("requests a human gate when timeout recovery probe fails recoverably", async () => {
@@ -3232,6 +3408,13 @@ describe("runSingleFakePhase", () => {
.from(approvalRequests) .from(approvalRequests)
.where(eq(approvalRequests.runId, runId)); .where(eq(approvalRequests.runId, runId));
expect(approval).toEqual({ gateKey: "artifact_timeout_exhausted", state: "pending" }); expect(approval).toEqual({ gateKey: "artifact_timeout_exhausted", state: "pending" });
const events = await db
.select({ type: runEvents.type })
.from(runEvents)
.where(eq(runEvents.runId, runId))
.orderBy(runEvents.seq);
expect(events.map((event) => event.type)).not.toContain("session.crashed");
}); });
it("does not rebootstrap when tmux probe only proves pane liveness", async () => { it("does not rebootstrap when tmux probe only proves pane liveness", async () => {
@@ -3274,6 +3457,508 @@ describe("runSingleFakePhase", () => {
expect(approval).toEqual({ gateKey: "artifact_timeout_exhausted", state: "pending" }); expect(approval).toEqual({ gateKey: "artifact_timeout_exhausted", state: "pending" });
}); });
it("recovers a post-prompt crashed session through rebootstrap", async () => {
const { db, phaseId, runId } = await createRunAndPhase();
const worktreeRoot = realpathSync(
mkdtempSync(join(tmpdir(), "devflow-fake-phase-dead-pane-timeout-")),
);
tempRoots.push(worktreeRoot);
const expectedArtifactPath = join(worktreeRoot, "artifacts", "spec.json");
const sessionId = randomUUID();
const adapter = new DeadPaneProbeFakeAdapter({
sessionIdFactory: () => sessionId,
writeDelayMs: 0,
});
const result = await runSingleFakePhase({
adapter,
db,
expectedArtifactPath,
expectedSchema: "dev/spec@1",
instructions: "Scenario: timeout\nRepair-Scenario: ok\nPane exits after accepting prompt.",
phaseId,
phaseKey: "implement",
roleId: "implementer",
runId,
worktreeRoot,
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 10 },
uuidFactory: () => "00000000-0000-4000-8000-000000000047",
});
expect(result.artifactValid).toBe(true);
expect(adapter.rebootstrapAttempts).toBe(1);
await expectRunCompleted(db, runId);
const [session] = await db
.select({ state: tuiSessions.state })
.from(tuiSessions)
.where(eq(tuiSessions.id, sessionId));
expect(session).toEqual({ state: "READY" });
const events = await db
.select({ type: runEvents.type })
.from(runEvents)
.where(eq(runEvents.runId, runId))
.orderBy(runEvents.seq);
expect(events.map((event) => event.type)).toEqual(
expect.arrayContaining(["session.crashed", "session.recovered", "artifact.validated"]),
);
expect(events.map((event) => event.type)).not.toContain("approval.requested");
});
it.each(["ARTIFACT_TIMEOUT", "HUNG"] as const)(
"recovers a dead pane when replay starts from %s",
async (sessionState) => {
const { db, phaseId, runId } = await createRunAndPhase("executing", "awaiting_artifact", 1);
const worktreeRoot = realpathSync(
mkdtempSync(
join(tmpdir(), `devflow-fake-phase-dead-${sessionState.toLowerCase()}-replay-`),
),
);
tempRoots.push(worktreeRoot);
const expectedArtifactPath = join(worktreeRoot, "artifacts", "spec.json");
const sessionId = randomUUID();
const instructions = `Scenario: ok\nReplay dead pane from ${sessionState}.`;
const dedupKey = hash({
attempt: 1,
expectedArtifact: expectedArtifactPath,
expectedSchema: "dev/spec@1",
instructions,
phaseKey: "implement",
roleId: "implementer",
runId,
});
await new RunEventRepository(db).append({
idempotencyKey: `phase.started:${phaseId}:1`,
phaseId,
payload: { attempt: 1, phaseKey: "implement" },
runId,
type: "phase.started",
});
await db.insert(tuiSessions).values({
id: sessionId,
runId,
roleId: "implementer",
backend: "fake",
cwd: worktreeRoot,
expectedArtifactPath,
expectedSchema: "dev/spec@1",
lastPromptHash: dedupKey,
lastPromptAt: new Date("2026-05-13T00:00:00.000Z"),
state: sessionState,
});
const adapter = new DeadPaneProbeFakeAdapter({
sessionIdFactory: () => sessionId,
writeDelayMs: 0,
});
await adapter.start({
sessionId,
runId,
roleId: "implementer",
backend: "fake",
cwd: worktreeRoot,
expectedArtifactPath,
expectedSchema: "dev/spec@1",
});
const result = await runSingleFakePhase({
adapter,
db,
expectedArtifactPath,
expectedSchema: "dev/spec@1",
instructions,
phaseId,
phaseKey: "implement",
roleId: "implementer",
runId,
worktreeRoot,
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 500 },
});
expect(result.artifactValid).toBe(true);
expect(adapter.rebootstrapAttempts).toBe(1);
const events = await db
.select({ type: runEvents.type })
.from(runEvents)
.where(eq(runEvents.runId, runId))
.orderBy(runEvents.seq);
expect(events.map((event) => event.type)).toContain("session.recovered");
expect(events.map((event) => event.type)).toContain("artifact.validated");
expect(events.map((event) => event.type)).not.toContain("approval.requested");
},
);
it("gates awaiting-artifact replay when the persisted session is already timed out", async () => {
const { db, phaseId, runId } = await createRunAndPhase("executing", "awaiting_artifact", 1);
const worktreeRoot = realpathSync(
mkdtempSync(join(tmpdir(), "devflow-fake-phase-timeout-replay-")),
);
tempRoots.push(worktreeRoot);
const expectedArtifactPath = join(worktreeRoot, "artifacts", "spec.json");
mkdirSync(dirname(expectedArtifactPath), { recursive: true });
writeFileSync(
expectedArtifactPath,
JSON.stringify({
summary: "Late artifact",
requirements: [{ id: "REQ-1", description: "Do not accept stale timeout output" }],
acceptanceCriteria: ["Gate instead of skipping recovery"],
risks: [],
}),
);
const sessionId = randomUUID();
const instructions = "Scenario: ok\nReplay a timed-out phase.";
const dedupKey = hash({
attempt: 1,
expectedArtifact: expectedArtifactPath,
expectedSchema: "dev/spec@1",
instructions,
phaseKey: "implement",
roleId: "implementer",
runId,
});
await new RunEventRepository(db).append({
idempotencyKey: `phase.started:${phaseId}:1`,
phaseId,
payload: { attempt: 1, phaseKey: "implement" },
runId,
type: "phase.started",
});
await db.insert(tuiSessions).values({
id: sessionId,
runId,
roleId: "implementer",
backend: "fake",
cwd: worktreeRoot,
expectedArtifactPath,
expectedSchema: "dev/spec@1",
lastPromptHash: dedupKey,
lastPromptAt: new Date("2026-05-13T00:00:00.000Z"),
state: "ARTIFACT_TIMEOUT",
});
await expect(
runSingleFakePhase({
adapter: new FakeSessionAdapter({ sessionIdFactory: () => sessionId, writeDelayMs: 0 }),
db,
expectedArtifactPath,
expectedSchema: "dev/spec@1",
instructions,
phaseId,
phaseKey: "implement",
roleId: "implementer",
runId,
worktreeRoot,
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 10 },
uuidFactory: () => "00000000-0000-4000-8000-000000000042",
}),
).rejects.toMatchObject({ code: "artifact_timeout_exhausted" });
await expectRunPaused(db, runId);
const [session] = await db
.select({ state: tuiSessions.state })
.from(tuiSessions)
.where(eq(tuiSessions.id, sessionId));
expect(session).toEqual({ state: "FAILED_NEEDS_HUMAN" });
const events = await db
.select({ type: runEvents.type })
.from(runEvents)
.where(eq(runEvents.runId, runId))
.orderBy(runEvents.seq);
expect(events.map((event) => event.type)).not.toContain("session.crashed");
});
it("continues timeout recovery replay when the session was already resuming", async () => {
const { db, phaseId, runId } = await createRunAndPhase("executing", "awaiting_artifact", 1);
const worktreeRoot = realpathSync(
mkdtempSync(join(tmpdir(), "devflow-fake-phase-resuming-timeout-replay-")),
);
tempRoots.push(worktreeRoot);
const expectedArtifactPath = join(worktreeRoot, "artifacts", "spec.json");
mkdirSync(dirname(expectedArtifactPath), { recursive: true });
writeFileSync(
expectedArtifactPath,
JSON.stringify({
summary: "Late artifact while resuming",
requirements: [{ id: "REQ-1", description: "Do not accept stale resuming output" }],
acceptanceCriteria: ["Continue recovery and request fresh repair output"],
risks: [],
}),
);
const sessionId = randomUUID();
const instructions = "Scenario: ok\nReplay an already-resuming timeout recovery.";
const dedupKey = hash({
attempt: 1,
expectedArtifact: expectedArtifactPath,
expectedSchema: "dev/spec@1",
instructions,
phaseKey: "implement",
roleId: "implementer",
runId,
});
await new RunEventRepository(db).append({
idempotencyKey: `phase.started:${phaseId}:1`,
phaseId,
payload: { attempt: 1, phaseKey: "implement" },
runId,
type: "phase.started",
});
await db.insert(tuiSessions).values({
id: sessionId,
runId,
roleId: "implementer",
backend: "fake",
cwd: worktreeRoot,
expectedArtifactPath,
expectedSchema: "dev/spec@1",
lastPromptHash: dedupKey,
lastPromptAt: new Date("2026-05-13T00:00:00.000Z"),
state: "RESUMING",
});
const adapter = new RebootstrapCountingFakeAdapter({
sessionIdFactory: () => sessionId,
writeDelayMs: 0,
});
await adapter.start({
sessionId,
runId,
roleId: "implementer",
backend: "fake",
cwd: worktreeRoot,
expectedArtifactPath,
expectedSchema: "dev/spec@1",
});
const result = await runSingleFakePhase({
adapter,
db,
expectedArtifactPath,
expectedSchema: "dev/spec@1",
instructions,
phaseId,
phaseKey: "implement",
roleId: "implementer",
runId,
worktreeRoot,
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 500 },
uuidFactory: () => "00000000-0000-4000-8000-000000000046",
});
expect(result.artifactValid).toBe(true);
expect(adapter.rebootstrapAttempts).toBe(1);
const artifactRows = await db
.select({ id: artifacts.id, valid: artifacts.valid })
.from(artifacts)
.where(eq(artifacts.runId, runId));
expect(artifactRows).toHaveLength(1);
expect(artifactRows[0]?.valid).toBe(true);
const [phase] = await db
.select({ attempts: runPhases.attempts, state: runPhases.state })
.from(runPhases)
.where(eq(runPhases.id, phaseId));
expect(phase).toEqual({ attempts: 2, state: "completed" });
const events = await db
.select({ type: runEvents.type })
.from(runEvents)
.where(eq(runEvents.runId, runId))
.orderBy(runEvents.seq);
expect(events.map((event) => event.type)).toContain("session.recovered");
expect(events.map((event) => event.type)).toContain("artifact.validated");
});
it.each([
["HUNG", 1],
["CRASHED", 1],
["REBOOTSTRAPPED", 0],
] as const)(
"continues timeout recovery replay when the session is %s",
async (sessionState, expectedRebootstrapAttempts) => {
const { db, phaseId, runId } = await createRunAndPhase("executing", "awaiting_artifact", 1);
const worktreeRoot = realpathSync(
mkdtempSync(
join(tmpdir(), `devflow-fake-phase-${sessionState.toLowerCase()}-timeout-replay-`),
),
);
tempRoots.push(worktreeRoot);
const expectedArtifactPath = join(worktreeRoot, "artifacts", "spec.json");
mkdirSync(dirname(expectedArtifactPath), { recursive: true });
writeFileSync(
expectedArtifactPath,
JSON.stringify({
summary: `Late artifact while ${sessionState}`,
requirements: [{ id: "REQ-1", description: "Do not accept stale replay output" }],
acceptanceCriteria: ["Continue recovery and require fresh repair output"],
risks: [],
}),
);
const sessionId = randomUUID();
const instructions = `Scenario: ok\nReplay ${sessionState} timeout recovery.`;
const dedupKey = hash({
attempt: 1,
expectedArtifact: expectedArtifactPath,
expectedSchema: "dev/spec@1",
instructions,
phaseKey: "implement",
roleId: "implementer",
runId,
});
await new RunEventRepository(db).append({
idempotencyKey: `phase.started:${phaseId}:1`,
phaseId,
payload: { attempt: 1, phaseKey: "implement" },
runId,
type: "phase.started",
});
await db.insert(tuiSessions).values({
id: sessionId,
runId,
roleId: "implementer",
backend: "fake",
cwd: worktreeRoot,
expectedArtifactPath,
expectedSchema: "dev/spec@1",
lastPromptHash: dedupKey,
lastPromptAt: new Date("2026-05-13T00:00:00.000Z"),
state: sessionState,
});
const adapter = new RebootstrapCountingFakeAdapter({
sessionIdFactory: () => sessionId,
writeDelayMs: 0,
});
await adapter.start({
sessionId,
runId,
roleId: "implementer",
backend: "fake",
cwd: worktreeRoot,
expectedArtifactPath,
expectedSchema: "dev/spec@1",
});
const result = await runSingleFakePhase({
adapter,
db,
expectedArtifactPath,
expectedSchema: "dev/spec@1",
instructions,
phaseId,
phaseKey: "implement",
roleId: "implementer",
runId,
worktreeRoot,
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 500 },
});
expect(result.artifactValid).toBe(true);
expect(adapter.rebootstrapAttempts).toBe(expectedRebootstrapAttempts);
const [phase] = await db
.select({ attempts: runPhases.attempts, state: runPhases.state })
.from(runPhases)
.where(eq(runPhases.id, phaseId));
expect(phase).toEqual({ attempts: 2, state: "completed" });
const events = await db
.select({ type: runEvents.type })
.from(runEvents)
.where(eq(runEvents.runId, runId))
.orderBy(runEvents.seq);
expect(events.map((event) => event.type)).toContain("session.recovered");
expect(events.map((event) => event.type)).toContain("artifact.validated");
expect(events.map((event) => event.type)).not.toContain("session.crashed");
},
);
it("does not accept a stale timeout artifact after session recovery replay", async () => {
const { db, phaseId, runId } = await createRunAndPhase("executing", "awaiting_artifact", 1);
const worktreeRoot = realpathSync(
mkdtempSync(join(tmpdir(), "devflow-fake-phase-recovered-timeout-replay-")),
);
tempRoots.push(worktreeRoot);
const expectedArtifactPath = join(worktreeRoot, "artifacts", "spec.json");
mkdirSync(dirname(expectedArtifactPath), { recursive: true });
writeFileSync(
expectedArtifactPath,
JSON.stringify({
summary: "Late artifact after recovery",
requirements: [{ id: "REQ-1", description: "Do not complete from stale output" }],
acceptanceCriteria: ["Require fresh repair output"],
risks: [],
}),
);
const sessionId = randomUUID();
const instructions = "Scenario: ok\nReplay after recovered timeout.";
const dedupKey = hash({
attempt: 1,
expectedArtifact: expectedArtifactPath,
expectedSchema: "dev/spec@1",
instructions,
phaseKey: "implement",
roleId: "implementer",
runId,
});
const repository = new RunEventRepository(db);
await repository.append({
idempotencyKey: `phase.started:${phaseId}:1`,
phaseId,
payload: { attempt: 1, phaseKey: "implement" },
runId,
type: "phase.started",
});
await repository.append({
idempotencyKey: `session.recovered:${sessionId}:1`,
phaseId,
payload: { recoveryAttempts: 1, roleId: "implementer", sessionId },
runId,
type: "session.recovered",
});
await db.insert(tuiSessions).values({
id: sessionId,
runId,
roleId: "implementer",
backend: "fake",
cwd: worktreeRoot,
expectedArtifactPath,
expectedSchema: "dev/spec@1",
lastPromptHash: dedupKey,
lastPromptAt: new Date("2026-05-13T00:00:00.000Z"),
recoveryAttempts: 1,
state: "READY",
});
await expect(
runSingleFakePhase({
adapter: new FakeSessionAdapter({ sessionIdFactory: () => sessionId, writeDelayMs: 0 }),
db,
expectedArtifactPath,
expectedSchema: "dev/spec@1",
instructions,
phaseId,
phaseKey: "implement",
roleId: "implementer",
runId,
worktreeRoot,
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 10 },
uuidFactory: () => "00000000-0000-4000-8000-000000000043",
}),
).rejects.toMatchObject({ code: "prompt_send_exhausted" });
const artifactRows = await db
.select({ id: artifacts.id })
.from(artifacts)
.where(eq(artifacts.runId, runId));
expect(artifactRows).toEqual([]);
const [phase] = await db
.select({ attempts: runPhases.attempts, state: runPhases.state })
.from(runPhases)
.where(eq(runPhases.id, phaseId));
expect(phase).toEqual({ attempts: 2, state: "failed" });
});
it("fails the run when timeout recovery probe throws an unclassified error", async () => { it("fails the run when timeout recovery probe throws an unclassified error", async () => {
const { db, phaseId, runId } = await createRunAndPhase(); const { db, phaseId, runId } = await createRunAndPhase();
const worktreeRoot = realpathSync( const worktreeRoot = realpathSync(
@@ -3313,6 +3998,47 @@ describe("runSingleFakePhase", () => {
expect(approvals).toEqual([]); expect(approvals).toEqual([]);
}); });
it("does not swallow an unclassified timeout-classification probe failure when a later probe would recover", async () => {
const { db, phaseId, runId } = await createRunAndPhase();
const worktreeRoot = realpathSync(
mkdtempSync(join(tmpdir(), "devflow-fake-phase-probe-first-unknown-")),
);
tempRoots.push(worktreeRoot);
const expectedArtifactPath = join(worktreeRoot, "artifacts", "spec.json");
const sessionId = randomUUID();
await expect(
runSingleFakePhase({
adapter: new ProbeFailsOnceThenRecoversFakeAdapter({
sessionIdFactory: () => sessionId,
writeDelayMs: 0,
}),
db,
expectedArtifactPath,
expectedSchema: "dev/spec@1",
instructions:
"Scenario: timeout\nRepair-Scenario: ok\nFirst classification probe fails unexpectedly.",
phaseId,
phaseKey: "implement",
roleId: "implementer",
runId,
worktreeRoot,
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 10 },
uuidFactory: () => "00000000-0000-4000-8000-000000000038",
}),
).rejects.toMatchObject({ code: "internal_state_corruption" });
const [run] = await db.select({ state: runs.state }).from(runs).where(eq(runs.id, runId));
expect(run).toEqual({ state: "failed" });
const events = await db
.select({ type: runEvents.type })
.from(runEvents)
.where(eq(runEvents.runId, runId))
.orderBy(runEvents.seq);
expect(events.map((event) => event.type)).not.toContain("session.recovered");
});
it("does not let a stale artifact produced during timeout recovery satisfy repair validation", async () => { it("does not let a stale artifact produced during timeout recovery satisfy repair validation", async () => {
const { db, phaseId, runId } = await createRunAndPhase(); const { db, phaseId, runId } = await createRunAndPhase();
const worktreeRoot = realpathSync( const worktreeRoot = realpathSync(

View File

@@ -28,7 +28,12 @@ import {
SessionManager, SessionManager,
type SessionRuntime, type SessionRuntime,
type TranscriptChunkSink, type TranscriptChunkSink,
assertSessionStateAssignment,
assertSessionTransition,
captureAndPersistTranscript, captureAndPersistTranscript,
isAllowedSessionTransition,
isSessionHung,
retryRecoverable,
} from "@devflow/session"; } from "@devflow/session";
import { and, desc, eq, inArray, sql } from "drizzle-orm"; import { and, desc, eq, inArray, sql } from "drizzle-orm";
@@ -44,6 +49,10 @@ interface ArtifactWaitOptions extends FakePhaseWaitOptions {
ignoreInitialSignature?: string; ignoreInitialSignature?: string;
} }
interface FakePhaseRecoveryOptions {
maxHungMs?: number;
}
interface RunSingleFakePhaseBaseInput { interface RunSingleFakePhaseBaseInput {
db: DbClient["db"]; db: DbClient["db"];
runId: string; runId: string;
@@ -55,6 +64,7 @@ interface RunSingleFakePhaseBaseInput {
expectedSchema: string; expectedSchema: string;
instructions: string; instructions: string;
wait?: FakePhaseWaitOptions; wait?: FakePhaseWaitOptions;
recovery?: FakePhaseRecoveryOptions;
uuidFactory?: () => string; uuidFactory?: () => string;
transcriptSink?: TranscriptChunkSink; transcriptSink?: TranscriptChunkSink;
terminalRun?: boolean; terminalRun?: boolean;
@@ -81,7 +91,6 @@ export interface RunSingleFakePhaseResult {
type TransactionDb = Parameters<Parameters<RunSingleFakePhaseInput["db"]["transaction"]>[0]>[0]; type TransactionDb = Parameters<Parameters<RunSingleFakePhaseInput["db"]["transaction"]>[0]>[0];
const sendPromptRetryBudget = 2;
const terminalRunStates = ["completed", "failed", "aborted"] as const; const terminalRunStates = ["completed", "failed", "aborted"] as const;
const phaseMutationRunStates = ["executing", "planning"] as const; const phaseMutationRunStates = ["executing", "planning"] as const;
@@ -298,9 +307,9 @@ export async function runSingleFakePhase(
throw error; throw error;
} }
let recovered: boolean; let recovery: ArtifactTimeoutRecoveryResult;
try { try {
recovered = await recoverFromArtifactTimeout(input, eventRepository, handle.sessionId); recovery = await recoverFromArtifactTimeout(input, eventRepository, handle.sessionId);
} catch (recoveryError) { } catch (recoveryError) {
if (isRunStateChanged(recoveryError)) { if (isRunStateChanged(recoveryError)) {
await captureTranscript(input, handle); await captureTranscript(input, handle);
@@ -333,7 +342,7 @@ export async function runSingleFakePhase(
); );
throw recoveryError; throw recoveryError;
} }
if (!recovered) { if (!recovery.recovered) {
await failPhaseAndRequestGate( await failPhaseAndRequestGate(
input, input,
eventRepository, eventRepository,
@@ -342,9 +351,9 @@ export async function runSingleFakePhase(
"artifact_timeout_exhausted", "artifact_timeout_exhausted",
{ {
expectedArtifactPath: input.expectedArtifactPath, expectedArtifactPath: input.expectedArtifactPath,
recoveryHint: recovery.recoveryHint ?? input.expectedArtifactPath,
}, },
handle.sessionId, handle.sessionId,
{ markSessionCrashed: true },
); );
await captureTranscript(input, handle); await captureTranscript(input, handle);
throw error; throw error;
@@ -358,6 +367,7 @@ export async function runSingleFakePhase(
"artifact_timeout_exhausted", "artifact_timeout_exhausted",
{ {
expectedArtifactPath: input.expectedArtifactPath, expectedArtifactPath: input.expectedArtifactPath,
recoveryHint: input.expectedArtifactPath,
}, },
handle.sessionId, handle.sessionId,
); );
@@ -470,6 +480,7 @@ export async function runSingleFakePhase(
"artifact_timeout_exhausted", "artifact_timeout_exhausted",
{ {
expectedArtifactPath: input.expectedArtifactPath, expectedArtifactPath: input.expectedArtifactPath,
recoveryHint: input.expectedArtifactPath,
}, },
handle.sessionId, handle.sessionId,
); );
@@ -511,6 +522,7 @@ export async function runSingleFakePhase(
{ {
artifactId: outcome.artifact.id, artifactId: outcome.artifact.id,
expectedArtifactPath: input.expectedArtifactPath, expectedArtifactPath: input.expectedArtifactPath,
recoveryHint: artifactInvalidRecoveryHint(input, outcome),
}, },
handle.sessionId, handle.sessionId,
); );
@@ -520,6 +532,7 @@ export async function runSingleFakePhase(
code: "artifact_invalid_after_repair", code: "artifact_invalid_after_repair",
runId: input.runId, runId: input.runId,
phaseId: input.phaseId, phaseId: input.phaseId,
recoveryHint: artifactInvalidRecoveryHint(input, outcome),
}); });
} }
@@ -621,6 +634,7 @@ export async function runSingleFakePhase(
"artifact_timeout_exhausted", "artifact_timeout_exhausted",
{ {
expectedArtifactPath: input.expectedArtifactPath, expectedArtifactPath: input.expectedArtifactPath,
recoveryHint: input.expectedArtifactPath,
}, },
handle.sessionId, handle.sessionId,
); );
@@ -667,6 +681,7 @@ export async function runSingleFakePhase(
{ {
artifactId: outcome.artifact.id, artifactId: outcome.artifact.id,
expectedArtifactPath: input.expectedArtifactPath, expectedArtifactPath: input.expectedArtifactPath,
recoveryHint: artifactInvalidRecoveryHint(input, outcome),
}, },
handle.sessionId, handle.sessionId,
); );
@@ -676,6 +691,7 @@ export async function runSingleFakePhase(
code: "artifact_invalid_after_repair", code: "artifact_invalid_after_repair",
runId: input.runId, runId: input.runId,
phaseId: input.phaseId, phaseId: input.phaseId,
recoveryHint: artifactInvalidRecoveryHint(input, outcome),
}); });
} }
@@ -833,6 +849,18 @@ async function enterInitialPhase(
}) })
.from(tuiSessions) .from(tuiSessions)
.where(and(eq(tuiSessions.runId, input.runId), eq(tuiSessions.roleId, input.roleId))); .where(and(eq(tuiSessions.runId, input.runId), eq(tuiSessions.roleId, input.roleId)));
if (
session !== undefined &&
(isTimeoutRecoverySessionState(session.state) ||
(session.state === "READY" && (await sessionRecoveredEventExists(input, session.id)))) &&
session.lastPromptHash === envelope.dedupKey &&
session.expectedArtifactPath === input.expectedArtifactPath &&
session.expectedSchema === input.expectedSchema
) {
return recoverAwaitingArtifactReplay(input, eventRepository, phase.attempts, session.id, {
repairAttemptUsed: phaseStart.repairAttemptUsed,
});
}
if ( if (
session !== undefined && session !== undefined &&
session.state !== "FAILED_NEEDS_HUMAN" && session.state !== "FAILED_NEEDS_HUMAN" &&
@@ -967,6 +995,112 @@ async function enterInitialPhase(
throw cannotReplayPhase(input, phase.state); throw cannotReplayPhase(input, phase.state);
} }
async function recoverAwaitingArtifactReplay(
input: CanonicalRunSingleFakePhaseInput,
eventRepository: RunEventRepository,
attempt: number,
sessionId: string,
options: { repairAttemptUsed: boolean },
): Promise<PhaseEntry> {
let recovery: ArtifactTimeoutRecoveryResult;
try {
recovery = await recoverFromArtifactTimeout(input, eventRepository, sessionId);
} catch (recoveryError) {
if (isRunStateChanged(recoveryError)) {
throw recoveryError;
}
if (shouldCreateHumanGate(recoveryError)) {
const gateError = toArtifactTimeoutRecoveryGateError(recoveryError);
await failPhaseAndRequestGate(
input,
eventRepository,
attempt,
"artifact_timeout_recovery_failed",
gateError.code,
{
errorCode: recoveryError.code,
expectedArtifactPath: input.expectedArtifactPath,
recoveryHint: gateError.recoveryHint,
},
sessionId,
);
throw gateError;
}
await failPhaseAndRun(input, eventRepository, attempt, "artifact_timeout_recovery_failed");
throw recoveryError;
}
if (!recovery.recovered || options.repairAttemptUsed) {
await failPhaseAndRequestGate(
input,
eventRepository,
attempt,
"artifact_timeout",
"artifact_timeout_exhausted",
{
expectedArtifactPath: input.expectedArtifactPath,
recoveryHint: recovery.recoveryHint ?? input.expectedArtifactPath,
},
sessionId,
);
throw new DevflowError("Artifact timeout recovery exhausted retry budget", {
class: "human_required",
code: "artifact_timeout_exhausted",
runId: input.runId,
phaseId: input.phaseId,
recoveryHint: recovery.recoveryHint ?? input.expectedArtifactPath,
});
}
const repairAttempt = await startPhaseAndRecord(input, eventRepository, ["awaiting_artifact"], {
reason: "artifact_timeout",
repair: true,
});
try {
await removeStaleArtifact(input);
} catch (error) {
await failPhaseAndRun(input, eventRepository, repairAttempt, "stale_artifact_remove_failed");
throw error;
}
return {
attempt: repairAttempt,
continueArtifactWait: false,
continueValidation: false,
handle: { sessionId },
repairAttemptUsed: true,
resumedPrompt: false,
};
}
function isTimeoutRecoverySessionState(state: string): boolean {
return ["ARTIFACT_TIMEOUT", "HUNG", "CRASHED", "RESUMING", "REBOOTSTRAPPED"].includes(state);
}
async function sessionRecoveredEventExists(
input: CanonicalRunSingleFakePhaseInput,
sessionId: string,
): Promise<boolean> {
const events = await input.db
.select({ payload: runEvents.payload })
.from(runEvents)
.where(
and(
eq(runEvents.runId, input.runId),
eq(runEvents.phaseId, input.phaseId),
eq(runEvents.type, "session.recovered"),
),
);
return events.some((event) => payloadSessionId(event.payload) === sessionId);
}
function payloadSessionId(payload: unknown): string | undefined {
if (payload === null || typeof payload !== "object") {
return undefined;
}
const sessionId = (payload as Record<string, unknown>).sessionId;
return typeof sessionId === "string" ? sessionId : undefined;
}
function cannotReplayPhase( function cannotReplayPhase(
input: CanonicalRunSingleFakePhaseInput, input: CanonicalRunSingleFakePhaseInput,
phaseState: string, phaseState: string,
@@ -1234,10 +1368,11 @@ async function failPhaseAndRequestGate(
if (sessionId !== undefined && options.markSessionCrashed === true) { if (sessionId !== undefined && options.markSessionCrashed === true) {
const [session] = await tx const [session] = await tx
.select({ recoveryAttempts: tuiSessions.recoveryAttempts }) .select({ recoveryAttempts: tuiSessions.recoveryAttempts, state: tuiSessions.state })
.from(tuiSessions) .from(tuiSessions)
.where(eq(tuiSessions.id, sessionId)); .where(eq(tuiSessions.id, sessionId));
const recoveryAttempts = (session?.recoveryAttempts ?? 0) + 1; const recoveryAttempts = (session?.recoveryAttempts ?? 0) + 1;
if (session !== undefined && isAllowedSessionTransition(session.state, "CRASHED")) {
await tx await tx
.update(tuiSessions) .update(tuiSessions)
.set({ state: "CRASHED", recoveryAttempts }) .set({ state: "CRASHED", recoveryAttempts })
@@ -1250,8 +1385,16 @@ async function failPhaseAndRequestGate(
idempotencyKey: `session.crashed:${sessionId}:${recoveryAttempts}`, idempotencyKey: `session.crashed:${sessionId}:${recoveryAttempts}`,
}); });
} }
}
if (sessionId !== undefined) { if (sessionId !== undefined) {
const [session] = await tx
.select({ state: tuiSessions.state })
.from(tuiSessions)
.where(eq(tuiSessions.id, sessionId));
if (session !== undefined) {
assertSessionTransition(session.state, "FAILED_NEEDS_HUMAN");
}
await tx await tx
.insert(tuiSessions) .insert(tuiSessions)
.values({ .values({
@@ -1425,10 +1568,11 @@ async function completePhaseAndRun(
}); });
const [session] = await tx const [session] = await tx
.select({ recoveryAttempts: tuiSessions.recoveryAttempts }) .select({ recoveryAttempts: tuiSessions.recoveryAttempts, state: tuiSessions.state })
.from(tuiSessions) .from(tuiSessions)
.where(eq(tuiSessions.id, sessionId)); .where(eq(tuiSessions.id, sessionId));
const recoveryAttempts = session?.recoveryAttempts ?? 0; const recoveryAttempts = session?.recoveryAttempts ?? 0;
assertSessionStateAssignment(session?.state ?? "BUSY", "READY");
await tx.update(tuiSessions).set({ state: "READY" }).where(eq(tuiSessions.id, sessionId)); await tx.update(tuiSessions).set({ state: "READY" }).where(eq(tuiSessions.id, sessionId));
await eventRepository.appendInTransaction(tx, { await eventRepository.appendInTransaction(tx, {
runId: input.runId, runId: input.runId,
@@ -1484,6 +1628,11 @@ async function requestWorkflowApproval(
.update(runPhases) .update(runPhases)
.set({ state: "awaiting_approval" }) .set({ state: "awaiting_approval" })
.where(and(eq(runPhases.id, input.phaseId), eq(runPhases.runId, input.runId))); .where(and(eq(runPhases.id, input.phaseId), eq(runPhases.runId, input.runId)));
const [session] = await tx
.select({ state: tuiSessions.state })
.from(tuiSessions)
.where(eq(tuiSessions.id, sessionId));
assertSessionStateAssignment(session?.state ?? "BUSY", "WAITING_FOR_APPROVAL");
await tx await tx
.update(tuiSessions) .update(tuiSessions)
.set({ state: "WAITING_FOR_APPROVAL" }) .set({ state: "WAITING_FOR_APPROVAL" })
@@ -1614,7 +1763,7 @@ async function startSessionAndRecord(
cwd: input.worktreeRoot, cwd: input.worktreeRoot,
expectedArtifactPath: input.expectedArtifactPath, expectedArtifactPath: input.expectedArtifactPath,
expectedSchema: input.expectedSchema, expectedSchema: input.expectedSchema,
state: "BOOTSTRAPPING", state: "CREATED",
}) })
.onConflictDoNothing({ target: tuiSessions.id }); .onConflictDoNothing({ target: tuiSessions.id });
await eventRepository.appendInTransaction(tx, { await eventRepository.appendInTransaction(tx, {
@@ -1624,10 +1773,12 @@ async function startSessionAndRecord(
payload: { sessionId: startedHandle.sessionId, roleId: input.roleId, backend: "fake" }, payload: { sessionId: startedHandle.sessionId, roleId: input.roleId, backend: "fake" },
idempotencyKey: `session.created:${startedHandle.sessionId}`, idempotencyKey: `session.created:${startedHandle.sessionId}`,
}); });
assertSessionTransition("CREATED", "BOOTSTRAPPING");
await tx await tx
.update(tuiSessions) .update(tuiSessions)
.set({ state: "BOOTSTRAPPING" }) .set({ state: "BOOTSTRAPPING" })
.where(eq(tuiSessions.id, startedHandle.sessionId)); .where(eq(tuiSessions.id, startedHandle.sessionId));
assertSessionTransition("BOOTSTRAPPING", "READY");
await tx await tx
.update(tuiSessions) .update(tuiSessions)
.set({ state: "READY" }) .set({ state: "READY" })
@@ -1751,6 +1902,11 @@ async function resumeExistingSessionAndRecord(
} }
await input.db.transaction(async (tx) => { await input.db.transaction(async (tx) => {
await assertRunCanMutatePhaseInTransaction(input, tx); await assertRunCanMutatePhaseInTransaction(input, tx);
const [currentSession] = await tx
.select({ state: tuiSessions.state })
.from(tuiSessions)
.where(eq(tuiSessions.id, session.id));
assertSessionStateAssignment(currentSession?.state ?? session.state, "READY");
await tx await tx
.update(tuiSessions) .update(tuiSessions)
.set({ .set({
@@ -1873,6 +2029,11 @@ async function sendPromptAndRecord(
: await artifactSignature(input.expectedArtifactPath); : await artifactSignature(input.expectedArtifactPath);
await input.db.transaction(async (tx) => { await input.db.transaction(async (tx) => {
await assertRunCanMutatePhaseInTransaction(input, tx); await assertRunCanMutatePhaseInTransaction(input, tx);
const [currentSession] = await tx
.select({ state: tuiSessions.state })
.from(tuiSessions)
.where(eq(tuiSessions.id, handle.sessionId));
assertSessionTransition(currentSession?.state ?? "READY", "BUSY");
await tx await tx
.update(tuiSessions) .update(tuiSessions)
.set({ .set({
@@ -1972,19 +2133,7 @@ async function sendPromptWithRetry(
handle: { sessionId: string }, handle: { sessionId: string },
envelope: PromptEnvelope, envelope: PromptEnvelope,
): Promise<{ promptId: string }> { ): Promise<{ promptId: string }> {
let lastError: unknown; return retryRecoverable("sendPrompt", () => sessions.sendPrompt(handle, envelope));
for (let physicalAttempt = 0; physicalAttempt <= sendPromptRetryBudget; physicalAttempt += 1) {
try {
return await sessions.sendPrompt(handle, envelope);
} catch (error) {
lastError = error;
if (!(error instanceof DevflowError) || error.class !== "recoverable") {
throw error;
}
}
}
throw lastError;
} }
interface ArtifactOutcome { interface ArtifactOutcome {
@@ -1994,6 +2143,13 @@ interface ArtifactOutcome {
validation: ReturnType<typeof validateArtifact>; validation: ReturnType<typeof validateArtifact>;
} }
function artifactInvalidRecoveryHint(
input: CanonicalRunSingleFakePhaseInput,
outcome: ArtifactOutcome,
): string {
return `artifact=${outcome.artifact.id};path=${input.expectedArtifactPath}`;
}
interface ArtifactRecord { interface ArtifactRecord {
id: string; id: string;
phaseId: string | null; phaseId: string | null;
@@ -2020,7 +2176,8 @@ async function waitForAndValidateArtifact(
if (!isDevflowErrorWithCode(error, "artifact_timeout_exhausted")) { if (!isDevflowErrorWithCode(error, "artifact_timeout_exhausted")) {
throw error; throw error;
} }
await recordArtifactTimeout(input, eventRepository, attempt, sessionId); const timedOutSessionState = await classifyTimedOutSession(input, sessionId);
await recordArtifactTimeout(input, eventRepository, attempt, sessionId, timedOutSessionState);
throw error; throw error;
} }
@@ -2087,6 +2244,7 @@ async function recordArtifactTimeout(
eventRepository: RunEventRepository, eventRepository: RunEventRepository,
attempt: number, attempt: number,
sessionId: string, sessionId: string,
sessionState: "ARTIFACT_TIMEOUT" | "HUNG" | "CRASHED",
) { ) {
await input.db.transaction(async (tx) => { await input.db.transaction(async (tx) => {
await assertRunCanMutatePhaseInTransaction(input, tx); await assertRunCanMutatePhaseInTransaction(input, tx);
@@ -2098,14 +2256,58 @@ async function recordArtifactTimeout(
path: input.expectedArtifactPath, path: input.expectedArtifactPath,
schemaId: input.expectedSchema, schemaId: input.expectedSchema,
attempt, attempt,
sessionState,
}, },
idempotencyKey: `artifact.timeout:${input.phaseId}:${attempt}:${input.expectedArtifactPath}`, idempotencyKey: `artifact.timeout:${input.phaseId}:${attempt}:${input.expectedArtifactPath}`,
}); });
const [currentSession] = await tx
.select({ recoveryAttempts: tuiSessions.recoveryAttempts, state: tuiSessions.state })
.from(tuiSessions)
.where(eq(tuiSessions.id, sessionId));
const currentState = currentSession?.state ?? "BUSY";
assertSessionStateAssignment(currentState, sessionState);
if (currentState === sessionState) {
return;
}
if (sessionState === "CRASHED") {
const recoveryAttempts = (currentSession?.recoveryAttempts ?? 0) + 1;
await tx await tx
.update(tuiSessions) .update(tuiSessions)
.set({ state: "ARTIFACT_TIMEOUT" }) .set({ recoveryAttempts, state: "CRASHED" })
.where(eq(tuiSessions.id, sessionId)); .where(eq(tuiSessions.id, sessionId));
await eventRepository.appendInTransaction(tx, {
runId: input.runId,
phaseId: input.phaseId,
type: "session.crashed",
payload: { sessionId, roleId: input.roleId, recoveryAttempts },
idempotencyKey: `session.crashed:${sessionId}:${recoveryAttempts}`,
}); });
return;
}
await tx.update(tuiSessions).set({ state: sessionState }).where(eq(tuiSessions.id, sessionId));
});
}
async function classifyTimedOutSession(
input: CanonicalRunSingleFakePhaseInput,
sessionId: string,
): Promise<"ARTIFACT_TIMEOUT" | "HUNG" | "CRASHED"> {
try {
const probe = await probeWithTypedError(input.sessions, { sessionId });
if (!probe.alive || !probe.paneActive) {
return "CRASHED";
}
return isSessionHung(probe.lastOutputAt, new Date(), input.recovery?.maxHungMs)
? "HUNG"
: "ARTIFACT_TIMEOUT";
} catch (error) {
// A transient probe failure should not be promoted to a crash classification,
// but fatal/unclassified probe failures must still fail the run.
if (error instanceof DevflowError && error.class === "recoverable") {
return "ARTIFACT_TIMEOUT";
}
throw error;
}
} }
async function recordArtifactValidation( async function recordArtifactValidation(
@@ -2232,6 +2434,11 @@ async function markSessionIdle(
) { ) {
await input.db.transaction(async (tx) => { await input.db.transaction(async (tx) => {
await assertRunCanMutatePhaseInTransaction(input, tx); await assertRunCanMutatePhaseInTransaction(input, tx);
const [currentSession] = await tx
.select({ state: tuiSessions.state })
.from(tuiSessions)
.where(eq(tuiSessions.id, sessionId));
assertSessionStateAssignment(currentSession?.state ?? "BUSY", "READY");
await tx.update(tuiSessions).set({ state: "READY" }).where(eq(tuiSessions.id, sessionId)); await tx.update(tuiSessions).set({ state: "READY" }).where(eq(tuiSessions.id, sessionId));
await eventRepository.appendInTransaction(tx, { await eventRepository.appendInTransaction(tx, {
runId: input.runId, runId: input.runId,
@@ -2266,26 +2473,36 @@ async function recoverFromArtifactTimeout(
input: CanonicalRunSingleFakePhaseInput, input: CanonicalRunSingleFakePhaseInput,
eventRepository: RunEventRepository, eventRepository: RunEventRepository,
sessionId: string, sessionId: string,
): Promise<boolean> { ): Promise<ArtifactTimeoutRecoveryResult> {
const probe = await probeWithTypedError(input.sessions, { sessionId }); const currentState = await sessionState(input, sessionId);
if (!probe.alive || !probe.paneActive || isBackendReadinessUnknown(probe)) { if (currentState === "READY") {
return false; return { recovered: true };
} }
if (!["CRASHED", "RESUMING", "REBOOTSTRAPPED"].includes(currentState ?? "")) {
const probe = await probeWithTypedError(input.sessions, { sessionId });
if (isBackendReadinessUnknown(probe)) {
return {
recovered: false,
recoveryHint: recoveryHintForProbe(probe),
};
}
}
if (currentState !== "REBOOTSTRAPPED") {
await setSessionStateIfRunActive(input, sessionId, "RESUMING"); await setSessionStateIfRunActive(input, sessionId, "RESUMING");
const rebootstrapOk = await rebootstrapWithRetry(input.sessions, { sessionId }); await rebootstrapWithRetry(input.sessions, { sessionId });
if (!rebootstrapOk) {
return false;
}
await setSessionStateIfRunActive(input, sessionId, "REBOOTSTRAPPED"); await setSessionStateIfRunActive(input, sessionId, "REBOOTSTRAPPED");
}
await input.db.transaction(async (tx) => { await input.db.transaction(async (tx) => {
await assertRunCanMutatePhaseInTransaction(input, tx); await assertRunCanMutatePhaseInTransaction(input, tx);
const [session] = await tx const [session] = await tx
.select({ recoveryAttempts: tuiSessions.recoveryAttempts }) .select({ recoveryAttempts: tuiSessions.recoveryAttempts, state: tuiSessions.state })
.from(tuiSessions) .from(tuiSessions)
.where(eq(tuiSessions.id, sessionId)); .where(eq(tuiSessions.id, sessionId));
const recoveryAttempts = (session?.recoveryAttempts ?? 0) + 1; const recoveryAttempts = (session?.recoveryAttempts ?? 0) + 1;
assertSessionTransition(session?.state ?? "REBOOTSTRAPPED", "READY");
await tx await tx
.update(tuiSessions) .update(tuiSessions)
.set({ state: "READY", recoveryAttempts }) .set({ state: "READY", recoveryAttempts })
@@ -2298,7 +2515,30 @@ async function recoverFromArtifactTimeout(
idempotencyKey: `session.recovered:${sessionId}:${recoveryAttempts}`, idempotencyKey: `session.recovered:${sessionId}:${recoveryAttempts}`,
}); });
}); });
return true; return { recovered: true };
}
interface ArtifactTimeoutRecoveryResult {
recovered: boolean;
recoveryHint?: string;
}
function recoveryHintForProbe(probe: ProbeResult): string {
if (probe.hint !== undefined && probe.hint.length > 0) {
return probe.hint;
}
return `probe_alive=${probe.alive};pane_active=${probe.paneActive}`;
}
async function sessionState(
input: CanonicalRunSingleFakePhaseInput,
sessionId: string,
): Promise<string | undefined> {
const [session] = await input.db
.select({ state: tuiSessions.state })
.from(tuiSessions)
.where(eq(tuiSessions.id, sessionId));
return session?.state;
} }
function isBackendReadinessUnknown(probe: ProbeResult): boolean { function isBackendReadinessUnknown(probe: ProbeResult): boolean {
@@ -2312,6 +2552,13 @@ async function setSessionStateIfRunActive(
) { ) {
await input.db.transaction(async (tx) => { await input.db.transaction(async (tx) => {
await assertRunCanMutatePhaseInTransaction(input, tx); await assertRunCanMutatePhaseInTransaction(input, tx);
const [session] = await tx
.select({ state: tuiSessions.state })
.from(tuiSessions)
.where(eq(tuiSessions.id, sessionId));
if (session !== undefined) {
assertSessionStateAssignment(session.state, state);
}
await tx.update(tuiSessions).set({ state }).where(eq(tuiSessions.id, sessionId)); await tx.update(tuiSessions).set({ state }).where(eq(tuiSessions.id, sessionId));
}); });
} }
@@ -2337,11 +2584,11 @@ async function probeWithTypedError(
async function rebootstrapWithRetry( async function rebootstrapWithRetry(
sessions: SessionRuntime, sessions: SessionRuntime,
handle: { sessionId: string }, handle: { sessionId: string },
): Promise<boolean> { ): Promise<void> {
for (let attemptsRemaining = 2; attemptsRemaining > 0; attemptsRemaining -= 1) {
try { try {
await retryRecoverable("rebootstrap", async () => {
await sessions.rebootstrap(handle); await sessions.rebootstrap(handle);
return true; });
} catch (error) { } catch (error) {
if (!(error instanceof DevflowError)) { if (!(error instanceof DevflowError)) {
throw new DevflowError("Unclassified rebootstrap failure", { throw new DevflowError("Unclassified rebootstrap failure", {
@@ -2353,30 +2600,15 @@ async function rebootstrapWithRetry(
if (error.class !== "recoverable") { if (error.class !== "recoverable") {
throw error; throw error;
} }
// Retry budget is intentionally one rebootstrap retry after the first failure. throw error;
} }
} }
return false;
}
async function resumeWithRetry( async function resumeWithRetry(
sessions: SessionRuntime, sessions: SessionRuntime,
handle: { sessionId: string }, handle: { sessionId: string },
): Promise<SessionHandle> { ): Promise<SessionHandle> {
let lastError: unknown; return retryRecoverable("resume", () => sessions.resume(handle));
for (let physicalAttempt = 0; physicalAttempt <= 2; physicalAttempt += 1) {
try {
return await sessions.resume(handle);
} catch (error) {
lastError = error;
if (!(error instanceof DevflowError) || error.class !== "recoverable") {
throw error;
}
}
}
throw lastError;
} }
async function markSessionFailedNeedsHuman( async function markSessionFailedNeedsHuman(
@@ -2384,6 +2616,13 @@ async function markSessionFailedNeedsHuman(
eventRepository: RunEventRepository, eventRepository: RunEventRepository,
sessionId: string, sessionId: string,
) { ) {
const [existingSession] = await input.db
.select({ state: tuiSessions.state })
.from(tuiSessions)
.where(eq(tuiSessions.id, sessionId));
if (existingSession !== undefined) {
assertSessionStateAssignment(existingSession.state, "FAILED_NEEDS_HUMAN");
}
await input.db await input.db
.insert(tuiSessions) .insert(tuiSessions)
.values({ .values({
@@ -2416,12 +2655,15 @@ async function markAllSessionsFailedInTransaction(
runId: string, runId: string,
): Promise<string[]> { ): Promise<string[]> {
const sessions = await tx const sessions = await tx
.select({ id: tuiSessions.id, roleId: tuiSessions.roleId }) .select({ id: tuiSessions.id, roleId: tuiSessions.roleId, state: tuiSessions.state })
.from(tuiSessions) .from(tuiSessions)
.where(eq(tuiSessions.runId, runId)); .where(eq(tuiSessions.runId, runId));
if (sessions.length === 0) { if (sessions.length === 0) {
return []; return [];
} }
for (const session of sessions) {
assertSessionStateAssignment(session.state, "FAILED_NEEDS_HUMAN");
}
await tx await tx
.update(tuiSessions) .update(tuiSessions)
@@ -2684,7 +2926,7 @@ function shouldCreateHumanGate(error: unknown): error is DevflowError {
function toHumanRequiredRecoveryError(error: DevflowError): DevflowError { function toHumanRequiredRecoveryError(error: DevflowError): DevflowError {
if (error.class === "human_required") { if (error.class === "human_required") {
return error; return ensureRecoveryHint(error);
} }
const options: ConstructorParameters<typeof DevflowError>[1] = { const options: ConstructorParameters<typeof DevflowError>[1] = {
@@ -2708,7 +2950,7 @@ function toHumanRequiredRecoveryError(error: DevflowError): DevflowError {
function toArtifactTimeoutRecoveryGateError(error: DevflowError): DevflowError { function toArtifactTimeoutRecoveryGateError(error: DevflowError): DevflowError {
if (error.class === "human_required") { if (error.class === "human_required") {
return error; return ensureRecoveryHint(error);
} }
const options: ConstructorParameters<typeof DevflowError>[1] = { const options: ConstructorParameters<typeof DevflowError>[1] = {
@@ -2727,6 +2969,26 @@ function toArtifactTimeoutRecoveryGateError(error: DevflowError): DevflowError {
return new DevflowError("Artifact timeout recovery exhausted retry budget", options); return new DevflowError("Artifact timeout recovery exhausted retry budget", options);
} }
function ensureRecoveryHint(error: DevflowError): DevflowError {
if (error.recoveryHint !== undefined && error.recoveryHint.length > 0) {
return error;
}
const options: ConstructorParameters<typeof DevflowError>[1] = {
class: error.class,
code: error.code,
recoveryHint: error.message,
cause: error.cause,
};
if (error.runId !== undefined) {
options.runId = error.runId;
}
if (error.phaseId !== undefined) {
options.phaseId = error.phaseId;
}
return new DevflowError(error.message, options);
}
async function removeStaleArtifact(input: CanonicalRunSingleFakePhaseInput): Promise<void> { async function removeStaleArtifact(input: CanonicalRunSingleFakePhaseInput): Promise<void> {
try { try {
await unlink(input.expectedArtifactPath); await unlink(input.expectedArtifactPath);

View File

@@ -1,5 +1,6 @@
export * from "./adapter.js"; export * from "./adapter.js";
export * from "./fake.js"; export * from "./fake.js";
export * from "./manager.js"; export * from "./manager.js";
export * from "./recovery.js";
export * from "./transcript.js"; export * from "./transcript.js";
export * from "./tmux.js"; export * from "./tmux.js";

View File

@@ -19,6 +19,7 @@ import type {
TranscriptBaseline, TranscriptBaseline,
TranscriptChunk, TranscriptChunk,
} from "./adapter.js"; } from "./adapter.js";
import { assertSessionTransition, retryRecoverable } from "./recovery.js";
import { captureAndPersistTranscript } from "./transcript.js"; import { captureAndPersistTranscript } from "./transcript.js";
type Database = DbClient["db"]; type Database = DbClient["db"];
@@ -298,6 +299,12 @@ export class SessionManager implements SessionRuntime {
if (this.db === undefined || !["CREATED", "BOOTSTRAPPING"].includes(session.state)) { if (this.db === undefined || !["CREATED", "BOOTSTRAPPING"].includes(session.state)) {
return; return;
} }
if (session.state === "CREATED") {
assertSessionTransition("CREATED", "BOOTSTRAPPING");
assertSessionTransition("BOOTSTRAPPING", "READY");
} else {
assertSessionTransition(session.state, "READY");
}
const eventRepository = new RunEventRepository(this.db); const eventRepository = new RunEventRepository(this.db);
const sessionUpdate: { const sessionUpdate: {
@@ -345,6 +352,7 @@ export class SessionManager implements SessionRuntime {
backend: string; backend: string;
cwd: string; cwd: string;
recoveryAttempts: number; recoveryAttempts: number;
state: string;
}, },
error: unknown, error: unknown,
): Promise<void> { ): Promise<void> {
@@ -357,6 +365,7 @@ export class SessionManager implements SessionRuntime {
const gateKey = "session_recovery_required"; const gateKey = "session_recovery_required";
const approvalIdempotencyKey = `${session.runId}:${gateKey}:${session.id}:${recoveryAttempts}`; const approvalIdempotencyKey = `${session.runId}:${gateKey}:${session.id}:${recoveryAttempts}`;
const pauseCause = `session_recovery_failed:${session.id}:${recoveryAttempts}`; const pauseCause = `session_recovery_failed:${session.id}:${recoveryAttempts}`;
assertSessionTransition(session.state, "FAILED_NEEDS_HUMAN");
await this.db.transaction(async (tx) => { await this.db.transaction(async (tx) => {
await tx.execute(sql`SELECT 1 FROM ${runs} WHERE ${runs.id} = ${session.runId} FOR UPDATE`); await tx.execute(sql`SELECT 1 FROM ${runs} WHERE ${runs.id} = ${session.runId} FOR UPDATE`);
const [run] = await tx const [run] = await tx
@@ -431,18 +440,7 @@ export class SessionManager implements SessionRuntime {
} }
private async resumeWithRetry(handle: SessionHandle): Promise<SessionHandle> { private async resumeWithRetry(handle: SessionHandle): Promise<SessionHandle> {
let lastError: unknown; return retryRecoverable("resume", () => this.track(this.adapter.resume(handle)));
for (let attempt = 0; attempt <= 2; attempt += 1) {
try {
return await this.track(this.adapter.resume(handle));
} catch (error) {
lastError = error;
if (!(error instanceof DevflowError) || error.class !== "recoverable") {
throw error;
}
}
}
throw lastError;
} }
private async loadTranscriptBaseline( private async loadTranscriptBaseline(

View File

@@ -0,0 +1,103 @@
import { describe, expect, it } from "vitest";
import { DevflowError } from "@devflow/core";
import {
SessionRecoveryBudget,
assertSessionStateAssignment,
assertSessionTransition,
isSessionHung,
retryRecoverable,
} from "./recovery.js";
describe("session recovery policy", () => {
it("allows only locked session state-machine transitions", () => {
expect(() => assertSessionTransition("CREATED", "BOOTSTRAPPING")).not.toThrow();
expect(() => assertSessionTransition("BOOTSTRAPPING", "READY")).not.toThrow();
expect(() => assertSessionTransition("READY", "BUSY")).not.toThrow();
expect(() => assertSessionTransition("BUSY", "READY")).not.toThrow();
expect(() => assertSessionTransition("BUSY", "ARTIFACT_TIMEOUT")).not.toThrow();
expect(() => assertSessionTransition("ARTIFACT_TIMEOUT", "RESUMING")).not.toThrow();
expect(() => assertSessionTransition("RESUMING", "REBOOTSTRAPPED")).not.toThrow();
expect(() => assertSessionTransition("REBOOTSTRAPPED", "READY")).not.toThrow();
expect(() => assertSessionTransition("READY", "REBOOTSTRAPPED")).toThrow(
/Invalid session state transition/,
);
expect(() => assertSessionTransition("CRASHED", "CRASHED")).toThrow(
/Invalid session state transition/,
);
expect(() => assertSessionTransition("FAILED_NEEDS_HUMAN", "READY")).toThrow(
/Invalid session state transition/,
);
});
it("allows no-op state assignment without treating it as a transition", () => {
expect(() => assertSessionStateAssignment("READY", "READY")).not.toThrow();
expect(() => assertSessionStateAssignment("BUSY", "READY")).not.toThrow();
expect(() => assertSessionStateAssignment("FAILED_NEEDS_HUMAN", "READY")).toThrow(
/Invalid session state assignment/,
);
});
it("retries recoverable errors for one initial prompt send plus two retries", async () => {
let attempts = 0;
const result = await retryRecoverable("sendPrompt", async () => {
attempts += 1;
if (attempts < SessionRecoveryBudget.sendPrompt.physicalAttempts) {
throw new DevflowError("temporary prompt failure", {
class: "recoverable",
code: "prompt_send_transient",
});
}
return "sent";
});
expect(result).toBe("sent");
expect(attempts).toBe(3);
});
it("throws the final recoverable error after the retry budget is exhausted", async () => {
let attempts = 0;
await expect(
retryRecoverable("rebootstrap", async () => {
attempts += 1;
throw new DevflowError("pane briefly unresponsive", {
class: "recoverable",
code: "pane_briefly_unresponsive",
});
}),
).rejects.toMatchObject({
class: "recoverable",
code: "pane_briefly_unresponsive",
});
expect(attempts).toBe(SessionRecoveryBudget.rebootstrap.physicalAttempts);
});
it("does not retry human-required or fatal errors", async () => {
let attempts = 0;
await expect(
retryRecoverable("resume", async () => {
attempts += 1;
throw new DevflowError("operator action required", {
class: "human_required",
code: "session_recovery_required",
});
}),
).rejects.toMatchObject({
class: "human_required",
code: "session_recovery_required",
});
expect(attempts).toBe(1);
});
it("uses the locked hung-session timeout boundary", () => {
const now = new Date("2026-05-13T10:20:00.000Z");
expect(isSessionHung(new Date("2026-05-13T10:00:00.000Z"), now)).toBe(true);
expect(isSessionHung(new Date("2026-05-13T10:00:01.000Z"), now)).toBe(false);
expect(isSessionHung(undefined, now)).toBe(false);
});
});

View File

@@ -0,0 +1,127 @@
import { DevflowError, SessionState, type SessionState as SessionStateName } from "@devflow/core";
export const SessionRecoveryBudget = Object.freeze({
sendPrompt: Object.freeze({ retries: 2, physicalAttempts: 3 }),
resume: Object.freeze({ retries: 2, physicalAttempts: 3 }),
rebootstrap: Object.freeze({ retries: 1, physicalAttempts: 2 }),
artifactRepair: Object.freeze({ retries: 1, physicalAttempts: 2 }),
maxHungMs: 20 * 60 * 1000,
});
export type SessionRetryOperation = "sendPrompt" | "resume" | "rebootstrap" | "artifactRepair";
const allowedSessionTransitions: ReadonlyMap<
SessionStateName,
ReadonlySet<SessionStateName>
> = new Map([
["CREATED", new Set(["BOOTSTRAPPING", "FAILED_NEEDS_HUMAN"])],
["BOOTSTRAPPING", new Set(["READY", "FAILED_NEEDS_HUMAN"])],
["READY", new Set(["BUSY", "FAILED_NEEDS_HUMAN"])],
[
"BUSY",
new Set([
"READY",
"WAITING_FOR_APPROVAL",
"ARTIFACT_TIMEOUT",
"HUNG",
"CRASHED",
"FAILED_NEEDS_HUMAN",
]),
],
["WAITING_FOR_APPROVAL", new Set(["READY", "FAILED_NEEDS_HUMAN"])],
["ARTIFACT_TIMEOUT", new Set(["RESUMING", "FAILED_NEEDS_HUMAN"])],
["HUNG", new Set(["RESUMING", "FAILED_NEEDS_HUMAN"])],
["CRASHED", new Set(["RESUMING", "FAILED_NEEDS_HUMAN"])],
["RESUMING", new Set(["READY", "REBOOTSTRAPPED", "FAILED_NEEDS_HUMAN"])],
["REBOOTSTRAPPED", new Set(["READY", "FAILED_NEEDS_HUMAN"])],
["FAILED_NEEDS_HUMAN", new Set()],
]);
export function isAllowedSessionTransition(from: string, to: string): boolean {
const parsedFrom = SessionState.safeParse(from);
const parsedTo = SessionState.safeParse(to);
if (!parsedFrom.success || !parsedTo.success) {
return false;
}
return allowedSessionTransitions.get(parsedFrom.data)?.has(parsedTo.data) ?? false;
}
export function isAllowedSessionStateAssignment(from: string, to: string): boolean {
const parsedFrom = SessionState.safeParse(from);
const parsedTo = SessionState.safeParse(to);
if (!parsedFrom.success || !parsedTo.success) {
return false;
}
if (parsedFrom.data === parsedTo.data) {
return true;
}
return isAllowedSessionTransition(parsedFrom.data, parsedTo.data);
}
export function assertSessionTransition(from: string, to: string): void {
if (isAllowedSessionTransition(from, to)) {
return;
}
throw new DevflowError("Invalid session state transition", {
class: "fatal",
code: "internal_state_corruption",
recoveryHint: `${from}->${to}`,
});
}
export function assertSessionStateAssignment(from: string, to: string): void {
if (isAllowedSessionStateAssignment(from, to)) {
return;
}
throw new DevflowError("Invalid session state assignment", {
class: "fatal",
code: "internal_state_corruption",
recoveryHint: `${from}->${to}`,
});
}
export async function retryRecoverable<T>(
operation: SessionRetryOperation,
run: (physicalAttempt: number) => Promise<T>,
): Promise<T> {
const physicalAttempts = recoveryPhysicalAttempts(operation);
let lastError: unknown;
for (let physicalAttempt = 1; physicalAttempt <= physicalAttempts; physicalAttempt += 1) {
try {
return await run(physicalAttempt);
} catch (error) {
lastError = error;
if (!(error instanceof DevflowError) || error.class !== "recoverable") {
throw error;
}
}
}
throw lastError;
}
export function isSessionHung(
lastOutputAt: Date | undefined,
now: Date,
maxHungMs = SessionRecoveryBudget.maxHungMs,
): boolean {
if (lastOutputAt === undefined) {
return false;
}
return now.getTime() - lastOutputAt.getTime() >= maxHungMs;
}
function recoveryPhysicalAttempts(operation: SessionRetryOperation): number {
switch (operation) {
case "sendPrompt":
return SessionRecoveryBudget.sendPrompt.physicalAttempts;
case "resume":
return SessionRecoveryBudget.resume.physicalAttempts;
case "rebootstrap":
return SessionRecoveryBudget.rebootstrap.physicalAttempts;
case "artifactRepair":
return SessionRecoveryBudget.artifactRepair.physicalAttempts;
}
}

View File

@@ -14,6 +14,9 @@ export interface DevflowActivityDependencies {
workspaceRoot: string; workspaceRoot: string;
availableBackends?: readonly BackendConfig[]; availableBackends?: readonly BackendConfig[];
maxConcurrentRuns?: number; maxConcurrentRuns?: number;
recovery?: {
maxHungMs?: number;
};
wait?: { wait?: {
timeoutMs?: number; timeoutMs?: number;
pollIntervalMs?: number; pollIntervalMs?: number;
@@ -50,6 +53,7 @@ export function createDevflowActivities(
...(dependencies.maxConcurrentRuns === undefined ...(dependencies.maxConcurrentRuns === undefined
? {} ? {}
: { maxConcurrentRuns: dependencies.maxConcurrentRuns }), : { maxConcurrentRuns: dependencies.maxConcurrentRuns }),
...(dependencies.recovery === undefined ? {} : { recovery: dependencies.recovery }),
...(activityWait === undefined ? {} : { wait: activityWait }), ...(activityWait === undefined ? {} : { wait: activityWait }),
}); });
}; };