feat: add tui recovery state machine
This commit is contained in:
@@ -28,7 +28,12 @@ import {
|
||||
SessionManager,
|
||||
type SessionRuntime,
|
||||
type TranscriptChunkSink,
|
||||
assertSessionStateAssignment,
|
||||
assertSessionTransition,
|
||||
captureAndPersistTranscript,
|
||||
isAllowedSessionTransition,
|
||||
isSessionHung,
|
||||
retryRecoverable,
|
||||
} from "@devflow/session";
|
||||
import { and, desc, eq, inArray, sql } from "drizzle-orm";
|
||||
|
||||
@@ -44,6 +49,10 @@ interface ArtifactWaitOptions extends FakePhaseWaitOptions {
|
||||
ignoreInitialSignature?: string;
|
||||
}
|
||||
|
||||
interface FakePhaseRecoveryOptions {
|
||||
maxHungMs?: number;
|
||||
}
|
||||
|
||||
interface RunSingleFakePhaseBaseInput {
|
||||
db: DbClient["db"];
|
||||
runId: string;
|
||||
@@ -55,6 +64,7 @@ interface RunSingleFakePhaseBaseInput {
|
||||
expectedSchema: string;
|
||||
instructions: string;
|
||||
wait?: FakePhaseWaitOptions;
|
||||
recovery?: FakePhaseRecoveryOptions;
|
||||
uuidFactory?: () => string;
|
||||
transcriptSink?: TranscriptChunkSink;
|
||||
terminalRun?: boolean;
|
||||
@@ -81,7 +91,6 @@ export interface RunSingleFakePhaseResult {
|
||||
|
||||
type TransactionDb = Parameters<Parameters<RunSingleFakePhaseInput["db"]["transaction"]>[0]>[0];
|
||||
|
||||
const sendPromptRetryBudget = 2;
|
||||
const terminalRunStates = ["completed", "failed", "aborted"] as const;
|
||||
const phaseMutationRunStates = ["executing", "planning"] as const;
|
||||
|
||||
@@ -298,9 +307,9 @@ export async function runSingleFakePhase(
|
||||
throw error;
|
||||
}
|
||||
|
||||
let recovered: boolean;
|
||||
let recovery: ArtifactTimeoutRecoveryResult;
|
||||
try {
|
||||
recovered = await recoverFromArtifactTimeout(input, eventRepository, handle.sessionId);
|
||||
recovery = await recoverFromArtifactTimeout(input, eventRepository, handle.sessionId);
|
||||
} catch (recoveryError) {
|
||||
if (isRunStateChanged(recoveryError)) {
|
||||
await captureTranscript(input, handle);
|
||||
@@ -333,7 +342,7 @@ export async function runSingleFakePhase(
|
||||
);
|
||||
throw recoveryError;
|
||||
}
|
||||
if (!recovered) {
|
||||
if (!recovery.recovered) {
|
||||
await failPhaseAndRequestGate(
|
||||
input,
|
||||
eventRepository,
|
||||
@@ -342,9 +351,9 @@ export async function runSingleFakePhase(
|
||||
"artifact_timeout_exhausted",
|
||||
{
|
||||
expectedArtifactPath: input.expectedArtifactPath,
|
||||
recoveryHint: recovery.recoveryHint ?? input.expectedArtifactPath,
|
||||
},
|
||||
handle.sessionId,
|
||||
{ markSessionCrashed: true },
|
||||
);
|
||||
await captureTranscript(input, handle);
|
||||
throw error;
|
||||
@@ -358,6 +367,7 @@ export async function runSingleFakePhase(
|
||||
"artifact_timeout_exhausted",
|
||||
{
|
||||
expectedArtifactPath: input.expectedArtifactPath,
|
||||
recoveryHint: input.expectedArtifactPath,
|
||||
},
|
||||
handle.sessionId,
|
||||
);
|
||||
@@ -470,6 +480,7 @@ export async function runSingleFakePhase(
|
||||
"artifact_timeout_exhausted",
|
||||
{
|
||||
expectedArtifactPath: input.expectedArtifactPath,
|
||||
recoveryHint: input.expectedArtifactPath,
|
||||
},
|
||||
handle.sessionId,
|
||||
);
|
||||
@@ -511,6 +522,7 @@ export async function runSingleFakePhase(
|
||||
{
|
||||
artifactId: outcome.artifact.id,
|
||||
expectedArtifactPath: input.expectedArtifactPath,
|
||||
recoveryHint: artifactInvalidRecoveryHint(input, outcome),
|
||||
},
|
||||
handle.sessionId,
|
||||
);
|
||||
@@ -520,6 +532,7 @@ export async function runSingleFakePhase(
|
||||
code: "artifact_invalid_after_repair",
|
||||
runId: input.runId,
|
||||
phaseId: input.phaseId,
|
||||
recoveryHint: artifactInvalidRecoveryHint(input, outcome),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -621,6 +634,7 @@ export async function runSingleFakePhase(
|
||||
"artifact_timeout_exhausted",
|
||||
{
|
||||
expectedArtifactPath: input.expectedArtifactPath,
|
||||
recoveryHint: input.expectedArtifactPath,
|
||||
},
|
||||
handle.sessionId,
|
||||
);
|
||||
@@ -667,6 +681,7 @@ export async function runSingleFakePhase(
|
||||
{
|
||||
artifactId: outcome.artifact.id,
|
||||
expectedArtifactPath: input.expectedArtifactPath,
|
||||
recoveryHint: artifactInvalidRecoveryHint(input, outcome),
|
||||
},
|
||||
handle.sessionId,
|
||||
);
|
||||
@@ -676,6 +691,7 @@ export async function runSingleFakePhase(
|
||||
code: "artifact_invalid_after_repair",
|
||||
runId: input.runId,
|
||||
phaseId: input.phaseId,
|
||||
recoveryHint: artifactInvalidRecoveryHint(input, outcome),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -833,6 +849,18 @@ async function enterInitialPhase(
|
||||
})
|
||||
.from(tuiSessions)
|
||||
.where(and(eq(tuiSessions.runId, input.runId), eq(tuiSessions.roleId, input.roleId)));
|
||||
if (
|
||||
session !== undefined &&
|
||||
(isTimeoutRecoverySessionState(session.state) ||
|
||||
(session.state === "READY" && (await sessionRecoveredEventExists(input, session.id)))) &&
|
||||
session.lastPromptHash === envelope.dedupKey &&
|
||||
session.expectedArtifactPath === input.expectedArtifactPath &&
|
||||
session.expectedSchema === input.expectedSchema
|
||||
) {
|
||||
return recoverAwaitingArtifactReplay(input, eventRepository, phase.attempts, session.id, {
|
||||
repairAttemptUsed: phaseStart.repairAttemptUsed,
|
||||
});
|
||||
}
|
||||
if (
|
||||
session !== undefined &&
|
||||
session.state !== "FAILED_NEEDS_HUMAN" &&
|
||||
@@ -967,6 +995,112 @@ async function enterInitialPhase(
|
||||
throw cannotReplayPhase(input, phase.state);
|
||||
}
|
||||
|
||||
async function recoverAwaitingArtifactReplay(
|
||||
input: CanonicalRunSingleFakePhaseInput,
|
||||
eventRepository: RunEventRepository,
|
||||
attempt: number,
|
||||
sessionId: string,
|
||||
options: { repairAttemptUsed: boolean },
|
||||
): Promise<PhaseEntry> {
|
||||
let recovery: ArtifactTimeoutRecoveryResult;
|
||||
try {
|
||||
recovery = await recoverFromArtifactTimeout(input, eventRepository, sessionId);
|
||||
} catch (recoveryError) {
|
||||
if (isRunStateChanged(recoveryError)) {
|
||||
throw recoveryError;
|
||||
}
|
||||
if (shouldCreateHumanGate(recoveryError)) {
|
||||
const gateError = toArtifactTimeoutRecoveryGateError(recoveryError);
|
||||
await failPhaseAndRequestGate(
|
||||
input,
|
||||
eventRepository,
|
||||
attempt,
|
||||
"artifact_timeout_recovery_failed",
|
||||
gateError.code,
|
||||
{
|
||||
errorCode: recoveryError.code,
|
||||
expectedArtifactPath: input.expectedArtifactPath,
|
||||
recoveryHint: gateError.recoveryHint,
|
||||
},
|
||||
sessionId,
|
||||
);
|
||||
throw gateError;
|
||||
}
|
||||
await failPhaseAndRun(input, eventRepository, attempt, "artifact_timeout_recovery_failed");
|
||||
throw recoveryError;
|
||||
}
|
||||
|
||||
if (!recovery.recovered || options.repairAttemptUsed) {
|
||||
await failPhaseAndRequestGate(
|
||||
input,
|
||||
eventRepository,
|
||||
attempt,
|
||||
"artifact_timeout",
|
||||
"artifact_timeout_exhausted",
|
||||
{
|
||||
expectedArtifactPath: input.expectedArtifactPath,
|
||||
recoveryHint: recovery.recoveryHint ?? input.expectedArtifactPath,
|
||||
},
|
||||
sessionId,
|
||||
);
|
||||
throw new DevflowError("Artifact timeout recovery exhausted retry budget", {
|
||||
class: "human_required",
|
||||
code: "artifact_timeout_exhausted",
|
||||
runId: input.runId,
|
||||
phaseId: input.phaseId,
|
||||
recoveryHint: recovery.recoveryHint ?? input.expectedArtifactPath,
|
||||
});
|
||||
}
|
||||
|
||||
const repairAttempt = await startPhaseAndRecord(input, eventRepository, ["awaiting_artifact"], {
|
||||
reason: "artifact_timeout",
|
||||
repair: true,
|
||||
});
|
||||
try {
|
||||
await removeStaleArtifact(input);
|
||||
} catch (error) {
|
||||
await failPhaseAndRun(input, eventRepository, repairAttempt, "stale_artifact_remove_failed");
|
||||
throw error;
|
||||
}
|
||||
return {
|
||||
attempt: repairAttempt,
|
||||
continueArtifactWait: false,
|
||||
continueValidation: false,
|
||||
handle: { sessionId },
|
||||
repairAttemptUsed: true,
|
||||
resumedPrompt: false,
|
||||
};
|
||||
}
|
||||
|
||||
function isTimeoutRecoverySessionState(state: string): boolean {
|
||||
return ["ARTIFACT_TIMEOUT", "HUNG", "CRASHED", "RESUMING", "REBOOTSTRAPPED"].includes(state);
|
||||
}
|
||||
|
||||
async function sessionRecoveredEventExists(
|
||||
input: CanonicalRunSingleFakePhaseInput,
|
||||
sessionId: string,
|
||||
): Promise<boolean> {
|
||||
const events = await input.db
|
||||
.select({ payload: runEvents.payload })
|
||||
.from(runEvents)
|
||||
.where(
|
||||
and(
|
||||
eq(runEvents.runId, input.runId),
|
||||
eq(runEvents.phaseId, input.phaseId),
|
||||
eq(runEvents.type, "session.recovered"),
|
||||
),
|
||||
);
|
||||
return events.some((event) => payloadSessionId(event.payload) === sessionId);
|
||||
}
|
||||
|
||||
function payloadSessionId(payload: unknown): string | undefined {
|
||||
if (payload === null || typeof payload !== "object") {
|
||||
return undefined;
|
||||
}
|
||||
const sessionId = (payload as Record<string, unknown>).sessionId;
|
||||
return typeof sessionId === "string" ? sessionId : undefined;
|
||||
}
|
||||
|
||||
function cannotReplayPhase(
|
||||
input: CanonicalRunSingleFakePhaseInput,
|
||||
phaseState: string,
|
||||
@@ -1234,24 +1368,33 @@ async function failPhaseAndRequestGate(
|
||||
|
||||
if (sessionId !== undefined && options.markSessionCrashed === true) {
|
||||
const [session] = await tx
|
||||
.select({ recoveryAttempts: tuiSessions.recoveryAttempts })
|
||||
.select({ recoveryAttempts: tuiSessions.recoveryAttempts, state: tuiSessions.state })
|
||||
.from(tuiSessions)
|
||||
.where(eq(tuiSessions.id, sessionId));
|
||||
const recoveryAttempts = (session?.recoveryAttempts ?? 0) + 1;
|
||||
await tx
|
||||
.update(tuiSessions)
|
||||
.set({ state: "CRASHED", recoveryAttempts })
|
||||
.where(eq(tuiSessions.id, sessionId));
|
||||
await eventRepository.appendInTransaction(tx, {
|
||||
runId: input.runId,
|
||||
phaseId: input.phaseId,
|
||||
type: "session.crashed",
|
||||
payload: { sessionId, roleId: input.roleId, recoveryAttempts },
|
||||
idempotencyKey: `session.crashed:${sessionId}:${recoveryAttempts}`,
|
||||
});
|
||||
if (session !== undefined && isAllowedSessionTransition(session.state, "CRASHED")) {
|
||||
await tx
|
||||
.update(tuiSessions)
|
||||
.set({ state: "CRASHED", recoveryAttempts })
|
||||
.where(eq(tuiSessions.id, sessionId));
|
||||
await eventRepository.appendInTransaction(tx, {
|
||||
runId: input.runId,
|
||||
phaseId: input.phaseId,
|
||||
type: "session.crashed",
|
||||
payload: { sessionId, roleId: input.roleId, recoveryAttempts },
|
||||
idempotencyKey: `session.crashed:${sessionId}:${recoveryAttempts}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (sessionId !== undefined) {
|
||||
const [session] = await tx
|
||||
.select({ state: tuiSessions.state })
|
||||
.from(tuiSessions)
|
||||
.where(eq(tuiSessions.id, sessionId));
|
||||
if (session !== undefined) {
|
||||
assertSessionTransition(session.state, "FAILED_NEEDS_HUMAN");
|
||||
}
|
||||
await tx
|
||||
.insert(tuiSessions)
|
||||
.values({
|
||||
@@ -1425,10 +1568,11 @@ async function completePhaseAndRun(
|
||||
});
|
||||
|
||||
const [session] = await tx
|
||||
.select({ recoveryAttempts: tuiSessions.recoveryAttempts })
|
||||
.select({ recoveryAttempts: tuiSessions.recoveryAttempts, state: tuiSessions.state })
|
||||
.from(tuiSessions)
|
||||
.where(eq(tuiSessions.id, sessionId));
|
||||
const recoveryAttempts = session?.recoveryAttempts ?? 0;
|
||||
assertSessionStateAssignment(session?.state ?? "BUSY", "READY");
|
||||
await tx.update(tuiSessions).set({ state: "READY" }).where(eq(tuiSessions.id, sessionId));
|
||||
await eventRepository.appendInTransaction(tx, {
|
||||
runId: input.runId,
|
||||
@@ -1484,6 +1628,11 @@ async function requestWorkflowApproval(
|
||||
.update(runPhases)
|
||||
.set({ state: "awaiting_approval" })
|
||||
.where(and(eq(runPhases.id, input.phaseId), eq(runPhases.runId, input.runId)));
|
||||
const [session] = await tx
|
||||
.select({ state: tuiSessions.state })
|
||||
.from(tuiSessions)
|
||||
.where(eq(tuiSessions.id, sessionId));
|
||||
assertSessionStateAssignment(session?.state ?? "BUSY", "WAITING_FOR_APPROVAL");
|
||||
await tx
|
||||
.update(tuiSessions)
|
||||
.set({ state: "WAITING_FOR_APPROVAL" })
|
||||
@@ -1614,7 +1763,7 @@ async function startSessionAndRecord(
|
||||
cwd: input.worktreeRoot,
|
||||
expectedArtifactPath: input.expectedArtifactPath,
|
||||
expectedSchema: input.expectedSchema,
|
||||
state: "BOOTSTRAPPING",
|
||||
state: "CREATED",
|
||||
})
|
||||
.onConflictDoNothing({ target: tuiSessions.id });
|
||||
await eventRepository.appendInTransaction(tx, {
|
||||
@@ -1624,10 +1773,12 @@ async function startSessionAndRecord(
|
||||
payload: { sessionId: startedHandle.sessionId, roleId: input.roleId, backend: "fake" },
|
||||
idempotencyKey: `session.created:${startedHandle.sessionId}`,
|
||||
});
|
||||
assertSessionTransition("CREATED", "BOOTSTRAPPING");
|
||||
await tx
|
||||
.update(tuiSessions)
|
||||
.set({ state: "BOOTSTRAPPING" })
|
||||
.where(eq(tuiSessions.id, startedHandle.sessionId));
|
||||
assertSessionTransition("BOOTSTRAPPING", "READY");
|
||||
await tx
|
||||
.update(tuiSessions)
|
||||
.set({ state: "READY" })
|
||||
@@ -1751,6 +1902,11 @@ async function resumeExistingSessionAndRecord(
|
||||
}
|
||||
await input.db.transaction(async (tx) => {
|
||||
await assertRunCanMutatePhaseInTransaction(input, tx);
|
||||
const [currentSession] = await tx
|
||||
.select({ state: tuiSessions.state })
|
||||
.from(tuiSessions)
|
||||
.where(eq(tuiSessions.id, session.id));
|
||||
assertSessionStateAssignment(currentSession?.state ?? session.state, "READY");
|
||||
await tx
|
||||
.update(tuiSessions)
|
||||
.set({
|
||||
@@ -1873,6 +2029,11 @@ async function sendPromptAndRecord(
|
||||
: await artifactSignature(input.expectedArtifactPath);
|
||||
await input.db.transaction(async (tx) => {
|
||||
await assertRunCanMutatePhaseInTransaction(input, tx);
|
||||
const [currentSession] = await tx
|
||||
.select({ state: tuiSessions.state })
|
||||
.from(tuiSessions)
|
||||
.where(eq(tuiSessions.id, handle.sessionId));
|
||||
assertSessionTransition(currentSession?.state ?? "READY", "BUSY");
|
||||
await tx
|
||||
.update(tuiSessions)
|
||||
.set({
|
||||
@@ -1972,19 +2133,7 @@ async function sendPromptWithRetry(
|
||||
handle: { sessionId: string },
|
||||
envelope: PromptEnvelope,
|
||||
): Promise<{ promptId: string }> {
|
||||
let lastError: unknown;
|
||||
for (let physicalAttempt = 0; physicalAttempt <= sendPromptRetryBudget; physicalAttempt += 1) {
|
||||
try {
|
||||
return await sessions.sendPrompt(handle, envelope);
|
||||
} catch (error) {
|
||||
lastError = error;
|
||||
if (!(error instanceof DevflowError) || error.class !== "recoverable") {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
throw lastError;
|
||||
return retryRecoverable("sendPrompt", () => sessions.sendPrompt(handle, envelope));
|
||||
}
|
||||
|
||||
interface ArtifactOutcome {
|
||||
@@ -1994,6 +2143,13 @@ interface ArtifactOutcome {
|
||||
validation: ReturnType<typeof validateArtifact>;
|
||||
}
|
||||
|
||||
function artifactInvalidRecoveryHint(
|
||||
input: CanonicalRunSingleFakePhaseInput,
|
||||
outcome: ArtifactOutcome,
|
||||
): string {
|
||||
return `artifact=${outcome.artifact.id};path=${input.expectedArtifactPath}`;
|
||||
}
|
||||
|
||||
interface ArtifactRecord {
|
||||
id: string;
|
||||
phaseId: string | null;
|
||||
@@ -2020,7 +2176,8 @@ async function waitForAndValidateArtifact(
|
||||
if (!isDevflowErrorWithCode(error, "artifact_timeout_exhausted")) {
|
||||
throw error;
|
||||
}
|
||||
await recordArtifactTimeout(input, eventRepository, attempt, sessionId);
|
||||
const timedOutSessionState = await classifyTimedOutSession(input, sessionId);
|
||||
await recordArtifactTimeout(input, eventRepository, attempt, sessionId, timedOutSessionState);
|
||||
throw error;
|
||||
}
|
||||
|
||||
@@ -2087,6 +2244,7 @@ async function recordArtifactTimeout(
|
||||
eventRepository: RunEventRepository,
|
||||
attempt: number,
|
||||
sessionId: string,
|
||||
sessionState: "ARTIFACT_TIMEOUT" | "HUNG" | "CRASHED",
|
||||
) {
|
||||
await input.db.transaction(async (tx) => {
|
||||
await assertRunCanMutatePhaseInTransaction(input, tx);
|
||||
@@ -2098,16 +2256,60 @@ async function recordArtifactTimeout(
|
||||
path: input.expectedArtifactPath,
|
||||
schemaId: input.expectedSchema,
|
||||
attempt,
|
||||
sessionState,
|
||||
},
|
||||
idempotencyKey: `artifact.timeout:${input.phaseId}:${attempt}:${input.expectedArtifactPath}`,
|
||||
});
|
||||
await tx
|
||||
.update(tuiSessions)
|
||||
.set({ state: "ARTIFACT_TIMEOUT" })
|
||||
const [currentSession] = await tx
|
||||
.select({ recoveryAttempts: tuiSessions.recoveryAttempts, state: tuiSessions.state })
|
||||
.from(tuiSessions)
|
||||
.where(eq(tuiSessions.id, sessionId));
|
||||
const currentState = currentSession?.state ?? "BUSY";
|
||||
assertSessionStateAssignment(currentState, sessionState);
|
||||
if (currentState === sessionState) {
|
||||
return;
|
||||
}
|
||||
if (sessionState === "CRASHED") {
|
||||
const recoveryAttempts = (currentSession?.recoveryAttempts ?? 0) + 1;
|
||||
await tx
|
||||
.update(tuiSessions)
|
||||
.set({ recoveryAttempts, state: "CRASHED" })
|
||||
.where(eq(tuiSessions.id, sessionId));
|
||||
await eventRepository.appendInTransaction(tx, {
|
||||
runId: input.runId,
|
||||
phaseId: input.phaseId,
|
||||
type: "session.crashed",
|
||||
payload: { sessionId, roleId: input.roleId, recoveryAttempts },
|
||||
idempotencyKey: `session.crashed:${sessionId}:${recoveryAttempts}`,
|
||||
});
|
||||
return;
|
||||
}
|
||||
await tx.update(tuiSessions).set({ state: sessionState }).where(eq(tuiSessions.id, sessionId));
|
||||
});
|
||||
}
|
||||
|
||||
async function classifyTimedOutSession(
|
||||
input: CanonicalRunSingleFakePhaseInput,
|
||||
sessionId: string,
|
||||
): Promise<"ARTIFACT_TIMEOUT" | "HUNG" | "CRASHED"> {
|
||||
try {
|
||||
const probe = await probeWithTypedError(input.sessions, { sessionId });
|
||||
if (!probe.alive || !probe.paneActive) {
|
||||
return "CRASHED";
|
||||
}
|
||||
return isSessionHung(probe.lastOutputAt, new Date(), input.recovery?.maxHungMs)
|
||||
? "HUNG"
|
||||
: "ARTIFACT_TIMEOUT";
|
||||
} catch (error) {
|
||||
// A transient probe failure should not be promoted to a crash classification,
|
||||
// but fatal/unclassified probe failures must still fail the run.
|
||||
if (error instanceof DevflowError && error.class === "recoverable") {
|
||||
return "ARTIFACT_TIMEOUT";
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async function recordArtifactValidation(
|
||||
input: CanonicalRunSingleFakePhaseInput,
|
||||
eventRepository: RunEventRepository,
|
||||
@@ -2232,6 +2434,11 @@ async function markSessionIdle(
|
||||
) {
|
||||
await input.db.transaction(async (tx) => {
|
||||
await assertRunCanMutatePhaseInTransaction(input, tx);
|
||||
const [currentSession] = await tx
|
||||
.select({ state: tuiSessions.state })
|
||||
.from(tuiSessions)
|
||||
.where(eq(tuiSessions.id, sessionId));
|
||||
assertSessionStateAssignment(currentSession?.state ?? "BUSY", "READY");
|
||||
await tx.update(tuiSessions).set({ state: "READY" }).where(eq(tuiSessions.id, sessionId));
|
||||
await eventRepository.appendInTransaction(tx, {
|
||||
runId: input.runId,
|
||||
@@ -2266,26 +2473,36 @@ async function recoverFromArtifactTimeout(
|
||||
input: CanonicalRunSingleFakePhaseInput,
|
||||
eventRepository: RunEventRepository,
|
||||
sessionId: string,
|
||||
): Promise<boolean> {
|
||||
const probe = await probeWithTypedError(input.sessions, { sessionId });
|
||||
if (!probe.alive || !probe.paneActive || isBackendReadinessUnknown(probe)) {
|
||||
return false;
|
||||
): Promise<ArtifactTimeoutRecoveryResult> {
|
||||
const currentState = await sessionState(input, sessionId);
|
||||
if (currentState === "READY") {
|
||||
return { recovered: true };
|
||||
}
|
||||
await setSessionStateIfRunActive(input, sessionId, "RESUMING");
|
||||
|
||||
const rebootstrapOk = await rebootstrapWithRetry(input.sessions, { sessionId });
|
||||
if (!rebootstrapOk) {
|
||||
return false;
|
||||
if (!["CRASHED", "RESUMING", "REBOOTSTRAPPED"].includes(currentState ?? "")) {
|
||||
const probe = await probeWithTypedError(input.sessions, { sessionId });
|
||||
if (isBackendReadinessUnknown(probe)) {
|
||||
return {
|
||||
recovered: false,
|
||||
recoveryHint: recoveryHintForProbe(probe),
|
||||
};
|
||||
}
|
||||
}
|
||||
if (currentState !== "REBOOTSTRAPPED") {
|
||||
await setSessionStateIfRunActive(input, sessionId, "RESUMING");
|
||||
|
||||
await rebootstrapWithRetry(input.sessions, { sessionId });
|
||||
await setSessionStateIfRunActive(input, sessionId, "REBOOTSTRAPPED");
|
||||
}
|
||||
await setSessionStateIfRunActive(input, sessionId, "REBOOTSTRAPPED");
|
||||
|
||||
await input.db.transaction(async (tx) => {
|
||||
await assertRunCanMutatePhaseInTransaction(input, tx);
|
||||
const [session] = await tx
|
||||
.select({ recoveryAttempts: tuiSessions.recoveryAttempts })
|
||||
.select({ recoveryAttempts: tuiSessions.recoveryAttempts, state: tuiSessions.state })
|
||||
.from(tuiSessions)
|
||||
.where(eq(tuiSessions.id, sessionId));
|
||||
const recoveryAttempts = (session?.recoveryAttempts ?? 0) + 1;
|
||||
assertSessionTransition(session?.state ?? "REBOOTSTRAPPED", "READY");
|
||||
await tx
|
||||
.update(tuiSessions)
|
||||
.set({ state: "READY", recoveryAttempts })
|
||||
@@ -2298,7 +2515,30 @@ async function recoverFromArtifactTimeout(
|
||||
idempotencyKey: `session.recovered:${sessionId}:${recoveryAttempts}`,
|
||||
});
|
||||
});
|
||||
return true;
|
||||
return { recovered: true };
|
||||
}
|
||||
|
||||
interface ArtifactTimeoutRecoveryResult {
|
||||
recovered: boolean;
|
||||
recoveryHint?: string;
|
||||
}
|
||||
|
||||
function recoveryHintForProbe(probe: ProbeResult): string {
|
||||
if (probe.hint !== undefined && probe.hint.length > 0) {
|
||||
return probe.hint;
|
||||
}
|
||||
return `probe_alive=${probe.alive};pane_active=${probe.paneActive}`;
|
||||
}
|
||||
|
||||
async function sessionState(
|
||||
input: CanonicalRunSingleFakePhaseInput,
|
||||
sessionId: string,
|
||||
): Promise<string | undefined> {
|
||||
const [session] = await input.db
|
||||
.select({ state: tuiSessions.state })
|
||||
.from(tuiSessions)
|
||||
.where(eq(tuiSessions.id, sessionId));
|
||||
return session?.state;
|
||||
}
|
||||
|
||||
function isBackendReadinessUnknown(probe: ProbeResult): boolean {
|
||||
@@ -2312,6 +2552,13 @@ async function setSessionStateIfRunActive(
|
||||
) {
|
||||
await input.db.transaction(async (tx) => {
|
||||
await assertRunCanMutatePhaseInTransaction(input, tx);
|
||||
const [session] = await tx
|
||||
.select({ state: tuiSessions.state })
|
||||
.from(tuiSessions)
|
||||
.where(eq(tuiSessions.id, sessionId));
|
||||
if (session !== undefined) {
|
||||
assertSessionStateAssignment(session.state, state);
|
||||
}
|
||||
await tx.update(tuiSessions).set({ state }).where(eq(tuiSessions.id, sessionId));
|
||||
});
|
||||
}
|
||||
@@ -2337,46 +2584,31 @@ async function probeWithTypedError(
|
||||
async function rebootstrapWithRetry(
|
||||
sessions: SessionRuntime,
|
||||
handle: { sessionId: string },
|
||||
): Promise<boolean> {
|
||||
for (let attemptsRemaining = 2; attemptsRemaining > 0; attemptsRemaining -= 1) {
|
||||
try {
|
||||
): Promise<void> {
|
||||
try {
|
||||
await retryRecoverable("rebootstrap", async () => {
|
||||
await sessions.rebootstrap(handle);
|
||||
return true;
|
||||
} catch (error) {
|
||||
if (!(error instanceof DevflowError)) {
|
||||
throw new DevflowError("Unclassified rebootstrap failure", {
|
||||
class: "fatal",
|
||||
code: "internal_state_corruption",
|
||||
cause: error,
|
||||
});
|
||||
}
|
||||
if (error.class !== "recoverable") {
|
||||
throw error;
|
||||
}
|
||||
// Retry budget is intentionally one rebootstrap retry after the first failure.
|
||||
});
|
||||
} catch (error) {
|
||||
if (!(error instanceof DevflowError)) {
|
||||
throw new DevflowError("Unclassified rebootstrap failure", {
|
||||
class: "fatal",
|
||||
code: "internal_state_corruption",
|
||||
cause: error,
|
||||
});
|
||||
}
|
||||
if (error.class !== "recoverable") {
|
||||
throw error;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
async function resumeWithRetry(
|
||||
sessions: SessionRuntime,
|
||||
handle: { sessionId: string },
|
||||
): Promise<SessionHandle> {
|
||||
let lastError: unknown;
|
||||
for (let physicalAttempt = 0; physicalAttempt <= 2; physicalAttempt += 1) {
|
||||
try {
|
||||
return await sessions.resume(handle);
|
||||
} catch (error) {
|
||||
lastError = error;
|
||||
if (!(error instanceof DevflowError) || error.class !== "recoverable") {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
throw lastError;
|
||||
return retryRecoverable("resume", () => sessions.resume(handle));
|
||||
}
|
||||
|
||||
async function markSessionFailedNeedsHuman(
|
||||
@@ -2384,6 +2616,13 @@ async function markSessionFailedNeedsHuman(
|
||||
eventRepository: RunEventRepository,
|
||||
sessionId: string,
|
||||
) {
|
||||
const [existingSession] = await input.db
|
||||
.select({ state: tuiSessions.state })
|
||||
.from(tuiSessions)
|
||||
.where(eq(tuiSessions.id, sessionId));
|
||||
if (existingSession !== undefined) {
|
||||
assertSessionStateAssignment(existingSession.state, "FAILED_NEEDS_HUMAN");
|
||||
}
|
||||
await input.db
|
||||
.insert(tuiSessions)
|
||||
.values({
|
||||
@@ -2416,12 +2655,15 @@ async function markAllSessionsFailedInTransaction(
|
||||
runId: string,
|
||||
): Promise<string[]> {
|
||||
const sessions = await tx
|
||||
.select({ id: tuiSessions.id, roleId: tuiSessions.roleId })
|
||||
.select({ id: tuiSessions.id, roleId: tuiSessions.roleId, state: tuiSessions.state })
|
||||
.from(tuiSessions)
|
||||
.where(eq(tuiSessions.runId, runId));
|
||||
if (sessions.length === 0) {
|
||||
return [];
|
||||
}
|
||||
for (const session of sessions) {
|
||||
assertSessionStateAssignment(session.state, "FAILED_NEEDS_HUMAN");
|
||||
}
|
||||
|
||||
await tx
|
||||
.update(tuiSessions)
|
||||
@@ -2684,7 +2926,7 @@ function shouldCreateHumanGate(error: unknown): error is DevflowError {
|
||||
|
||||
function toHumanRequiredRecoveryError(error: DevflowError): DevflowError {
|
||||
if (error.class === "human_required") {
|
||||
return error;
|
||||
return ensureRecoveryHint(error);
|
||||
}
|
||||
|
||||
const options: ConstructorParameters<typeof DevflowError>[1] = {
|
||||
@@ -2708,7 +2950,7 @@ function toHumanRequiredRecoveryError(error: DevflowError): DevflowError {
|
||||
|
||||
function toArtifactTimeoutRecoveryGateError(error: DevflowError): DevflowError {
|
||||
if (error.class === "human_required") {
|
||||
return error;
|
||||
return ensureRecoveryHint(error);
|
||||
}
|
||||
|
||||
const options: ConstructorParameters<typeof DevflowError>[1] = {
|
||||
@@ -2727,6 +2969,26 @@ function toArtifactTimeoutRecoveryGateError(error: DevflowError): DevflowError {
|
||||
return new DevflowError("Artifact timeout recovery exhausted retry budget", options);
|
||||
}
|
||||
|
||||
function ensureRecoveryHint(error: DevflowError): DevflowError {
|
||||
if (error.recoveryHint !== undefined && error.recoveryHint.length > 0) {
|
||||
return error;
|
||||
}
|
||||
|
||||
const options: ConstructorParameters<typeof DevflowError>[1] = {
|
||||
class: error.class,
|
||||
code: error.code,
|
||||
recoveryHint: error.message,
|
||||
cause: error.cause,
|
||||
};
|
||||
if (error.runId !== undefined) {
|
||||
options.runId = error.runId;
|
||||
}
|
||||
if (error.phaseId !== undefined) {
|
||||
options.phaseId = error.phaseId;
|
||||
}
|
||||
return new DevflowError(error.message, options);
|
||||
}
|
||||
|
||||
async function removeStaleArtifact(input: CanonicalRunSingleFakePhaseInput): Promise<void> {
|
||||
try {
|
||||
await unlink(input.expectedArtifactPath);
|
||||
|
||||
Reference in New Issue
Block a user