feat: add tui recovery state machine
This commit is contained in:
@@ -4,3 +4,4 @@ LOG_LEVEL=info
|
|||||||
TEMPORAL_ADDRESS=localhost:7233
|
TEMPORAL_ADDRESS=localhost:7233
|
||||||
DEVFLOW_POSTGRES_PORT=55432
|
DEVFLOW_POSTGRES_PORT=55432
|
||||||
DEVFLOW_BACKENDS_JSON=[{"id":"fake","enabled":true}]
|
DEVFLOW_BACKENDS_JSON=[{"id":"fake","enabled":true}]
|
||||||
|
SESSION_MAX_HUNG_MS=1200000
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ export interface StartM4ApiOptions {
|
|||||||
sessionManager?: SessionManager;
|
sessionManager?: SessionManager;
|
||||||
runEngine?: RunEngine;
|
runEngine?: RunEngine;
|
||||||
maxConcurrentRuns?: number;
|
maxConcurrentRuns?: number;
|
||||||
|
sessionMaxHungMs?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface StartM4ApiResult {
|
export interface StartM4ApiResult {
|
||||||
@@ -50,6 +51,7 @@ export interface StartTemporalApiOptions {
|
|||||||
availableBackends?: readonly BackendConfig[];
|
availableBackends?: readonly BackendConfig[];
|
||||||
maxConcurrentRuns?: number;
|
maxConcurrentRuns?: number;
|
||||||
workspaceRoot?: string;
|
workspaceRoot?: string;
|
||||||
|
sessionMaxHungMs?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface StartTemporalApiResult {
|
export interface StartTemporalApiResult {
|
||||||
@@ -69,6 +71,7 @@ export async function startM4Api(options: StartM4ApiOptions = {}): Promise<Start
|
|||||||
const config = ownedClient || options.workspaceRoot === undefined ? getConfig() : undefined;
|
const config = ownedClient || options.workspaceRoot === undefined ? getConfig() : undefined;
|
||||||
const dbClient =
|
const dbClient =
|
||||||
options.dbClient ?? createDbClient(config?.DATABASE_URL ?? getConfig().DATABASE_URL);
|
options.dbClient ?? createDbClient(config?.DATABASE_URL ?? getConfig().DATABASE_URL);
|
||||||
|
const sessionMaxHungMs = options.sessionMaxHungMs ?? config?.SESSION_MAX_HUNG_MS;
|
||||||
const sessionManager =
|
const sessionManager =
|
||||||
options.sessionManager ??
|
options.sessionManager ??
|
||||||
new SessionManager({
|
new SessionManager({
|
||||||
@@ -90,6 +93,7 @@ export async function startM4Api(options: StartM4ApiOptions = {}): Promise<Start
|
|||||||
...(options.maxConcurrentRuns === undefined
|
...(options.maxConcurrentRuns === undefined
|
||||||
? {}
|
? {}
|
||||||
: { maxConcurrentRuns: options.maxConcurrentRuns }),
|
: { maxConcurrentRuns: options.maxConcurrentRuns }),
|
||||||
|
...(sessionMaxHungMs === undefined ? {} : { recovery: { maxHungMs: sessionMaxHungMs } }),
|
||||||
});
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -158,6 +162,7 @@ export async function startTemporalApi(
|
|||||||
const replayValidationBackends = options.availableBackends ?? config?.backends;
|
const replayValidationBackends = options.availableBackends ?? config?.backends;
|
||||||
const replayValidationMaxConcurrentRuns =
|
const replayValidationMaxConcurrentRuns =
|
||||||
options.maxConcurrentRuns ?? config?.MAX_CONCURRENT_RUNS;
|
options.maxConcurrentRuns ?? config?.MAX_CONCURRENT_RUNS;
|
||||||
|
const replayValidationSessionMaxHungMs = options.sessionMaxHungMs ?? config?.SESSION_MAX_HUNG_MS;
|
||||||
const replayValidationEngine = new DbRunEngine({
|
const replayValidationEngine = new DbRunEngine({
|
||||||
db: dbClient.db,
|
db: dbClient.db,
|
||||||
sessions: dbOnlySessionRuntime(),
|
sessions: dbOnlySessionRuntime(),
|
||||||
@@ -168,6 +173,9 @@ export async function startTemporalApi(
|
|||||||
...(replayValidationMaxConcurrentRuns === undefined
|
...(replayValidationMaxConcurrentRuns === undefined
|
||||||
? {}
|
? {}
|
||||||
: { maxConcurrentRuns: replayValidationMaxConcurrentRuns }),
|
: { maxConcurrentRuns: replayValidationMaxConcurrentRuns }),
|
||||||
|
...(replayValidationSessionMaxHungMs === undefined
|
||||||
|
? {}
|
||||||
|
: { recovery: { maxHungMs: replayValidationSessionMaxHungMs } }),
|
||||||
});
|
});
|
||||||
const engine = new TemporalRunEngine({
|
const engine = new TemporalRunEngine({
|
||||||
client: temporalClient,
|
client: temporalClient,
|
||||||
|
|||||||
@@ -110,6 +110,7 @@ describe("startWorker", () => {
|
|||||||
TEMPORAL_ADDRESS: "localhost:7233",
|
TEMPORAL_ADDRESS: "localhost:7233",
|
||||||
WORKSPACE_ROOT: worktreeRoot,
|
WORKSPACE_ROOT: worktreeRoot,
|
||||||
MAX_CONCURRENT_RUNS: 4,
|
MAX_CONCURRENT_RUNS: 4,
|
||||||
|
SESSION_MAX_HUNG_MS: 20 * 60 * 1000,
|
||||||
backends: [{ id: "fake", enabled: true }],
|
backends: [{ id: "fake", enabled: true }],
|
||||||
},
|
},
|
||||||
dbClient: client,
|
dbClient: client,
|
||||||
@@ -148,6 +149,7 @@ describe("startWorker", () => {
|
|||||||
TEMPORAL_ADDRESS: "localhost:7233",
|
TEMPORAL_ADDRESS: "localhost:7233",
|
||||||
WORKSPACE_ROOT: realpathSync(mkdtempSync(join(tmpdir(), "devflow-worker-workspace-"))),
|
WORKSPACE_ROOT: realpathSync(mkdtempSync(join(tmpdir(), "devflow-worker-workspace-"))),
|
||||||
MAX_CONCURRENT_RUNS: 4,
|
MAX_CONCURRENT_RUNS: 4,
|
||||||
|
SESSION_MAX_HUNG_MS: 20 * 60 * 1000,
|
||||||
backends: [{ id: "fake", enabled: true }],
|
backends: [{ id: "fake", enabled: true }],
|
||||||
},
|
},
|
||||||
dbClient: client,
|
dbClient: client,
|
||||||
@@ -165,6 +167,7 @@ describe("startWorker", () => {
|
|||||||
TEMPORAL_ADDRESS: "localhost:7233",
|
TEMPORAL_ADDRESS: "localhost:7233",
|
||||||
WORKSPACE_ROOT: realpathSync(mkdtempSync(join(tmpdir(), "devflow-worker-workspace-"))),
|
WORKSPACE_ROOT: realpathSync(mkdtempSync(join(tmpdir(), "devflow-worker-workspace-"))),
|
||||||
MAX_CONCURRENT_RUNS: 4,
|
MAX_CONCURRENT_RUNS: 4,
|
||||||
|
SESSION_MAX_HUNG_MS: 20 * 60 * 1000,
|
||||||
backends: [{ id: "fake", enabled: true }],
|
backends: [{ id: "fake", enabled: true }],
|
||||||
},
|
},
|
||||||
dbClient: client,
|
dbClient: client,
|
||||||
@@ -191,6 +194,7 @@ describe("startWorker", () => {
|
|||||||
TEMPORAL_ADDRESS: "localhost:7233",
|
TEMPORAL_ADDRESS: "localhost:7233",
|
||||||
WORKSPACE_ROOT: workspaceRoot,
|
WORKSPACE_ROOT: workspaceRoot,
|
||||||
MAX_CONCURRENT_RUNS: 4,
|
MAX_CONCURRENT_RUNS: 4,
|
||||||
|
SESSION_MAX_HUNG_MS: 20 * 60 * 1000,
|
||||||
backends: [{ id: "fake", enabled: true }],
|
backends: [{ id: "fake", enabled: true }],
|
||||||
},
|
},
|
||||||
dbClient: client,
|
dbClient: client,
|
||||||
@@ -211,6 +215,7 @@ describe("startWorker", () => {
|
|||||||
TEMPORAL_ADDRESS: "localhost:7233",
|
TEMPORAL_ADDRESS: "localhost:7233",
|
||||||
WORKSPACE_ROOT: workspaceRoot,
|
WORKSPACE_ROOT: workspaceRoot,
|
||||||
MAX_CONCURRENT_RUNS: 4,
|
MAX_CONCURRENT_RUNS: 4,
|
||||||
|
SESSION_MAX_HUNG_MS: 20 * 60 * 1000,
|
||||||
backends: [{ id: "fake", enabled: true }],
|
backends: [{ id: "fake", enabled: true }],
|
||||||
},
|
},
|
||||||
dbClient: client,
|
dbClient: client,
|
||||||
@@ -233,6 +238,7 @@ describe("startWorker", () => {
|
|||||||
TEMPORAL_ADDRESS: "localhost:7233",
|
TEMPORAL_ADDRESS: "localhost:7233",
|
||||||
WORKSPACE_ROOT: workspaceRoot,
|
WORKSPACE_ROOT: workspaceRoot,
|
||||||
MAX_CONCURRENT_RUNS: 4,
|
MAX_CONCURRENT_RUNS: 4,
|
||||||
|
SESSION_MAX_HUNG_MS: 20 * 60 * 1000,
|
||||||
backends: [{ id: "fake", enabled: true }],
|
backends: [{ id: "fake", enabled: true }],
|
||||||
},
|
},
|
||||||
dbClient: client,
|
dbClient: client,
|
||||||
@@ -251,6 +257,7 @@ describe("startWorker", () => {
|
|||||||
TEMPORAL_ADDRESS: "localhost:7233",
|
TEMPORAL_ADDRESS: "localhost:7233",
|
||||||
WORKSPACE_ROOT: workspaceRoot,
|
WORKSPACE_ROOT: workspaceRoot,
|
||||||
MAX_CONCURRENT_RUNS: 4,
|
MAX_CONCURRENT_RUNS: 4,
|
||||||
|
SESSION_MAX_HUNG_MS: 20 * 60 * 1000,
|
||||||
backends: [{ id: "fake", enabled: true }],
|
backends: [{ id: "fake", enabled: true }],
|
||||||
},
|
},
|
||||||
dbClient: client,
|
dbClient: client,
|
||||||
|
|||||||
@@ -51,6 +51,7 @@ export async function startWorker(options: StartWorkerOptions = {}) {
|
|||||||
workspaceRoot: config.WORKSPACE_ROOT,
|
workspaceRoot: config.WORKSPACE_ROOT,
|
||||||
availableBackends: config.backends,
|
availableBackends: config.backends,
|
||||||
maxConcurrentRuns: config.MAX_CONCURRENT_RUNS,
|
maxConcurrentRuns: config.MAX_CONCURRENT_RUNS,
|
||||||
|
recovery: { maxHungMs: config.SESSION_MAX_HUNG_MS },
|
||||||
}),
|
}),
|
||||||
connection: connection as NativeConnection,
|
connection: connection as NativeConnection,
|
||||||
namespace: "devflow",
|
namespace: "devflow",
|
||||||
|
|||||||
@@ -50,6 +50,26 @@ describe("config loader", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
expect(config.backends).toContainEqual({ id: "fake", enabled: true });
|
expect(config.backends).toContainEqual({ id: "fake", enabled: true });
|
||||||
|
expect(config.SESSION_MAX_HUNG_MS).toBe(20 * 60 * 1000);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("loads configurable session hung timeout", () => {
|
||||||
|
const root = mkdtempSync(join(tmpdir(), "devflow-config-"));
|
||||||
|
const workspace = join(root, "workspace");
|
||||||
|
mkdirSync(workspace);
|
||||||
|
|
||||||
|
const config = loadConfigFromSources({
|
||||||
|
cwd: root,
|
||||||
|
env: {
|
||||||
|
DATABASE_URL: "postgres://devflow:devflow@localhost:5432/devflow",
|
||||||
|
WORKSPACE_ROOT: workspace,
|
||||||
|
LOG_LEVEL: "info",
|
||||||
|
TEMPORAL_ADDRESS: "localhost:7233",
|
||||||
|
SESSION_MAX_HUNG_MS: "2500",
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(config.SESSION_MAX_HUNG_MS).toBe(2500);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("resolves backend binaries from PATH during config load", () => {
|
it("resolves backend binaries from PATH during config load", () => {
|
||||||
|
|||||||
@@ -22,6 +22,11 @@ const RawConfigSchema = z.object({
|
|||||||
LOG_LEVEL: LogLevel,
|
LOG_LEVEL: LogLevel,
|
||||||
TEMPORAL_ADDRESS: z.string().min(1),
|
TEMPORAL_ADDRESS: z.string().min(1),
|
||||||
MAX_CONCURRENT_RUNS: z.coerce.number().int().positive().default(4),
|
MAX_CONCURRENT_RUNS: z.coerce.number().int().positive().default(4),
|
||||||
|
SESSION_MAX_HUNG_MS: z.coerce
|
||||||
|
.number()
|
||||||
|
.int()
|
||||||
|
.positive()
|
||||||
|
.default(20 * 60 * 1000),
|
||||||
backends: z.array(BackendConfig).default([{ id: "fake", enabled: true }]),
|
backends: z.array(BackendConfig).default([{ id: "fake", enabled: true }]),
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ import {
|
|||||||
} from "@devflow/db";
|
} from "@devflow/db";
|
||||||
import {
|
import {
|
||||||
FakeSessionAdapter,
|
FakeSessionAdapter,
|
||||||
|
type ProbeResult,
|
||||||
type SessionAdapter,
|
type SessionAdapter,
|
||||||
type SessionHandle,
|
type SessionHandle,
|
||||||
SessionManager,
|
SessionManager,
|
||||||
@@ -161,6 +162,16 @@ class CaptureFailsAfterDisposeFakeAdapter extends FakeSessionAdapter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class RecentlyHungProbeFakeAdapter extends FakeSessionAdapter {
|
||||||
|
override async probe(handle: SessionHandle): Promise<ProbeResult> {
|
||||||
|
const result = await super.probe(handle);
|
||||||
|
if (!result.alive || !result.paneActive) {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
return { ...result, lastOutputAt: new Date(Date.now() - 5) };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
class TerminalHandleRecordingRuntime implements SessionRuntime {
|
class TerminalHandleRecordingRuntime implements SessionRuntime {
|
||||||
readonly adapter = new FakeSessionAdapter({ writeDelayMs: 0 });
|
readonly adapter = new FakeSessionAdapter({ writeDelayMs: 0 });
|
||||||
readonly captureHandles: SessionHandle[] = [];
|
readonly captureHandles: SessionHandle[] = [];
|
||||||
@@ -480,6 +491,43 @@ describe("DbRunEngine", () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("passes configured session hung timeout into phase recovery", async () => {
|
||||||
|
client = createDbClient(databaseUrl);
|
||||||
|
await seedDevelopmentRegistry(client.db);
|
||||||
|
const workspaceRoot = realpathSync(mkdtempSync(join(tmpdir(), "devflow-engine-workspace-")));
|
||||||
|
const repoPath = createGitRepo();
|
||||||
|
tempRoots.push(workspaceRoot, repoPath);
|
||||||
|
const engine = new DbRunEngine({
|
||||||
|
db: client.db,
|
||||||
|
sessions: sessionRuntime(client.db, new RecentlyHungProbeFakeAdapter({ writeDelayMs: 0 })),
|
||||||
|
maxConcurrentRuns: 100,
|
||||||
|
recovery: { maxHungMs: 1 },
|
||||||
|
workspaceRoot,
|
||||||
|
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 10 },
|
||||||
|
});
|
||||||
|
|
||||||
|
const { runId } = await engine.startRun({
|
||||||
|
requirementsMd: "Classify timeout recovery using the configured hung threshold.",
|
||||||
|
repoPath,
|
||||||
|
baseBranch: "main",
|
||||||
|
scenarios: {
|
||||||
|
spec: { scenario: "timeout", repairScenario: "ok" },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
runIds.push(runId);
|
||||||
|
|
||||||
|
const status = await engine.getStatus(runId);
|
||||||
|
expect(status.run.state).toBe("awaiting_approval");
|
||||||
|
expect(pendingApproval(status, "spec_approved")).toBeDefined();
|
||||||
|
|
||||||
|
const [timeoutEvent] = await client.db
|
||||||
|
.select({ payload: runEvents.payload })
|
||||||
|
.from(runEvents)
|
||||||
|
.where(and(eq(runEvents.runId, runId), eq(runEvents.type, "artifact.timeout")))
|
||||||
|
.limit(1);
|
||||||
|
expect(timeoutEvent?.payload).toMatchObject({ sessionState: "HUNG" });
|
||||||
|
});
|
||||||
|
|
||||||
it("validates a prepared run replay without accepting changed start inputs", async () => {
|
it("validates a prepared run replay without accepting changed start inputs", async () => {
|
||||||
client = createDbClient(databaseUrl);
|
client = createDbClient(databaseUrl);
|
||||||
await seedDevelopmentRegistry(client.db);
|
await seedDevelopmentRegistry(client.db);
|
||||||
|
|||||||
@@ -96,6 +96,9 @@ export interface DbRunEngineOptions {
|
|||||||
workspaceRoot: string;
|
workspaceRoot: string;
|
||||||
availableBackends?: readonly BackendConfig[];
|
availableBackends?: readonly BackendConfig[];
|
||||||
maxConcurrentRuns?: number;
|
maxConcurrentRuns?: number;
|
||||||
|
recovery?: {
|
||||||
|
maxHungMs?: number;
|
||||||
|
};
|
||||||
wait?: {
|
wait?: {
|
||||||
timeoutMs?: number;
|
timeoutMs?: number;
|
||||||
pollIntervalMs?: number;
|
pollIntervalMs?: number;
|
||||||
@@ -179,6 +182,7 @@ export class DbRunEngine implements RunEngine {
|
|||||||
private readonly workspaceRoot: string;
|
private readonly workspaceRoot: string;
|
||||||
private readonly availableBackends: readonly BackendConfig[];
|
private readonly availableBackends: readonly BackendConfig[];
|
||||||
private readonly maxConcurrentRuns: number;
|
private readonly maxConcurrentRuns: number;
|
||||||
|
private readonly recovery: DbRunEngineOptions["recovery"];
|
||||||
private readonly wait: DbRunEngineOptions["wait"];
|
private readonly wait: DbRunEngineOptions["wait"];
|
||||||
|
|
||||||
constructor(options: DbRunEngineOptions) {
|
constructor(options: DbRunEngineOptions) {
|
||||||
@@ -189,6 +193,7 @@ export class DbRunEngine implements RunEngine {
|
|||||||
{ id: "fake", enabled: true, binaryPath: undefined },
|
{ id: "fake", enabled: true, binaryPath: undefined },
|
||||||
];
|
];
|
||||||
this.maxConcurrentRuns = options.maxConcurrentRuns ?? 4;
|
this.maxConcurrentRuns = options.maxConcurrentRuns ?? 4;
|
||||||
|
this.recovery = options.recovery;
|
||||||
this.wait = options.wait;
|
this.wait = options.wait;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -911,6 +916,7 @@ export class DbRunEngine implements RunEngine {
|
|||||||
context.input.requirementsMd,
|
context.input.requirementsMd,
|
||||||
scenarioForPhase(context.input.extra, phaseRow.phaseKey),
|
scenarioForPhase(context.input.extra, phaseRow.phaseKey),
|
||||||
),
|
),
|
||||||
|
...(this.recovery === undefined ? {} : { recovery: this.recovery }),
|
||||||
...(wait === undefined ? {} : { wait }),
|
...(wait === undefined ? {} : { wait }),
|
||||||
terminalRun: false,
|
terminalRun: false,
|
||||||
...(workflowApprovalGateKey === undefined
|
...(workflowApprovalGateKey === undefined
|
||||||
|
|||||||
@@ -55,6 +55,19 @@ class RebootstrapFailsOnceFakeAdapter extends FakeSessionAdapter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class RebootstrapAlwaysRecoverableFakeAdapter extends FakeSessionAdapter {
|
||||||
|
rebootstrapAttempts = 0;
|
||||||
|
|
||||||
|
override async rebootstrap(_handle: SessionHandle): Promise<SessionHandle> {
|
||||||
|
this.rebootstrapAttempts += 1;
|
||||||
|
throw new DevflowError("rebootstrap retry budget exhausted", {
|
||||||
|
class: "recoverable",
|
||||||
|
code: "pane_briefly_unresponsive",
|
||||||
|
recoveryHint: "pane did not become responsive after rebootstrap",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
class RebootstrapWritesStaleArtifactFakeAdapter extends FakeSessionAdapter {
|
class RebootstrapWritesStaleArtifactFakeAdapter extends FakeSessionAdapter {
|
||||||
private expectedArtifactPath: string | undefined;
|
private expectedArtifactPath: string | undefined;
|
||||||
|
|
||||||
@@ -98,6 +111,15 @@ class RebootstrapHumanRequiredFakeAdapter extends FakeSessionAdapter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class RebootstrapCountingFakeAdapter extends FakeSessionAdapter {
|
||||||
|
rebootstrapAttempts = 0;
|
||||||
|
|
||||||
|
override async rebootstrap(handle: SessionHandle): Promise<SessionHandle> {
|
||||||
|
this.rebootstrapAttempts += 1;
|
||||||
|
return super.rebootstrap(handle);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
class ProbeRecoverableFakeAdapter extends FakeSessionAdapter {
|
class ProbeRecoverableFakeAdapter extends FakeSessionAdapter {
|
||||||
override async probe(_handle: SessionHandle): Promise<ProbeResult> {
|
override async probe(_handle: SessionHandle): Promise<ProbeResult> {
|
||||||
throw new DevflowError("recoverable probe failure", {
|
throw new DevflowError("recoverable probe failure", {
|
||||||
@@ -113,6 +135,18 @@ class ProbeUnknownFailureFakeAdapter extends FakeSessionAdapter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class ProbeFailsOnceThenRecoversFakeAdapter extends FakeSessionAdapter {
|
||||||
|
private probeAttempts = 0;
|
||||||
|
|
||||||
|
override async probe(handle: SessionHandle): Promise<ProbeResult> {
|
||||||
|
this.probeAttempts += 1;
|
||||||
|
if (this.probeAttempts === 1) {
|
||||||
|
throw new Error("first probe failed unexpectedly");
|
||||||
|
}
|
||||||
|
return super.probe(handle);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
class TmuxLivenessOnlyProbeFakeAdapter extends FakeSessionAdapter {
|
class TmuxLivenessOnlyProbeFakeAdapter extends FakeSessionAdapter {
|
||||||
rebootstrapAttempts = 0;
|
rebootstrapAttempts = 0;
|
||||||
|
|
||||||
@@ -126,6 +160,19 @@ class TmuxLivenessOnlyProbeFakeAdapter extends FakeSessionAdapter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class DeadPaneProbeFakeAdapter extends FakeSessionAdapter {
|
||||||
|
rebootstrapAttempts = 0;
|
||||||
|
|
||||||
|
override async probe(_handle: SessionHandle): Promise<ProbeResult> {
|
||||||
|
return { alive: false, paneActive: false, hint: "pane exited after prompt" };
|
||||||
|
}
|
||||||
|
|
||||||
|
override async rebootstrap(handle: SessionHandle): Promise<SessionHandle> {
|
||||||
|
this.rebootstrapAttempts += 1;
|
||||||
|
return super.rebootstrap(handle);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
class BreakArtifactParentFakeAdapter extends FakeSessionAdapter {
|
class BreakArtifactParentFakeAdapter extends FakeSessionAdapter {
|
||||||
override async sendPrompt(
|
override async sendPrompt(
|
||||||
handle: SessionHandle,
|
handle: SessionHandle,
|
||||||
@@ -1685,6 +1732,72 @@ describe("runSingleFakePhase", () => {
|
|||||||
.from(approvalRequests)
|
.from(approvalRequests)
|
||||||
.where(eq(approvalRequests.runId, runId));
|
.where(eq(approvalRequests.runId, runId));
|
||||||
expect(approval).toEqual({ gateKey: "prompt_send_exhausted", state: "pending" });
|
expect(approval).toEqual({ gateKey: "prompt_send_exhausted", state: "pending" });
|
||||||
|
|
||||||
|
const [session] = await db
|
||||||
|
.select({ state: tuiSessions.state })
|
||||||
|
.from(tuiSessions)
|
||||||
|
.where(eq(tuiSessions.id, sessionId));
|
||||||
|
expect(session).toEqual({ state: "FAILED_NEEDS_HUMAN" });
|
||||||
|
|
||||||
|
const events = await db
|
||||||
|
.select({ type: runEvents.type })
|
||||||
|
.from(runEvents)
|
||||||
|
.where(eq(runEvents.runId, runId))
|
||||||
|
.orderBy(runEvents.seq);
|
||||||
|
expect(events.map((event) => event.type)).not.toContain("session.crashed");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("does not emit a duplicate crash event when a crashed session resume exhausts retries", async () => {
|
||||||
|
const { db, phaseId, runId } = await createRunAndPhase();
|
||||||
|
const worktreeRoot = realpathSync(
|
||||||
|
mkdtempSync(join(tmpdir(), "devflow-fake-phase-crashed-resume-fails-")),
|
||||||
|
);
|
||||||
|
tempRoots.push(worktreeRoot);
|
||||||
|
const expectedArtifactPath = join(worktreeRoot, "artifacts", "spec.json");
|
||||||
|
const sessionId = randomUUID();
|
||||||
|
await db.insert(tuiSessions).values({
|
||||||
|
id: sessionId,
|
||||||
|
runId,
|
||||||
|
roleId: "implementer",
|
||||||
|
backend: "fake",
|
||||||
|
cwd: worktreeRoot,
|
||||||
|
expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
recoveryAttempts: 1,
|
||||||
|
state: "CRASHED",
|
||||||
|
});
|
||||||
|
const adapter = new ResumeFailsFakeAdapter();
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
runSingleFakePhase({
|
||||||
|
adapter,
|
||||||
|
db,
|
||||||
|
expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
instructions: "Scenario: ok\nCrashed session resume still fails.",
|
||||||
|
phaseId,
|
||||||
|
phaseKey: "implement",
|
||||||
|
roleId: "implementer",
|
||||||
|
runId,
|
||||||
|
worktreeRoot,
|
||||||
|
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 500 },
|
||||||
|
uuidFactory: () => "00000000-0000-4000-8000-000000000045",
|
||||||
|
}),
|
||||||
|
).rejects.toMatchObject({ code: "prompt_send_exhausted" });
|
||||||
|
|
||||||
|
expect(adapter.resumeAttempts).toBe(3);
|
||||||
|
const [session] = await db
|
||||||
|
.select({ recoveryAttempts: tuiSessions.recoveryAttempts, state: tuiSessions.state })
|
||||||
|
.from(tuiSessions)
|
||||||
|
.where(eq(tuiSessions.id, sessionId));
|
||||||
|
expect(session).toEqual({ recoveryAttempts: 1, state: "FAILED_NEEDS_HUMAN" });
|
||||||
|
|
||||||
|
const events = await db
|
||||||
|
.select({ type: runEvents.type })
|
||||||
|
.from(runEvents)
|
||||||
|
.where(eq(runEvents.runId, runId))
|
||||||
|
.orderBy(runEvents.seq);
|
||||||
|
expect(events.map((event) => event.type)).not.toContain("session.crashed");
|
||||||
});
|
});
|
||||||
|
|
||||||
it("resumes a running phase when the crash happened before session creation", async () => {
|
it("resumes a running phase when the crash happened before session creation", async () => {
|
||||||
@@ -2358,10 +2471,20 @@ describe("runSingleFakePhase", () => {
|
|||||||
await expectRunPaused(db, runId);
|
await expectRunPaused(db, runId);
|
||||||
|
|
||||||
const [approval] = await db
|
const [approval] = await db
|
||||||
.select({ gateKey: approvalRequests.gateKey, state: approvalRequests.state })
|
.select({
|
||||||
|
gateKey: approvalRequests.gateKey,
|
||||||
|
payload: approvalRequests.payload,
|
||||||
|
state: approvalRequests.state,
|
||||||
|
})
|
||||||
.from(approvalRequests)
|
.from(approvalRequests)
|
||||||
.where(eq(approvalRequests.runId, runId));
|
.where(eq(approvalRequests.runId, runId));
|
||||||
expect(approval).toEqual({ gateKey: "artifact_invalid_after_repair", state: "pending" });
|
expect(approval).toEqual({
|
||||||
|
gateKey: "artifact_invalid_after_repair",
|
||||||
|
payload: expect.objectContaining({
|
||||||
|
recoveryHint: expect.stringContaining(expectedArtifactPath),
|
||||||
|
}),
|
||||||
|
state: "pending",
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
it("revalidates an artifact file when validating replay has no artifact row yet", async () => {
|
it("revalidates an artifact file when validating replay has no artifact row yet", async () => {
|
||||||
@@ -3068,6 +3191,51 @@ describe("runSingleFakePhase", () => {
|
|||||||
await expectRunCompleted(db, runId);
|
await expectRunCompleted(db, runId);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("preserves rebootstrap exhaustion recovery hints in the human gate", async () => {
|
||||||
|
const { db, phaseId, runId } = await createRunAndPhase();
|
||||||
|
const worktreeRoot = realpathSync(
|
||||||
|
mkdtempSync(join(tmpdir(), "devflow-fake-phase-rebootstrap-exhausted-")),
|
||||||
|
);
|
||||||
|
tempRoots.push(worktreeRoot);
|
||||||
|
const expectedArtifactPath = join(worktreeRoot, "artifacts", "spec.json");
|
||||||
|
const sessionId = randomUUID();
|
||||||
|
const adapter = new RebootstrapAlwaysRecoverableFakeAdapter({
|
||||||
|
sessionIdFactory: () => sessionId,
|
||||||
|
writeDelayMs: 0,
|
||||||
|
});
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
runSingleFakePhase({
|
||||||
|
adapter,
|
||||||
|
db,
|
||||||
|
expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
instructions: "Scenario: timeout\nRepair-Scenario: ok\nRebootstrap never recovers.",
|
||||||
|
phaseId,
|
||||||
|
phaseKey: "implement",
|
||||||
|
roleId: "implementer",
|
||||||
|
runId,
|
||||||
|
worktreeRoot,
|
||||||
|
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 10 },
|
||||||
|
uuidFactory: () => "00000000-0000-4000-8000-000000000041",
|
||||||
|
}),
|
||||||
|
).rejects.toMatchObject({ code: "artifact_timeout_exhausted" });
|
||||||
|
|
||||||
|
expect(adapter.rebootstrapAttempts).toBe(2);
|
||||||
|
await expectRunPaused(db, runId);
|
||||||
|
|
||||||
|
const [approval] = await db
|
||||||
|
.select({ gateKey: approvalRequests.gateKey, payload: approvalRequests.payload })
|
||||||
|
.from(approvalRequests)
|
||||||
|
.where(eq(approvalRequests.runId, runId));
|
||||||
|
expect(approval).toEqual({
|
||||||
|
gateKey: "artifact_timeout_exhausted",
|
||||||
|
payload: expect.objectContaining({
|
||||||
|
recoveryHint: "pane did not become responsive after rebootstrap",
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
it("fails the run instead of gating when timeout recovery rebootstrap is fatal", async () => {
|
it("fails the run instead of gating when timeout recovery rebootstrap is fatal", async () => {
|
||||||
const { db, phaseId, runId } = await createRunAndPhase();
|
const { db, phaseId, runId } = await createRunAndPhase();
|
||||||
const worktreeRoot = realpathSync(
|
const worktreeRoot = realpathSync(
|
||||||
@@ -3178,10 +3346,18 @@ describe("runSingleFakePhase", () => {
|
|||||||
await expectRunPaused(db, runId);
|
await expectRunPaused(db, runId);
|
||||||
|
|
||||||
const [approval] = await db
|
const [approval] = await db
|
||||||
.select({ gateKey: approvalRequests.gateKey, state: approvalRequests.state })
|
.select({
|
||||||
|
gateKey: approvalRequests.gateKey,
|
||||||
|
payload: approvalRequests.payload,
|
||||||
|
state: approvalRequests.state,
|
||||||
|
})
|
||||||
.from(approvalRequests)
|
.from(approvalRequests)
|
||||||
.where(eq(approvalRequests.runId, runId));
|
.where(eq(approvalRequests.runId, runId));
|
||||||
expect(approval).toEqual({ gateKey: "backend_unavailable", state: "pending" });
|
expect(approval).toEqual({
|
||||||
|
gateKey: "backend_unavailable",
|
||||||
|
payload: expect.objectContaining({ recoveryHint: "human rebootstrap failure" }),
|
||||||
|
state: "pending",
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
it("requests a human gate when timeout recovery probe fails recoverably", async () => {
|
it("requests a human gate when timeout recovery probe fails recoverably", async () => {
|
||||||
@@ -3232,6 +3408,13 @@ describe("runSingleFakePhase", () => {
|
|||||||
.from(approvalRequests)
|
.from(approvalRequests)
|
||||||
.where(eq(approvalRequests.runId, runId));
|
.where(eq(approvalRequests.runId, runId));
|
||||||
expect(approval).toEqual({ gateKey: "artifact_timeout_exhausted", state: "pending" });
|
expect(approval).toEqual({ gateKey: "artifact_timeout_exhausted", state: "pending" });
|
||||||
|
|
||||||
|
const events = await db
|
||||||
|
.select({ type: runEvents.type })
|
||||||
|
.from(runEvents)
|
||||||
|
.where(eq(runEvents.runId, runId))
|
||||||
|
.orderBy(runEvents.seq);
|
||||||
|
expect(events.map((event) => event.type)).not.toContain("session.crashed");
|
||||||
});
|
});
|
||||||
|
|
||||||
it("does not rebootstrap when tmux probe only proves pane liveness", async () => {
|
it("does not rebootstrap when tmux probe only proves pane liveness", async () => {
|
||||||
@@ -3274,6 +3457,508 @@ describe("runSingleFakePhase", () => {
|
|||||||
expect(approval).toEqual({ gateKey: "artifact_timeout_exhausted", state: "pending" });
|
expect(approval).toEqual({ gateKey: "artifact_timeout_exhausted", state: "pending" });
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("recovers a post-prompt crashed session through rebootstrap", async () => {
|
||||||
|
const { db, phaseId, runId } = await createRunAndPhase();
|
||||||
|
const worktreeRoot = realpathSync(
|
||||||
|
mkdtempSync(join(tmpdir(), "devflow-fake-phase-dead-pane-timeout-")),
|
||||||
|
);
|
||||||
|
tempRoots.push(worktreeRoot);
|
||||||
|
const expectedArtifactPath = join(worktreeRoot, "artifacts", "spec.json");
|
||||||
|
const sessionId = randomUUID();
|
||||||
|
const adapter = new DeadPaneProbeFakeAdapter({
|
||||||
|
sessionIdFactory: () => sessionId,
|
||||||
|
writeDelayMs: 0,
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await runSingleFakePhase({
|
||||||
|
adapter,
|
||||||
|
db,
|
||||||
|
expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
instructions: "Scenario: timeout\nRepair-Scenario: ok\nPane exits after accepting prompt.",
|
||||||
|
phaseId,
|
||||||
|
phaseKey: "implement",
|
||||||
|
roleId: "implementer",
|
||||||
|
runId,
|
||||||
|
worktreeRoot,
|
||||||
|
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 10 },
|
||||||
|
uuidFactory: () => "00000000-0000-4000-8000-000000000047",
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.artifactValid).toBe(true);
|
||||||
|
expect(adapter.rebootstrapAttempts).toBe(1);
|
||||||
|
await expectRunCompleted(db, runId);
|
||||||
|
|
||||||
|
const [session] = await db
|
||||||
|
.select({ state: tuiSessions.state })
|
||||||
|
.from(tuiSessions)
|
||||||
|
.where(eq(tuiSessions.id, sessionId));
|
||||||
|
expect(session).toEqual({ state: "READY" });
|
||||||
|
|
||||||
|
const events = await db
|
||||||
|
.select({ type: runEvents.type })
|
||||||
|
.from(runEvents)
|
||||||
|
.where(eq(runEvents.runId, runId))
|
||||||
|
.orderBy(runEvents.seq);
|
||||||
|
expect(events.map((event) => event.type)).toEqual(
|
||||||
|
expect.arrayContaining(["session.crashed", "session.recovered", "artifact.validated"]),
|
||||||
|
);
|
||||||
|
expect(events.map((event) => event.type)).not.toContain("approval.requested");
|
||||||
|
});
|
||||||
|
|
||||||
|
it.each(["ARTIFACT_TIMEOUT", "HUNG"] as const)(
|
||||||
|
"recovers a dead pane when replay starts from %s",
|
||||||
|
async (sessionState) => {
|
||||||
|
const { db, phaseId, runId } = await createRunAndPhase("executing", "awaiting_artifact", 1);
|
||||||
|
const worktreeRoot = realpathSync(
|
||||||
|
mkdtempSync(
|
||||||
|
join(tmpdir(), `devflow-fake-phase-dead-${sessionState.toLowerCase()}-replay-`),
|
||||||
|
),
|
||||||
|
);
|
||||||
|
tempRoots.push(worktreeRoot);
|
||||||
|
const expectedArtifactPath = join(worktreeRoot, "artifacts", "spec.json");
|
||||||
|
const sessionId = randomUUID();
|
||||||
|
const instructions = `Scenario: ok\nReplay dead pane from ${sessionState}.`;
|
||||||
|
const dedupKey = hash({
|
||||||
|
attempt: 1,
|
||||||
|
expectedArtifact: expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
instructions,
|
||||||
|
phaseKey: "implement",
|
||||||
|
roleId: "implementer",
|
||||||
|
runId,
|
||||||
|
});
|
||||||
|
await new RunEventRepository(db).append({
|
||||||
|
idempotencyKey: `phase.started:${phaseId}:1`,
|
||||||
|
phaseId,
|
||||||
|
payload: { attempt: 1, phaseKey: "implement" },
|
||||||
|
runId,
|
||||||
|
type: "phase.started",
|
||||||
|
});
|
||||||
|
await db.insert(tuiSessions).values({
|
||||||
|
id: sessionId,
|
||||||
|
runId,
|
||||||
|
roleId: "implementer",
|
||||||
|
backend: "fake",
|
||||||
|
cwd: worktreeRoot,
|
||||||
|
expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
lastPromptHash: dedupKey,
|
||||||
|
lastPromptAt: new Date("2026-05-13T00:00:00.000Z"),
|
||||||
|
state: sessionState,
|
||||||
|
});
|
||||||
|
const adapter = new DeadPaneProbeFakeAdapter({
|
||||||
|
sessionIdFactory: () => sessionId,
|
||||||
|
writeDelayMs: 0,
|
||||||
|
});
|
||||||
|
await adapter.start({
|
||||||
|
sessionId,
|
||||||
|
runId,
|
||||||
|
roleId: "implementer",
|
||||||
|
backend: "fake",
|
||||||
|
cwd: worktreeRoot,
|
||||||
|
expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await runSingleFakePhase({
|
||||||
|
adapter,
|
||||||
|
db,
|
||||||
|
expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
instructions,
|
||||||
|
phaseId,
|
||||||
|
phaseKey: "implement",
|
||||||
|
roleId: "implementer",
|
||||||
|
runId,
|
||||||
|
worktreeRoot,
|
||||||
|
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 500 },
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.artifactValid).toBe(true);
|
||||||
|
expect(adapter.rebootstrapAttempts).toBe(1);
|
||||||
|
|
||||||
|
const events = await db
|
||||||
|
.select({ type: runEvents.type })
|
||||||
|
.from(runEvents)
|
||||||
|
.where(eq(runEvents.runId, runId))
|
||||||
|
.orderBy(runEvents.seq);
|
||||||
|
expect(events.map((event) => event.type)).toContain("session.recovered");
|
||||||
|
expect(events.map((event) => event.type)).toContain("artifact.validated");
|
||||||
|
expect(events.map((event) => event.type)).not.toContain("approval.requested");
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
it("gates awaiting-artifact replay when the persisted session is already timed out", async () => {
|
||||||
|
const { db, phaseId, runId } = await createRunAndPhase("executing", "awaiting_artifact", 1);
|
||||||
|
const worktreeRoot = realpathSync(
|
||||||
|
mkdtempSync(join(tmpdir(), "devflow-fake-phase-timeout-replay-")),
|
||||||
|
);
|
||||||
|
tempRoots.push(worktreeRoot);
|
||||||
|
const expectedArtifactPath = join(worktreeRoot, "artifacts", "spec.json");
|
||||||
|
mkdirSync(dirname(expectedArtifactPath), { recursive: true });
|
||||||
|
writeFileSync(
|
||||||
|
expectedArtifactPath,
|
||||||
|
JSON.stringify({
|
||||||
|
summary: "Late artifact",
|
||||||
|
requirements: [{ id: "REQ-1", description: "Do not accept stale timeout output" }],
|
||||||
|
acceptanceCriteria: ["Gate instead of skipping recovery"],
|
||||||
|
risks: [],
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
const sessionId = randomUUID();
|
||||||
|
const instructions = "Scenario: ok\nReplay a timed-out phase.";
|
||||||
|
const dedupKey = hash({
|
||||||
|
attempt: 1,
|
||||||
|
expectedArtifact: expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
instructions,
|
||||||
|
phaseKey: "implement",
|
||||||
|
roleId: "implementer",
|
||||||
|
runId,
|
||||||
|
});
|
||||||
|
await new RunEventRepository(db).append({
|
||||||
|
idempotencyKey: `phase.started:${phaseId}:1`,
|
||||||
|
phaseId,
|
||||||
|
payload: { attempt: 1, phaseKey: "implement" },
|
||||||
|
runId,
|
||||||
|
type: "phase.started",
|
||||||
|
});
|
||||||
|
await db.insert(tuiSessions).values({
|
||||||
|
id: sessionId,
|
||||||
|
runId,
|
||||||
|
roleId: "implementer",
|
||||||
|
backend: "fake",
|
||||||
|
cwd: worktreeRoot,
|
||||||
|
expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
lastPromptHash: dedupKey,
|
||||||
|
lastPromptAt: new Date("2026-05-13T00:00:00.000Z"),
|
||||||
|
state: "ARTIFACT_TIMEOUT",
|
||||||
|
});
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
runSingleFakePhase({
|
||||||
|
adapter: new FakeSessionAdapter({ sessionIdFactory: () => sessionId, writeDelayMs: 0 }),
|
||||||
|
db,
|
||||||
|
expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
instructions,
|
||||||
|
phaseId,
|
||||||
|
phaseKey: "implement",
|
||||||
|
roleId: "implementer",
|
||||||
|
runId,
|
||||||
|
worktreeRoot,
|
||||||
|
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 10 },
|
||||||
|
uuidFactory: () => "00000000-0000-4000-8000-000000000042",
|
||||||
|
}),
|
||||||
|
).rejects.toMatchObject({ code: "artifact_timeout_exhausted" });
|
||||||
|
|
||||||
|
await expectRunPaused(db, runId);
|
||||||
|
const [session] = await db
|
||||||
|
.select({ state: tuiSessions.state })
|
||||||
|
.from(tuiSessions)
|
||||||
|
.where(eq(tuiSessions.id, sessionId));
|
||||||
|
expect(session).toEqual({ state: "FAILED_NEEDS_HUMAN" });
|
||||||
|
|
||||||
|
const events = await db
|
||||||
|
.select({ type: runEvents.type })
|
||||||
|
.from(runEvents)
|
||||||
|
.where(eq(runEvents.runId, runId))
|
||||||
|
.orderBy(runEvents.seq);
|
||||||
|
expect(events.map((event) => event.type)).not.toContain("session.crashed");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("continues timeout recovery replay when the session was already resuming", async () => {
|
||||||
|
const { db, phaseId, runId } = await createRunAndPhase("executing", "awaiting_artifact", 1);
|
||||||
|
const worktreeRoot = realpathSync(
|
||||||
|
mkdtempSync(join(tmpdir(), "devflow-fake-phase-resuming-timeout-replay-")),
|
||||||
|
);
|
||||||
|
tempRoots.push(worktreeRoot);
|
||||||
|
const expectedArtifactPath = join(worktreeRoot, "artifacts", "spec.json");
|
||||||
|
mkdirSync(dirname(expectedArtifactPath), { recursive: true });
|
||||||
|
writeFileSync(
|
||||||
|
expectedArtifactPath,
|
||||||
|
JSON.stringify({
|
||||||
|
summary: "Late artifact while resuming",
|
||||||
|
requirements: [{ id: "REQ-1", description: "Do not accept stale resuming output" }],
|
||||||
|
acceptanceCriteria: ["Continue recovery and request fresh repair output"],
|
||||||
|
risks: [],
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
const sessionId = randomUUID();
|
||||||
|
const instructions = "Scenario: ok\nReplay an already-resuming timeout recovery.";
|
||||||
|
const dedupKey = hash({
|
||||||
|
attempt: 1,
|
||||||
|
expectedArtifact: expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
instructions,
|
||||||
|
phaseKey: "implement",
|
||||||
|
roleId: "implementer",
|
||||||
|
runId,
|
||||||
|
});
|
||||||
|
await new RunEventRepository(db).append({
|
||||||
|
idempotencyKey: `phase.started:${phaseId}:1`,
|
||||||
|
phaseId,
|
||||||
|
payload: { attempt: 1, phaseKey: "implement" },
|
||||||
|
runId,
|
||||||
|
type: "phase.started",
|
||||||
|
});
|
||||||
|
await db.insert(tuiSessions).values({
|
||||||
|
id: sessionId,
|
||||||
|
runId,
|
||||||
|
roleId: "implementer",
|
||||||
|
backend: "fake",
|
||||||
|
cwd: worktreeRoot,
|
||||||
|
expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
lastPromptHash: dedupKey,
|
||||||
|
lastPromptAt: new Date("2026-05-13T00:00:00.000Z"),
|
||||||
|
state: "RESUMING",
|
||||||
|
});
|
||||||
|
const adapter = new RebootstrapCountingFakeAdapter({
|
||||||
|
sessionIdFactory: () => sessionId,
|
||||||
|
writeDelayMs: 0,
|
||||||
|
});
|
||||||
|
await adapter.start({
|
||||||
|
sessionId,
|
||||||
|
runId,
|
||||||
|
roleId: "implementer",
|
||||||
|
backend: "fake",
|
||||||
|
cwd: worktreeRoot,
|
||||||
|
expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await runSingleFakePhase({
|
||||||
|
adapter,
|
||||||
|
db,
|
||||||
|
expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
instructions,
|
||||||
|
phaseId,
|
||||||
|
phaseKey: "implement",
|
||||||
|
roleId: "implementer",
|
||||||
|
runId,
|
||||||
|
worktreeRoot,
|
||||||
|
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 500 },
|
||||||
|
uuidFactory: () => "00000000-0000-4000-8000-000000000046",
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.artifactValid).toBe(true);
|
||||||
|
expect(adapter.rebootstrapAttempts).toBe(1);
|
||||||
|
const artifactRows = await db
|
||||||
|
.select({ id: artifacts.id, valid: artifacts.valid })
|
||||||
|
.from(artifacts)
|
||||||
|
.where(eq(artifacts.runId, runId));
|
||||||
|
expect(artifactRows).toHaveLength(1);
|
||||||
|
expect(artifactRows[0]?.valid).toBe(true);
|
||||||
|
|
||||||
|
const [phase] = await db
|
||||||
|
.select({ attempts: runPhases.attempts, state: runPhases.state })
|
||||||
|
.from(runPhases)
|
||||||
|
.where(eq(runPhases.id, phaseId));
|
||||||
|
expect(phase).toEqual({ attempts: 2, state: "completed" });
|
||||||
|
|
||||||
|
const events = await db
|
||||||
|
.select({ type: runEvents.type })
|
||||||
|
.from(runEvents)
|
||||||
|
.where(eq(runEvents.runId, runId))
|
||||||
|
.orderBy(runEvents.seq);
|
||||||
|
expect(events.map((event) => event.type)).toContain("session.recovered");
|
||||||
|
expect(events.map((event) => event.type)).toContain("artifact.validated");
|
||||||
|
});
|
||||||
|
|
||||||
|
it.each([
|
||||||
|
["HUNG", 1],
|
||||||
|
["CRASHED", 1],
|
||||||
|
["REBOOTSTRAPPED", 0],
|
||||||
|
] as const)(
|
||||||
|
"continues timeout recovery replay when the session is %s",
|
||||||
|
async (sessionState, expectedRebootstrapAttempts) => {
|
||||||
|
const { db, phaseId, runId } = await createRunAndPhase("executing", "awaiting_artifact", 1);
|
||||||
|
const worktreeRoot = realpathSync(
|
||||||
|
mkdtempSync(
|
||||||
|
join(tmpdir(), `devflow-fake-phase-${sessionState.toLowerCase()}-timeout-replay-`),
|
||||||
|
),
|
||||||
|
);
|
||||||
|
tempRoots.push(worktreeRoot);
|
||||||
|
const expectedArtifactPath = join(worktreeRoot, "artifacts", "spec.json");
|
||||||
|
mkdirSync(dirname(expectedArtifactPath), { recursive: true });
|
||||||
|
writeFileSync(
|
||||||
|
expectedArtifactPath,
|
||||||
|
JSON.stringify({
|
||||||
|
summary: `Late artifact while ${sessionState}`,
|
||||||
|
requirements: [{ id: "REQ-1", description: "Do not accept stale replay output" }],
|
||||||
|
acceptanceCriteria: ["Continue recovery and require fresh repair output"],
|
||||||
|
risks: [],
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
const sessionId = randomUUID();
|
||||||
|
const instructions = `Scenario: ok\nReplay ${sessionState} timeout recovery.`;
|
||||||
|
const dedupKey = hash({
|
||||||
|
attempt: 1,
|
||||||
|
expectedArtifact: expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
instructions,
|
||||||
|
phaseKey: "implement",
|
||||||
|
roleId: "implementer",
|
||||||
|
runId,
|
||||||
|
});
|
||||||
|
await new RunEventRepository(db).append({
|
||||||
|
idempotencyKey: `phase.started:${phaseId}:1`,
|
||||||
|
phaseId,
|
||||||
|
payload: { attempt: 1, phaseKey: "implement" },
|
||||||
|
runId,
|
||||||
|
type: "phase.started",
|
||||||
|
});
|
||||||
|
await db.insert(tuiSessions).values({
|
||||||
|
id: sessionId,
|
||||||
|
runId,
|
||||||
|
roleId: "implementer",
|
||||||
|
backend: "fake",
|
||||||
|
cwd: worktreeRoot,
|
||||||
|
expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
lastPromptHash: dedupKey,
|
||||||
|
lastPromptAt: new Date("2026-05-13T00:00:00.000Z"),
|
||||||
|
state: sessionState,
|
||||||
|
});
|
||||||
|
const adapter = new RebootstrapCountingFakeAdapter({
|
||||||
|
sessionIdFactory: () => sessionId,
|
||||||
|
writeDelayMs: 0,
|
||||||
|
});
|
||||||
|
await adapter.start({
|
||||||
|
sessionId,
|
||||||
|
runId,
|
||||||
|
roleId: "implementer",
|
||||||
|
backend: "fake",
|
||||||
|
cwd: worktreeRoot,
|
||||||
|
expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await runSingleFakePhase({
|
||||||
|
adapter,
|
||||||
|
db,
|
||||||
|
expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
instructions,
|
||||||
|
phaseId,
|
||||||
|
phaseKey: "implement",
|
||||||
|
roleId: "implementer",
|
||||||
|
runId,
|
||||||
|
worktreeRoot,
|
||||||
|
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 500 },
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.artifactValid).toBe(true);
|
||||||
|
expect(adapter.rebootstrapAttempts).toBe(expectedRebootstrapAttempts);
|
||||||
|
|
||||||
|
const [phase] = await db
|
||||||
|
.select({ attempts: runPhases.attempts, state: runPhases.state })
|
||||||
|
.from(runPhases)
|
||||||
|
.where(eq(runPhases.id, phaseId));
|
||||||
|
expect(phase).toEqual({ attempts: 2, state: "completed" });
|
||||||
|
|
||||||
|
const events = await db
|
||||||
|
.select({ type: runEvents.type })
|
||||||
|
.from(runEvents)
|
||||||
|
.where(eq(runEvents.runId, runId))
|
||||||
|
.orderBy(runEvents.seq);
|
||||||
|
expect(events.map((event) => event.type)).toContain("session.recovered");
|
||||||
|
expect(events.map((event) => event.type)).toContain("artifact.validated");
|
||||||
|
expect(events.map((event) => event.type)).not.toContain("session.crashed");
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
it("does not accept a stale timeout artifact after session recovery replay", async () => {
|
||||||
|
const { db, phaseId, runId } = await createRunAndPhase("executing", "awaiting_artifact", 1);
|
||||||
|
const worktreeRoot = realpathSync(
|
||||||
|
mkdtempSync(join(tmpdir(), "devflow-fake-phase-recovered-timeout-replay-")),
|
||||||
|
);
|
||||||
|
tempRoots.push(worktreeRoot);
|
||||||
|
const expectedArtifactPath = join(worktreeRoot, "artifacts", "spec.json");
|
||||||
|
mkdirSync(dirname(expectedArtifactPath), { recursive: true });
|
||||||
|
writeFileSync(
|
||||||
|
expectedArtifactPath,
|
||||||
|
JSON.stringify({
|
||||||
|
summary: "Late artifact after recovery",
|
||||||
|
requirements: [{ id: "REQ-1", description: "Do not complete from stale output" }],
|
||||||
|
acceptanceCriteria: ["Require fresh repair output"],
|
||||||
|
risks: [],
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
const sessionId = randomUUID();
|
||||||
|
const instructions = "Scenario: ok\nReplay after recovered timeout.";
|
||||||
|
const dedupKey = hash({
|
||||||
|
attempt: 1,
|
||||||
|
expectedArtifact: expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
instructions,
|
||||||
|
phaseKey: "implement",
|
||||||
|
roleId: "implementer",
|
||||||
|
runId,
|
||||||
|
});
|
||||||
|
const repository = new RunEventRepository(db);
|
||||||
|
await repository.append({
|
||||||
|
idempotencyKey: `phase.started:${phaseId}:1`,
|
||||||
|
phaseId,
|
||||||
|
payload: { attempt: 1, phaseKey: "implement" },
|
||||||
|
runId,
|
||||||
|
type: "phase.started",
|
||||||
|
});
|
||||||
|
await repository.append({
|
||||||
|
idempotencyKey: `session.recovered:${sessionId}:1`,
|
||||||
|
phaseId,
|
||||||
|
payload: { recoveryAttempts: 1, roleId: "implementer", sessionId },
|
||||||
|
runId,
|
||||||
|
type: "session.recovered",
|
||||||
|
});
|
||||||
|
await db.insert(tuiSessions).values({
|
||||||
|
id: sessionId,
|
||||||
|
runId,
|
||||||
|
roleId: "implementer",
|
||||||
|
backend: "fake",
|
||||||
|
cwd: worktreeRoot,
|
||||||
|
expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
lastPromptHash: dedupKey,
|
||||||
|
lastPromptAt: new Date("2026-05-13T00:00:00.000Z"),
|
||||||
|
recoveryAttempts: 1,
|
||||||
|
state: "READY",
|
||||||
|
});
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
runSingleFakePhase({
|
||||||
|
adapter: new FakeSessionAdapter({ sessionIdFactory: () => sessionId, writeDelayMs: 0 }),
|
||||||
|
db,
|
||||||
|
expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
instructions,
|
||||||
|
phaseId,
|
||||||
|
phaseKey: "implement",
|
||||||
|
roleId: "implementer",
|
||||||
|
runId,
|
||||||
|
worktreeRoot,
|
||||||
|
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 10 },
|
||||||
|
uuidFactory: () => "00000000-0000-4000-8000-000000000043",
|
||||||
|
}),
|
||||||
|
).rejects.toMatchObject({ code: "prompt_send_exhausted" });
|
||||||
|
|
||||||
|
const artifactRows = await db
|
||||||
|
.select({ id: artifacts.id })
|
||||||
|
.from(artifacts)
|
||||||
|
.where(eq(artifacts.runId, runId));
|
||||||
|
expect(artifactRows).toEqual([]);
|
||||||
|
|
||||||
|
const [phase] = await db
|
||||||
|
.select({ attempts: runPhases.attempts, state: runPhases.state })
|
||||||
|
.from(runPhases)
|
||||||
|
.where(eq(runPhases.id, phaseId));
|
||||||
|
expect(phase).toEqual({ attempts: 2, state: "failed" });
|
||||||
|
});
|
||||||
|
|
||||||
it("fails the run when timeout recovery probe throws an unclassified error", async () => {
|
it("fails the run when timeout recovery probe throws an unclassified error", async () => {
|
||||||
const { db, phaseId, runId } = await createRunAndPhase();
|
const { db, phaseId, runId } = await createRunAndPhase();
|
||||||
const worktreeRoot = realpathSync(
|
const worktreeRoot = realpathSync(
|
||||||
@@ -3313,6 +3998,47 @@ describe("runSingleFakePhase", () => {
|
|||||||
expect(approvals).toEqual([]);
|
expect(approvals).toEqual([]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("does not swallow an unclassified timeout-classification probe failure when a later probe would recover", async () => {
|
||||||
|
const { db, phaseId, runId } = await createRunAndPhase();
|
||||||
|
const worktreeRoot = realpathSync(
|
||||||
|
mkdtempSync(join(tmpdir(), "devflow-fake-phase-probe-first-unknown-")),
|
||||||
|
);
|
||||||
|
tempRoots.push(worktreeRoot);
|
||||||
|
const expectedArtifactPath = join(worktreeRoot, "artifacts", "spec.json");
|
||||||
|
const sessionId = randomUUID();
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
runSingleFakePhase({
|
||||||
|
adapter: new ProbeFailsOnceThenRecoversFakeAdapter({
|
||||||
|
sessionIdFactory: () => sessionId,
|
||||||
|
writeDelayMs: 0,
|
||||||
|
}),
|
||||||
|
db,
|
||||||
|
expectedArtifactPath,
|
||||||
|
expectedSchema: "dev/spec@1",
|
||||||
|
instructions:
|
||||||
|
"Scenario: timeout\nRepair-Scenario: ok\nFirst classification probe fails unexpectedly.",
|
||||||
|
phaseId,
|
||||||
|
phaseKey: "implement",
|
||||||
|
roleId: "implementer",
|
||||||
|
runId,
|
||||||
|
worktreeRoot,
|
||||||
|
wait: { pollIntervalMs: 1, stableMs: 0, timeoutMs: 10 },
|
||||||
|
uuidFactory: () => "00000000-0000-4000-8000-000000000038",
|
||||||
|
}),
|
||||||
|
).rejects.toMatchObject({ code: "internal_state_corruption" });
|
||||||
|
|
||||||
|
const [run] = await db.select({ state: runs.state }).from(runs).where(eq(runs.id, runId));
|
||||||
|
expect(run).toEqual({ state: "failed" });
|
||||||
|
|
||||||
|
const events = await db
|
||||||
|
.select({ type: runEvents.type })
|
||||||
|
.from(runEvents)
|
||||||
|
.where(eq(runEvents.runId, runId))
|
||||||
|
.orderBy(runEvents.seq);
|
||||||
|
expect(events.map((event) => event.type)).not.toContain("session.recovered");
|
||||||
|
});
|
||||||
|
|
||||||
it("does not let a stale artifact produced during timeout recovery satisfy repair validation", async () => {
|
it("does not let a stale artifact produced during timeout recovery satisfy repair validation", async () => {
|
||||||
const { db, phaseId, runId } = await createRunAndPhase();
|
const { db, phaseId, runId } = await createRunAndPhase();
|
||||||
const worktreeRoot = realpathSync(
|
const worktreeRoot = realpathSync(
|
||||||
|
|||||||
@@ -28,7 +28,12 @@ import {
|
|||||||
SessionManager,
|
SessionManager,
|
||||||
type SessionRuntime,
|
type SessionRuntime,
|
||||||
type TranscriptChunkSink,
|
type TranscriptChunkSink,
|
||||||
|
assertSessionStateAssignment,
|
||||||
|
assertSessionTransition,
|
||||||
captureAndPersistTranscript,
|
captureAndPersistTranscript,
|
||||||
|
isAllowedSessionTransition,
|
||||||
|
isSessionHung,
|
||||||
|
retryRecoverable,
|
||||||
} from "@devflow/session";
|
} from "@devflow/session";
|
||||||
import { and, desc, eq, inArray, sql } from "drizzle-orm";
|
import { and, desc, eq, inArray, sql } from "drizzle-orm";
|
||||||
|
|
||||||
@@ -44,6 +49,10 @@ interface ArtifactWaitOptions extends FakePhaseWaitOptions {
|
|||||||
ignoreInitialSignature?: string;
|
ignoreInitialSignature?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface FakePhaseRecoveryOptions {
|
||||||
|
maxHungMs?: number;
|
||||||
|
}
|
||||||
|
|
||||||
interface RunSingleFakePhaseBaseInput {
|
interface RunSingleFakePhaseBaseInput {
|
||||||
db: DbClient["db"];
|
db: DbClient["db"];
|
||||||
runId: string;
|
runId: string;
|
||||||
@@ -55,6 +64,7 @@ interface RunSingleFakePhaseBaseInput {
|
|||||||
expectedSchema: string;
|
expectedSchema: string;
|
||||||
instructions: string;
|
instructions: string;
|
||||||
wait?: FakePhaseWaitOptions;
|
wait?: FakePhaseWaitOptions;
|
||||||
|
recovery?: FakePhaseRecoveryOptions;
|
||||||
uuidFactory?: () => string;
|
uuidFactory?: () => string;
|
||||||
transcriptSink?: TranscriptChunkSink;
|
transcriptSink?: TranscriptChunkSink;
|
||||||
terminalRun?: boolean;
|
terminalRun?: boolean;
|
||||||
@@ -81,7 +91,6 @@ export interface RunSingleFakePhaseResult {
|
|||||||
|
|
||||||
type TransactionDb = Parameters<Parameters<RunSingleFakePhaseInput["db"]["transaction"]>[0]>[0];
|
type TransactionDb = Parameters<Parameters<RunSingleFakePhaseInput["db"]["transaction"]>[0]>[0];
|
||||||
|
|
||||||
const sendPromptRetryBudget = 2;
|
|
||||||
const terminalRunStates = ["completed", "failed", "aborted"] as const;
|
const terminalRunStates = ["completed", "failed", "aborted"] as const;
|
||||||
const phaseMutationRunStates = ["executing", "planning"] as const;
|
const phaseMutationRunStates = ["executing", "planning"] as const;
|
||||||
|
|
||||||
@@ -298,9 +307,9 @@ export async function runSingleFakePhase(
|
|||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
|
|
||||||
let recovered: boolean;
|
let recovery: ArtifactTimeoutRecoveryResult;
|
||||||
try {
|
try {
|
||||||
recovered = await recoverFromArtifactTimeout(input, eventRepository, handle.sessionId);
|
recovery = await recoverFromArtifactTimeout(input, eventRepository, handle.sessionId);
|
||||||
} catch (recoveryError) {
|
} catch (recoveryError) {
|
||||||
if (isRunStateChanged(recoveryError)) {
|
if (isRunStateChanged(recoveryError)) {
|
||||||
await captureTranscript(input, handle);
|
await captureTranscript(input, handle);
|
||||||
@@ -333,7 +342,7 @@ export async function runSingleFakePhase(
|
|||||||
);
|
);
|
||||||
throw recoveryError;
|
throw recoveryError;
|
||||||
}
|
}
|
||||||
if (!recovered) {
|
if (!recovery.recovered) {
|
||||||
await failPhaseAndRequestGate(
|
await failPhaseAndRequestGate(
|
||||||
input,
|
input,
|
||||||
eventRepository,
|
eventRepository,
|
||||||
@@ -342,9 +351,9 @@ export async function runSingleFakePhase(
|
|||||||
"artifact_timeout_exhausted",
|
"artifact_timeout_exhausted",
|
||||||
{
|
{
|
||||||
expectedArtifactPath: input.expectedArtifactPath,
|
expectedArtifactPath: input.expectedArtifactPath,
|
||||||
|
recoveryHint: recovery.recoveryHint ?? input.expectedArtifactPath,
|
||||||
},
|
},
|
||||||
handle.sessionId,
|
handle.sessionId,
|
||||||
{ markSessionCrashed: true },
|
|
||||||
);
|
);
|
||||||
await captureTranscript(input, handle);
|
await captureTranscript(input, handle);
|
||||||
throw error;
|
throw error;
|
||||||
@@ -358,6 +367,7 @@ export async function runSingleFakePhase(
|
|||||||
"artifact_timeout_exhausted",
|
"artifact_timeout_exhausted",
|
||||||
{
|
{
|
||||||
expectedArtifactPath: input.expectedArtifactPath,
|
expectedArtifactPath: input.expectedArtifactPath,
|
||||||
|
recoveryHint: input.expectedArtifactPath,
|
||||||
},
|
},
|
||||||
handle.sessionId,
|
handle.sessionId,
|
||||||
);
|
);
|
||||||
@@ -470,6 +480,7 @@ export async function runSingleFakePhase(
|
|||||||
"artifact_timeout_exhausted",
|
"artifact_timeout_exhausted",
|
||||||
{
|
{
|
||||||
expectedArtifactPath: input.expectedArtifactPath,
|
expectedArtifactPath: input.expectedArtifactPath,
|
||||||
|
recoveryHint: input.expectedArtifactPath,
|
||||||
},
|
},
|
||||||
handle.sessionId,
|
handle.sessionId,
|
||||||
);
|
);
|
||||||
@@ -511,6 +522,7 @@ export async function runSingleFakePhase(
|
|||||||
{
|
{
|
||||||
artifactId: outcome.artifact.id,
|
artifactId: outcome.artifact.id,
|
||||||
expectedArtifactPath: input.expectedArtifactPath,
|
expectedArtifactPath: input.expectedArtifactPath,
|
||||||
|
recoveryHint: artifactInvalidRecoveryHint(input, outcome),
|
||||||
},
|
},
|
||||||
handle.sessionId,
|
handle.sessionId,
|
||||||
);
|
);
|
||||||
@@ -520,6 +532,7 @@ export async function runSingleFakePhase(
|
|||||||
code: "artifact_invalid_after_repair",
|
code: "artifact_invalid_after_repair",
|
||||||
runId: input.runId,
|
runId: input.runId,
|
||||||
phaseId: input.phaseId,
|
phaseId: input.phaseId,
|
||||||
|
recoveryHint: artifactInvalidRecoveryHint(input, outcome),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -621,6 +634,7 @@ export async function runSingleFakePhase(
|
|||||||
"artifact_timeout_exhausted",
|
"artifact_timeout_exhausted",
|
||||||
{
|
{
|
||||||
expectedArtifactPath: input.expectedArtifactPath,
|
expectedArtifactPath: input.expectedArtifactPath,
|
||||||
|
recoveryHint: input.expectedArtifactPath,
|
||||||
},
|
},
|
||||||
handle.sessionId,
|
handle.sessionId,
|
||||||
);
|
);
|
||||||
@@ -667,6 +681,7 @@ export async function runSingleFakePhase(
|
|||||||
{
|
{
|
||||||
artifactId: outcome.artifact.id,
|
artifactId: outcome.artifact.id,
|
||||||
expectedArtifactPath: input.expectedArtifactPath,
|
expectedArtifactPath: input.expectedArtifactPath,
|
||||||
|
recoveryHint: artifactInvalidRecoveryHint(input, outcome),
|
||||||
},
|
},
|
||||||
handle.sessionId,
|
handle.sessionId,
|
||||||
);
|
);
|
||||||
@@ -676,6 +691,7 @@ export async function runSingleFakePhase(
|
|||||||
code: "artifact_invalid_after_repair",
|
code: "artifact_invalid_after_repair",
|
||||||
runId: input.runId,
|
runId: input.runId,
|
||||||
phaseId: input.phaseId,
|
phaseId: input.phaseId,
|
||||||
|
recoveryHint: artifactInvalidRecoveryHint(input, outcome),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -833,6 +849,18 @@ async function enterInitialPhase(
|
|||||||
})
|
})
|
||||||
.from(tuiSessions)
|
.from(tuiSessions)
|
||||||
.where(and(eq(tuiSessions.runId, input.runId), eq(tuiSessions.roleId, input.roleId)));
|
.where(and(eq(tuiSessions.runId, input.runId), eq(tuiSessions.roleId, input.roleId)));
|
||||||
|
if (
|
||||||
|
session !== undefined &&
|
||||||
|
(isTimeoutRecoverySessionState(session.state) ||
|
||||||
|
(session.state === "READY" && (await sessionRecoveredEventExists(input, session.id)))) &&
|
||||||
|
session.lastPromptHash === envelope.dedupKey &&
|
||||||
|
session.expectedArtifactPath === input.expectedArtifactPath &&
|
||||||
|
session.expectedSchema === input.expectedSchema
|
||||||
|
) {
|
||||||
|
return recoverAwaitingArtifactReplay(input, eventRepository, phase.attempts, session.id, {
|
||||||
|
repairAttemptUsed: phaseStart.repairAttemptUsed,
|
||||||
|
});
|
||||||
|
}
|
||||||
if (
|
if (
|
||||||
session !== undefined &&
|
session !== undefined &&
|
||||||
session.state !== "FAILED_NEEDS_HUMAN" &&
|
session.state !== "FAILED_NEEDS_HUMAN" &&
|
||||||
@@ -967,6 +995,112 @@ async function enterInitialPhase(
|
|||||||
throw cannotReplayPhase(input, phase.state);
|
throw cannotReplayPhase(input, phase.state);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function recoverAwaitingArtifactReplay(
|
||||||
|
input: CanonicalRunSingleFakePhaseInput,
|
||||||
|
eventRepository: RunEventRepository,
|
||||||
|
attempt: number,
|
||||||
|
sessionId: string,
|
||||||
|
options: { repairAttemptUsed: boolean },
|
||||||
|
): Promise<PhaseEntry> {
|
||||||
|
let recovery: ArtifactTimeoutRecoveryResult;
|
||||||
|
try {
|
||||||
|
recovery = await recoverFromArtifactTimeout(input, eventRepository, sessionId);
|
||||||
|
} catch (recoveryError) {
|
||||||
|
if (isRunStateChanged(recoveryError)) {
|
||||||
|
throw recoveryError;
|
||||||
|
}
|
||||||
|
if (shouldCreateHumanGate(recoveryError)) {
|
||||||
|
const gateError = toArtifactTimeoutRecoveryGateError(recoveryError);
|
||||||
|
await failPhaseAndRequestGate(
|
||||||
|
input,
|
||||||
|
eventRepository,
|
||||||
|
attempt,
|
||||||
|
"artifact_timeout_recovery_failed",
|
||||||
|
gateError.code,
|
||||||
|
{
|
||||||
|
errorCode: recoveryError.code,
|
||||||
|
expectedArtifactPath: input.expectedArtifactPath,
|
||||||
|
recoveryHint: gateError.recoveryHint,
|
||||||
|
},
|
||||||
|
sessionId,
|
||||||
|
);
|
||||||
|
throw gateError;
|
||||||
|
}
|
||||||
|
await failPhaseAndRun(input, eventRepository, attempt, "artifact_timeout_recovery_failed");
|
||||||
|
throw recoveryError;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!recovery.recovered || options.repairAttemptUsed) {
|
||||||
|
await failPhaseAndRequestGate(
|
||||||
|
input,
|
||||||
|
eventRepository,
|
||||||
|
attempt,
|
||||||
|
"artifact_timeout",
|
||||||
|
"artifact_timeout_exhausted",
|
||||||
|
{
|
||||||
|
expectedArtifactPath: input.expectedArtifactPath,
|
||||||
|
recoveryHint: recovery.recoveryHint ?? input.expectedArtifactPath,
|
||||||
|
},
|
||||||
|
sessionId,
|
||||||
|
);
|
||||||
|
throw new DevflowError("Artifact timeout recovery exhausted retry budget", {
|
||||||
|
class: "human_required",
|
||||||
|
code: "artifact_timeout_exhausted",
|
||||||
|
runId: input.runId,
|
||||||
|
phaseId: input.phaseId,
|
||||||
|
recoveryHint: recovery.recoveryHint ?? input.expectedArtifactPath,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const repairAttempt = await startPhaseAndRecord(input, eventRepository, ["awaiting_artifact"], {
|
||||||
|
reason: "artifact_timeout",
|
||||||
|
repair: true,
|
||||||
|
});
|
||||||
|
try {
|
||||||
|
await removeStaleArtifact(input);
|
||||||
|
} catch (error) {
|
||||||
|
await failPhaseAndRun(input, eventRepository, repairAttempt, "stale_artifact_remove_failed");
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
attempt: repairAttempt,
|
||||||
|
continueArtifactWait: false,
|
||||||
|
continueValidation: false,
|
||||||
|
handle: { sessionId },
|
||||||
|
repairAttemptUsed: true,
|
||||||
|
resumedPrompt: false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function isTimeoutRecoverySessionState(state: string): boolean {
|
||||||
|
return ["ARTIFACT_TIMEOUT", "HUNG", "CRASHED", "RESUMING", "REBOOTSTRAPPED"].includes(state);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function sessionRecoveredEventExists(
|
||||||
|
input: CanonicalRunSingleFakePhaseInput,
|
||||||
|
sessionId: string,
|
||||||
|
): Promise<boolean> {
|
||||||
|
const events = await input.db
|
||||||
|
.select({ payload: runEvents.payload })
|
||||||
|
.from(runEvents)
|
||||||
|
.where(
|
||||||
|
and(
|
||||||
|
eq(runEvents.runId, input.runId),
|
||||||
|
eq(runEvents.phaseId, input.phaseId),
|
||||||
|
eq(runEvents.type, "session.recovered"),
|
||||||
|
),
|
||||||
|
);
|
||||||
|
return events.some((event) => payloadSessionId(event.payload) === sessionId);
|
||||||
|
}
|
||||||
|
|
||||||
|
function payloadSessionId(payload: unknown): string | undefined {
|
||||||
|
if (payload === null || typeof payload !== "object") {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
const sessionId = (payload as Record<string, unknown>).sessionId;
|
||||||
|
return typeof sessionId === "string" ? sessionId : undefined;
|
||||||
|
}
|
||||||
|
|
||||||
function cannotReplayPhase(
|
function cannotReplayPhase(
|
||||||
input: CanonicalRunSingleFakePhaseInput,
|
input: CanonicalRunSingleFakePhaseInput,
|
||||||
phaseState: string,
|
phaseState: string,
|
||||||
@@ -1234,24 +1368,33 @@ async function failPhaseAndRequestGate(
|
|||||||
|
|
||||||
if (sessionId !== undefined && options.markSessionCrashed === true) {
|
if (sessionId !== undefined && options.markSessionCrashed === true) {
|
||||||
const [session] = await tx
|
const [session] = await tx
|
||||||
.select({ recoveryAttempts: tuiSessions.recoveryAttempts })
|
.select({ recoveryAttempts: tuiSessions.recoveryAttempts, state: tuiSessions.state })
|
||||||
.from(tuiSessions)
|
.from(tuiSessions)
|
||||||
.where(eq(tuiSessions.id, sessionId));
|
.where(eq(tuiSessions.id, sessionId));
|
||||||
const recoveryAttempts = (session?.recoveryAttempts ?? 0) + 1;
|
const recoveryAttempts = (session?.recoveryAttempts ?? 0) + 1;
|
||||||
await tx
|
if (session !== undefined && isAllowedSessionTransition(session.state, "CRASHED")) {
|
||||||
.update(tuiSessions)
|
await tx
|
||||||
.set({ state: "CRASHED", recoveryAttempts })
|
.update(tuiSessions)
|
||||||
.where(eq(tuiSessions.id, sessionId));
|
.set({ state: "CRASHED", recoveryAttempts })
|
||||||
await eventRepository.appendInTransaction(tx, {
|
.where(eq(tuiSessions.id, sessionId));
|
||||||
runId: input.runId,
|
await eventRepository.appendInTransaction(tx, {
|
||||||
phaseId: input.phaseId,
|
runId: input.runId,
|
||||||
type: "session.crashed",
|
phaseId: input.phaseId,
|
||||||
payload: { sessionId, roleId: input.roleId, recoveryAttempts },
|
type: "session.crashed",
|
||||||
idempotencyKey: `session.crashed:${sessionId}:${recoveryAttempts}`,
|
payload: { sessionId, roleId: input.roleId, recoveryAttempts },
|
||||||
});
|
idempotencyKey: `session.crashed:${sessionId}:${recoveryAttempts}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sessionId !== undefined) {
|
if (sessionId !== undefined) {
|
||||||
|
const [session] = await tx
|
||||||
|
.select({ state: tuiSessions.state })
|
||||||
|
.from(tuiSessions)
|
||||||
|
.where(eq(tuiSessions.id, sessionId));
|
||||||
|
if (session !== undefined) {
|
||||||
|
assertSessionTransition(session.state, "FAILED_NEEDS_HUMAN");
|
||||||
|
}
|
||||||
await tx
|
await tx
|
||||||
.insert(tuiSessions)
|
.insert(tuiSessions)
|
||||||
.values({
|
.values({
|
||||||
@@ -1425,10 +1568,11 @@ async function completePhaseAndRun(
|
|||||||
});
|
});
|
||||||
|
|
||||||
const [session] = await tx
|
const [session] = await tx
|
||||||
.select({ recoveryAttempts: tuiSessions.recoveryAttempts })
|
.select({ recoveryAttempts: tuiSessions.recoveryAttempts, state: tuiSessions.state })
|
||||||
.from(tuiSessions)
|
.from(tuiSessions)
|
||||||
.where(eq(tuiSessions.id, sessionId));
|
.where(eq(tuiSessions.id, sessionId));
|
||||||
const recoveryAttempts = session?.recoveryAttempts ?? 0;
|
const recoveryAttempts = session?.recoveryAttempts ?? 0;
|
||||||
|
assertSessionStateAssignment(session?.state ?? "BUSY", "READY");
|
||||||
await tx.update(tuiSessions).set({ state: "READY" }).where(eq(tuiSessions.id, sessionId));
|
await tx.update(tuiSessions).set({ state: "READY" }).where(eq(tuiSessions.id, sessionId));
|
||||||
await eventRepository.appendInTransaction(tx, {
|
await eventRepository.appendInTransaction(tx, {
|
||||||
runId: input.runId,
|
runId: input.runId,
|
||||||
@@ -1484,6 +1628,11 @@ async function requestWorkflowApproval(
|
|||||||
.update(runPhases)
|
.update(runPhases)
|
||||||
.set({ state: "awaiting_approval" })
|
.set({ state: "awaiting_approval" })
|
||||||
.where(and(eq(runPhases.id, input.phaseId), eq(runPhases.runId, input.runId)));
|
.where(and(eq(runPhases.id, input.phaseId), eq(runPhases.runId, input.runId)));
|
||||||
|
const [session] = await tx
|
||||||
|
.select({ state: tuiSessions.state })
|
||||||
|
.from(tuiSessions)
|
||||||
|
.where(eq(tuiSessions.id, sessionId));
|
||||||
|
assertSessionStateAssignment(session?.state ?? "BUSY", "WAITING_FOR_APPROVAL");
|
||||||
await tx
|
await tx
|
||||||
.update(tuiSessions)
|
.update(tuiSessions)
|
||||||
.set({ state: "WAITING_FOR_APPROVAL" })
|
.set({ state: "WAITING_FOR_APPROVAL" })
|
||||||
@@ -1614,7 +1763,7 @@ async function startSessionAndRecord(
|
|||||||
cwd: input.worktreeRoot,
|
cwd: input.worktreeRoot,
|
||||||
expectedArtifactPath: input.expectedArtifactPath,
|
expectedArtifactPath: input.expectedArtifactPath,
|
||||||
expectedSchema: input.expectedSchema,
|
expectedSchema: input.expectedSchema,
|
||||||
state: "BOOTSTRAPPING",
|
state: "CREATED",
|
||||||
})
|
})
|
||||||
.onConflictDoNothing({ target: tuiSessions.id });
|
.onConflictDoNothing({ target: tuiSessions.id });
|
||||||
await eventRepository.appendInTransaction(tx, {
|
await eventRepository.appendInTransaction(tx, {
|
||||||
@@ -1624,10 +1773,12 @@ async function startSessionAndRecord(
|
|||||||
payload: { sessionId: startedHandle.sessionId, roleId: input.roleId, backend: "fake" },
|
payload: { sessionId: startedHandle.sessionId, roleId: input.roleId, backend: "fake" },
|
||||||
idempotencyKey: `session.created:${startedHandle.sessionId}`,
|
idempotencyKey: `session.created:${startedHandle.sessionId}`,
|
||||||
});
|
});
|
||||||
|
assertSessionTransition("CREATED", "BOOTSTRAPPING");
|
||||||
await tx
|
await tx
|
||||||
.update(tuiSessions)
|
.update(tuiSessions)
|
||||||
.set({ state: "BOOTSTRAPPING" })
|
.set({ state: "BOOTSTRAPPING" })
|
||||||
.where(eq(tuiSessions.id, startedHandle.sessionId));
|
.where(eq(tuiSessions.id, startedHandle.sessionId));
|
||||||
|
assertSessionTransition("BOOTSTRAPPING", "READY");
|
||||||
await tx
|
await tx
|
||||||
.update(tuiSessions)
|
.update(tuiSessions)
|
||||||
.set({ state: "READY" })
|
.set({ state: "READY" })
|
||||||
@@ -1751,6 +1902,11 @@ async function resumeExistingSessionAndRecord(
|
|||||||
}
|
}
|
||||||
await input.db.transaction(async (tx) => {
|
await input.db.transaction(async (tx) => {
|
||||||
await assertRunCanMutatePhaseInTransaction(input, tx);
|
await assertRunCanMutatePhaseInTransaction(input, tx);
|
||||||
|
const [currentSession] = await tx
|
||||||
|
.select({ state: tuiSessions.state })
|
||||||
|
.from(tuiSessions)
|
||||||
|
.where(eq(tuiSessions.id, session.id));
|
||||||
|
assertSessionStateAssignment(currentSession?.state ?? session.state, "READY");
|
||||||
await tx
|
await tx
|
||||||
.update(tuiSessions)
|
.update(tuiSessions)
|
||||||
.set({
|
.set({
|
||||||
@@ -1873,6 +2029,11 @@ async function sendPromptAndRecord(
|
|||||||
: await artifactSignature(input.expectedArtifactPath);
|
: await artifactSignature(input.expectedArtifactPath);
|
||||||
await input.db.transaction(async (tx) => {
|
await input.db.transaction(async (tx) => {
|
||||||
await assertRunCanMutatePhaseInTransaction(input, tx);
|
await assertRunCanMutatePhaseInTransaction(input, tx);
|
||||||
|
const [currentSession] = await tx
|
||||||
|
.select({ state: tuiSessions.state })
|
||||||
|
.from(tuiSessions)
|
||||||
|
.where(eq(tuiSessions.id, handle.sessionId));
|
||||||
|
assertSessionTransition(currentSession?.state ?? "READY", "BUSY");
|
||||||
await tx
|
await tx
|
||||||
.update(tuiSessions)
|
.update(tuiSessions)
|
||||||
.set({
|
.set({
|
||||||
@@ -1972,19 +2133,7 @@ async function sendPromptWithRetry(
|
|||||||
handle: { sessionId: string },
|
handle: { sessionId: string },
|
||||||
envelope: PromptEnvelope,
|
envelope: PromptEnvelope,
|
||||||
): Promise<{ promptId: string }> {
|
): Promise<{ promptId: string }> {
|
||||||
let lastError: unknown;
|
return retryRecoverable("sendPrompt", () => sessions.sendPrompt(handle, envelope));
|
||||||
for (let physicalAttempt = 0; physicalAttempt <= sendPromptRetryBudget; physicalAttempt += 1) {
|
|
||||||
try {
|
|
||||||
return await sessions.sendPrompt(handle, envelope);
|
|
||||||
} catch (error) {
|
|
||||||
lastError = error;
|
|
||||||
if (!(error instanceof DevflowError) || error.class !== "recoverable") {
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
throw lastError;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
interface ArtifactOutcome {
|
interface ArtifactOutcome {
|
||||||
@@ -1994,6 +2143,13 @@ interface ArtifactOutcome {
|
|||||||
validation: ReturnType<typeof validateArtifact>;
|
validation: ReturnType<typeof validateArtifact>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function artifactInvalidRecoveryHint(
|
||||||
|
input: CanonicalRunSingleFakePhaseInput,
|
||||||
|
outcome: ArtifactOutcome,
|
||||||
|
): string {
|
||||||
|
return `artifact=${outcome.artifact.id};path=${input.expectedArtifactPath}`;
|
||||||
|
}
|
||||||
|
|
||||||
interface ArtifactRecord {
|
interface ArtifactRecord {
|
||||||
id: string;
|
id: string;
|
||||||
phaseId: string | null;
|
phaseId: string | null;
|
||||||
@@ -2020,7 +2176,8 @@ async function waitForAndValidateArtifact(
|
|||||||
if (!isDevflowErrorWithCode(error, "artifact_timeout_exhausted")) {
|
if (!isDevflowErrorWithCode(error, "artifact_timeout_exhausted")) {
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
await recordArtifactTimeout(input, eventRepository, attempt, sessionId);
|
const timedOutSessionState = await classifyTimedOutSession(input, sessionId);
|
||||||
|
await recordArtifactTimeout(input, eventRepository, attempt, sessionId, timedOutSessionState);
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2087,6 +2244,7 @@ async function recordArtifactTimeout(
|
|||||||
eventRepository: RunEventRepository,
|
eventRepository: RunEventRepository,
|
||||||
attempt: number,
|
attempt: number,
|
||||||
sessionId: string,
|
sessionId: string,
|
||||||
|
sessionState: "ARTIFACT_TIMEOUT" | "HUNG" | "CRASHED",
|
||||||
) {
|
) {
|
||||||
await input.db.transaction(async (tx) => {
|
await input.db.transaction(async (tx) => {
|
||||||
await assertRunCanMutatePhaseInTransaction(input, tx);
|
await assertRunCanMutatePhaseInTransaction(input, tx);
|
||||||
@@ -2098,16 +2256,60 @@ async function recordArtifactTimeout(
|
|||||||
path: input.expectedArtifactPath,
|
path: input.expectedArtifactPath,
|
||||||
schemaId: input.expectedSchema,
|
schemaId: input.expectedSchema,
|
||||||
attempt,
|
attempt,
|
||||||
|
sessionState,
|
||||||
},
|
},
|
||||||
idempotencyKey: `artifact.timeout:${input.phaseId}:${attempt}:${input.expectedArtifactPath}`,
|
idempotencyKey: `artifact.timeout:${input.phaseId}:${attempt}:${input.expectedArtifactPath}`,
|
||||||
});
|
});
|
||||||
await tx
|
const [currentSession] = await tx
|
||||||
.update(tuiSessions)
|
.select({ recoveryAttempts: tuiSessions.recoveryAttempts, state: tuiSessions.state })
|
||||||
.set({ state: "ARTIFACT_TIMEOUT" })
|
.from(tuiSessions)
|
||||||
.where(eq(tuiSessions.id, sessionId));
|
.where(eq(tuiSessions.id, sessionId));
|
||||||
|
const currentState = currentSession?.state ?? "BUSY";
|
||||||
|
assertSessionStateAssignment(currentState, sessionState);
|
||||||
|
if (currentState === sessionState) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (sessionState === "CRASHED") {
|
||||||
|
const recoveryAttempts = (currentSession?.recoveryAttempts ?? 0) + 1;
|
||||||
|
await tx
|
||||||
|
.update(tuiSessions)
|
||||||
|
.set({ recoveryAttempts, state: "CRASHED" })
|
||||||
|
.where(eq(tuiSessions.id, sessionId));
|
||||||
|
await eventRepository.appendInTransaction(tx, {
|
||||||
|
runId: input.runId,
|
||||||
|
phaseId: input.phaseId,
|
||||||
|
type: "session.crashed",
|
||||||
|
payload: { sessionId, roleId: input.roleId, recoveryAttempts },
|
||||||
|
idempotencyKey: `session.crashed:${sessionId}:${recoveryAttempts}`,
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
await tx.update(tuiSessions).set({ state: sessionState }).where(eq(tuiSessions.id, sessionId));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function classifyTimedOutSession(
|
||||||
|
input: CanonicalRunSingleFakePhaseInput,
|
||||||
|
sessionId: string,
|
||||||
|
): Promise<"ARTIFACT_TIMEOUT" | "HUNG" | "CRASHED"> {
|
||||||
|
try {
|
||||||
|
const probe = await probeWithTypedError(input.sessions, { sessionId });
|
||||||
|
if (!probe.alive || !probe.paneActive) {
|
||||||
|
return "CRASHED";
|
||||||
|
}
|
||||||
|
return isSessionHung(probe.lastOutputAt, new Date(), input.recovery?.maxHungMs)
|
||||||
|
? "HUNG"
|
||||||
|
: "ARTIFACT_TIMEOUT";
|
||||||
|
} catch (error) {
|
||||||
|
// A transient probe failure should not be promoted to a crash classification,
|
||||||
|
// but fatal/unclassified probe failures must still fail the run.
|
||||||
|
if (error instanceof DevflowError && error.class === "recoverable") {
|
||||||
|
return "ARTIFACT_TIMEOUT";
|
||||||
|
}
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function recordArtifactValidation(
|
async function recordArtifactValidation(
|
||||||
input: CanonicalRunSingleFakePhaseInput,
|
input: CanonicalRunSingleFakePhaseInput,
|
||||||
eventRepository: RunEventRepository,
|
eventRepository: RunEventRepository,
|
||||||
@@ -2232,6 +2434,11 @@ async function markSessionIdle(
|
|||||||
) {
|
) {
|
||||||
await input.db.transaction(async (tx) => {
|
await input.db.transaction(async (tx) => {
|
||||||
await assertRunCanMutatePhaseInTransaction(input, tx);
|
await assertRunCanMutatePhaseInTransaction(input, tx);
|
||||||
|
const [currentSession] = await tx
|
||||||
|
.select({ state: tuiSessions.state })
|
||||||
|
.from(tuiSessions)
|
||||||
|
.where(eq(tuiSessions.id, sessionId));
|
||||||
|
assertSessionStateAssignment(currentSession?.state ?? "BUSY", "READY");
|
||||||
await tx.update(tuiSessions).set({ state: "READY" }).where(eq(tuiSessions.id, sessionId));
|
await tx.update(tuiSessions).set({ state: "READY" }).where(eq(tuiSessions.id, sessionId));
|
||||||
await eventRepository.appendInTransaction(tx, {
|
await eventRepository.appendInTransaction(tx, {
|
||||||
runId: input.runId,
|
runId: input.runId,
|
||||||
@@ -2266,26 +2473,36 @@ async function recoverFromArtifactTimeout(
|
|||||||
input: CanonicalRunSingleFakePhaseInput,
|
input: CanonicalRunSingleFakePhaseInput,
|
||||||
eventRepository: RunEventRepository,
|
eventRepository: RunEventRepository,
|
||||||
sessionId: string,
|
sessionId: string,
|
||||||
): Promise<boolean> {
|
): Promise<ArtifactTimeoutRecoveryResult> {
|
||||||
const probe = await probeWithTypedError(input.sessions, { sessionId });
|
const currentState = await sessionState(input, sessionId);
|
||||||
if (!probe.alive || !probe.paneActive || isBackendReadinessUnknown(probe)) {
|
if (currentState === "READY") {
|
||||||
return false;
|
return { recovered: true };
|
||||||
}
|
}
|
||||||
await setSessionStateIfRunActive(input, sessionId, "RESUMING");
|
|
||||||
|
|
||||||
const rebootstrapOk = await rebootstrapWithRetry(input.sessions, { sessionId });
|
if (!["CRASHED", "RESUMING", "REBOOTSTRAPPED"].includes(currentState ?? "")) {
|
||||||
if (!rebootstrapOk) {
|
const probe = await probeWithTypedError(input.sessions, { sessionId });
|
||||||
return false;
|
if (isBackendReadinessUnknown(probe)) {
|
||||||
|
return {
|
||||||
|
recovered: false,
|
||||||
|
recoveryHint: recoveryHintForProbe(probe),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (currentState !== "REBOOTSTRAPPED") {
|
||||||
|
await setSessionStateIfRunActive(input, sessionId, "RESUMING");
|
||||||
|
|
||||||
|
await rebootstrapWithRetry(input.sessions, { sessionId });
|
||||||
|
await setSessionStateIfRunActive(input, sessionId, "REBOOTSTRAPPED");
|
||||||
}
|
}
|
||||||
await setSessionStateIfRunActive(input, sessionId, "REBOOTSTRAPPED");
|
|
||||||
|
|
||||||
await input.db.transaction(async (tx) => {
|
await input.db.transaction(async (tx) => {
|
||||||
await assertRunCanMutatePhaseInTransaction(input, tx);
|
await assertRunCanMutatePhaseInTransaction(input, tx);
|
||||||
const [session] = await tx
|
const [session] = await tx
|
||||||
.select({ recoveryAttempts: tuiSessions.recoveryAttempts })
|
.select({ recoveryAttempts: tuiSessions.recoveryAttempts, state: tuiSessions.state })
|
||||||
.from(tuiSessions)
|
.from(tuiSessions)
|
||||||
.where(eq(tuiSessions.id, sessionId));
|
.where(eq(tuiSessions.id, sessionId));
|
||||||
const recoveryAttempts = (session?.recoveryAttempts ?? 0) + 1;
|
const recoveryAttempts = (session?.recoveryAttempts ?? 0) + 1;
|
||||||
|
assertSessionTransition(session?.state ?? "REBOOTSTRAPPED", "READY");
|
||||||
await tx
|
await tx
|
||||||
.update(tuiSessions)
|
.update(tuiSessions)
|
||||||
.set({ state: "READY", recoveryAttempts })
|
.set({ state: "READY", recoveryAttempts })
|
||||||
@@ -2298,7 +2515,30 @@ async function recoverFromArtifactTimeout(
|
|||||||
idempotencyKey: `session.recovered:${sessionId}:${recoveryAttempts}`,
|
idempotencyKey: `session.recovered:${sessionId}:${recoveryAttempts}`,
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
return true;
|
return { recovered: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ArtifactTimeoutRecoveryResult {
|
||||||
|
recovered: boolean;
|
||||||
|
recoveryHint?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
function recoveryHintForProbe(probe: ProbeResult): string {
|
||||||
|
if (probe.hint !== undefined && probe.hint.length > 0) {
|
||||||
|
return probe.hint;
|
||||||
|
}
|
||||||
|
return `probe_alive=${probe.alive};pane_active=${probe.paneActive}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function sessionState(
|
||||||
|
input: CanonicalRunSingleFakePhaseInput,
|
||||||
|
sessionId: string,
|
||||||
|
): Promise<string | undefined> {
|
||||||
|
const [session] = await input.db
|
||||||
|
.select({ state: tuiSessions.state })
|
||||||
|
.from(tuiSessions)
|
||||||
|
.where(eq(tuiSessions.id, sessionId));
|
||||||
|
return session?.state;
|
||||||
}
|
}
|
||||||
|
|
||||||
function isBackendReadinessUnknown(probe: ProbeResult): boolean {
|
function isBackendReadinessUnknown(probe: ProbeResult): boolean {
|
||||||
@@ -2312,6 +2552,13 @@ async function setSessionStateIfRunActive(
|
|||||||
) {
|
) {
|
||||||
await input.db.transaction(async (tx) => {
|
await input.db.transaction(async (tx) => {
|
||||||
await assertRunCanMutatePhaseInTransaction(input, tx);
|
await assertRunCanMutatePhaseInTransaction(input, tx);
|
||||||
|
const [session] = await tx
|
||||||
|
.select({ state: tuiSessions.state })
|
||||||
|
.from(tuiSessions)
|
||||||
|
.where(eq(tuiSessions.id, sessionId));
|
||||||
|
if (session !== undefined) {
|
||||||
|
assertSessionStateAssignment(session.state, state);
|
||||||
|
}
|
||||||
await tx.update(tuiSessions).set({ state }).where(eq(tuiSessions.id, sessionId));
|
await tx.update(tuiSessions).set({ state }).where(eq(tuiSessions.id, sessionId));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -2337,46 +2584,31 @@ async function probeWithTypedError(
|
|||||||
async function rebootstrapWithRetry(
|
async function rebootstrapWithRetry(
|
||||||
sessions: SessionRuntime,
|
sessions: SessionRuntime,
|
||||||
handle: { sessionId: string },
|
handle: { sessionId: string },
|
||||||
): Promise<boolean> {
|
): Promise<void> {
|
||||||
for (let attemptsRemaining = 2; attemptsRemaining > 0; attemptsRemaining -= 1) {
|
try {
|
||||||
try {
|
await retryRecoverable("rebootstrap", async () => {
|
||||||
await sessions.rebootstrap(handle);
|
await sessions.rebootstrap(handle);
|
||||||
return true;
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (!(error instanceof DevflowError)) {
|
if (!(error instanceof DevflowError)) {
|
||||||
throw new DevflowError("Unclassified rebootstrap failure", {
|
throw new DevflowError("Unclassified rebootstrap failure", {
|
||||||
class: "fatal",
|
class: "fatal",
|
||||||
code: "internal_state_corruption",
|
code: "internal_state_corruption",
|
||||||
cause: error,
|
cause: error,
|
||||||
});
|
});
|
||||||
}
|
|
||||||
if (error.class !== "recoverable") {
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
// Retry budget is intentionally one rebootstrap retry after the first failure.
|
|
||||||
}
|
}
|
||||||
|
if (error.class !== "recoverable") {
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
throw error;
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function resumeWithRetry(
|
async function resumeWithRetry(
|
||||||
sessions: SessionRuntime,
|
sessions: SessionRuntime,
|
||||||
handle: { sessionId: string },
|
handle: { sessionId: string },
|
||||||
): Promise<SessionHandle> {
|
): Promise<SessionHandle> {
|
||||||
let lastError: unknown;
|
return retryRecoverable("resume", () => sessions.resume(handle));
|
||||||
for (let physicalAttempt = 0; physicalAttempt <= 2; physicalAttempt += 1) {
|
|
||||||
try {
|
|
||||||
return await sessions.resume(handle);
|
|
||||||
} catch (error) {
|
|
||||||
lastError = error;
|
|
||||||
if (!(error instanceof DevflowError) || error.class !== "recoverable") {
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
throw lastError;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function markSessionFailedNeedsHuman(
|
async function markSessionFailedNeedsHuman(
|
||||||
@@ -2384,6 +2616,13 @@ async function markSessionFailedNeedsHuman(
|
|||||||
eventRepository: RunEventRepository,
|
eventRepository: RunEventRepository,
|
||||||
sessionId: string,
|
sessionId: string,
|
||||||
) {
|
) {
|
||||||
|
const [existingSession] = await input.db
|
||||||
|
.select({ state: tuiSessions.state })
|
||||||
|
.from(tuiSessions)
|
||||||
|
.where(eq(tuiSessions.id, sessionId));
|
||||||
|
if (existingSession !== undefined) {
|
||||||
|
assertSessionStateAssignment(existingSession.state, "FAILED_NEEDS_HUMAN");
|
||||||
|
}
|
||||||
await input.db
|
await input.db
|
||||||
.insert(tuiSessions)
|
.insert(tuiSessions)
|
||||||
.values({
|
.values({
|
||||||
@@ -2416,12 +2655,15 @@ async function markAllSessionsFailedInTransaction(
|
|||||||
runId: string,
|
runId: string,
|
||||||
): Promise<string[]> {
|
): Promise<string[]> {
|
||||||
const sessions = await tx
|
const sessions = await tx
|
||||||
.select({ id: tuiSessions.id, roleId: tuiSessions.roleId })
|
.select({ id: tuiSessions.id, roleId: tuiSessions.roleId, state: tuiSessions.state })
|
||||||
.from(tuiSessions)
|
.from(tuiSessions)
|
||||||
.where(eq(tuiSessions.runId, runId));
|
.where(eq(tuiSessions.runId, runId));
|
||||||
if (sessions.length === 0) {
|
if (sessions.length === 0) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
for (const session of sessions) {
|
||||||
|
assertSessionStateAssignment(session.state, "FAILED_NEEDS_HUMAN");
|
||||||
|
}
|
||||||
|
|
||||||
await tx
|
await tx
|
||||||
.update(tuiSessions)
|
.update(tuiSessions)
|
||||||
@@ -2684,7 +2926,7 @@ function shouldCreateHumanGate(error: unknown): error is DevflowError {
|
|||||||
|
|
||||||
function toHumanRequiredRecoveryError(error: DevflowError): DevflowError {
|
function toHumanRequiredRecoveryError(error: DevflowError): DevflowError {
|
||||||
if (error.class === "human_required") {
|
if (error.class === "human_required") {
|
||||||
return error;
|
return ensureRecoveryHint(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
const options: ConstructorParameters<typeof DevflowError>[1] = {
|
const options: ConstructorParameters<typeof DevflowError>[1] = {
|
||||||
@@ -2708,7 +2950,7 @@ function toHumanRequiredRecoveryError(error: DevflowError): DevflowError {
|
|||||||
|
|
||||||
function toArtifactTimeoutRecoveryGateError(error: DevflowError): DevflowError {
|
function toArtifactTimeoutRecoveryGateError(error: DevflowError): DevflowError {
|
||||||
if (error.class === "human_required") {
|
if (error.class === "human_required") {
|
||||||
return error;
|
return ensureRecoveryHint(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
const options: ConstructorParameters<typeof DevflowError>[1] = {
|
const options: ConstructorParameters<typeof DevflowError>[1] = {
|
||||||
@@ -2727,6 +2969,26 @@ function toArtifactTimeoutRecoveryGateError(error: DevflowError): DevflowError {
|
|||||||
return new DevflowError("Artifact timeout recovery exhausted retry budget", options);
|
return new DevflowError("Artifact timeout recovery exhausted retry budget", options);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function ensureRecoveryHint(error: DevflowError): DevflowError {
|
||||||
|
if (error.recoveryHint !== undefined && error.recoveryHint.length > 0) {
|
||||||
|
return error;
|
||||||
|
}
|
||||||
|
|
||||||
|
const options: ConstructorParameters<typeof DevflowError>[1] = {
|
||||||
|
class: error.class,
|
||||||
|
code: error.code,
|
||||||
|
recoveryHint: error.message,
|
||||||
|
cause: error.cause,
|
||||||
|
};
|
||||||
|
if (error.runId !== undefined) {
|
||||||
|
options.runId = error.runId;
|
||||||
|
}
|
||||||
|
if (error.phaseId !== undefined) {
|
||||||
|
options.phaseId = error.phaseId;
|
||||||
|
}
|
||||||
|
return new DevflowError(error.message, options);
|
||||||
|
}
|
||||||
|
|
||||||
async function removeStaleArtifact(input: CanonicalRunSingleFakePhaseInput): Promise<void> {
|
async function removeStaleArtifact(input: CanonicalRunSingleFakePhaseInput): Promise<void> {
|
||||||
try {
|
try {
|
||||||
await unlink(input.expectedArtifactPath);
|
await unlink(input.expectedArtifactPath);
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
export * from "./adapter.js";
|
export * from "./adapter.js";
|
||||||
export * from "./fake.js";
|
export * from "./fake.js";
|
||||||
export * from "./manager.js";
|
export * from "./manager.js";
|
||||||
|
export * from "./recovery.js";
|
||||||
export * from "./transcript.js";
|
export * from "./transcript.js";
|
||||||
export * from "./tmux.js";
|
export * from "./tmux.js";
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ import type {
|
|||||||
TranscriptBaseline,
|
TranscriptBaseline,
|
||||||
TranscriptChunk,
|
TranscriptChunk,
|
||||||
} from "./adapter.js";
|
} from "./adapter.js";
|
||||||
|
import { assertSessionTransition, retryRecoverable } from "./recovery.js";
|
||||||
import { captureAndPersistTranscript } from "./transcript.js";
|
import { captureAndPersistTranscript } from "./transcript.js";
|
||||||
|
|
||||||
type Database = DbClient["db"];
|
type Database = DbClient["db"];
|
||||||
@@ -298,6 +299,12 @@ export class SessionManager implements SessionRuntime {
|
|||||||
if (this.db === undefined || !["CREATED", "BOOTSTRAPPING"].includes(session.state)) {
|
if (this.db === undefined || !["CREATED", "BOOTSTRAPPING"].includes(session.state)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (session.state === "CREATED") {
|
||||||
|
assertSessionTransition("CREATED", "BOOTSTRAPPING");
|
||||||
|
assertSessionTransition("BOOTSTRAPPING", "READY");
|
||||||
|
} else {
|
||||||
|
assertSessionTransition(session.state, "READY");
|
||||||
|
}
|
||||||
|
|
||||||
const eventRepository = new RunEventRepository(this.db);
|
const eventRepository = new RunEventRepository(this.db);
|
||||||
const sessionUpdate: {
|
const sessionUpdate: {
|
||||||
@@ -345,6 +352,7 @@ export class SessionManager implements SessionRuntime {
|
|||||||
backend: string;
|
backend: string;
|
||||||
cwd: string;
|
cwd: string;
|
||||||
recoveryAttempts: number;
|
recoveryAttempts: number;
|
||||||
|
state: string;
|
||||||
},
|
},
|
||||||
error: unknown,
|
error: unknown,
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
@@ -357,6 +365,7 @@ export class SessionManager implements SessionRuntime {
|
|||||||
const gateKey = "session_recovery_required";
|
const gateKey = "session_recovery_required";
|
||||||
const approvalIdempotencyKey = `${session.runId}:${gateKey}:${session.id}:${recoveryAttempts}`;
|
const approvalIdempotencyKey = `${session.runId}:${gateKey}:${session.id}:${recoveryAttempts}`;
|
||||||
const pauseCause = `session_recovery_failed:${session.id}:${recoveryAttempts}`;
|
const pauseCause = `session_recovery_failed:${session.id}:${recoveryAttempts}`;
|
||||||
|
assertSessionTransition(session.state, "FAILED_NEEDS_HUMAN");
|
||||||
await this.db.transaction(async (tx) => {
|
await this.db.transaction(async (tx) => {
|
||||||
await tx.execute(sql`SELECT 1 FROM ${runs} WHERE ${runs.id} = ${session.runId} FOR UPDATE`);
|
await tx.execute(sql`SELECT 1 FROM ${runs} WHERE ${runs.id} = ${session.runId} FOR UPDATE`);
|
||||||
const [run] = await tx
|
const [run] = await tx
|
||||||
@@ -431,18 +440,7 @@ export class SessionManager implements SessionRuntime {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private async resumeWithRetry(handle: SessionHandle): Promise<SessionHandle> {
|
private async resumeWithRetry(handle: SessionHandle): Promise<SessionHandle> {
|
||||||
let lastError: unknown;
|
return retryRecoverable("resume", () => this.track(this.adapter.resume(handle)));
|
||||||
for (let attempt = 0; attempt <= 2; attempt += 1) {
|
|
||||||
try {
|
|
||||||
return await this.track(this.adapter.resume(handle));
|
|
||||||
} catch (error) {
|
|
||||||
lastError = error;
|
|
||||||
if (!(error instanceof DevflowError) || error.class !== "recoverable") {
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
throw lastError;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private async loadTranscriptBaseline(
|
private async loadTranscriptBaseline(
|
||||||
|
|||||||
103
packages/session/src/recovery.test.ts
Normal file
103
packages/session/src/recovery.test.ts
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
import { describe, expect, it } from "vitest";
|
||||||
|
|
||||||
|
import { DevflowError } from "@devflow/core";
|
||||||
|
|
||||||
|
import {
|
||||||
|
SessionRecoveryBudget,
|
||||||
|
assertSessionStateAssignment,
|
||||||
|
assertSessionTransition,
|
||||||
|
isSessionHung,
|
||||||
|
retryRecoverable,
|
||||||
|
} from "./recovery.js";
|
||||||
|
|
||||||
|
describe("session recovery policy", () => {
|
||||||
|
it("allows only locked session state-machine transitions", () => {
|
||||||
|
expect(() => assertSessionTransition("CREATED", "BOOTSTRAPPING")).not.toThrow();
|
||||||
|
expect(() => assertSessionTransition("BOOTSTRAPPING", "READY")).not.toThrow();
|
||||||
|
expect(() => assertSessionTransition("READY", "BUSY")).not.toThrow();
|
||||||
|
expect(() => assertSessionTransition("BUSY", "READY")).not.toThrow();
|
||||||
|
expect(() => assertSessionTransition("BUSY", "ARTIFACT_TIMEOUT")).not.toThrow();
|
||||||
|
expect(() => assertSessionTransition("ARTIFACT_TIMEOUT", "RESUMING")).not.toThrow();
|
||||||
|
expect(() => assertSessionTransition("RESUMING", "REBOOTSTRAPPED")).not.toThrow();
|
||||||
|
expect(() => assertSessionTransition("REBOOTSTRAPPED", "READY")).not.toThrow();
|
||||||
|
|
||||||
|
expect(() => assertSessionTransition("READY", "REBOOTSTRAPPED")).toThrow(
|
||||||
|
/Invalid session state transition/,
|
||||||
|
);
|
||||||
|
expect(() => assertSessionTransition("CRASHED", "CRASHED")).toThrow(
|
||||||
|
/Invalid session state transition/,
|
||||||
|
);
|
||||||
|
expect(() => assertSessionTransition("FAILED_NEEDS_HUMAN", "READY")).toThrow(
|
||||||
|
/Invalid session state transition/,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("allows no-op state assignment without treating it as a transition", () => {
|
||||||
|
expect(() => assertSessionStateAssignment("READY", "READY")).not.toThrow();
|
||||||
|
expect(() => assertSessionStateAssignment("BUSY", "READY")).not.toThrow();
|
||||||
|
expect(() => assertSessionStateAssignment("FAILED_NEEDS_HUMAN", "READY")).toThrow(
|
||||||
|
/Invalid session state assignment/,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("retries recoverable errors for one initial prompt send plus two retries", async () => {
|
||||||
|
let attempts = 0;
|
||||||
|
const result = await retryRecoverable("sendPrompt", async () => {
|
||||||
|
attempts += 1;
|
||||||
|
if (attempts < SessionRecoveryBudget.sendPrompt.physicalAttempts) {
|
||||||
|
throw new DevflowError("temporary prompt failure", {
|
||||||
|
class: "recoverable",
|
||||||
|
code: "prompt_send_transient",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return "sent";
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result).toBe("sent");
|
||||||
|
expect(attempts).toBe(3);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("throws the final recoverable error after the retry budget is exhausted", async () => {
|
||||||
|
let attempts = 0;
|
||||||
|
await expect(
|
||||||
|
retryRecoverable("rebootstrap", async () => {
|
||||||
|
attempts += 1;
|
||||||
|
throw new DevflowError("pane briefly unresponsive", {
|
||||||
|
class: "recoverable",
|
||||||
|
code: "pane_briefly_unresponsive",
|
||||||
|
});
|
||||||
|
}),
|
||||||
|
).rejects.toMatchObject({
|
||||||
|
class: "recoverable",
|
||||||
|
code: "pane_briefly_unresponsive",
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(attempts).toBe(SessionRecoveryBudget.rebootstrap.physicalAttempts);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("does not retry human-required or fatal errors", async () => {
|
||||||
|
let attempts = 0;
|
||||||
|
await expect(
|
||||||
|
retryRecoverable("resume", async () => {
|
||||||
|
attempts += 1;
|
||||||
|
throw new DevflowError("operator action required", {
|
||||||
|
class: "human_required",
|
||||||
|
code: "session_recovery_required",
|
||||||
|
});
|
||||||
|
}),
|
||||||
|
).rejects.toMatchObject({
|
||||||
|
class: "human_required",
|
||||||
|
code: "session_recovery_required",
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(attempts).toBe(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("uses the locked hung-session timeout boundary", () => {
|
||||||
|
const now = new Date("2026-05-13T10:20:00.000Z");
|
||||||
|
|
||||||
|
expect(isSessionHung(new Date("2026-05-13T10:00:00.000Z"), now)).toBe(true);
|
||||||
|
expect(isSessionHung(new Date("2026-05-13T10:00:01.000Z"), now)).toBe(false);
|
||||||
|
expect(isSessionHung(undefined, now)).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
127
packages/session/src/recovery.ts
Normal file
127
packages/session/src/recovery.ts
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
import { DevflowError, SessionState, type SessionState as SessionStateName } from "@devflow/core";
|
||||||
|
|
||||||
|
export const SessionRecoveryBudget = Object.freeze({
|
||||||
|
sendPrompt: Object.freeze({ retries: 2, physicalAttempts: 3 }),
|
||||||
|
resume: Object.freeze({ retries: 2, physicalAttempts: 3 }),
|
||||||
|
rebootstrap: Object.freeze({ retries: 1, physicalAttempts: 2 }),
|
||||||
|
artifactRepair: Object.freeze({ retries: 1, physicalAttempts: 2 }),
|
||||||
|
maxHungMs: 20 * 60 * 1000,
|
||||||
|
});
|
||||||
|
|
||||||
|
export type SessionRetryOperation = "sendPrompt" | "resume" | "rebootstrap" | "artifactRepair";
|
||||||
|
|
||||||
|
const allowedSessionTransitions: ReadonlyMap<
|
||||||
|
SessionStateName,
|
||||||
|
ReadonlySet<SessionStateName>
|
||||||
|
> = new Map([
|
||||||
|
["CREATED", new Set(["BOOTSTRAPPING", "FAILED_NEEDS_HUMAN"])],
|
||||||
|
["BOOTSTRAPPING", new Set(["READY", "FAILED_NEEDS_HUMAN"])],
|
||||||
|
["READY", new Set(["BUSY", "FAILED_NEEDS_HUMAN"])],
|
||||||
|
[
|
||||||
|
"BUSY",
|
||||||
|
new Set([
|
||||||
|
"READY",
|
||||||
|
"WAITING_FOR_APPROVAL",
|
||||||
|
"ARTIFACT_TIMEOUT",
|
||||||
|
"HUNG",
|
||||||
|
"CRASHED",
|
||||||
|
"FAILED_NEEDS_HUMAN",
|
||||||
|
]),
|
||||||
|
],
|
||||||
|
["WAITING_FOR_APPROVAL", new Set(["READY", "FAILED_NEEDS_HUMAN"])],
|
||||||
|
["ARTIFACT_TIMEOUT", new Set(["RESUMING", "FAILED_NEEDS_HUMAN"])],
|
||||||
|
["HUNG", new Set(["RESUMING", "FAILED_NEEDS_HUMAN"])],
|
||||||
|
["CRASHED", new Set(["RESUMING", "FAILED_NEEDS_HUMAN"])],
|
||||||
|
["RESUMING", new Set(["READY", "REBOOTSTRAPPED", "FAILED_NEEDS_HUMAN"])],
|
||||||
|
["REBOOTSTRAPPED", new Set(["READY", "FAILED_NEEDS_HUMAN"])],
|
||||||
|
["FAILED_NEEDS_HUMAN", new Set()],
|
||||||
|
]);
|
||||||
|
|
||||||
|
export function isAllowedSessionTransition(from: string, to: string): boolean {
|
||||||
|
const parsedFrom = SessionState.safeParse(from);
|
||||||
|
const parsedTo = SessionState.safeParse(to);
|
||||||
|
if (!parsedFrom.success || !parsedTo.success) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return allowedSessionTransitions.get(parsedFrom.data)?.has(parsedTo.data) ?? false;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isAllowedSessionStateAssignment(from: string, to: string): boolean {
|
||||||
|
const parsedFrom = SessionState.safeParse(from);
|
||||||
|
const parsedTo = SessionState.safeParse(to);
|
||||||
|
if (!parsedFrom.success || !parsedTo.success) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (parsedFrom.data === parsedTo.data) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return isAllowedSessionTransition(parsedFrom.data, parsedTo.data);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function assertSessionTransition(from: string, to: string): void {
|
||||||
|
if (isAllowedSessionTransition(from, to)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new DevflowError("Invalid session state transition", {
|
||||||
|
class: "fatal",
|
||||||
|
code: "internal_state_corruption",
|
||||||
|
recoveryHint: `${from}->${to}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export function assertSessionStateAssignment(from: string, to: string): void {
|
||||||
|
if (isAllowedSessionStateAssignment(from, to)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new DevflowError("Invalid session state assignment", {
|
||||||
|
class: "fatal",
|
||||||
|
code: "internal_state_corruption",
|
||||||
|
recoveryHint: `${from}->${to}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function retryRecoverable<T>(
|
||||||
|
operation: SessionRetryOperation,
|
||||||
|
run: (physicalAttempt: number) => Promise<T>,
|
||||||
|
): Promise<T> {
|
||||||
|
const physicalAttempts = recoveryPhysicalAttempts(operation);
|
||||||
|
let lastError: unknown;
|
||||||
|
for (let physicalAttempt = 1; physicalAttempt <= physicalAttempts; physicalAttempt += 1) {
|
||||||
|
try {
|
||||||
|
return await run(physicalAttempt);
|
||||||
|
} catch (error) {
|
||||||
|
lastError = error;
|
||||||
|
if (!(error instanceof DevflowError) || error.class !== "recoverable") {
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
throw lastError;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isSessionHung(
|
||||||
|
lastOutputAt: Date | undefined,
|
||||||
|
now: Date,
|
||||||
|
maxHungMs = SessionRecoveryBudget.maxHungMs,
|
||||||
|
): boolean {
|
||||||
|
if (lastOutputAt === undefined) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return now.getTime() - lastOutputAt.getTime() >= maxHungMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
function recoveryPhysicalAttempts(operation: SessionRetryOperation): number {
|
||||||
|
switch (operation) {
|
||||||
|
case "sendPrompt":
|
||||||
|
return SessionRecoveryBudget.sendPrompt.physicalAttempts;
|
||||||
|
case "resume":
|
||||||
|
return SessionRecoveryBudget.resume.physicalAttempts;
|
||||||
|
case "rebootstrap":
|
||||||
|
return SessionRecoveryBudget.rebootstrap.physicalAttempts;
|
||||||
|
case "artifactRepair":
|
||||||
|
return SessionRecoveryBudget.artifactRepair.physicalAttempts;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -14,6 +14,9 @@ export interface DevflowActivityDependencies {
|
|||||||
workspaceRoot: string;
|
workspaceRoot: string;
|
||||||
availableBackends?: readonly BackendConfig[];
|
availableBackends?: readonly BackendConfig[];
|
||||||
maxConcurrentRuns?: number;
|
maxConcurrentRuns?: number;
|
||||||
|
recovery?: {
|
||||||
|
maxHungMs?: number;
|
||||||
|
};
|
||||||
wait?: {
|
wait?: {
|
||||||
timeoutMs?: number;
|
timeoutMs?: number;
|
||||||
pollIntervalMs?: number;
|
pollIntervalMs?: number;
|
||||||
@@ -50,6 +53,7 @@ export function createDevflowActivities(
|
|||||||
...(dependencies.maxConcurrentRuns === undefined
|
...(dependencies.maxConcurrentRuns === undefined
|
||||||
? {}
|
? {}
|
||||||
: { maxConcurrentRuns: dependencies.maxConcurrentRuns }),
|
: { maxConcurrentRuns: dependencies.maxConcurrentRuns }),
|
||||||
|
...(dependencies.recovery === undefined ? {} : { recovery: dependencies.recovery }),
|
||||||
...(activityWait === undefined ? {} : { wait: activityWait }),
|
...(activityWait === undefined ? {} : { wait: activityWait }),
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user