Skip to content

Commit abc450c

Browse files
committed
Fix race condition with sending a request while container is stopping
Due to some quirks in the runtime, it's possible for the DO to send a request to a container when it thinks the container is in a running state, but while the request is in flight, the container stops and the monitor promise resolves. This results in an error, and instead of retrying we throw a 500 error. Instead, recognize this case and restart the container. This is a bandage solution, but we will follow up with some improvements to the runtime that will clean up the state management required in this DO class.
1 parent 9885a4b commit abc450c

1 file changed

Lines changed: 98 additions & 2 deletions

File tree

src/lib/container.ts

Lines changed: 98 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,20 @@ const isRuntimeSignalledError = (error: unknown): boolean =>
7777
const isNotListeningError = (error: unknown): boolean => isErrorOfType(error, NOT_LISTENING_ERROR);
7878
const isContainerExitNonZeroError = (error: unknown): boolean =>
7979
isErrorOfType(error, UNEXPECTED_EXIT_ERROR);
80+
const isContainerNotRunningError = (error: unknown): boolean => {
81+
const patterns = [
82+
'the container is not running',
83+
'not expected to be running',
84+
'consider calling start()',
85+
];
86+
return patterns.some(pattern => isErrorOfType(error, pattern));
87+
};
88+
const isContainerCrashedDuringPortCheckError = (error: unknown): boolean =>
89+
isErrorOfType(error, 'container crashed while checking for ports');
90+
const isRecoverableContainerUnavailableError = (error: unknown): boolean =>
91+
isContainerNotRunningError(error) ||
92+
isNotListeningError(error) ||
93+
isContainerCrashedDuringPortCheckError(error);
8094

8195
function getExitCodeFromError(error: unknown): number | null {
8296
if (!(error instanceof Error)) {
@@ -512,7 +526,13 @@ export class Container<Env = Cloudflare.Env> extends DurableObject<Env> {
512526
*/
513527
public async stop(signal: Signal | SignalInteger = 'SIGTERM'): Promise<void> {
514528
if (this.container.running) {
515-
this.container.signal(typeof signal === 'string' ? signalToNumbers[signal] : signal);
529+
try {
530+
this.container.signal(typeof signal === 'string' ? signalToNumbers[signal] : signal);
531+
} catch (error) {
532+
if (!isRecoverableContainerUnavailableError(error)) {
533+
throw error;
534+
}
535+
}
516536
}
517537
await this.syncPendingStoppedEvents();
518538
}
@@ -690,7 +710,10 @@ export class Container<Env = Cloudflare.Env> extends DurableObject<Env> {
690710
const state = await this.state.getState();
691711
if (!this.container.running || state.status !== 'healthy') {
692712
try {
693-
await this.startAndWaitForPorts(port, { abort: request.signal });
713+
await this.startAndWaitForPortsWithRecovery(port, {
714+
abort: request.signal,
715+
context: 'startup',
716+
});
694717
} catch (e) {
695718
if (isNoInstanceError(e)) {
696719
return new Response(
@@ -721,6 +744,29 @@ export class Container<Env = Cloudflare.Env> extends DurableObject<Env> {
721744
throw e;
722745
}
723746

747+
// If container stopped or is no longer reachable during the request, restart and retry
748+
if (!this.container.running || isRecoverableContainerUnavailableError(e)) {
749+
try {
750+
await this.startAndWaitForPortsWithRecovery(port, {
751+
context: 'proxy',
752+
initialError: e,
753+
});
754+
const retryTcpPort = this.container.getTcpPort(port);
755+
return await retryTcpPort.fetch(containerUrl, request);
756+
} catch (retryError) {
757+
if (isNoInstanceError(retryError)) {
758+
return new Response(
759+
'There is no Container instance available at this time.\nThis is likely because you have reached your max concurrent instance count (set in wrangler config) or are you currently provisioning the Container.\nIf you are deploying your Container for the first time, check your dashboard to see provisioning status, this may take a few minutes.',
760+
{ status: 503 }
761+
);
762+
}
763+
return new Response(
764+
`Failed to restart container: ${retryError instanceof Error ? retryError.message : String(retryError)}`,
765+
{ status: 500 }
766+
);
767+
}
768+
}
769+
724770
// This error means that the container might've just restarted
725771
if (e.message.includes('Network connection lost.')) {
726772
return new Response('Container suddenly disconnected, try again', { status: 500 });
@@ -839,6 +885,56 @@ export class Container<Env = Cloudflare.Env> extends DurableObject<Env> {
839885
return { request, port };
840886
}
841887

888+
private async startAndWaitForPortsWithRecovery(
889+
port: number,
890+
options: {
891+
context: 'startup' | 'proxy';
892+
abort?: AbortSignal;
893+
initialError?: Error;
894+
}
895+
): Promise<void> {
896+
if (options.initialError) {
897+
console.debug(
898+
`Recoverable ${options.context} error for container ${this.ctx.id}, restarting and retrying: ${options.initialError.message}`
899+
);
900+
}
901+
902+
const attempts: (CancellationOptions | undefined)[] = [
903+
options.abort ? { abort: options.abort } : undefined,
904+
undefined,
905+
undefined,
906+
];
907+
908+
let lastError: unknown;
909+
910+
for (let index = 0; index < attempts.length; index++) {
911+
const cancellationOptions = attempts[index];
912+
913+
try {
914+
if (cancellationOptions) {
915+
await this.startAndWaitForPorts(port, cancellationOptions);
916+
} else {
917+
await this.startAndWaitForPorts(port);
918+
}
919+
return;
920+
} catch (error) {
921+
lastError = error;
922+
923+
if (!isRecoverableContainerUnavailableError(error) || index === attempts.length - 1) {
924+
throw error;
925+
}
926+
927+
console.debug(
928+
`Recoverable ${options.context} start error for container ${this.ctx.id}, retry ${index + 1}/${attempts.length - 1}: ${error instanceof Error ? error.message : String(error)}`
929+
);
930+
931+
await new Promise(resolve => setTimeout(resolve, 100));
932+
}
933+
}
934+
935+
throw lastError;
936+
}
937+
842938
/**
843939
*
844940
* The method prioritizes port sources in this order:

0 commit comments

Comments
 (0)