From a0062494b755399157b6c73f5bf2dd8f3f391cef Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Fri, 6 Feb 2026 14:05:53 -0600 Subject: [PATCH] fix(CQ-ORCH-7): Graceful Docker container shutdown before force remove Replace the always-force container removal (SIGKILL) with a two-phase approach: first attempt graceful stop (SIGTERM with configurable timeout), then remove without force. Falls back to force remove only if the graceful path fails. The graceful stop timeout is configurable via orchestrator.sandbox.gracefulStopTimeoutSeconds (default: 10s). Co-Authored-By: Claude Opus 4.6 --- .../spawner/docker-sandbox.service.spec.ts | 76 +++++++++++++++++-- .../src/spawner/docker-sandbox.service.ts | 33 +++++++- 2 files changed, 99 insertions(+), 10 deletions(-) diff --git a/apps/orchestrator/src/spawner/docker-sandbox.service.spec.ts b/apps/orchestrator/src/spawner/docker-sandbox.service.spec.ts index c7e2de1..49c51cf 100644 --- a/apps/orchestrator/src/spawner/docker-sandbox.service.spec.ts +++ b/apps/orchestrator/src/spawner/docker-sandbox.service.spec.ts @@ -233,19 +233,52 @@ describe("DockerSandboxService", () => { }); describe("removeContainer", () => { - it("should remove a container by ID", async () => { + it("should gracefully stop and remove a container by ID", async () => { const containerId = "container-123"; await service.removeContainer(containerId); expect(mockDocker.getContainer).toHaveBeenCalledWith(containerId); + expect(mockContainer.stop).toHaveBeenCalledWith({ t: 10 }); + expect(mockContainer.remove).toHaveBeenCalledWith({ force: false }); + }); + + it("should fall back to force remove when graceful stop fails", async () => { + const containerId = "container-123"; + + (mockContainer.stop as ReturnType).mockRejectedValueOnce( + new Error("Container already stopped") + ); + + await service.removeContainer(containerId); + + expect(mockContainer.stop).toHaveBeenCalledWith({ t: 10 }); expect(mockContainer.remove).toHaveBeenCalledWith({ force: true }); }); - it("should throw error if container removal fails", async () => { + it("should fall back to force remove when graceful remove fails", async () => { const containerId = "container-123"; - (mockContainer.remove as ReturnType).mockRejectedValue( + (mockContainer.remove as ReturnType) + .mockRejectedValueOnce(new Error("Container still running")) + .mockResolvedValueOnce(undefined); + + await service.removeContainer(containerId); + + expect(mockContainer.stop).toHaveBeenCalledWith({ t: 10 }); + // First call: graceful remove (force: false) - fails + expect(mockContainer.remove).toHaveBeenNthCalledWith(1, { force: false }); + // Second call: force remove (force: true) - succeeds + expect(mockContainer.remove).toHaveBeenNthCalledWith(2, { force: true }); + }); + + it("should throw error if both graceful and force removal fail", async () => { + const containerId = "container-123"; + + (mockContainer.stop as ReturnType).mockRejectedValueOnce( + new Error("Stop failed") + ); + (mockContainer.remove as ReturnType).mockRejectedValueOnce( new Error("Container not found") ); @@ -253,6 +286,31 @@ describe("DockerSandboxService", () => { "Failed to remove container container-123" ); }); + + it("should use configurable graceful stop timeout", async () => { + const customConfigService = { + get: vi.fn((key: string, defaultValue?: unknown) => { + const config: Record = { + "orchestrator.docker.socketPath": "/var/run/docker.sock", + "orchestrator.sandbox.enabled": true, + "orchestrator.sandbox.defaultImage": "node:20-alpine", + "orchestrator.sandbox.defaultMemoryMB": 512, + "orchestrator.sandbox.defaultCpuLimit": 1.0, + "orchestrator.sandbox.networkMode": "bridge", + "orchestrator.sandbox.gracefulStopTimeoutSeconds": 30, + }; + return config[key] !== undefined ? config[key] : defaultValue; + }), + } as unknown as ConfigService; + + const customService = new DockerSandboxService(customConfigService, mockDocker); + const containerId = "container-123"; + + await customService.removeContainer(containerId); + + expect(mockContainer.stop).toHaveBeenCalledWith({ t: 30 }); + expect(mockContainer.remove).toHaveBeenCalledWith({ force: false }); + }); }); describe("getContainerStatus", () => { @@ -280,24 +338,30 @@ describe("DockerSandboxService", () => { }); describe("cleanup", () => { - it("should stop and remove container", async () => { + it("should stop and remove container gracefully", async () => { const containerId = "container-123"; await service.cleanup(containerId); + // cleanup calls stopContainer first, then removeContainer + // stopContainer sends stop({ t: 10 }) + // removeContainer also tries stop({ t: 10 }) then remove({ force: false }) expect(mockContainer.stop).toHaveBeenCalledWith({ t: 10 }); - expect(mockContainer.remove).toHaveBeenCalledWith({ force: true }); + expect(mockContainer.remove).toHaveBeenCalledWith({ force: false }); }); - it("should remove container even if stop fails", async () => { + it("should remove container even if initial stop fails", async () => { const containerId = "container-123"; + // First stop call (from cleanup's stopContainer) fails + // Second stop call (from removeContainer's graceful attempt) also fails (mockContainer.stop as ReturnType).mockRejectedValue( new Error("Container already stopped") ); await service.cleanup(containerId); + // removeContainer falls back to force remove after graceful stop fails expect(mockContainer.remove).toHaveBeenCalledWith({ force: true }); }); diff --git a/apps/orchestrator/src/spawner/docker-sandbox.service.ts b/apps/orchestrator/src/spawner/docker-sandbox.service.ts index d0eabbd..fd52c18 100644 --- a/apps/orchestrator/src/spawner/docker-sandbox.service.ts +++ b/apps/orchestrator/src/spawner/docker-sandbox.service.ts @@ -81,6 +81,7 @@ export class DockerSandboxService { private readonly defaultNetworkMode: string; private readonly envWhitelist: readonly string[]; private readonly defaultSecurityOptions: Required; + private readonly gracefulStopTimeoutSeconds: number; constructor( private readonly configService: ConfigService, @@ -144,6 +145,11 @@ export class DockerSandboxService { noNewPrivileges: configNoNewPrivileges ?? DEFAULT_SECURITY_OPTIONS.noNewPrivileges, }; + this.gracefulStopTimeoutSeconds = this.configService.get( + "orchestrator.sandbox.gracefulStopTimeoutSeconds", + 10 + ); + // Validate default image tag at startup to fail fast on misconfiguration this.validateImageTag(this.defaultImage); @@ -336,15 +342,34 @@ export class DockerSandboxService { } /** - * Remove a Docker container + * Remove a Docker container with graceful shutdown. + * First attempts to gracefully stop the container (SIGTERM with configurable timeout), + * then removes it without force. If graceful stop fails, falls back to force remove (SIGKILL). * @param containerId Container ID to remove */ async removeContainer(containerId: string): Promise { + this.logger.log(`Removing container: ${containerId}`); + const container = this.docker.getContainer(containerId); + + // Try graceful stop first (SIGTERM with timeout), then non-force remove + try { + this.logger.log( + `Attempting graceful stop of container ${containerId} (timeout: ${this.gracefulStopTimeoutSeconds.toString()}s)` + ); + await container.stop({ t: this.gracefulStopTimeoutSeconds }); + await container.remove({ force: false }); + this.logger.log(`Container gracefully stopped and removed: ${containerId}`); + return; + } catch (gracefulError) { + this.logger.warn( + `Graceful stop failed for container ${containerId}, falling back to force remove: ${gracefulError instanceof Error ? gracefulError.message : String(gracefulError)}` + ); + } + + // Fallback: force remove (SIGKILL) try { - this.logger.log(`Removing container: ${containerId}`); - const container = this.docker.getContainer(containerId); await container.remove({ force: true }); - this.logger.log(`Container removed successfully: ${containerId}`); + this.logger.log(`Container force-removed: ${containerId}`); } catch (error) { const enhancedError = error instanceof Error ? error : new Error(String(error)); enhancedError.message = `Failed to remove container ${containerId}: ${enhancedError.message}`;