fix(CQ-ORCH-7): Graceful Docker container shutdown before force remove
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful

Replace the always-force container removal (SIGKILL) with a two-phase
approach: first attempt graceful stop (SIGTERM with configurable timeout),
then remove without force. Falls back to force remove only if the graceful
path fails. The graceful stop timeout is configurable via
orchestrator.sandbox.gracefulStopTimeoutSeconds (default: 10s).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jason Woltje
2026-02-06 14:05:53 -06:00
parent 2b356f6ca2
commit a0062494b7
2 changed files with 99 additions and 10 deletions

View File

@@ -233,19 +233,52 @@ describe("DockerSandboxService", () => {
});
describe("removeContainer", () => {
it("should remove a container by ID", async () => {
it("should gracefully stop and remove a container by ID", async () => {
const containerId = "container-123";
await service.removeContainer(containerId);
expect(mockDocker.getContainer).toHaveBeenCalledWith(containerId);
expect(mockContainer.stop).toHaveBeenCalledWith({ t: 10 });
expect(mockContainer.remove).toHaveBeenCalledWith({ force: false });
});
it("should fall back to force remove when graceful stop fails", async () => {
const containerId = "container-123";
(mockContainer.stop as ReturnType<typeof vi.fn>).mockRejectedValueOnce(
new Error("Container already stopped")
);
await service.removeContainer(containerId);
expect(mockContainer.stop).toHaveBeenCalledWith({ t: 10 });
expect(mockContainer.remove).toHaveBeenCalledWith({ force: true });
});
it("should throw error if container removal fails", async () => {
it("should fall back to force remove when graceful remove fails", async () => {
const containerId = "container-123";
(mockContainer.remove as ReturnType<typeof vi.fn>).mockRejectedValue(
(mockContainer.remove as ReturnType<typeof vi.fn>)
.mockRejectedValueOnce(new Error("Container still running"))
.mockResolvedValueOnce(undefined);
await service.removeContainer(containerId);
expect(mockContainer.stop).toHaveBeenCalledWith({ t: 10 });
// First call: graceful remove (force: false) - fails
expect(mockContainer.remove).toHaveBeenNthCalledWith(1, { force: false });
// Second call: force remove (force: true) - succeeds
expect(mockContainer.remove).toHaveBeenNthCalledWith(2, { force: true });
});
it("should throw error if both graceful and force removal fail", async () => {
const containerId = "container-123";
(mockContainer.stop as ReturnType<typeof vi.fn>).mockRejectedValueOnce(
new Error("Stop failed")
);
(mockContainer.remove as ReturnType<typeof vi.fn>).mockRejectedValueOnce(
new Error("Container not found")
);
@@ -253,6 +286,31 @@ describe("DockerSandboxService", () => {
"Failed to remove container container-123"
);
});
it("should use configurable graceful stop timeout", async () => {
const customConfigService = {
get: vi.fn((key: string, defaultValue?: unknown) => {
const config: Record<string, unknown> = {
"orchestrator.docker.socketPath": "/var/run/docker.sock",
"orchestrator.sandbox.enabled": true,
"orchestrator.sandbox.defaultImage": "node:20-alpine",
"orchestrator.sandbox.defaultMemoryMB": 512,
"orchestrator.sandbox.defaultCpuLimit": 1.0,
"orchestrator.sandbox.networkMode": "bridge",
"orchestrator.sandbox.gracefulStopTimeoutSeconds": 30,
};
return config[key] !== undefined ? config[key] : defaultValue;
}),
} as unknown as ConfigService;
const customService = new DockerSandboxService(customConfigService, mockDocker);
const containerId = "container-123";
await customService.removeContainer(containerId);
expect(mockContainer.stop).toHaveBeenCalledWith({ t: 30 });
expect(mockContainer.remove).toHaveBeenCalledWith({ force: false });
});
});
describe("getContainerStatus", () => {
@@ -280,24 +338,30 @@ describe("DockerSandboxService", () => {
});
describe("cleanup", () => {
it("should stop and remove container", async () => {
it("should stop and remove container gracefully", async () => {
const containerId = "container-123";
await service.cleanup(containerId);
// cleanup calls stopContainer first, then removeContainer
// stopContainer sends stop({ t: 10 })
// removeContainer also tries stop({ t: 10 }) then remove({ force: false })
expect(mockContainer.stop).toHaveBeenCalledWith({ t: 10 });
expect(mockContainer.remove).toHaveBeenCalledWith({ force: true });
expect(mockContainer.remove).toHaveBeenCalledWith({ force: false });
});
it("should remove container even if stop fails", async () => {
it("should remove container even if initial stop fails", async () => {
const containerId = "container-123";
// First stop call (from cleanup's stopContainer) fails
// Second stop call (from removeContainer's graceful attempt) also fails
(mockContainer.stop as ReturnType<typeof vi.fn>).mockRejectedValue(
new Error("Container already stopped")
);
await service.cleanup(containerId);
// removeContainer falls back to force remove after graceful stop fails
expect(mockContainer.remove).toHaveBeenCalledWith({ force: true });
});

View File

@@ -81,6 +81,7 @@ export class DockerSandboxService {
private readonly defaultNetworkMode: string;
private readonly envWhitelist: readonly string[];
private readonly defaultSecurityOptions: Required<DockerSecurityOptions>;
private readonly gracefulStopTimeoutSeconds: number;
constructor(
private readonly configService: ConfigService,
@@ -144,6 +145,11 @@ export class DockerSandboxService {
noNewPrivileges: configNoNewPrivileges ?? DEFAULT_SECURITY_OPTIONS.noNewPrivileges,
};
this.gracefulStopTimeoutSeconds = this.configService.get<number>(
"orchestrator.sandbox.gracefulStopTimeoutSeconds",
10
);
// Validate default image tag at startup to fail fast on misconfiguration
this.validateImageTag(this.defaultImage);
@@ -336,15 +342,34 @@ export class DockerSandboxService {
}
/**
* Remove a Docker container
* Remove a Docker container with graceful shutdown.
* First attempts to gracefully stop the container (SIGTERM with configurable timeout),
* then removes it without force. If graceful stop fails, falls back to force remove (SIGKILL).
* @param containerId Container ID to remove
*/
async removeContainer(containerId: string): Promise<void> {
this.logger.log(`Removing container: ${containerId}`);
const container = this.docker.getContainer(containerId);
// Try graceful stop first (SIGTERM with timeout), then non-force remove
try {
this.logger.log(
`Attempting graceful stop of container ${containerId} (timeout: ${this.gracefulStopTimeoutSeconds.toString()}s)`
);
await container.stop({ t: this.gracefulStopTimeoutSeconds });
await container.remove({ force: false });
this.logger.log(`Container gracefully stopped and removed: ${containerId}`);
return;
} catch (gracefulError) {
this.logger.warn(
`Graceful stop failed for container ${containerId}, falling back to force remove: ${gracefulError instanceof Error ? gracefulError.message : String(gracefulError)}`
);
}
// Fallback: force remove (SIGKILL)
try {
this.logger.log(`Removing container: ${containerId}`);
const container = this.docker.getContainer(containerId);
await container.remove({ force: true });
this.logger.log(`Container removed successfully: ${containerId}`);
this.logger.log(`Container force-removed: ${containerId}`);
} catch (error) {
const enhancedError = error instanceof Error ? error : new Error(String(error));
enhancedError.message = `Failed to remove container ${containerId}: ${enhancedError.message}`;