chore: upgrade Node.js runtime to v24 across codebase #419
933
apps/api/src/speech/speech.integration.spec.ts
Normal file
933
apps/api/src/speech/speech.integration.spec.ts
Normal file
@@ -0,0 +1,933 @@
|
||||
/**
|
||||
* Speech Services E2E Integration Tests
|
||||
*
|
||||
* Tests the full speech pipeline from API endpoints through to mocked external providers.
|
||||
* Covers REST transcription, synthesis, provider fallback, WebSocket streaming,
|
||||
* audio validation, file size limits, authentication, voice listing, and health checks.
|
||||
*
|
||||
* Uses NestJS testing module with supertest for HTTP testing and direct gateway
|
||||
* invocation for WebSocket streaming tests.
|
||||
*
|
||||
* Issue #405
|
||||
*/
|
||||
|
||||
import { describe, it, expect, beforeAll, beforeEach, afterAll, vi } from "vitest";
|
||||
import { Test } from "@nestjs/testing";
|
||||
import {
|
||||
type INestApplication,
|
||||
type CanActivate,
|
||||
type ExecutionContext,
|
||||
UnauthorizedException,
|
||||
ValidationPipe,
|
||||
} from "@nestjs/common";
|
||||
import request from "supertest";
|
||||
import type { App } from "supertest/types";
|
||||
|
||||
import { SpeechController } from "./speech.controller";
|
||||
import { SpeechService } from "./speech.service";
|
||||
import { SpeechGateway } from "./speech.gateway";
|
||||
import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants";
|
||||
import { speechConfig } from "./speech.config";
|
||||
import type { SpeechConfig } from "./speech.config";
|
||||
import type { ISTTProvider } from "./interfaces/stt-provider.interface";
|
||||
import type { ITTSProvider } from "./interfaces/tts-provider.interface";
|
||||
import type {
|
||||
TranscriptionResult,
|
||||
SynthesisResult,
|
||||
VoiceInfo,
|
||||
SpeechTier,
|
||||
} from "./interfaces/speech-types";
|
||||
import { AuthGuard } from "../auth/guards/auth.guard";
|
||||
import { WorkspaceGuard, PermissionGuard } from "../common/guards";
|
||||
import { AuthService } from "../auth/auth.service";
|
||||
import { PrismaService } from "../prisma/prisma.service";
|
||||
|
||||
// ==========================================
|
||||
// Test Fixtures
|
||||
// ==========================================
|
||||
|
||||
/**
|
||||
* Small WAV file header (44 bytes) + minimal data.
|
||||
* Not a real audio file, but has the correct structure for testing.
|
||||
*/
|
||||
const TEST_AUDIO_BUFFER = Buffer.alloc(1024, 0);
|
||||
|
||||
const MOCK_WORKSPACE_ID = "550e8400-e29b-41d4-a716-446655440001";
|
||||
const MOCK_USER_ID = "550e8400-e29b-41d4-a716-446655440002";
|
||||
|
||||
const MOCK_USER = {
|
||||
id: MOCK_USER_ID,
|
||||
email: "test@example.com",
|
||||
name: "Test User",
|
||||
workspaceId: MOCK_WORKSPACE_ID,
|
||||
};
|
||||
|
||||
const MOCK_TRANSCRIPTION_RESULT: TranscriptionResult = {
|
||||
text: "Hello, this is a test transcription.",
|
||||
language: "en",
|
||||
durationSeconds: 3.2,
|
||||
confidence: 0.97,
|
||||
segments: [
|
||||
{ text: "Hello, this is a test transcription.", start: 0, end: 3.2, confidence: 0.97 },
|
||||
],
|
||||
};
|
||||
|
||||
const MOCK_SYNTHESIS_RESULT: SynthesisResult = {
|
||||
audio: Buffer.from("fake-synthesized-audio-data-mp3"),
|
||||
format: "mp3",
|
||||
voice: "af_heart",
|
||||
tier: "default" as SpeechTier,
|
||||
durationSeconds: 2.1,
|
||||
};
|
||||
|
||||
const MOCK_VOICES: VoiceInfo[] = [
|
||||
{ id: "af_heart", name: "Heart", language: "en", tier: "default", isDefault: true },
|
||||
{ id: "af_sky", name: "Sky", language: "en", tier: "default", isDefault: false },
|
||||
{
|
||||
id: "chatterbox-default",
|
||||
name: "Chatterbox",
|
||||
language: "en",
|
||||
tier: "premium",
|
||||
isDefault: true,
|
||||
},
|
||||
];
|
||||
|
||||
const MOCK_SPEECH_CONFIG: SpeechConfig = {
|
||||
stt: {
|
||||
enabled: true,
|
||||
baseUrl: "http://speaches:8000/v1",
|
||||
model: "test-model",
|
||||
language: "en",
|
||||
},
|
||||
tts: {
|
||||
default: { enabled: true, url: "http://kokoro:8880/v1", voice: "af_heart", format: "mp3" },
|
||||
premium: { enabled: true, url: "http://chatterbox:8881/v1" },
|
||||
fallback: { enabled: true, url: "http://openedai:8000/v1" },
|
||||
},
|
||||
limits: {
|
||||
maxUploadSize: 25_000_000,
|
||||
maxDurationSeconds: 600,
|
||||
maxTextLength: 4096,
|
||||
},
|
||||
};
|
||||
|
||||
// ==========================================
|
||||
// Mock Providers
|
||||
// ==========================================
|
||||
|
||||
function createMockSTTProvider(): ISTTProvider {
|
||||
return {
|
||||
name: "mock-stt",
|
||||
transcribe: vi.fn().mockResolvedValue(MOCK_TRANSCRIPTION_RESULT),
|
||||
isHealthy: vi.fn().mockResolvedValue(true),
|
||||
};
|
||||
}
|
||||
|
||||
function createMockTTSProvider(tier: SpeechTier, name: string): ITTSProvider {
|
||||
const voices = MOCK_VOICES.filter((v) => v.tier === tier);
|
||||
return {
|
||||
name,
|
||||
tier,
|
||||
synthesize: vi.fn().mockResolvedValue({
|
||||
...MOCK_SYNTHESIS_RESULT,
|
||||
tier,
|
||||
}),
|
||||
listVoices: vi.fn().mockResolvedValue(voices),
|
||||
isHealthy: vi.fn().mockResolvedValue(true),
|
||||
};
|
||||
}
|
||||
|
||||
// ==========================================
|
||||
// Test Guards
|
||||
// ==========================================
|
||||
|
||||
/**
|
||||
* Conditional auth guard for testing.
|
||||
* Authenticates requests that carry `Authorization: Bearer test-token`.
|
||||
* Rejects all others with UnauthorizedException.
|
||||
*/
|
||||
class TestAuthGuard implements CanActivate {
|
||||
canActivate(context: ExecutionContext): boolean {
|
||||
const req = context.switchToHttp().getRequest<{
|
||||
headers: Record<string, string | undefined>;
|
||||
user?: typeof MOCK_USER;
|
||||
cookies?: Record<string, string>;
|
||||
}>();
|
||||
const authHeader = req.headers.authorization;
|
||||
const cookieToken = req.cookies?.["better-auth.session_token"];
|
||||
|
||||
if (authHeader === "Bearer test-token" || cookieToken === "test-token") {
|
||||
req.user = { ...MOCK_USER };
|
||||
return true;
|
||||
}
|
||||
|
||||
throw new UnauthorizedException("No authentication token provided");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test workspace guard that attaches a mock workspace to the request.
|
||||
*/
|
||||
class TestWorkspaceGuard implements CanActivate {
|
||||
canActivate(context: ExecutionContext): boolean {
|
||||
const req = context.switchToHttp().getRequest<{
|
||||
workspace?: { id: string };
|
||||
headers: Record<string, string | undefined>;
|
||||
}>();
|
||||
const workspaceId = req.headers["x-workspace-id"] ?? MOCK_WORKSPACE_ID;
|
||||
req.workspace = { id: workspaceId as string };
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test permission guard that always allows access.
|
||||
*/
|
||||
class TestPermissionGuard implements CanActivate {
|
||||
canActivate(): boolean {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// ==========================================
|
||||
// Tests
|
||||
// ==========================================
|
||||
|
||||
describe("Speech Services E2E Integration", () => {
|
||||
let app: INestApplication;
|
||||
let mockSTTProvider: ISTTProvider;
|
||||
let defaultTTSProvider: ITTSProvider;
|
||||
let premiumTTSProvider: ITTSProvider;
|
||||
let fallbackTTSProvider: ITTSProvider;
|
||||
let ttsProvidersMap: Map<SpeechTier, ITTSProvider>;
|
||||
|
||||
// WebSocket gateway test dependencies
|
||||
let speechGateway: SpeechGateway;
|
||||
let mockSpeechService: SpeechService;
|
||||
|
||||
beforeAll(async () => {
|
||||
// Create mock providers
|
||||
mockSTTProvider = createMockSTTProvider();
|
||||
defaultTTSProvider = createMockTTSProvider("default", "mock-kokoro");
|
||||
premiumTTSProvider = createMockTTSProvider("premium", "mock-chatterbox");
|
||||
fallbackTTSProvider = createMockTTSProvider("fallback", "mock-piper");
|
||||
|
||||
ttsProvidersMap = new Map<SpeechTier, ITTSProvider>([
|
||||
["default", defaultTTSProvider],
|
||||
["premium", premiumTTSProvider],
|
||||
["fallback", fallbackTTSProvider],
|
||||
]);
|
||||
|
||||
const moduleRef = await Test.createTestingModule({
|
||||
controllers: [SpeechController],
|
||||
providers: [
|
||||
SpeechService,
|
||||
{
|
||||
provide: speechConfig.KEY,
|
||||
useValue: MOCK_SPEECH_CONFIG,
|
||||
},
|
||||
{
|
||||
provide: STT_PROVIDER,
|
||||
useValue: mockSTTProvider,
|
||||
},
|
||||
{
|
||||
provide: TTS_PROVIDERS,
|
||||
useValue: ttsProvidersMap,
|
||||
},
|
||||
// Gateway dependencies (not tested via HTTP but needed for DI)
|
||||
{
|
||||
provide: SpeechGateway,
|
||||
useFactory: (
|
||||
authService: AuthService,
|
||||
prisma: PrismaService,
|
||||
speechService: SpeechService,
|
||||
config: SpeechConfig
|
||||
): SpeechGateway => {
|
||||
return new SpeechGateway(authService, prisma, speechService, config);
|
||||
},
|
||||
inject: [AuthService, PrismaService, SpeechService, speechConfig.KEY],
|
||||
},
|
||||
{
|
||||
provide: AuthService,
|
||||
useValue: {
|
||||
verifySession: vi.fn().mockResolvedValue({
|
||||
user: { id: MOCK_USER_ID, email: "test@example.com", name: "Test User" },
|
||||
session: { id: "test-session" },
|
||||
}),
|
||||
},
|
||||
},
|
||||
{
|
||||
provide: PrismaService,
|
||||
useValue: {
|
||||
workspaceMember: {
|
||||
findFirst: vi.fn().mockResolvedValue({
|
||||
userId: MOCK_USER_ID,
|
||||
workspaceId: MOCK_WORKSPACE_ID,
|
||||
role: "MEMBER",
|
||||
}),
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
})
|
||||
.overrideGuard(AuthGuard)
|
||||
.useClass(TestAuthGuard)
|
||||
.overrideGuard(WorkspaceGuard)
|
||||
.useClass(TestWorkspaceGuard)
|
||||
.overrideGuard(PermissionGuard)
|
||||
.useClass(TestPermissionGuard)
|
||||
.compile();
|
||||
|
||||
app = moduleRef.createNestApplication();
|
||||
app.useGlobalPipes(new ValidationPipe({ transform: true, whitelist: true }));
|
||||
await app.init();
|
||||
|
||||
// Capture references for WebSocket tests
|
||||
speechGateway = moduleRef.get(SpeechGateway);
|
||||
mockSpeechService = moduleRef.get(SpeechService);
|
||||
});
|
||||
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
|
||||
// Reset default mock behaviors
|
||||
(mockSTTProvider.transcribe as ReturnType<typeof vi.fn>).mockResolvedValue(
|
||||
MOCK_TRANSCRIPTION_RESULT
|
||||
);
|
||||
(defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
|
||||
...MOCK_SYNTHESIS_RESULT,
|
||||
tier: "default",
|
||||
});
|
||||
(premiumTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
|
||||
...MOCK_SYNTHESIS_RESULT,
|
||||
tier: "premium",
|
||||
});
|
||||
(fallbackTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
|
||||
...MOCK_SYNTHESIS_RESULT,
|
||||
tier: "fallback",
|
||||
});
|
||||
(defaultTTSProvider.listVoices as ReturnType<typeof vi.fn>).mockResolvedValue(
|
||||
MOCK_VOICES.filter((v) => v.tier === "default")
|
||||
);
|
||||
(premiumTTSProvider.listVoices as ReturnType<typeof vi.fn>).mockResolvedValue(
|
||||
MOCK_VOICES.filter((v) => v.tier === "premium")
|
||||
);
|
||||
(fallbackTTSProvider.listVoices as ReturnType<typeof vi.fn>).mockResolvedValue([]);
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (app) {
|
||||
await app.close();
|
||||
}
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// Scenario 1: REST Transcription
|
||||
// ==========================================
|
||||
describe("Scenario 1: REST Transcription (POST /speech/transcribe)", () => {
|
||||
it("should transcribe an uploaded audio file and return the transcription result", async () => {
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.post("/speech/transcribe")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.attach("file", TEST_AUDIO_BUFFER, {
|
||||
filename: "test.wav",
|
||||
contentType: "audio/wav",
|
||||
})
|
||||
.expect(201);
|
||||
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data).toMatchObject({
|
||||
text: MOCK_TRANSCRIPTION_RESULT.text,
|
||||
language: MOCK_TRANSCRIPTION_RESULT.language,
|
||||
durationSeconds: MOCK_TRANSCRIPTION_RESULT.durationSeconds,
|
||||
confidence: MOCK_TRANSCRIPTION_RESULT.confidence,
|
||||
});
|
||||
expect(response.body.data.segments).toBeDefined();
|
||||
expect(response.body.data.segments).toHaveLength(1);
|
||||
|
||||
expect(mockSTTProvider.transcribe).toHaveBeenCalledWith(
|
||||
expect.any(Buffer),
|
||||
expect.objectContaining({ mimeType: "audio/wav" })
|
||||
);
|
||||
});
|
||||
|
||||
it("should pass optional transcription parameters to the service", async () => {
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.post("/speech/transcribe")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.attach("file", TEST_AUDIO_BUFFER, {
|
||||
filename: "test.mp3",
|
||||
contentType: "audio/mpeg",
|
||||
})
|
||||
.field("language", "fr")
|
||||
.field("model", "whisper-large-v3")
|
||||
.field("prompt", "Meeting transcript")
|
||||
.field("temperature", "0.3")
|
||||
.expect(201);
|
||||
|
||||
expect(response.body.data.text).toBe(MOCK_TRANSCRIPTION_RESULT.text);
|
||||
|
||||
expect(mockSTTProvider.transcribe).toHaveBeenCalledWith(
|
||||
expect.any(Buffer),
|
||||
expect.objectContaining({
|
||||
mimeType: "audio/mpeg",
|
||||
language: "fr",
|
||||
model: "whisper-large-v3",
|
||||
prompt: "Meeting transcript",
|
||||
temperature: 0.3,
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
it("should reject request without an audio file", async () => {
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.post("/speech/transcribe")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.expect(400);
|
||||
|
||||
expect(response.body).toHaveProperty("message");
|
||||
});
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// Scenario 2: REST Synthesis
|
||||
// ==========================================
|
||||
describe("Scenario 2: REST Synthesis (POST /speech/synthesize)", () => {
|
||||
it("should synthesize text and return audio binary response", async () => {
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.post("/speech/synthesize")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.send({ text: "Hello, world!" })
|
||||
.expect(201);
|
||||
|
||||
// Response should be binary audio
|
||||
expect(response.headers["content-type"]).toContain("audio/mpeg");
|
||||
expect(response.headers["content-disposition"]).toContain("attachment");
|
||||
expect(response.headers["content-disposition"]).toContain("speech.mp3");
|
||||
expect(response.body).toBeDefined();
|
||||
expect(Buffer.isBuffer(response.body) || response.body instanceof Buffer).toBe(true);
|
||||
});
|
||||
|
||||
it("should pass voice, speed, format, and tier options to the service", async () => {
|
||||
(defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
|
||||
audio: Buffer.from("wav-audio-data"),
|
||||
format: "wav",
|
||||
voice: "af_sky",
|
||||
tier: "default",
|
||||
durationSeconds: 1.5,
|
||||
});
|
||||
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.post("/speech/synthesize")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.send({
|
||||
text: "Test with options",
|
||||
voice: "af_sky",
|
||||
speed: 1.5,
|
||||
format: "wav",
|
||||
})
|
||||
.expect(201);
|
||||
|
||||
expect(response.headers["content-type"]).toContain("audio/wav");
|
||||
expect(response.headers["content-disposition"]).toContain("speech.wav");
|
||||
});
|
||||
|
||||
it("should accept empty text (validation delegated to service)", async () => {
|
||||
// The SynthesizeDto allows empty strings (no @IsNotEmpty decorator).
|
||||
// The service/provider handles empty text semantics.
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.post("/speech/synthesize")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.send({ text: "" })
|
||||
.expect(201);
|
||||
|
||||
expect(response.headers["content-type"]).toContain("audio/mpeg");
|
||||
});
|
||||
|
||||
it("should reject missing text field", async () => {
|
||||
await request(app.getHttpServer() as App)
|
||||
.post("/speech/synthesize")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.send({})
|
||||
.expect(400);
|
||||
});
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// Scenario 3: Provider Fallback
|
||||
// ==========================================
|
||||
describe("Scenario 3: Provider Fallback", () => {
|
||||
it("should fall back from premium to default when premium fails", async () => {
|
||||
// Make premium provider fail
|
||||
(premiumTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
|
||||
new Error("Premium provider unavailable")
|
||||
);
|
||||
|
||||
// Default provider should succeed
|
||||
(defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
|
||||
audio: Buffer.from("fallback-audio"),
|
||||
format: "mp3",
|
||||
voice: "af_heart",
|
||||
tier: "default",
|
||||
});
|
||||
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.post("/speech/synthesize")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.send({ text: "Fallback test", tier: "premium" })
|
||||
.expect(201);
|
||||
|
||||
// Premium was attempted first
|
||||
expect(premiumTTSProvider.synthesize).toHaveBeenCalled();
|
||||
// Then default succeeded
|
||||
expect(defaultTTSProvider.synthesize).toHaveBeenCalled();
|
||||
expect(response.headers["content-type"]).toContain("audio/mpeg");
|
||||
});
|
||||
|
||||
it("should fall back through entire chain: premium -> default -> fallback", async () => {
|
||||
// Make premium and default fail
|
||||
(premiumTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
|
||||
new Error("Premium down")
|
||||
);
|
||||
(defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
|
||||
new Error("Default down")
|
||||
);
|
||||
|
||||
// Fallback should succeed
|
||||
(fallbackTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
|
||||
audio: Buffer.from("fallback-piper-audio"),
|
||||
format: "mp3",
|
||||
voice: "piper-default",
|
||||
tier: "fallback",
|
||||
});
|
||||
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.post("/speech/synthesize")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.send({ text: "Full fallback chain test", tier: "premium" })
|
||||
.expect(201);
|
||||
|
||||
expect(premiumTTSProvider.synthesize).toHaveBeenCalled();
|
||||
expect(defaultTTSProvider.synthesize).toHaveBeenCalled();
|
||||
expect(fallbackTTSProvider.synthesize).toHaveBeenCalled();
|
||||
expect(response.headers["content-type"]).toContain("audio/mpeg");
|
||||
});
|
||||
|
||||
it("should return 503 when all TTS providers fail", async () => {
|
||||
(premiumTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
|
||||
new Error("Premium down")
|
||||
);
|
||||
(defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
|
||||
new Error("Default down")
|
||||
);
|
||||
(fallbackTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
|
||||
new Error("Fallback down")
|
||||
);
|
||||
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.post("/speech/synthesize")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.send({ text: "All providers down", tier: "premium" })
|
||||
.expect(503);
|
||||
|
||||
expect(response.body).toHaveProperty("message");
|
||||
expect(response.body.message).toContain("All TTS providers failed");
|
||||
});
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// Scenario 4: WebSocket Streaming Transcription
|
||||
// ==========================================
|
||||
describe("Scenario 4: WebSocket Streaming Transcription", () => {
|
||||
interface MockSocket {
|
||||
id: string;
|
||||
join: ReturnType<typeof vi.fn>;
|
||||
leave: ReturnType<typeof vi.fn>;
|
||||
emit: ReturnType<typeof vi.fn>;
|
||||
disconnect: ReturnType<typeof vi.fn>;
|
||||
data: { userId?: string; workspaceId?: string };
|
||||
handshake: {
|
||||
auth: Record<string, unknown>;
|
||||
query: Record<string, unknown>;
|
||||
headers: Record<string, unknown>;
|
||||
};
|
||||
}
|
||||
|
||||
function createTestSocket(overrides?: Partial<MockSocket>): MockSocket {
|
||||
return {
|
||||
id: "e2e-test-socket",
|
||||
join: vi.fn(),
|
||||
leave: vi.fn(),
|
||||
emit: vi.fn(),
|
||||
disconnect: vi.fn(),
|
||||
data: {},
|
||||
handshake: {
|
||||
auth: { token: "valid-token" },
|
||||
query: {},
|
||||
headers: {},
|
||||
},
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
it("should complete the full streaming transcription lifecycle", async () => {
|
||||
const client = createTestSocket();
|
||||
// Authenticate the client
|
||||
await speechGateway.handleConnection(client as never);
|
||||
|
||||
expect(client.data.userId).toBe(MOCK_USER_ID);
|
||||
expect(client.data.workspaceId).toBe(MOCK_WORKSPACE_ID);
|
||||
expect(client.disconnect).not.toHaveBeenCalled();
|
||||
|
||||
// Start transcription session
|
||||
speechGateway.handleStartTranscription(client as never, { language: "en" });
|
||||
|
||||
expect(client.emit).toHaveBeenCalledWith(
|
||||
"transcription-started",
|
||||
expect.objectContaining({ sessionId: "e2e-test-socket" })
|
||||
);
|
||||
|
||||
// Send audio chunks
|
||||
const chunk1 = Buffer.from("audio-data-chunk-1");
|
||||
const chunk2 = Buffer.from("audio-data-chunk-2");
|
||||
const chunk3 = Buffer.from("audio-data-chunk-3");
|
||||
|
||||
speechGateway.handleAudioChunk(client as never, chunk1);
|
||||
speechGateway.handleAudioChunk(client as never, chunk2);
|
||||
speechGateway.handleAudioChunk(client as never, chunk3);
|
||||
|
||||
// No errors should have been emitted for chunks
|
||||
const errorCalls = client.emit.mock.calls.filter(
|
||||
(call: unknown[]) => call[0] === "transcription-error"
|
||||
);
|
||||
expect(errorCalls).toHaveLength(0);
|
||||
|
||||
vi.clearAllMocks();
|
||||
(mockSTTProvider.transcribe as ReturnType<typeof vi.fn>).mockResolvedValue(
|
||||
MOCK_TRANSCRIPTION_RESULT
|
||||
);
|
||||
|
||||
// Stop transcription - should trigger the full transcription pipeline
|
||||
await speechGateway.handleStopTranscription(client as never);
|
||||
|
||||
// Verify transcription was called with concatenated audio
|
||||
expect(mockSTTProvider.transcribe).toHaveBeenCalledWith(
|
||||
expect.any(Buffer),
|
||||
expect.objectContaining({ language: "en" })
|
||||
);
|
||||
|
||||
// Verify the final result was emitted
|
||||
expect(client.emit).toHaveBeenCalledWith(
|
||||
"transcription-final",
|
||||
expect.objectContaining({
|
||||
text: MOCK_TRANSCRIPTION_RESULT.text,
|
||||
language: "en",
|
||||
durationSeconds: 3.2,
|
||||
confidence: 0.97,
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
it("should clean up session on disconnect", async () => {
|
||||
const client = createTestSocket({ id: "disconnect-test" });
|
||||
await speechGateway.handleConnection(client as never);
|
||||
|
||||
speechGateway.handleStartTranscription(client as never, {});
|
||||
speechGateway.handleAudioChunk(client as never, Buffer.from("data"));
|
||||
|
||||
// Disconnect
|
||||
speechGateway.handleDisconnect(client as never);
|
||||
|
||||
// Trying to send more chunks should fail (session cleaned up)
|
||||
vi.clearAllMocks();
|
||||
speechGateway.handleAudioChunk(client as never, Buffer.from("more-data"));
|
||||
|
||||
expect(client.emit).toHaveBeenCalledWith(
|
||||
"transcription-error",
|
||||
expect.objectContaining({
|
||||
message: expect.stringContaining("No active transcription session"),
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
it("should reject unauthenticated WebSocket clients", async () => {
|
||||
const client = createTestSocket({
|
||||
id: "unauth-ws-client",
|
||||
handshake: { auth: {}, query: {}, headers: {} },
|
||||
});
|
||||
|
||||
await speechGateway.handleConnection(client as never);
|
||||
|
||||
expect(client.disconnect).toHaveBeenCalled();
|
||||
expect(client.data.userId).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// Scenario 5: Audio Validation (Invalid MIME Type)
|
||||
// ==========================================
|
||||
describe("Scenario 5: Audio Validation", () => {
|
||||
it("should reject files with unsupported MIME types", async () => {
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.post("/speech/transcribe")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.attach("file", Buffer.from("not-audio"), {
|
||||
filename: "document.pdf",
|
||||
contentType: "application/pdf",
|
||||
})
|
||||
.expect(400);
|
||||
|
||||
expect(response.body).toHaveProperty("message");
|
||||
expect(response.body.message).toContain("Unsupported audio format");
|
||||
expect(response.body.message).toContain("application/pdf");
|
||||
});
|
||||
|
||||
it("should reject files with text/plain MIME type", async () => {
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.post("/speech/transcribe")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.attach("file", Buffer.from("plain text content"), {
|
||||
filename: "notes.txt",
|
||||
contentType: "text/plain",
|
||||
})
|
||||
.expect(400);
|
||||
|
||||
expect(response.body.message).toContain("Unsupported audio format");
|
||||
});
|
||||
|
||||
it("should reject video MIME types", async () => {
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.post("/speech/transcribe")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.attach("file", Buffer.from("video-data"), {
|
||||
filename: "video.mp4",
|
||||
contentType: "video/mp4",
|
||||
})
|
||||
.expect(400);
|
||||
|
||||
expect(response.body.message).toContain("Unsupported audio format");
|
||||
});
|
||||
|
||||
it("should accept valid audio MIME types", async () => {
|
||||
const validMimeTypes = [
|
||||
{ mime: "audio/wav", ext: "wav" },
|
||||
{ mime: "audio/mpeg", ext: "mp3" },
|
||||
{ mime: "audio/webm", ext: "webm" },
|
||||
{ mime: "audio/ogg", ext: "ogg" },
|
||||
{ mime: "audio/flac", ext: "flac" },
|
||||
];
|
||||
|
||||
for (const { mime, ext } of validMimeTypes) {
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.post("/speech/transcribe")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.attach("file", TEST_AUDIO_BUFFER, {
|
||||
filename: `test.${ext}`,
|
||||
contentType: mime,
|
||||
})
|
||||
.expect(201);
|
||||
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data.text).toBe(MOCK_TRANSCRIPTION_RESULT.text);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// Scenario 6: File Size Limits
|
||||
// ==========================================
|
||||
describe("Scenario 6: File Size Limits", () => {
|
||||
it("should reject files exceeding the maximum upload size (25 MB)", async () => {
|
||||
// Create a buffer slightly over the 25 MB limit
|
||||
const oversizedBuffer = Buffer.alloc(25_000_001, 0);
|
||||
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.post("/speech/transcribe")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.attach("file", oversizedBuffer, {
|
||||
filename: "large-audio.wav",
|
||||
contentType: "audio/wav",
|
||||
})
|
||||
.expect(400);
|
||||
|
||||
expect(response.body).toHaveProperty("message");
|
||||
expect(response.body.message).toContain("exceeds maximum allowed size");
|
||||
});
|
||||
|
||||
it("should accept files within the size limit", async () => {
|
||||
// Create a buffer at the exact limit
|
||||
const maxBuffer = Buffer.alloc(1024, 0);
|
||||
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.post("/speech/transcribe")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.attach("file", maxBuffer, {
|
||||
filename: "acceptable-audio.wav",
|
||||
contentType: "audio/wav",
|
||||
})
|
||||
.expect(201);
|
||||
|
||||
expect(response.body).toHaveProperty("data");
|
||||
});
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// Scenario 7: Authentication
|
||||
// ==========================================
|
||||
describe("Scenario 7: Authentication", () => {
|
||||
it("should reject POST /speech/transcribe without authentication", async () => {
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.post("/speech/transcribe")
|
||||
.attach("file", TEST_AUDIO_BUFFER, {
|
||||
filename: "test.wav",
|
||||
contentType: "audio/wav",
|
||||
})
|
||||
.expect(401);
|
||||
|
||||
expect(response.body).toHaveProperty("message");
|
||||
expect(response.body.message).toContain("No authentication token provided");
|
||||
});
|
||||
|
||||
it("should reject POST /speech/synthesize without authentication", async () => {
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.post("/speech/synthesize")
|
||||
.send({ text: "Hello" })
|
||||
.expect(401);
|
||||
|
||||
expect(response.body.message).toContain("No authentication token provided");
|
||||
});
|
||||
|
||||
it("should reject GET /speech/voices without authentication", async () => {
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.get("/speech/voices")
|
||||
.expect(401);
|
||||
|
||||
expect(response.body.message).toContain("No authentication token provided");
|
||||
});
|
||||
|
||||
it("should reject GET /speech/health without authentication", async () => {
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.get("/speech/health")
|
||||
.expect(401);
|
||||
|
||||
expect(response.body.message).toContain("No authentication token provided");
|
||||
});
|
||||
|
||||
it("should reject requests with an invalid token", async () => {
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.get("/speech/voices")
|
||||
.set("Authorization", "Bearer invalid-token-xyz")
|
||||
.expect(401);
|
||||
|
||||
expect(response.body.message).toContain("No authentication token provided");
|
||||
});
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// Scenario 8: Voice Listing
|
||||
// ==========================================
|
||||
describe("Scenario 8: Voice Listing (GET /speech/voices)", () => {
|
||||
it("should return all voices when no tier filter is provided", async () => {
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.get("/speech/voices")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.expect(200);
|
||||
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(Array.isArray(response.body.data)).toBe(true);
|
||||
|
||||
// Should have voices from all providers that returned voices
|
||||
const voices = response.body.data as VoiceInfo[];
|
||||
expect(voices.length).toBeGreaterThan(0);
|
||||
|
||||
// Verify voice structure
|
||||
for (const voice of voices) {
|
||||
expect(voice).toHaveProperty("id");
|
||||
expect(voice).toHaveProperty("name");
|
||||
expect(voice).toHaveProperty("tier");
|
||||
}
|
||||
});
|
||||
|
||||
it("should filter voices by tier when tier query param is provided", async () => {
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.get("/speech/voices?tier=default")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.expect(200);
|
||||
|
||||
const voices = response.body.data as VoiceInfo[];
|
||||
expect(voices.length).toBeGreaterThan(0);
|
||||
|
||||
for (const voice of voices) {
|
||||
expect(voice.tier).toBe("default");
|
||||
}
|
||||
|
||||
expect(defaultTTSProvider.listVoices).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("should return empty array for tier with no voices", async () => {
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.get("/speech/voices?tier=fallback")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.expect(200);
|
||||
|
||||
expect(response.body.data).toEqual([]);
|
||||
});
|
||||
|
||||
it("should include voice metadata (id, name, language, tier, isDefault)", async () => {
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.get("/speech/voices?tier=default")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.expect(200);
|
||||
|
||||
const voices = response.body.data as VoiceInfo[];
|
||||
const defaultVoice = voices.find((v) => v.isDefault === true);
|
||||
|
||||
expect(defaultVoice).toBeDefined();
|
||||
expect(defaultVoice).toMatchObject({
|
||||
id: "af_heart",
|
||||
name: "Heart",
|
||||
language: "en",
|
||||
tier: "default",
|
||||
isDefault: true,
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// Scenario 9: Health Check
|
||||
// ==========================================
|
||||
describe("Scenario 9: Health Check (GET /speech/health)", () => {
|
||||
it("should return health status for both STT and TTS providers", async () => {
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.get("/speech/health")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.expect(200);
|
||||
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data).toHaveProperty("stt");
|
||||
expect(response.body.data).toHaveProperty("tts");
|
||||
|
||||
expect(response.body.data.stt).toHaveProperty("available");
|
||||
expect(response.body.data.tts).toHaveProperty("available");
|
||||
|
||||
// Both should be available since we have mock providers registered and config enabled
|
||||
expect(response.body.data.stt.available).toBe(true);
|
||||
expect(response.body.data.tts.available).toBe(true);
|
||||
});
|
||||
|
||||
it("should return consistent health check format", async () => {
|
||||
const response = await request(app.getHttpServer() as App)
|
||||
.get("/speech/health")
|
||||
.set("Authorization", "Bearer test-token")
|
||||
.expect(200);
|
||||
|
||||
// Verify the response matches the expected shape
|
||||
expect(response.body).toEqual({
|
||||
data: {
|
||||
stt: { available: expect.any(Boolean) },
|
||||
tts: { available: expect.any(Boolean) },
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user