feat: M13-SpeechServices — TTS & STT integration #409
933
apps/api/src/speech/speech.integration.spec.ts
Normal file
933
apps/api/src/speech/speech.integration.spec.ts
Normal file
@@ -0,0 +1,933 @@
|
|||||||
|
/**
|
||||||
|
* Speech Services E2E Integration Tests
|
||||||
|
*
|
||||||
|
* Tests the full speech pipeline from API endpoints through to mocked external providers.
|
||||||
|
* Covers REST transcription, synthesis, provider fallback, WebSocket streaming,
|
||||||
|
* audio validation, file size limits, authentication, voice listing, and health checks.
|
||||||
|
*
|
||||||
|
* Uses NestJS testing module with supertest for HTTP testing and direct gateway
|
||||||
|
* invocation for WebSocket streaming tests.
|
||||||
|
*
|
||||||
|
* Issue #405
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { describe, it, expect, beforeAll, beforeEach, afterAll, vi } from "vitest";
|
||||||
|
import { Test } from "@nestjs/testing";
|
||||||
|
import {
|
||||||
|
type INestApplication,
|
||||||
|
type CanActivate,
|
||||||
|
type ExecutionContext,
|
||||||
|
UnauthorizedException,
|
||||||
|
ValidationPipe,
|
||||||
|
} from "@nestjs/common";
|
||||||
|
import request from "supertest";
|
||||||
|
import type { App } from "supertest/types";
|
||||||
|
|
||||||
|
import { SpeechController } from "./speech.controller";
|
||||||
|
import { SpeechService } from "./speech.service";
|
||||||
|
import { SpeechGateway } from "./speech.gateway";
|
||||||
|
import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants";
|
||||||
|
import { speechConfig } from "./speech.config";
|
||||||
|
import type { SpeechConfig } from "./speech.config";
|
||||||
|
import type { ISTTProvider } from "./interfaces/stt-provider.interface";
|
||||||
|
import type { ITTSProvider } from "./interfaces/tts-provider.interface";
|
||||||
|
import type {
|
||||||
|
TranscriptionResult,
|
||||||
|
SynthesisResult,
|
||||||
|
VoiceInfo,
|
||||||
|
SpeechTier,
|
||||||
|
} from "./interfaces/speech-types";
|
||||||
|
import { AuthGuard } from "../auth/guards/auth.guard";
|
||||||
|
import { WorkspaceGuard, PermissionGuard } from "../common/guards";
|
||||||
|
import { AuthService } from "../auth/auth.service";
|
||||||
|
import { PrismaService } from "../prisma/prisma.service";
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Test Fixtures
|
||||||
|
// ==========================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Small WAV file header (44 bytes) + minimal data.
|
||||||
|
* Not a real audio file, but has the correct structure for testing.
|
||||||
|
*/
|
||||||
|
const TEST_AUDIO_BUFFER = Buffer.alloc(1024, 0);
|
||||||
|
|
||||||
|
const MOCK_WORKSPACE_ID = "550e8400-e29b-41d4-a716-446655440001";
|
||||||
|
const MOCK_USER_ID = "550e8400-e29b-41d4-a716-446655440002";
|
||||||
|
|
||||||
|
const MOCK_USER = {
|
||||||
|
id: MOCK_USER_ID,
|
||||||
|
email: "test@example.com",
|
||||||
|
name: "Test User",
|
||||||
|
workspaceId: MOCK_WORKSPACE_ID,
|
||||||
|
};
|
||||||
|
|
||||||
|
const MOCK_TRANSCRIPTION_RESULT: TranscriptionResult = {
|
||||||
|
text: "Hello, this is a test transcription.",
|
||||||
|
language: "en",
|
||||||
|
durationSeconds: 3.2,
|
||||||
|
confidence: 0.97,
|
||||||
|
segments: [
|
||||||
|
{ text: "Hello, this is a test transcription.", start: 0, end: 3.2, confidence: 0.97 },
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
const MOCK_SYNTHESIS_RESULT: SynthesisResult = {
|
||||||
|
audio: Buffer.from("fake-synthesized-audio-data-mp3"),
|
||||||
|
format: "mp3",
|
||||||
|
voice: "af_heart",
|
||||||
|
tier: "default" as SpeechTier,
|
||||||
|
durationSeconds: 2.1,
|
||||||
|
};
|
||||||
|
|
||||||
|
const MOCK_VOICES: VoiceInfo[] = [
|
||||||
|
{ id: "af_heart", name: "Heart", language: "en", tier: "default", isDefault: true },
|
||||||
|
{ id: "af_sky", name: "Sky", language: "en", tier: "default", isDefault: false },
|
||||||
|
{
|
||||||
|
id: "chatterbox-default",
|
||||||
|
name: "Chatterbox",
|
||||||
|
language: "en",
|
||||||
|
tier: "premium",
|
||||||
|
isDefault: true,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const MOCK_SPEECH_CONFIG: SpeechConfig = {
|
||||||
|
stt: {
|
||||||
|
enabled: true,
|
||||||
|
baseUrl: "http://speaches:8000/v1",
|
||||||
|
model: "test-model",
|
||||||
|
language: "en",
|
||||||
|
},
|
||||||
|
tts: {
|
||||||
|
default: { enabled: true, url: "http://kokoro:8880/v1", voice: "af_heart", format: "mp3" },
|
||||||
|
premium: { enabled: true, url: "http://chatterbox:8881/v1" },
|
||||||
|
fallback: { enabled: true, url: "http://openedai:8000/v1" },
|
||||||
|
},
|
||||||
|
limits: {
|
||||||
|
maxUploadSize: 25_000_000,
|
||||||
|
maxDurationSeconds: 600,
|
||||||
|
maxTextLength: 4096,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Mock Providers
|
||||||
|
// ==========================================
|
||||||
|
|
||||||
|
function createMockSTTProvider(): ISTTProvider {
|
||||||
|
return {
|
||||||
|
name: "mock-stt",
|
||||||
|
transcribe: vi.fn().mockResolvedValue(MOCK_TRANSCRIPTION_RESULT),
|
||||||
|
isHealthy: vi.fn().mockResolvedValue(true),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function createMockTTSProvider(tier: SpeechTier, name: string): ITTSProvider {
|
||||||
|
const voices = MOCK_VOICES.filter((v) => v.tier === tier);
|
||||||
|
return {
|
||||||
|
name,
|
||||||
|
tier,
|
||||||
|
synthesize: vi.fn().mockResolvedValue({
|
||||||
|
...MOCK_SYNTHESIS_RESULT,
|
||||||
|
tier,
|
||||||
|
}),
|
||||||
|
listVoices: vi.fn().mockResolvedValue(voices),
|
||||||
|
isHealthy: vi.fn().mockResolvedValue(true),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Test Guards
|
||||||
|
// ==========================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Conditional auth guard for testing.
|
||||||
|
* Authenticates requests that carry `Authorization: Bearer test-token`.
|
||||||
|
* Rejects all others with UnauthorizedException.
|
||||||
|
*/
|
||||||
|
class TestAuthGuard implements CanActivate {
|
||||||
|
canActivate(context: ExecutionContext): boolean {
|
||||||
|
const req = context.switchToHttp().getRequest<{
|
||||||
|
headers: Record<string, string | undefined>;
|
||||||
|
user?: typeof MOCK_USER;
|
||||||
|
cookies?: Record<string, string>;
|
||||||
|
}>();
|
||||||
|
const authHeader = req.headers.authorization;
|
||||||
|
const cookieToken = req.cookies?.["better-auth.session_token"];
|
||||||
|
|
||||||
|
if (authHeader === "Bearer test-token" || cookieToken === "test-token") {
|
||||||
|
req.user = { ...MOCK_USER };
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new UnauthorizedException("No authentication token provided");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test workspace guard that attaches a mock workspace to the request.
|
||||||
|
*/
|
||||||
|
class TestWorkspaceGuard implements CanActivate {
|
||||||
|
canActivate(context: ExecutionContext): boolean {
|
||||||
|
const req = context.switchToHttp().getRequest<{
|
||||||
|
workspace?: { id: string };
|
||||||
|
headers: Record<string, string | undefined>;
|
||||||
|
}>();
|
||||||
|
const workspaceId = req.headers["x-workspace-id"] ?? MOCK_WORKSPACE_ID;
|
||||||
|
req.workspace = { id: workspaceId as string };
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test permission guard that always allows access.
|
||||||
|
*/
|
||||||
|
class TestPermissionGuard implements CanActivate {
|
||||||
|
canActivate(): boolean {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Tests
|
||||||
|
// ==========================================
|
||||||
|
|
||||||
|
describe("Speech Services E2E Integration", () => {
|
||||||
|
let app: INestApplication;
|
||||||
|
let mockSTTProvider: ISTTProvider;
|
||||||
|
let defaultTTSProvider: ITTSProvider;
|
||||||
|
let premiumTTSProvider: ITTSProvider;
|
||||||
|
let fallbackTTSProvider: ITTSProvider;
|
||||||
|
let ttsProvidersMap: Map<SpeechTier, ITTSProvider>;
|
||||||
|
|
||||||
|
// WebSocket gateway test dependencies
|
||||||
|
let speechGateway: SpeechGateway;
|
||||||
|
let mockSpeechService: SpeechService;
|
||||||
|
|
||||||
|
beforeAll(async () => {
|
||||||
|
// Create mock providers
|
||||||
|
mockSTTProvider = createMockSTTProvider();
|
||||||
|
defaultTTSProvider = createMockTTSProvider("default", "mock-kokoro");
|
||||||
|
premiumTTSProvider = createMockTTSProvider("premium", "mock-chatterbox");
|
||||||
|
fallbackTTSProvider = createMockTTSProvider("fallback", "mock-piper");
|
||||||
|
|
||||||
|
ttsProvidersMap = new Map<SpeechTier, ITTSProvider>([
|
||||||
|
["default", defaultTTSProvider],
|
||||||
|
["premium", premiumTTSProvider],
|
||||||
|
["fallback", fallbackTTSProvider],
|
||||||
|
]);
|
||||||
|
|
||||||
|
const moduleRef = await Test.createTestingModule({
|
||||||
|
controllers: [SpeechController],
|
||||||
|
providers: [
|
||||||
|
SpeechService,
|
||||||
|
{
|
||||||
|
provide: speechConfig.KEY,
|
||||||
|
useValue: MOCK_SPEECH_CONFIG,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
provide: STT_PROVIDER,
|
||||||
|
useValue: mockSTTProvider,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
provide: TTS_PROVIDERS,
|
||||||
|
useValue: ttsProvidersMap,
|
||||||
|
},
|
||||||
|
// Gateway dependencies (not tested via HTTP but needed for DI)
|
||||||
|
{
|
||||||
|
provide: SpeechGateway,
|
||||||
|
useFactory: (
|
||||||
|
authService: AuthService,
|
||||||
|
prisma: PrismaService,
|
||||||
|
speechService: SpeechService,
|
||||||
|
config: SpeechConfig
|
||||||
|
): SpeechGateway => {
|
||||||
|
return new SpeechGateway(authService, prisma, speechService, config);
|
||||||
|
},
|
||||||
|
inject: [AuthService, PrismaService, SpeechService, speechConfig.KEY],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
provide: AuthService,
|
||||||
|
useValue: {
|
||||||
|
verifySession: vi.fn().mockResolvedValue({
|
||||||
|
user: { id: MOCK_USER_ID, email: "test@example.com", name: "Test User" },
|
||||||
|
session: { id: "test-session" },
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
provide: PrismaService,
|
||||||
|
useValue: {
|
||||||
|
workspaceMember: {
|
||||||
|
findFirst: vi.fn().mockResolvedValue({
|
||||||
|
userId: MOCK_USER_ID,
|
||||||
|
workspaceId: MOCK_WORKSPACE_ID,
|
||||||
|
role: "MEMBER",
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
})
|
||||||
|
.overrideGuard(AuthGuard)
|
||||||
|
.useClass(TestAuthGuard)
|
||||||
|
.overrideGuard(WorkspaceGuard)
|
||||||
|
.useClass(TestWorkspaceGuard)
|
||||||
|
.overrideGuard(PermissionGuard)
|
||||||
|
.useClass(TestPermissionGuard)
|
||||||
|
.compile();
|
||||||
|
|
||||||
|
app = moduleRef.createNestApplication();
|
||||||
|
app.useGlobalPipes(new ValidationPipe({ transform: true, whitelist: true }));
|
||||||
|
await app.init();
|
||||||
|
|
||||||
|
// Capture references for WebSocket tests
|
||||||
|
speechGateway = moduleRef.get(SpeechGateway);
|
||||||
|
mockSpeechService = moduleRef.get(SpeechService);
|
||||||
|
});
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
vi.clearAllMocks();
|
||||||
|
|
||||||
|
// Reset default mock behaviors
|
||||||
|
(mockSTTProvider.transcribe as ReturnType<typeof vi.fn>).mockResolvedValue(
|
||||||
|
MOCK_TRANSCRIPTION_RESULT
|
||||||
|
);
|
||||||
|
(defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
|
||||||
|
...MOCK_SYNTHESIS_RESULT,
|
||||||
|
tier: "default",
|
||||||
|
});
|
||||||
|
(premiumTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
|
||||||
|
...MOCK_SYNTHESIS_RESULT,
|
||||||
|
tier: "premium",
|
||||||
|
});
|
||||||
|
(fallbackTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
|
||||||
|
...MOCK_SYNTHESIS_RESULT,
|
||||||
|
tier: "fallback",
|
||||||
|
});
|
||||||
|
(defaultTTSProvider.listVoices as ReturnType<typeof vi.fn>).mockResolvedValue(
|
||||||
|
MOCK_VOICES.filter((v) => v.tier === "default")
|
||||||
|
);
|
||||||
|
(premiumTTSProvider.listVoices as ReturnType<typeof vi.fn>).mockResolvedValue(
|
||||||
|
MOCK_VOICES.filter((v) => v.tier === "premium")
|
||||||
|
);
|
||||||
|
(fallbackTTSProvider.listVoices as ReturnType<typeof vi.fn>).mockResolvedValue([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
afterAll(async () => {
|
||||||
|
if (app) {
|
||||||
|
await app.close();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Scenario 1: REST Transcription
|
||||||
|
// ==========================================
|
||||||
|
describe("Scenario 1: REST Transcription (POST /speech/transcribe)", () => {
|
||||||
|
it("should transcribe an uploaded audio file and return the transcription result", async () => {
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.post("/speech/transcribe")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.attach("file", TEST_AUDIO_BUFFER, {
|
||||||
|
filename: "test.wav",
|
||||||
|
contentType: "audio/wav",
|
||||||
|
})
|
||||||
|
.expect(201);
|
||||||
|
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
expect(response.body.data).toMatchObject({
|
||||||
|
text: MOCK_TRANSCRIPTION_RESULT.text,
|
||||||
|
language: MOCK_TRANSCRIPTION_RESULT.language,
|
||||||
|
durationSeconds: MOCK_TRANSCRIPTION_RESULT.durationSeconds,
|
||||||
|
confidence: MOCK_TRANSCRIPTION_RESULT.confidence,
|
||||||
|
});
|
||||||
|
expect(response.body.data.segments).toBeDefined();
|
||||||
|
expect(response.body.data.segments).toHaveLength(1);
|
||||||
|
|
||||||
|
expect(mockSTTProvider.transcribe).toHaveBeenCalledWith(
|
||||||
|
expect.any(Buffer),
|
||||||
|
expect.objectContaining({ mimeType: "audio/wav" })
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should pass optional transcription parameters to the service", async () => {
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.post("/speech/transcribe")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.attach("file", TEST_AUDIO_BUFFER, {
|
||||||
|
filename: "test.mp3",
|
||||||
|
contentType: "audio/mpeg",
|
||||||
|
})
|
||||||
|
.field("language", "fr")
|
||||||
|
.field("model", "whisper-large-v3")
|
||||||
|
.field("prompt", "Meeting transcript")
|
||||||
|
.field("temperature", "0.3")
|
||||||
|
.expect(201);
|
||||||
|
|
||||||
|
expect(response.body.data.text).toBe(MOCK_TRANSCRIPTION_RESULT.text);
|
||||||
|
|
||||||
|
expect(mockSTTProvider.transcribe).toHaveBeenCalledWith(
|
||||||
|
expect.any(Buffer),
|
||||||
|
expect.objectContaining({
|
||||||
|
mimeType: "audio/mpeg",
|
||||||
|
language: "fr",
|
||||||
|
model: "whisper-large-v3",
|
||||||
|
prompt: "Meeting transcript",
|
||||||
|
temperature: 0.3,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should reject request without an audio file", async () => {
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.post("/speech/transcribe")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.expect(400);
|
||||||
|
|
||||||
|
expect(response.body).toHaveProperty("message");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Scenario 2: REST Synthesis
|
||||||
|
// ==========================================
|
||||||
|
describe("Scenario 2: REST Synthesis (POST /speech/synthesize)", () => {
|
||||||
|
it("should synthesize text and return audio binary response", async () => {
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.post("/speech/synthesize")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.send({ text: "Hello, world!" })
|
||||||
|
.expect(201);
|
||||||
|
|
||||||
|
// Response should be binary audio
|
||||||
|
expect(response.headers["content-type"]).toContain("audio/mpeg");
|
||||||
|
expect(response.headers["content-disposition"]).toContain("attachment");
|
||||||
|
expect(response.headers["content-disposition"]).toContain("speech.mp3");
|
||||||
|
expect(response.body).toBeDefined();
|
||||||
|
expect(Buffer.isBuffer(response.body) || response.body instanceof Buffer).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should pass voice, speed, format, and tier options to the service", async () => {
|
||||||
|
(defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
|
||||||
|
audio: Buffer.from("wav-audio-data"),
|
||||||
|
format: "wav",
|
||||||
|
voice: "af_sky",
|
||||||
|
tier: "default",
|
||||||
|
durationSeconds: 1.5,
|
||||||
|
});
|
||||||
|
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.post("/speech/synthesize")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.send({
|
||||||
|
text: "Test with options",
|
||||||
|
voice: "af_sky",
|
||||||
|
speed: 1.5,
|
||||||
|
format: "wav",
|
||||||
|
})
|
||||||
|
.expect(201);
|
||||||
|
|
||||||
|
expect(response.headers["content-type"]).toContain("audio/wav");
|
||||||
|
expect(response.headers["content-disposition"]).toContain("speech.wav");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should accept empty text (validation delegated to service)", async () => {
|
||||||
|
// The SynthesizeDto allows empty strings (no @IsNotEmpty decorator).
|
||||||
|
// The service/provider handles empty text semantics.
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.post("/speech/synthesize")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.send({ text: "" })
|
||||||
|
.expect(201);
|
||||||
|
|
||||||
|
expect(response.headers["content-type"]).toContain("audio/mpeg");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should reject missing text field", async () => {
|
||||||
|
await request(app.getHttpServer() as App)
|
||||||
|
.post("/speech/synthesize")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.send({})
|
||||||
|
.expect(400);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Scenario 3: Provider Fallback
|
||||||
|
// ==========================================
|
||||||
|
describe("Scenario 3: Provider Fallback", () => {
|
||||||
|
it("should fall back from premium to default when premium fails", async () => {
|
||||||
|
// Make premium provider fail
|
||||||
|
(premiumTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
|
||||||
|
new Error("Premium provider unavailable")
|
||||||
|
);
|
||||||
|
|
||||||
|
// Default provider should succeed
|
||||||
|
(defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
|
||||||
|
audio: Buffer.from("fallback-audio"),
|
||||||
|
format: "mp3",
|
||||||
|
voice: "af_heart",
|
||||||
|
tier: "default",
|
||||||
|
});
|
||||||
|
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.post("/speech/synthesize")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.send({ text: "Fallback test", tier: "premium" })
|
||||||
|
.expect(201);
|
||||||
|
|
||||||
|
// Premium was attempted first
|
||||||
|
expect(premiumTTSProvider.synthesize).toHaveBeenCalled();
|
||||||
|
// Then default succeeded
|
||||||
|
expect(defaultTTSProvider.synthesize).toHaveBeenCalled();
|
||||||
|
expect(response.headers["content-type"]).toContain("audio/mpeg");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should fall back through entire chain: premium -> default -> fallback", async () => {
|
||||||
|
// Make premium and default fail
|
||||||
|
(premiumTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
|
||||||
|
new Error("Premium down")
|
||||||
|
);
|
||||||
|
(defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
|
||||||
|
new Error("Default down")
|
||||||
|
);
|
||||||
|
|
||||||
|
// Fallback should succeed
|
||||||
|
(fallbackTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
|
||||||
|
audio: Buffer.from("fallback-piper-audio"),
|
||||||
|
format: "mp3",
|
||||||
|
voice: "piper-default",
|
||||||
|
tier: "fallback",
|
||||||
|
});
|
||||||
|
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.post("/speech/synthesize")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.send({ text: "Full fallback chain test", tier: "premium" })
|
||||||
|
.expect(201);
|
||||||
|
|
||||||
|
expect(premiumTTSProvider.synthesize).toHaveBeenCalled();
|
||||||
|
expect(defaultTTSProvider.synthesize).toHaveBeenCalled();
|
||||||
|
expect(fallbackTTSProvider.synthesize).toHaveBeenCalled();
|
||||||
|
expect(response.headers["content-type"]).toContain("audio/mpeg");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return 503 when all TTS providers fail", async () => {
|
||||||
|
(premiumTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
|
||||||
|
new Error("Premium down")
|
||||||
|
);
|
||||||
|
(defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
|
||||||
|
new Error("Default down")
|
||||||
|
);
|
||||||
|
(fallbackTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
|
||||||
|
new Error("Fallback down")
|
||||||
|
);
|
||||||
|
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.post("/speech/synthesize")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.send({ text: "All providers down", tier: "premium" })
|
||||||
|
.expect(503);
|
||||||
|
|
||||||
|
expect(response.body).toHaveProperty("message");
|
||||||
|
expect(response.body.message).toContain("All TTS providers failed");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Scenario 4: WebSocket Streaming Transcription
|
||||||
|
// ==========================================
|
||||||
|
describe("Scenario 4: WebSocket Streaming Transcription", () => {
|
||||||
|
interface MockSocket {
|
||||||
|
id: string;
|
||||||
|
join: ReturnType<typeof vi.fn>;
|
||||||
|
leave: ReturnType<typeof vi.fn>;
|
||||||
|
emit: ReturnType<typeof vi.fn>;
|
||||||
|
disconnect: ReturnType<typeof vi.fn>;
|
||||||
|
data: { userId?: string; workspaceId?: string };
|
||||||
|
handshake: {
|
||||||
|
auth: Record<string, unknown>;
|
||||||
|
query: Record<string, unknown>;
|
||||||
|
headers: Record<string, unknown>;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function createTestSocket(overrides?: Partial<MockSocket>): MockSocket {
|
||||||
|
return {
|
||||||
|
id: "e2e-test-socket",
|
||||||
|
join: vi.fn(),
|
||||||
|
leave: vi.fn(),
|
||||||
|
emit: vi.fn(),
|
||||||
|
disconnect: vi.fn(),
|
||||||
|
data: {},
|
||||||
|
handshake: {
|
||||||
|
auth: { token: "valid-token" },
|
||||||
|
query: {},
|
||||||
|
headers: {},
|
||||||
|
},
|
||||||
|
...overrides,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
it("should complete the full streaming transcription lifecycle", async () => {
|
||||||
|
const client = createTestSocket();
|
||||||
|
// Authenticate the client
|
||||||
|
await speechGateway.handleConnection(client as never);
|
||||||
|
|
||||||
|
expect(client.data.userId).toBe(MOCK_USER_ID);
|
||||||
|
expect(client.data.workspaceId).toBe(MOCK_WORKSPACE_ID);
|
||||||
|
expect(client.disconnect).not.toHaveBeenCalled();
|
||||||
|
|
||||||
|
// Start transcription session
|
||||||
|
speechGateway.handleStartTranscription(client as never, { language: "en" });
|
||||||
|
|
||||||
|
expect(client.emit).toHaveBeenCalledWith(
|
||||||
|
"transcription-started",
|
||||||
|
expect.objectContaining({ sessionId: "e2e-test-socket" })
|
||||||
|
);
|
||||||
|
|
||||||
|
// Send audio chunks
|
||||||
|
const chunk1 = Buffer.from("audio-data-chunk-1");
|
||||||
|
const chunk2 = Buffer.from("audio-data-chunk-2");
|
||||||
|
const chunk3 = Buffer.from("audio-data-chunk-3");
|
||||||
|
|
||||||
|
speechGateway.handleAudioChunk(client as never, chunk1);
|
||||||
|
speechGateway.handleAudioChunk(client as never, chunk2);
|
||||||
|
speechGateway.handleAudioChunk(client as never, chunk3);
|
||||||
|
|
||||||
|
// No errors should have been emitted for chunks
|
||||||
|
const errorCalls = client.emit.mock.calls.filter(
|
||||||
|
(call: unknown[]) => call[0] === "transcription-error"
|
||||||
|
);
|
||||||
|
expect(errorCalls).toHaveLength(0);
|
||||||
|
|
||||||
|
vi.clearAllMocks();
|
||||||
|
(mockSTTProvider.transcribe as ReturnType<typeof vi.fn>).mockResolvedValue(
|
||||||
|
MOCK_TRANSCRIPTION_RESULT
|
||||||
|
);
|
||||||
|
|
||||||
|
// Stop transcription - should trigger the full transcription pipeline
|
||||||
|
await speechGateway.handleStopTranscription(client as never);
|
||||||
|
|
||||||
|
// Verify transcription was called with concatenated audio
|
||||||
|
expect(mockSTTProvider.transcribe).toHaveBeenCalledWith(
|
||||||
|
expect.any(Buffer),
|
||||||
|
expect.objectContaining({ language: "en" })
|
||||||
|
);
|
||||||
|
|
||||||
|
// Verify the final result was emitted
|
||||||
|
expect(client.emit).toHaveBeenCalledWith(
|
||||||
|
"transcription-final",
|
||||||
|
expect.objectContaining({
|
||||||
|
text: MOCK_TRANSCRIPTION_RESULT.text,
|
||||||
|
language: "en",
|
||||||
|
durationSeconds: 3.2,
|
||||||
|
confidence: 0.97,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should clean up session on disconnect", async () => {
|
||||||
|
const client = createTestSocket({ id: "disconnect-test" });
|
||||||
|
await speechGateway.handleConnection(client as never);
|
||||||
|
|
||||||
|
speechGateway.handleStartTranscription(client as never, {});
|
||||||
|
speechGateway.handleAudioChunk(client as never, Buffer.from("data"));
|
||||||
|
|
||||||
|
// Disconnect
|
||||||
|
speechGateway.handleDisconnect(client as never);
|
||||||
|
|
||||||
|
// Trying to send more chunks should fail (session cleaned up)
|
||||||
|
vi.clearAllMocks();
|
||||||
|
speechGateway.handleAudioChunk(client as never, Buffer.from("more-data"));
|
||||||
|
|
||||||
|
expect(client.emit).toHaveBeenCalledWith(
|
||||||
|
"transcription-error",
|
||||||
|
expect.objectContaining({
|
||||||
|
message: expect.stringContaining("No active transcription session"),
|
||||||
|
})
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should reject unauthenticated WebSocket clients", async () => {
|
||||||
|
const client = createTestSocket({
|
||||||
|
id: "unauth-ws-client",
|
||||||
|
handshake: { auth: {}, query: {}, headers: {} },
|
||||||
|
});
|
||||||
|
|
||||||
|
await speechGateway.handleConnection(client as never);
|
||||||
|
|
||||||
|
expect(client.disconnect).toHaveBeenCalled();
|
||||||
|
expect(client.data.userId).toBeUndefined();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Scenario 5: Audio Validation (Invalid MIME Type)
|
||||||
|
// ==========================================
|
||||||
|
describe("Scenario 5: Audio Validation", () => {
|
||||||
|
it("should reject files with unsupported MIME types", async () => {
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.post("/speech/transcribe")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.attach("file", Buffer.from("not-audio"), {
|
||||||
|
filename: "document.pdf",
|
||||||
|
contentType: "application/pdf",
|
||||||
|
})
|
||||||
|
.expect(400);
|
||||||
|
|
||||||
|
expect(response.body).toHaveProperty("message");
|
||||||
|
expect(response.body.message).toContain("Unsupported audio format");
|
||||||
|
expect(response.body.message).toContain("application/pdf");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should reject files with text/plain MIME type", async () => {
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.post("/speech/transcribe")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.attach("file", Buffer.from("plain text content"), {
|
||||||
|
filename: "notes.txt",
|
||||||
|
contentType: "text/plain",
|
||||||
|
})
|
||||||
|
.expect(400);
|
||||||
|
|
||||||
|
expect(response.body.message).toContain("Unsupported audio format");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should reject video MIME types", async () => {
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.post("/speech/transcribe")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.attach("file", Buffer.from("video-data"), {
|
||||||
|
filename: "video.mp4",
|
||||||
|
contentType: "video/mp4",
|
||||||
|
})
|
||||||
|
.expect(400);
|
||||||
|
|
||||||
|
expect(response.body.message).toContain("Unsupported audio format");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should accept valid audio MIME types", async () => {
|
||||||
|
const validMimeTypes = [
|
||||||
|
{ mime: "audio/wav", ext: "wav" },
|
||||||
|
{ mime: "audio/mpeg", ext: "mp3" },
|
||||||
|
{ mime: "audio/webm", ext: "webm" },
|
||||||
|
{ mime: "audio/ogg", ext: "ogg" },
|
||||||
|
{ mime: "audio/flac", ext: "flac" },
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const { mime, ext } of validMimeTypes) {
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.post("/speech/transcribe")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.attach("file", TEST_AUDIO_BUFFER, {
|
||||||
|
filename: `test.${ext}`,
|
||||||
|
contentType: mime,
|
||||||
|
})
|
||||||
|
.expect(201);
|
||||||
|
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
expect(response.body.data.text).toBe(MOCK_TRANSCRIPTION_RESULT.text);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Scenario 6: File Size Limits
|
||||||
|
// ==========================================
|
||||||
|
describe("Scenario 6: File Size Limits", () => {
|
||||||
|
it("should reject files exceeding the maximum upload size (25 MB)", async () => {
|
||||||
|
// Create a buffer slightly over the 25 MB limit
|
||||||
|
const oversizedBuffer = Buffer.alloc(25_000_001, 0);
|
||||||
|
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.post("/speech/transcribe")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.attach("file", oversizedBuffer, {
|
||||||
|
filename: "large-audio.wav",
|
||||||
|
contentType: "audio/wav",
|
||||||
|
})
|
||||||
|
.expect(400);
|
||||||
|
|
||||||
|
expect(response.body).toHaveProperty("message");
|
||||||
|
expect(response.body.message).toContain("exceeds maximum allowed size");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should accept files within the size limit", async () => {
|
||||||
|
// Create a buffer at the exact limit
|
||||||
|
const maxBuffer = Buffer.alloc(1024, 0);
|
||||||
|
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.post("/speech/transcribe")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.attach("file", maxBuffer, {
|
||||||
|
filename: "acceptable-audio.wav",
|
||||||
|
contentType: "audio/wav",
|
||||||
|
})
|
||||||
|
.expect(201);
|
||||||
|
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Scenario 7: Authentication
|
||||||
|
// ==========================================
|
||||||
|
describe("Scenario 7: Authentication", () => {
|
||||||
|
it("should reject POST /speech/transcribe without authentication", async () => {
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.post("/speech/transcribe")
|
||||||
|
.attach("file", TEST_AUDIO_BUFFER, {
|
||||||
|
filename: "test.wav",
|
||||||
|
contentType: "audio/wav",
|
||||||
|
})
|
||||||
|
.expect(401);
|
||||||
|
|
||||||
|
expect(response.body).toHaveProperty("message");
|
||||||
|
expect(response.body.message).toContain("No authentication token provided");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should reject POST /speech/synthesize without authentication", async () => {
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.post("/speech/synthesize")
|
||||||
|
.send({ text: "Hello" })
|
||||||
|
.expect(401);
|
||||||
|
|
||||||
|
expect(response.body.message).toContain("No authentication token provided");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should reject GET /speech/voices without authentication", async () => {
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.get("/speech/voices")
|
||||||
|
.expect(401);
|
||||||
|
|
||||||
|
expect(response.body.message).toContain("No authentication token provided");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should reject GET /speech/health without authentication", async () => {
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.get("/speech/health")
|
||||||
|
.expect(401);
|
||||||
|
|
||||||
|
expect(response.body.message).toContain("No authentication token provided");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should reject requests with an invalid token", async () => {
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.get("/speech/voices")
|
||||||
|
.set("Authorization", "Bearer invalid-token-xyz")
|
||||||
|
.expect(401);
|
||||||
|
|
||||||
|
expect(response.body.message).toContain("No authentication token provided");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Scenario 8: Voice Listing
|
||||||
|
// ==========================================
|
||||||
|
describe("Scenario 8: Voice Listing (GET /speech/voices)", () => {
|
||||||
|
it("should return all voices when no tier filter is provided", async () => {
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.get("/speech/voices")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.expect(200);
|
||||||
|
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
expect(Array.isArray(response.body.data)).toBe(true);
|
||||||
|
|
||||||
|
// Should have voices from all providers that returned voices
|
||||||
|
const voices = response.body.data as VoiceInfo[];
|
||||||
|
expect(voices.length).toBeGreaterThan(0);
|
||||||
|
|
||||||
|
// Verify voice structure
|
||||||
|
for (const voice of voices) {
|
||||||
|
expect(voice).toHaveProperty("id");
|
||||||
|
expect(voice).toHaveProperty("name");
|
||||||
|
expect(voice).toHaveProperty("tier");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should filter voices by tier when tier query param is provided", async () => {
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.get("/speech/voices?tier=default")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.expect(200);
|
||||||
|
|
||||||
|
const voices = response.body.data as VoiceInfo[];
|
||||||
|
expect(voices.length).toBeGreaterThan(0);
|
||||||
|
|
||||||
|
for (const voice of voices) {
|
||||||
|
expect(voice.tier).toBe("default");
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(defaultTTSProvider.listVoices).toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return empty array for tier with no voices", async () => {
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.get("/speech/voices?tier=fallback")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.expect(200);
|
||||||
|
|
||||||
|
expect(response.body.data).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should include voice metadata (id, name, language, tier, isDefault)", async () => {
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.get("/speech/voices?tier=default")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.expect(200);
|
||||||
|
|
||||||
|
const voices = response.body.data as VoiceInfo[];
|
||||||
|
const defaultVoice = voices.find((v) => v.isDefault === true);
|
||||||
|
|
||||||
|
expect(defaultVoice).toBeDefined();
|
||||||
|
expect(defaultVoice).toMatchObject({
|
||||||
|
id: "af_heart",
|
||||||
|
name: "Heart",
|
||||||
|
language: "en",
|
||||||
|
tier: "default",
|
||||||
|
isDefault: true,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Scenario 9: Health Check
|
||||||
|
// ==========================================
|
||||||
|
describe("Scenario 9: Health Check (GET /speech/health)", () => {
|
||||||
|
it("should return health status for both STT and TTS providers", async () => {
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.get("/speech/health")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.expect(200);
|
||||||
|
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
expect(response.body.data).toHaveProperty("stt");
|
||||||
|
expect(response.body.data).toHaveProperty("tts");
|
||||||
|
|
||||||
|
expect(response.body.data.stt).toHaveProperty("available");
|
||||||
|
expect(response.body.data.tts).toHaveProperty("available");
|
||||||
|
|
||||||
|
// Both should be available since we have mock providers registered and config enabled
|
||||||
|
expect(response.body.data.stt.available).toBe(true);
|
||||||
|
expect(response.body.data.tts.available).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return consistent health check format", async () => {
|
||||||
|
const response = await request(app.getHttpServer() as App)
|
||||||
|
.get("/speech/health")
|
||||||
|
.set("Authorization", "Bearer test-token")
|
||||||
|
.expect(200);
|
||||||
|
|
||||||
|
// Verify the response matches the expected shape
|
||||||
|
expect(response.body).toEqual({
|
||||||
|
data: {
|
||||||
|
stt: { available: expect.any(Boolean) },
|
||||||
|
tts: { available: expect.any(Boolean) },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
Reference in New Issue
Block a user