Compare commits

...

2 Commits

Author SHA1 Message Date
3688f89c37 feat(api): add CryptoService for secret encryption (MS22-P1b)
All checks were successful
ci/woodpecker/push/ci Pipeline was successful
2026-03-01 08:41:28 -06:00
4294deda49 docs(design): MS22 DB-centric agent fleet architecture (#604)
Co-authored-by: Jason Woltje <jason@diversecanvas.com>
Co-committed-by: Jason Woltje <jason@diversecanvas.com>
2026-03-01 14:35:14 +00:00
5 changed files with 578 additions and 0 deletions

View File

@@ -39,6 +39,7 @@ import { JobStepsModule } from "./job-steps/job-steps.module";
import { CoordinatorIntegrationModule } from "./coordinator-integration/coordinator-integration.module"; import { CoordinatorIntegrationModule } from "./coordinator-integration/coordinator-integration.module";
import { FederationModule } from "./federation/federation.module"; import { FederationModule } from "./federation/federation.module";
import { CredentialsModule } from "./credentials/credentials.module"; import { CredentialsModule } from "./credentials/credentials.module";
import { CryptoModule } from "./crypto/crypto.module";
import { MosaicTelemetryModule } from "./mosaic-telemetry"; import { MosaicTelemetryModule } from "./mosaic-telemetry";
import { SpeechModule } from "./speech/speech.module"; import { SpeechModule } from "./speech/speech.module";
import { DashboardModule } from "./dashboard/dashboard.module"; import { DashboardModule } from "./dashboard/dashboard.module";
@@ -111,6 +112,7 @@ import { RlsContextInterceptor } from "./common/interceptors/rls-context.interce
CoordinatorIntegrationModule, CoordinatorIntegrationModule,
FederationModule, FederationModule,
CredentialsModule, CredentialsModule,
CryptoModule,
MosaicTelemetryModule, MosaicTelemetryModule,
SpeechModule, SpeechModule,
DashboardModule, DashboardModule,

View File

@@ -0,0 +1,10 @@
import { Module } from "@nestjs/common";
import { ConfigModule } from "@nestjs/config";
import { CryptoService } from "./crypto.service";
@Module({
imports: [ConfigModule],
providers: [CryptoService],
exports: [CryptoService],
})
export class CryptoModule {}

View File

@@ -0,0 +1,71 @@
import { describe, it, expect, beforeEach } from "vitest";
import { ConfigService } from "@nestjs/config";
import { CryptoService } from "./crypto.service";
function createConfigService(secret?: string): ConfigService {
return {
get: (key: string) => {
if (key === "MOSAIC_SECRET_KEY") {
return secret;
}
return undefined;
},
} as unknown as ConfigService;
}
describe("CryptoService", () => {
let service: CryptoService;
beforeEach(() => {
service = new CryptoService(createConfigService("this-is-a-test-secret-key-with-32+chars"));
});
it("encrypt -> decrypt roundtrip", () => {
const plaintext = "my-secret-api-key";
const encrypted = service.encrypt(plaintext);
const decrypted = service.decrypt(encrypted);
expect(encrypted.startsWith("enc:")).toBe(true);
expect(decrypted).toBe(plaintext);
});
it("decrypt rejects tampered ciphertext", () => {
const encrypted = service.encrypt("sensitive-token");
const payload = encrypted.slice(4);
const bytes = Buffer.from(payload, "base64");
bytes[bytes.length - 1] = bytes[bytes.length - 1]! ^ 0xff;
const tampered = `enc:${bytes.toString("base64")}`;
expect(() => service.decrypt(tampered)).toThrow();
});
it("decrypt rejects non-encrypted string", () => {
expect(() => service.decrypt("plain-text-value")).toThrow();
});
it("isEncrypted detects prefix correctly", () => {
expect(service.isEncrypted("enc:abc")).toBe(true);
expect(service.isEncrypted("ENC:abc")).toBe(false);
expect(service.isEncrypted("plain-text")).toBe(false);
});
it("generateToken returns 64-char hex string", () => {
const token = service.generateToken();
expect(token).toMatch(/^[0-9a-f]{64}$/);
});
it("different plaintexts produce different ciphertexts (random IV)", () => {
const encryptedA = service.encrypt("value-a");
const encryptedB = service.encrypt("value-b");
expect(encryptedA).not.toBe(encryptedB);
});
it("missing MOSAIC_SECRET_KEY throws on construction", () => {
expect(() => new CryptoService(createConfigService(undefined))).toThrow();
});
});

View File

@@ -0,0 +1,82 @@
import { Injectable } from "@nestjs/common";
import { ConfigService } from "@nestjs/config";
import { createCipheriv, createDecipheriv, hkdfSync, randomBytes } from "crypto";
const ALGORITHM = "aes-256-gcm";
const ENCRYPTED_PREFIX = "enc:";
const IV_LENGTH = 12;
const AUTH_TAG_LENGTH = 16;
const DERIVED_KEY_LENGTH = 32;
const HKDF_SALT = "mosaic.crypto.v1";
const HKDF_INFO = "mosaic-db-secret-encryption";
@Injectable()
export class CryptoService {
private readonly key: Buffer;
constructor(private readonly config: ConfigService) {
const secret = this.config.get<string>("MOSAIC_SECRET_KEY");
if (!secret) {
throw new Error("MOSAIC_SECRET_KEY environment variable is required");
}
if (secret.length < 32) {
throw new Error("MOSAIC_SECRET_KEY must be at least 32 characters");
}
this.key = Buffer.from(
hkdfSync(
"sha256",
Buffer.from(secret, "utf8"),
Buffer.from(HKDF_SALT, "utf8"),
Buffer.from(HKDF_INFO, "utf8"),
DERIVED_KEY_LENGTH
)
);
}
encrypt(plaintext: string): string {
const iv = randomBytes(IV_LENGTH);
const cipher = createCipheriv(ALGORITHM, this.key, iv);
const ciphertext = Buffer.concat([cipher.update(plaintext, "utf8"), cipher.final()]);
const authTag = cipher.getAuthTag();
const payload = Buffer.concat([iv, ciphertext, authTag]).toString("base64");
return `${ENCRYPTED_PREFIX}${payload}`;
}
decrypt(encrypted: string): string {
if (!this.isEncrypted(encrypted)) {
throw new Error("Value is not encrypted");
}
const payloadBase64 = encrypted.slice(ENCRYPTED_PREFIX.length);
try {
const payload = Buffer.from(payloadBase64, "base64");
if (payload.length < IV_LENGTH + AUTH_TAG_LENGTH) {
throw new Error("Encrypted payload is too short");
}
const iv = payload.subarray(0, IV_LENGTH);
const authTag = payload.subarray(payload.length - AUTH_TAG_LENGTH);
const ciphertext = payload.subarray(IV_LENGTH, payload.length - AUTH_TAG_LENGTH);
const decipher = createDecipheriv(ALGORITHM, this.key, iv);
decipher.setAuthTag(authTag);
return Buffer.concat([decipher.update(ciphertext), decipher.final()]).toString("utf8");
} catch {
throw new Error("Failed to decrypt value");
}
}
isEncrypted(value: string): boolean {
return value.startsWith(ENCRYPTED_PREFIX);
}
generateToken(): string {
return randomBytes(32).toString("hex");
}
}

View File

@@ -0,0 +1,413 @@
# MS22 Phase 1: DB-Centric Multi-User Agent Architecture
## Design Principles
1. **2 env vars to bootstrap**`DATABASE_URL` + `MOSAIC_SECRET_KEY`
2. **DB-centric config** — All runtime config in Postgres, managed via WebUI
3. **Mosaic is the gatekeeper** — Users authenticate to Mosaic, never to OpenClaw directly
4. **Per-user agent isolation** — Each user gets their own OpenClaw container(s) with their own credentials
5. **Onboarding-first** — Breakglass user + wizard on first boot
6. **Generic product** — No hardcoded names, models, providers, or endpoints
## Architecture Overview
```
┌─────────────────────────────────────────────────────┐
│ MOSAIC WEBUI │
│ (Auth: breakglass local + OIDC via settings) │
└──────────────────────┬──────────────────────────────┘
┌─────────────────────────────────────────────────────┐
│ MOSAIC API │
│ │
│ ┌──────────────┐ ┌────────────────┐ ┌─────────┐ │
│ │ Onboarding │ │ Container │ │ Config │ │
│ │ Wizard │ │ Lifecycle Mgr │ │ Store │ │
│ └──────────────┘ └───────┬────────┘ └─────────┘ │
│ │ │
└────────────────────────────┼────────────────────────┘
│ Docker API
┌──────────────────┼──────────────────┐
│ │ │
▼ ▼ ▼
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ OpenClaw │ │ OpenClaw │ │ OpenClaw │
│ User A │ │ User B │ │ System │
│ │ │ │ │ (admin) │
│ Claude Max │ │ Z.ai key │ │ Shared key │
│ own memory │ │ own memory │ │ monitoring │
└─────────────┘ └─────────────┘ └─────────────┘
Scale to zero Scale to zero Always on
after idle after idle
```
## Container Lifecycle
### User containers (on-demand)
1. User logs in → Mosaic checks `UserContainer` table
2. No running container → Mosaic calls Docker API to create one
3. Injects user's encrypted API keys via config endpoint
4. Routes chat requests to user's container
5. Idle timeout (configurable, default 30min) → scale to zero
6. State volume persists (sessions, memory, auth tokens)
7. Next request → container restarts, picks up state from volume
### System containers (always-on, optional)
- Admin-provisioned for system tasks (monitoring, scheduled jobs)
- Use admin-configured shared API keys
- Not tied to any user
## Auth Layers
| Flow | Method |
| ------------------------------- | ---------------------------------------------------------------------- |
| User → Mosaic WebUI | Breakglass (local) or OIDC (configured in settings) |
| Mosaic API → OpenClaw container | Bearer token (auto-generated per container, stored encrypted in DB) |
| OpenClaw → LLM providers | User's own API keys (delivered via config endpoint, decrypted from DB) |
| Admin → System settings | RBAC (admin role required) |
| Internal config endpoint | Bearer token (container authenticates to fetch its config) |
## Database Schema
### System Tables
```prisma
model SystemConfig {
id String @id @default(cuid())
key String @unique // "oidc.issuerUrl", "oidc.clientId", "onboarding.completed"
value String // plaintext or encrypted (prefix: "enc:")
encrypted Boolean @default(false)
updatedAt DateTime @updatedAt
}
model BreakglassUser {
id String @id @default(cuid())
username String @unique
passwordHash String // bcrypt
isActive Boolean @default(true)
createdAt DateTime @default(now())
updatedAt DateTime @updatedAt
}
```
### Provider Tables (per-user)
```prisma
model LlmProvider {
id String @id @default(cuid())
userId String // owner — each user manages their own providers
name String // "my-zai", "work-openai", "local-ollama"
displayName String // "Z.ai", "OpenAI (Work)", "Local Ollama"
type String // "zai" | "openai" | "anthropic" | "ollama" | "custom"
baseUrl String? // null for built-in, URL for custom/ollama
apiKey String? // encrypted
apiType String @default("openai-completions")
models Json @default("[]") // [{id, name, contextWindow, maxTokens}]
isActive Boolean @default(true)
createdAt DateTime @default(now())
updatedAt DateTime @updatedAt
@@unique([userId, name])
}
```
### Container Tables
```prisma
model UserContainer {
id String @id @default(cuid())
userId String @unique
containerId String? // Docker container ID (null = not running)
containerName String // "mosaic-user-{userId}"
gatewayPort Int? // assigned port (null = not running)
gatewayToken String // encrypted — auto-generated
status String @default("stopped") // "running" | "stopped" | "starting" | "error"
lastActiveAt DateTime?
idleTimeoutMin Int @default(30)
config Json @default("{}") // cached openclaw.json for this user
createdAt DateTime @default(now())
updatedAt DateTime @updatedAt
}
model SystemContainer {
id String @id @default(cuid())
name String @unique // "mosaic-system-ops", "mosaic-system-monitor"
role String // "operations" | "monitor" | "scheduler"
containerId String?
gatewayPort Int?
gatewayToken String // encrypted
status String @default("stopped")
providerId String? // references admin-level LlmProvider
primaryModel String // "zai/glm-5", etc.
isActive Boolean @default(true)
createdAt DateTime @default(now())
updatedAt DateTime @updatedAt
}
```
### User Agent Preferences
```prisma
model UserAgentConfig {
id String @id @default(cuid())
userId String @unique
primaryModel String? // user's preferred model
fallbackModels Json @default("[]")
personality String? // custom SOUL.md content
providerId String? // default provider for this user
createdAt DateTime @default(now())
updatedAt DateTime @updatedAt
}
```
## Internal Config Endpoint
`GET /api/internal/agent-config/:containerType/:id`
- Auth: Bearer token (container's own gateway token)
- Returns: Complete `openclaw.json` generated from DB
- For user containers: includes user's providers, model prefs, personality
- For system containers: includes admin provider config
Response assembles openclaw.json dynamically:
```json
{
"gateway": { "mode": "local", "port": 18789, "bind": "lan", "auth": { "mode": "token" } ... },
"agents": { "defaults": { "model": { "primary": "<from UserAgentConfig>" } } },
"models": { "providers": { "<from LlmProvider rows>": { ... } } }
}
```
## Container Lifecycle Manager
NestJS service that manages Docker containers:
```typescript
class ContainerLifecycleService {
// Create and start a user's OpenClaw container
async ensureRunning(userId: string): Promise<{ url: string; token: string }>;
// Stop idle containers (called by cron/scheduler)
async reapIdle(): Promise<number>;
// Stop a specific user's container
async stop(userId: string): Promise<void>;
// Health check all running containers
async healthCheckAll(): Promise<HealthStatus[]>;
// Restart container with updated config
async restart(userId: string): Promise<void>;
}
```
Uses Docker Engine API (`/var/run/docker.sock` or TCP) via `dockerode` npm package.
## Onboarding Wizard
### First-Boot Detection
- API checks: `SystemConfig.get("onboarding.completed")` → null = first boot
- WebUI redirects to `/onboarding` if not completed
### Steps
**Step 1: Create Breakglass Admin**
- Username + password → bcrypt → `BreakglassUser` table
- This user always works, even if OIDC is misconfigured
**Step 2: Configure Authentication (optional)**
- OIDC: provider URL, client ID, client secret → encrypted in `SystemConfig`
- Skip = breakglass-only auth (can add OIDC later in settings)
**Step 3: Add Your First LLM Provider**
- Pick type → enter API key/endpoint → test connection → save to `LlmProvider`
- This becomes the admin's default provider
**Step 4: System Agents (optional)**
- Configure always-on system agents for monitoring/ops
- Or skip — users can just use their own personal agents
**Step 5: Complete**
- Sets `SystemConfig("onboarding.completed") = true`
- Redirects to dashboard
### Post-Onboarding: User Self-Service
- Each user adds their own LLM providers in profile settings
- Each user configures their preferred model, personality
- First chat request triggers container creation
## Docker Compose (final)
```yaml
services:
mosaic-api:
image: mosaic/api:latest
environment:
DATABASE_URL: ${DATABASE_URL}
MOSAIC_SECRET_KEY: ${MOSAIC_SECRET_KEY}
volumes:
- /var/run/docker.sock:/var/run/docker.sock # Docker API access
networks:
- internal
mosaic-web:
image: mosaic/web:latest
environment:
NEXT_PUBLIC_API_URL: http://mosaic-api:4000
networks:
- internal
postgres:
image: postgres:17
environment:
POSTGRES_DB: mosaic
POSTGRES_USER: mosaic
POSTGRES_PASSWORD: ${DATABASE_PASSWORD}
volumes:
- postgres-data:/var/lib/postgresql/data
networks:
- internal
# System agent (optional, admin-provisioned)
# mosaic-system:
# image: alpine/openclaw:latest
# ... (managed by ContainerLifecycleService)
# User containers are NOT in this file —
# they are dynamically created by ContainerLifecycleService
# via the Docker API at runtime.
networks:
internal:
driver: overlay
volumes:
postgres-data:
```
Note: User OpenClaw containers are **not** defined in docker-compose. They are
created dynamically by the `ContainerLifecycleService` when users start chatting.
## Entrypoint (for dynamically created containers)
```sh
#!/bin/sh
set -e
: "${MOSAIC_API_URL:?required}"
: "${AGENT_TOKEN:?required}"
: "${AGENT_ID:?required}"
# Fetch config from Mosaic API
curl -sf "${MOSAIC_API_URL}/api/internal/agent-config/${AGENT_ID}" \
-H "Authorization: Bearer ${AGENT_TOKEN}" \
-o /tmp/openclaw.json
export OPENCLAW_CONFIG_PATH=/tmp/openclaw.json
exec openclaw gateway run --bind lan --auth token
```
Container env vars (injected by ContainerLifecycleService):
- `MOSAIC_API_URL` — internal API URL
- `AGENT_TOKEN` — this container's bearer token (from DB)
- `AGENT_ID` — container ID for config lookup
## Config Update Strategy
When a user changes settings (model, provider, personality):
1. Mosaic API updates DB
2. API calls `ContainerLifecycleService.restart(userId)`
3. Container restarts, fetches fresh config from API
4. OpenClaw gateway starts with new config
5. State volume preserves sessions/memory across restarts
## Task Breakdown
| Task | Phase | Scope | Dependencies |
| -------- | -------------- | --------------------------------------------------------------------------------------------------------------------- | ------------ |
| MS22-P1a | Schema | Prisma models: SystemConfig, BreakglassUser, LlmProvider, UserContainer, SystemContainer, UserAgentConfig. Migration. | — |
| MS22-P1b | Crypto | Encryption service for API keys/tokens (AES-256-GCM using MOSAIC_SECRET_KEY) | P1a |
| MS22-P1c | Config API | Internal config endpoint: assembles openclaw.json from DB | P1a, P1b |
| MS22-P1d | Container Mgr | ContainerLifecycleService: Docker API integration (dockerode), start/stop/health/reap | P1a |
| MS22-P1e | Onboarding API | Onboarding endpoints: breakglass, OIDC, provider, complete | P1a, P1b |
| MS22-P1f | Onboarding UI | Multi-step wizard in WebUI | P1e |
| MS22-P1g | Settings API | CRUD: providers, agent config, OIDC, breakglass | P1a, P1b |
| MS22-P1h | Settings UI | Settings pages: Providers, Agent Config, Auth | P1g |
| MS22-P1i | Chat Proxy | Route WebUI chat → user's OpenClaw container (SSE) | P1c, P1d |
| MS22-P1j | Docker | Entrypoint script, health checks, compose for core services | P1c |
| MS22-P1k | Idle Reaper | Cron service to stop idle user containers | P1d |
## Open Questions (Resolved)
1. ~~Config updates → restart?~~ **Yes.** Mosaic restarts the container, fresh config on boot.
2. ~~CLI alternative for breakglass?~~ **Yes.** Both WebUI wizard and CLI (`mosaic admin create-breakglass`).
3. ~~Config cache TTL?~~ **Yes.** Config fetched once at startup, changes trigger restart.
## Security Isolation Model
### Core Principle: ZERO cross-user access
Every user is fully sandboxed. No exceptions.
### Container Isolation
- Each user gets their **own** OpenClaw container (separate process, PID namespace)
- Each container has its **own** Docker volume (sessions, memory, workspace)
- Containers run on an **internal-only** Docker network — no external exposure
- Users NEVER talk to OpenClaw directly — Mosaic proxies all requests
- Container gateway tokens are unique per-user and single-purpose
### Data Isolation (enforced at API + DB level)
| Data | Isolation | Enforcement |
| ---------------- | ------------------------- | --------------------------------------------------------------------------------- |
| LLM API keys | Per-user, encrypted | `LlmProvider.userId` — all queries scoped by authenticated user |
| Chat history | Per-user container volume | Separate Docker volume per user, not shared |
| Agent memory | Per-user container volume | Separate Docker volume per user |
| Agent config | Per-user | `UserAgentConfig.userId` — scoped queries |
| Container access | Per-user | `UserContainer.userId` — Mosaic validates user owns the container before proxying |
### API Enforcement
- **All user-facing endpoints** include `WHERE userId = authenticatedUser.id`
- **No admin endpoint** exposes another user's API keys (even to admins)
- **Chat proxy** validates: authenticated user → owns target container → forwards request
- **Config endpoint** validates: container token matches the container requesting config
- **Provider CRUD** is fully user-scoped — User A cannot list, read, or modify User B's providers
### What admins CAN see
- Container status (running/stopped) — not contents
- User list and roles
- System-level config (OIDC, system agents)
- Aggregate usage metrics (not individual conversations)
### What admins CANNOT see
- Other users' API keys (encrypted, no decrypt endpoint)
- Other users' chat history (in container volumes, not in Mosaic DB)
- Other users' agent memory/workspace contents
### Future: Team Workspaces (NOT in scope)
Team/shared workspaces are a potential future feature where users opt-in to
shared agent contexts. This requires explicit consent, shared-key management,
and a different isolation model. **Not designed here. Not built now.**
### Attack Surface Notes
- Docker socket access (`/var/run/docker.sock`) is required by Mosaic API for container management. This is a privileged operation — the Mosaic API container must be trusted.
- `MOSAIC_SECRET_KEY` is the root of trust for encryption. Rotation requires re-encrypting all secrets in DB.
- Container-to-container communication is blocked by default (no shared network between user containers unless explicitly configured).