From b6841a877ad7b9d65584e37596ec75c7a98691fe Mon Sep 17 00:00:00 2001 From: Amit Saroj Date: Wed, 3 Jun 2026 23:06:30 +0530 Subject: [PATCH 1/3] fix(ai-openai): migrate WebRTC realtime adapter to OpenAI GA API --- packages/ai-openai/src/realtime/adapter.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/ai-openai/src/realtime/adapter.ts b/packages/ai-openai/src/realtime/adapter.ts index 5f8a5c09c..5830ce37c 100644 --- a/packages/ai-openai/src/realtime/adapter.ts +++ b/packages/ai-openai/src/realtime/adapter.ts @@ -190,7 +190,7 @@ async function createWebRTCConnection( // Send SDP to OpenAI and get answer. `offer.sdp` is `string | undefined` per // the WebRTC type definitions; coerce to `null` (which `RequestInit.body` // accepts) under exactOptionalPropertyTypes. - const sdpResponse = await fetch(`${OPENAI_REALTIME_URL}?model=${model}`, { + const sdpResponse = await fetch(`${OPENAI_REALTIME_URL}/calls`, { method: 'POST', headers: { Authorization: `Bearer ${token.token}`, @@ -586,7 +586,7 @@ async function createWebRTCConnection( }, updateSession(config: Partial) { - const sessionUpdate: Record = {} + const sessionUpdate: Record = { type: 'realtime' } if (config.instructions) { sessionUpdate.instructions = config.instructions From 676dd84a067a1d820d800224494f56117b36185e Mon Sep 17 00:00:00 2001 From: Tom Beckenham <34339192+tombeckenham@users.noreply.github.com> Date: Wed, 10 Jun 2026 14:06:32 +1000 Subject: [PATCH 2/3] fix(ai-openai): complete realtime Beta-to-GA migration Completes the GA migration started in this PR so the whole realtime flow works against OpenAI's GA API (the Beta shape was shut down 2026-05-12): - openaiRealtimeToken() mints ephemeral keys via POST /v1/realtime/client_secrets (the Beta /v1/realtime/sessions endpoint is retired) and parses the GA top-level value/expires_at response shape - session.update payloads use the GA shape via a new pure buildSessionUpdate() helper: required session.type, audio.input.*, audio.output.voice, output_modalities, max_output_tokens; temperature (removed in GA) is dropped with a debug log instead of getting the whole update rejected with unknown_parameter - server events handled under GA names (response.output_audio_transcript.*, response.output_audio.*, output_text/output_audio content parts) - removed the now-unused model local in createWebRTCConnection (the GA /calls endpoint rejects ?model=; the model is bound to the ephemeral key) - default model gpt-realtime; dead gpt-4o-(mini-)realtime-preview ids (shut down 2026-05-07) removed from OpenAIRealtimeModel, docs, and examples - unit tests for the session.update payload and client-secret request/response shapes; changeset added Live-verified against the OpenAI API: client_secrets 200 (ek_ token), /v1/realtime/calls 201 with SDP answer, and session.updated echoing voice, semantic VAD, tools, output_modalities, and max_output_tokens. Co-Authored-By: Claude Fable 5 --- .changeset/openai-realtime-ga-migration.md | 12 ++ docs/media/realtime-chat.md | 10 +- docs/reference/functions/realtimeToken.md | 4 +- .../_execute-prompt/api.realtime-token.ts | 2 +- .../ts-react-chat/src/lib/use-realtime.ts | 2 +- packages/ai-openai/src/realtime/adapter.ts | 92 ++++---------- .../ai-openai/src/realtime/session-update.ts | 76 ++++++++++++ packages/ai-openai/src/realtime/token.ts | 94 +++++++++----- packages/ai-openai/src/realtime/types.ts | 57 +++------ .../tests/realtime-session-update.test.ts | 115 ++++++++++++++++++ .../ai-openai/tests/realtime-token.test.ts | 55 +++++++++ packages/ai/src/realtime/index.ts | 4 +- 12 files changed, 372 insertions(+), 151 deletions(-) create mode 100644 .changeset/openai-realtime-ga-migration.md create mode 100644 packages/ai-openai/src/realtime/session-update.ts create mode 100644 packages/ai-openai/tests/realtime-session-update.test.ts create mode 100644 packages/ai-openai/tests/realtime-token.test.ts diff --git a/.changeset/openai-realtime-ga-migration.md b/.changeset/openai-realtime-ga-migration.md new file mode 100644 index 000000000..85e40347b --- /dev/null +++ b/.changeset/openai-realtime-ga-migration.md @@ -0,0 +1,12 @@ +--- +'@tanstack/ai-openai': patch +'@tanstack/ai': patch +--- + +Migrate the OpenAI realtime adapters from the retired Beta API (shut down 2026-05-12) to the GA API: + +- `openaiRealtime()` now exchanges WebRTC SDP via `POST /v1/realtime/calls` (the Beta `?model=` shape returned `beta_api_shape_disabled`). +- `openaiRealtimeToken()` now mints ephemeral keys via `POST /v1/realtime/client_secrets` instead of the retired `/v1/realtime/sessions`, and parses the GA top-level `value`/`expires_at` response shape. +- `session.update` payloads use the GA shape: required `session.type`, `audio.input.transcription`, `audio.input.turn_detection`, `audio.output.voice`, `output_modalities`, and `max_output_tokens`. `temperature` was removed from the GA session config and is no longer sent (a debug log notes when it is dropped). +- Server events are handled under their GA names (`response.output_audio_transcript.*`, `response.output_audio.*`, `output_text`/`output_audio` content parts). +- The default realtime model is now `gpt-realtime`; the `gpt-4o-(mini-)realtime-preview` ids (shut down by OpenAI on 2026-05-07) were removed from `OpenAIRealtimeModel`. diff --git a/docs/media/realtime-chat.md b/docs/media/realtime-chat.md index b625d5189..1b5fe9779 100644 --- a/docs/media/realtime-chat.md +++ b/docs/media/realtime-chat.md @@ -48,7 +48,7 @@ const getRealtimeToken = createServerFn({ method: 'POST' }) .handler(async () => { return realtimeToken({ adapter: openaiRealtimeToken({ - model: 'gpt-4o-realtime-preview', + model: 'gpt-realtime', }), }) }) @@ -119,7 +119,7 @@ import { openaiRealtimeToken } from '@tanstack/ai-openai' const token = await realtimeToken({ adapter: openaiRealtimeToken({ - model: 'gpt-4o-realtime-preview', + model: 'gpt-realtime', }), }) ``` @@ -138,10 +138,8 @@ const adapter = openaiRealtime() | Model | Description | |-------|-------------| -| `gpt-4o-realtime-preview` | Full realtime model | -| `gpt-4o-mini-realtime-preview` | Smaller, faster realtime model | -| `gpt-realtime` | Latest realtime model | -| `gpt-realtime-mini` | Latest mini realtime model | +| `gpt-realtime` | Full realtime model | +| `gpt-realtime-mini` | Smaller, faster realtime model | **Available voices:** `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, `cedar` diff --git a/docs/reference/functions/realtimeToken.md b/docs/reference/functions/realtimeToken.md index dd4f6b574..e1548e82f 100644 --- a/docs/reference/functions/realtimeToken.md +++ b/docs/reference/functions/realtimeToken.md @@ -41,9 +41,7 @@ export const getRealtimeToken = createServerFn() .handler(async () => { return realtimeToken({ adapter: openaiRealtimeToken({ - model: 'gpt-4o-realtime-preview', - voice: 'alloy', - instructions: 'You are a helpful assistant...', + model: 'gpt-realtime', }), }) }) diff --git a/examples/ts-code-mode-web/src/routes/_execute-prompt/api.realtime-token.ts b/examples/ts-code-mode-web/src/routes/_execute-prompt/api.realtime-token.ts index d1cd1ceea..9b4b38caf 100644 --- a/examples/ts-code-mode-web/src/routes/_execute-prompt/api.realtime-token.ts +++ b/examples/ts-code-mode-web/src/routes/_execute-prompt/api.realtime-token.ts @@ -11,7 +11,7 @@ export const Route = createFileRoute( try { const token = await realtimeToken({ adapter: openaiRealtimeToken({ - model: 'gpt-4o-realtime-preview', + model: 'gpt-realtime', }), }) return new Response(JSON.stringify(token), { diff --git a/examples/ts-react-chat/src/lib/use-realtime.ts b/examples/ts-react-chat/src/lib/use-realtime.ts index dbe60fc24..c32dd5031 100644 --- a/examples/ts-react-chat/src/lib/use-realtime.ts +++ b/examples/ts-react-chat/src/lib/use-realtime.ts @@ -20,7 +20,7 @@ const getRealtimeTokenFn = createServerFn({ method: 'POST' }) if (data.provider === 'openai') { return realtimeToken({ adapter: openaiRealtimeToken({ - model: 'gpt-4o-realtime-preview', + model: 'gpt-realtime', }), }) } diff --git a/packages/ai-openai/src/realtime/adapter.ts b/packages/ai-openai/src/realtime/adapter.ts index 5830ce37c..7d9965243 100644 --- a/packages/ai-openai/src/realtime/adapter.ts +++ b/packages/ai-openai/src/realtime/adapter.ts @@ -1,4 +1,5 @@ import { resolveDebugOption } from '@tanstack/ai/adapter-internals' +import { buildSessionUpdate } from './session-update' import type { AnyClientTool, AudioVisualization, @@ -47,7 +48,7 @@ export function openaiRealtime( token: RealtimeToken, _clientTools?: ReadonlyArray, ): Promise { - const model = token.config.model ?? 'gpt-4o-realtime-preview' + const model = token.config.model ?? 'gpt-realtime' logger.request(`activity=realtime provider=openai model=${model}`, { provider: 'openai', model, @@ -73,7 +74,6 @@ async function createWebRTCConnection( token: RealtimeToken, logger: InternalLogger, ): Promise { - const model = token.config.model ?? 'gpt-4o-realtime-preview' const eventHandlers = new Map>>() // WebRTC peer connection @@ -187,9 +187,11 @@ async function createWebRTCConnection( const offer = await pc.createOffer() await pc.setLocalDescription(offer) - // Send SDP to OpenAI and get answer. `offer.sdp` is `string | undefined` per - // the WebRTC type definitions; coerce to `null` (which `RequestInit.body` - // accepts) under exactOptionalPropertyTypes. + // Send SDP to OpenAI's GA `/calls` endpoint and get the answer. The model + // is bound to the ephemeral token (minted via `/v1/realtime/client_secrets`), + // so it must NOT be passed as a query param — GA rejects `?model=` with a + // 400. `offer.sdp` is `string | undefined` per the WebRTC type definitions; + // coerce to `null`, which `RequestInit.body` accepts. const sdpResponse = await fetch(`${OPENAI_REALTIME_URL}/calls`, { method: 'POST', headers: { @@ -260,7 +262,7 @@ async function createWebRTCConnection( break } - case 'response.audio_transcript.delta': { + case 'response.output_audio_transcript.delta': { const delta = event.delta as string emit('transcript', { role: 'assistant', @@ -270,7 +272,7 @@ async function createWebRTCConnection( break } - case 'response.audio_transcript.done': { + case 'response.output_audio_transcript.done': { const transcript = event.transcript as string emit('transcript', { role: 'assistant', transcript, isFinal: true }) break @@ -296,14 +298,14 @@ async function createWebRTCConnection( break } - case 'response.audio.delta': + case 'response.output_audio.delta': if (currentMode !== 'speaking') { currentMode = 'speaking' emit('mode_change', { mode: 'speaking' }) } break - case 'response.audio.done': + case 'response.output_audio.done': break case 'response.function_call_arguments.done': { @@ -359,12 +361,14 @@ async function createWebRTCConnection( if (item.type === 'message' && item.content) { const content = item.content as Array> for (const part of content) { - if (part.type === 'audio' && part.transcript) { + // GA renamed assistant content types: `audio` -> `output_audio`, + // `text` -> `output_text` + if (part.type === 'output_audio' && part.transcript) { message.parts.push({ type: 'audio', transcript: part.transcript as string, }) - } else if (part.type === 'text' && part.text) { + } else if (part.type === 'output_text' && part.text) { message.parts.push({ type: 'text', content: part.text as string, @@ -586,65 +590,19 @@ async function createWebRTCConnection( }, updateSession(config: Partial) { - const sessionUpdate: Record = { type: 'realtime' } - - if (config.instructions) { - sessionUpdate.instructions = config.instructions - } - - if (config.voice) { - sessionUpdate.voice = config.voice - } - - if (config.vadMode) { - if (config.vadMode === 'semantic') { - sessionUpdate.turn_detection = { - type: 'semantic_vad', - eagerness: config.semanticEagerness ?? 'medium', - } - } else if (config.vadMode === 'server') { - sessionUpdate.turn_detection = { - type: 'server_vad', - threshold: config.vadConfig?.threshold ?? 0.5, - prefix_padding_ms: config.vadConfig?.prefixPaddingMs ?? 300, - silence_duration_ms: config.vadConfig?.silenceDurationMs ?? 500, - } - } else { - sessionUpdate.turn_detection = null - } - } - - if (config.tools !== undefined) { - sessionUpdate.tools = config.tools.map((t) => ({ - type: 'function', - name: t.name, - description: t.description, - parameters: t.inputSchema ?? { type: 'object', properties: {} }, - })) - sessionUpdate.tool_choice = 'auto' - } - - if (config.outputModalities) { - sessionUpdate.modalities = config.outputModalities - } - if (config.temperature !== undefined) { - sessionUpdate.temperature = config.temperature - } - - if (config.maxOutputTokens !== undefined) { - sessionUpdate.max_response_output_tokens = config.maxOutputTokens + // The GA API removed `temperature` from session config; sending it + // would get the whole update rejected with `unknown_parameter`. + logger.provider( + 'provider=openai direction=out type=session.update dropped `temperature` (removed in the GA realtime API)', + { frame: { temperature: config.temperature } }, + ) } - // Always enable input audio transcription so user speech is transcribed - sessionUpdate.input_audio_transcription = { model: 'whisper-1' } - - if (Object.keys(sessionUpdate).length > 0) { - sendEvent({ - type: 'session.update', - session: sessionUpdate, - }) - } + sendEvent({ + type: 'session.update', + session: buildSessionUpdate(config), + }) }, interrupt() { diff --git a/packages/ai-openai/src/realtime/session-update.ts b/packages/ai-openai/src/realtime/session-update.ts new file mode 100644 index 000000000..5d0118f2d --- /dev/null +++ b/packages/ai-openai/src/realtime/session-update.ts @@ -0,0 +1,76 @@ +import type { RealtimeSessionConfig } from '@tanstack/ai' + +/** + * Builds the GA-shaped `session.update` payload for OpenAI's realtime API. + * + * The GA API requires `session.type` on every update and nests audio + * settings under `audio.input` / `audio.output` (the flat Beta field names + * were retired when the Beta shape was shut down on 2026-05-12). A + * `session.update` containing unknown fields is rejected with + * `unknown_parameter` and none of the config is applied, so the exact field + * names here are load-bearing. + * + * `temperature` was removed from the GA session config and is intentionally + * never sent; the adapter logs when it drops the option. + */ +export function buildSessionUpdate( + config: Partial, +): Record { + // Always enable input audio transcription so user speech is transcribed + const audioInput: Record = { + transcription: { model: 'whisper-1' }, + } + + if (config.vadMode) { + if (config.vadMode === 'semantic') { + audioInput.turn_detection = { + type: 'semantic_vad', + eagerness: config.semanticEagerness ?? 'medium', + } + } else if (config.vadMode === 'server') { + audioInput.turn_detection = { + type: 'server_vad', + threshold: config.vadConfig?.threshold ?? 0.5, + prefix_padding_ms: config.vadConfig?.prefixPaddingMs ?? 300, + silence_duration_ms: config.vadConfig?.silenceDurationMs ?? 500, + } + } else { + audioInput.turn_detection = null + } + } + + const audio: Record = { input: audioInput } + + if (config.voice) { + audio.output = { voice: config.voice } + } + + const sessionUpdate: Record = { + type: 'realtime', + audio, + } + + if (config.instructions) { + sessionUpdate.instructions = config.instructions + } + + if (config.tools !== undefined) { + sessionUpdate.tools = config.tools.map((t) => ({ + type: 'function', + name: t.name, + description: t.description, + parameters: t.inputSchema ?? { type: 'object', properties: {} }, + })) + sessionUpdate.tool_choice = 'auto' + } + + if (config.outputModalities) { + sessionUpdate.output_modalities = config.outputModalities + } + + if (config.maxOutputTokens !== undefined) { + sessionUpdate.max_output_tokens = config.maxOutputTokens + } + + return sessionUpdate +} diff --git a/packages/ai-openai/src/realtime/token.ts b/packages/ai-openai/src/realtime/token.ts index 6bff9c9c2..816ac8b1d 100644 --- a/packages/ai-openai/src/realtime/token.ts +++ b/packages/ai-openai/src/realtime/token.ts @@ -1,19 +1,67 @@ import { getOpenAIApiKeyFromEnv } from '../utils/client' import type { RealtimeToken, RealtimeTokenAdapter } from '@tanstack/ai' import type { + OpenAIRealtimeClientSecretResponse, OpenAIRealtimeModel, - OpenAIRealtimeSessionResponse, OpenAIRealtimeTokenOptions, } from './types' -const OPENAI_REALTIME_SESSIONS_URL = - 'https://api.openai.com/v1/realtime/sessions' +const OPENAI_REALTIME_CLIENT_SECRETS_URL = + 'https://api.openai.com/v1/realtime/client_secrets' + +/** + * Builds the GA `/v1/realtime/client_secrets` request body. + * + * The session config (including its required `type`) is nested under the + * `session` key. The model is bound to the resulting ephemeral key, so the + * client never sends it during the WebRTC SDP exchange. + */ +export function buildClientSecretRequest( + model: OpenAIRealtimeModel, +): Record { + return { session: { type: 'realtime', model } } +} + +/** + * Parses the GA client secret response into a {@link RealtimeToken}. + * + * GA returns the ephemeral key at the top level (`value` / `expires_at`), + * not nested under `client_secret` like the retired Beta + * `/v1/realtime/sessions` response did. + */ +export function parseClientSecretResponse( + data: Partial | undefined, + fallbackModel: OpenAIRealtimeModel, +): RealtimeToken { + // Validate shape before dereferencing — the API could return an error + // envelope with 200 status, or a partial response under protocol drift. + if ( + !data || + typeof data.value !== 'string' || + typeof data.expires_at !== 'number' || + !Number.isFinite(data.expires_at) + ) { + throw new Error( + 'OpenAI realtime client secret response missing or malformed `value`/`expires_at`', + ) + } + + return { + provider: 'openai', + token: data.value, + expiresAt: data.expires_at * 1000, + config: { + model: data.session?.model ?? fallbackModel, + }, + } +} /** * Creates an OpenAI realtime token adapter. * - * This adapter generates ephemeral tokens for client-side WebRTC connections. - * The token is valid for 10 minutes. + * This adapter generates ephemeral keys for client-side WebRTC connections + * via the GA `/v1/realtime/client_secrets` endpoint. The key is valid for + * 10 minutes by default. * * @param options - Configuration options for the realtime session * @returns A RealtimeTokenAdapter for use with realtimeToken() @@ -24,15 +72,7 @@ const OPENAI_REALTIME_SESSIONS_URL = * import { openaiRealtimeToken } from '@tanstack/ai-openai' * * const token = await realtimeToken({ - * adapter: openaiRealtimeToken({ - * model: 'gpt-4o-realtime-preview', - * voice: 'alloy', - * instructions: 'You are a helpful assistant.', - * turnDetection: { - * type: 'semantic_vad', - * eagerness: 'medium', - * }, - * }), + * adapter: openaiRealtimeToken({ model: 'gpt-realtime' }), * }) * ``` */ @@ -45,38 +85,32 @@ export function openaiRealtimeToken( provider: 'openai', async generateToken(): Promise { - const model: OpenAIRealtimeModel = - options.model ?? 'gpt-4o-realtime-preview' + const model: OpenAIRealtimeModel = options.model ?? 'gpt-realtime' - // Call OpenAI API to create session and get ephemeral token. // Only the model is sent server-side; all other session config - // (instructions, voice, tools, VAD) is applied client-side via session.update. - const response = await fetch(OPENAI_REALTIME_SESSIONS_URL, { + // (instructions, voice, tools, VAD) is applied client-side via + // session.update. + const response = await fetch(OPENAI_REALTIME_CLIENT_SECRETS_URL, { method: 'POST', headers: { Authorization: `Bearer ${apiKey}`, 'Content-Type': 'application/json', }, - body: JSON.stringify({ model }), + body: JSON.stringify(buildClientSecretRequest(model)), }) if (!response.ok) { const errorText = await response.text() throw new Error( - `OpenAI realtime session creation failed: ${response.status} ${errorText}`, + `OpenAI realtime client secret creation failed: ${response.status} ${errorText}`, ) } - const sessionData: OpenAIRealtimeSessionResponse = await response.json() + const data = (await response.json()) as + | Partial + | undefined - return { - provider: 'openai', - token: sessionData.client_secret.value, - expiresAt: sessionData.client_secret.expires_at * 1000, - config: { - model: sessionData.model, - }, - } + return parseClientSecretResponse(data, model) }, } } diff --git a/packages/ai-openai/src/realtime/types.ts b/packages/ai-openai/src/realtime/types.ts index be0bf856c..1adcf3b6d 100644 --- a/packages/ai-openai/src/realtime/types.ts +++ b/packages/ai-openai/src/realtime/types.ts @@ -16,15 +16,12 @@ export type OpenAIRealtimeVoice = | 'cedar' /** - * OpenAI realtime model options + * OpenAI realtime model options. + * + * The `gpt-4o-(mini-)realtime-preview` models were shut down by OpenAI on + * 2026-05-07 and are no longer listed here. */ -export type OpenAIRealtimeModel = - | 'gpt-4o-realtime-preview' - | 'gpt-4o-realtime-preview-2024-10-01' - | 'gpt-4o-mini-realtime-preview' - | 'gpt-4o-mini-realtime-preview-2024-12-17' - | 'gpt-realtime' - | 'gpt-realtime-mini' +export type OpenAIRealtimeModel = 'gpt-realtime' | 'gpt-realtime-mini' /** * OpenAI semantic VAD configuration @@ -54,7 +51,7 @@ export type OpenAITurnDetection = * Options for the OpenAI realtime token adapter */ export interface OpenAIRealtimeTokenOptions { - /** Model to use (default: 'gpt-4o-realtime-preview') */ + /** Model to use (default: 'gpt-realtime') */ model?: OpenAIRealtimeModel } @@ -74,38 +71,18 @@ export interface OpenAIRealtimeOptions { } /** - * OpenAI realtime session response from the API + * OpenAI GA realtime client secret response from + * `POST /v1/realtime/client_secrets`. Minimal shape — only the fields the + * token adapter reads. */ -export interface OpenAIRealtimeSessionResponse { - id: string - object: 'realtime.session' - model: string - modalities: Array - instructions: string - voice: string - input_audio_format: string - output_audio_format: string - input_audio_transcription: { - model: string - } | null - turn_detection: { - type: string - threshold?: number - prefix_padding_ms?: number - silence_duration_ms?: number - eagerness?: string - } | null - tools: Array<{ +export interface OpenAIRealtimeClientSecretResponse { + /** Ephemeral key (`ek_…`) used as the bearer token for the WebRTC SDP exchange */ + value: string + /** Unix timestamp (seconds) when the ephemeral key expires */ + expires_at: number + /** Effective session config the key was minted for */ + session: { type: string - name: string - description: string - parameters: Record - }> - tool_choice: string - temperature: number - max_response_output_tokens: number | string - client_secret: { - value: string - expires_at: number + model: string } } diff --git a/packages/ai-openai/tests/realtime-session-update.test.ts b/packages/ai-openai/tests/realtime-session-update.test.ts new file mode 100644 index 000000000..f2bb6d696 --- /dev/null +++ b/packages/ai-openai/tests/realtime-session-update.test.ts @@ -0,0 +1,115 @@ +import { describe, expect, it } from 'vitest' +import { buildSessionUpdate } from '../src/realtime/session-update' + +describe('buildSessionUpdate (GA session.update shape)', () => { + it('always stamps session.type="realtime" and enables input transcription', () => { + expect(buildSessionUpdate({})).toEqual({ + type: 'realtime', + audio: { input: { transcription: { model: 'whisper-1' } } }, + }) + }) + + it('nests voice under audio.output.voice', () => { + const session = buildSessionUpdate({ voice: 'marin' }) + expect(session.audio).toEqual({ + input: { transcription: { model: 'whisper-1' } }, + output: { voice: 'marin' }, + }) + }) + + it('nests semantic turn detection under audio.input.turn_detection', () => { + const session = buildSessionUpdate({ + vadMode: 'semantic', + semanticEagerness: 'high', + }) + expect(session.audio).toEqual({ + input: { + transcription: { model: 'whisper-1' }, + turn_detection: { type: 'semantic_vad', eagerness: 'high' }, + }, + }) + }) + + it('applies server VAD defaults under audio.input.turn_detection', () => { + const session = buildSessionUpdate({ vadMode: 'server' }) + expect(session.audio).toEqual({ + input: { + transcription: { model: 'whisper-1' }, + turn_detection: { + type: 'server_vad', + threshold: 0.5, + prefix_padding_ms: 300, + silence_duration_ms: 500, + }, + }, + }) + }) + + it('disables turn detection for manual VAD mode', () => { + const session = buildSessionUpdate({ vadMode: 'manual' }) + expect(session.audio).toEqual({ + input: { + transcription: { model: 'whisper-1' }, + turn_detection: null, + }, + }) + }) + + it('uses GA field names output_modalities and max_output_tokens', () => { + const session = buildSessionUpdate({ + outputModalities: ['audio'], + maxOutputTokens: 4096, + }) + expect(session.output_modalities).toEqual(['audio']) + expect(session.max_output_tokens).toBe(4096) + }) + + it('maps tools to the realtime function shape with tool_choice auto', () => { + const session = buildSessionUpdate({ + tools: [ + { + name: 'getWeather', + description: 'Get the weather', + inputSchema: { type: 'object', properties: { city: {} } }, + }, + { name: 'noSchema', description: 'No schema tool' }, + ], + }) + expect(session.tools).toEqual([ + { + type: 'function', + name: 'getWeather', + description: 'Get the weather', + parameters: { type: 'object', properties: { city: {} } }, + }, + { + type: 'function', + name: 'noSchema', + description: 'No schema tool', + parameters: { type: 'object', properties: {} }, + }, + ]) + expect(session.tool_choice).toBe('auto') + }) + + it('never emits Beta field names (GA rejects the whole update on unknown_parameter)', () => { + const session = buildSessionUpdate({ + instructions: 'Be helpful.', + voice: 'marin', + vadMode: 'server', + outputModalities: ['audio', 'text'], + temperature: 0.7, + maxOutputTokens: 1024, + }) + for (const betaField of [ + 'voice', + 'modalities', + 'turn_detection', + 'input_audio_transcription', + 'max_response_output_tokens', + 'temperature', + ]) { + expect(session).not.toHaveProperty(betaField) + } + }) +}) diff --git a/packages/ai-openai/tests/realtime-token.test.ts b/packages/ai-openai/tests/realtime-token.test.ts new file mode 100644 index 000000000..e355e6a83 --- /dev/null +++ b/packages/ai-openai/tests/realtime-token.test.ts @@ -0,0 +1,55 @@ +import { describe, expect, it } from 'vitest' +import { + buildClientSecretRequest, + parseClientSecretResponse, +} from '../src/realtime/token' + +describe('buildClientSecretRequest', () => { + it('nests session config with the required type under the `session` key', () => { + expect(buildClientSecretRequest('gpt-realtime-mini')).toEqual({ + session: { type: 'realtime', model: 'gpt-realtime-mini' }, + }) + }) +}) + +describe('parseClientSecretResponse', () => { + it('reads the GA top-level value/expires_at shape and converts seconds to ms', () => { + const token = parseClientSecretResponse( + { + value: 'ek_test_123', + expires_at: 1_700_000_000, + session: { type: 'realtime', model: 'gpt-realtime' }, + }, + 'gpt-realtime-mini', + ) + expect(token).toEqual({ + provider: 'openai', + token: 'ek_test_123', + expiresAt: 1_700_000_000_000, + config: { model: 'gpt-realtime' }, + }) + }) + + it('falls back to the requested model when the response omits session.model', () => { + const token = parseClientSecretResponse( + { value: 'ek_test_123', expires_at: 1_700_000_000 }, + 'gpt-realtime-mini', + ) + expect(token.config.model).toBe('gpt-realtime-mini') + }) + + it('throws on a missing or malformed response instead of returning a broken token', () => { + expect(() => parseClientSecretResponse(undefined, 'gpt-realtime')).toThrow( + /missing or malformed/, + ) + expect(() => + parseClientSecretResponse({ expires_at: 1_700_000_000 }, 'gpt-realtime'), + ).toThrow(/missing or malformed/) + expect(() => + parseClientSecretResponse( + { value: 'ek_test_123', expires_at: Number.NaN }, + 'gpt-realtime', + ), + ).toThrow(/missing or malformed/) + }) +}) diff --git a/packages/ai/src/realtime/index.ts b/packages/ai/src/realtime/index.ts index e3970285e..54a7a6439 100644 --- a/packages/ai/src/realtime/index.ts +++ b/packages/ai/src/realtime/index.ts @@ -22,9 +22,7 @@ export type * from './types' * .handler(async () => { * return realtimeToken({ * adapter: openaiRealtimeToken({ - * model: 'gpt-4o-realtime-preview', - * voice: 'alloy', - * instructions: 'You are a helpful assistant...', + * model: 'gpt-realtime', * }), * }) * }) From c2dbee8bc1012c2ab8aee8b2f58626f3421fc947 Mon Sep 17 00:00:00 2001 From: Tom Beckenham <34339192+tombeckenham@users.noreply.github.com> Date: Wed, 10 Jun 2026 14:46:02 +1000 Subject: [PATCH 3/3] fix(ai-openai): collapse output modalities to single GA-supported value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The GA realtime API only accepts ['audio'] or ['text'] for output_modalities; the Beta API accepted ['audio', 'text'] and the provider-agnostic RealtimeSessionConfig still legitimately produces it (e.g. the example UI's audio+text mode). Sending both got the whole session.update rejected with: Invalid modalities: ['audio', 'text']. Collapse to ['audio'] when audio is requested — GA audio replies still stream text via response.output_audio_transcript.* events, so visible behavior is unchanged. Live-verified: session.updated accepted. Co-Authored-By: Claude Fable 5 --- packages/ai-openai/src/realtime/session-update.ts | 8 +++++++- .../ai-openai/tests/realtime-session-update.test.ts | 10 ++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/packages/ai-openai/src/realtime/session-update.ts b/packages/ai-openai/src/realtime/session-update.ts index 5d0118f2d..0d9e9f68a 100644 --- a/packages/ai-openai/src/realtime/session-update.ts +++ b/packages/ai-openai/src/realtime/session-update.ts @@ -65,7 +65,13 @@ export function buildSessionUpdate( } if (config.outputModalities) { - sessionUpdate.output_modalities = config.outputModalities + // GA only supports a single output modality: ['audio'] or ['text'] + // (Beta accepted ['audio', 'text']). Audio replies still stream text + // via `response.output_audio_transcript.*` events, so collapsing + // ['audio', 'text'] to ['audio'] preserves the visible behavior. + sessionUpdate.output_modalities = config.outputModalities.includes('audio') + ? ['audio'] + : ['text'] } if (config.maxOutputTokens !== undefined) { diff --git a/packages/ai-openai/tests/realtime-session-update.test.ts b/packages/ai-openai/tests/realtime-session-update.test.ts index f2bb6d696..3fb6356a0 100644 --- a/packages/ai-openai/tests/realtime-session-update.test.ts +++ b/packages/ai-openai/tests/realtime-session-update.test.ts @@ -64,6 +64,16 @@ describe('buildSessionUpdate (GA session.update shape)', () => { expect(session.max_output_tokens).toBe(4096) }) + it('collapses ["audio", "text"] to ["audio"] (GA supports a single output modality)', () => { + expect( + buildSessionUpdate({ outputModalities: ['audio', 'text'] }) + .output_modalities, + ).toEqual(['audio']) + expect( + buildSessionUpdate({ outputModalities: ['text'] }).output_modalities, + ).toEqual(['text']) + }) + it('maps tools to the realtime function shape with tool_choice auto', () => { const session = buildSessionUpdate({ tools: [