Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .changeset/openai-realtime-ga-migration.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
'@tanstack/ai-openai': patch
'@tanstack/ai': patch
---

Migrate the OpenAI realtime adapters from the retired Beta API (shut down 2026-05-12) to the GA API:

- `openaiRealtime()` now exchanges WebRTC SDP via `POST /v1/realtime/calls` (the Beta `?model=` shape returned `beta_api_shape_disabled`).
- `openaiRealtimeToken()` now mints ephemeral keys via `POST /v1/realtime/client_secrets` instead of the retired `/v1/realtime/sessions`, and parses the GA top-level `value`/`expires_at` response shape.
- `session.update` payloads use the GA shape: required `session.type`, `audio.input.transcription`, `audio.input.turn_detection`, `audio.output.voice`, `output_modalities`, and `max_output_tokens`. `temperature` was removed from the GA session config and is no longer sent (a debug log notes when it is dropped).
- Server events are handled under their GA names (`response.output_audio_transcript.*`, `response.output_audio.*`, `output_text`/`output_audio` content parts).
- The default realtime model is now `gpt-realtime`; the `gpt-4o-(mini-)realtime-preview` ids (shut down by OpenAI on 2026-05-07) were removed from `OpenAIRealtimeModel`.
10 changes: 4 additions & 6 deletions docs/media/realtime-chat.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ const getRealtimeToken = createServerFn({ method: 'POST' })
.handler(async () => {
return realtimeToken({
adapter: openaiRealtimeToken({
model: 'gpt-4o-realtime-preview',
model: 'gpt-realtime',
}),
})
})
Expand Down Expand Up @@ -119,7 +119,7 @@ import { openaiRealtimeToken } from '@tanstack/ai-openai'

const token = await realtimeToken({
adapter: openaiRealtimeToken({
model: 'gpt-4o-realtime-preview',
model: 'gpt-realtime',
}),
})
```
Expand All @@ -138,10 +138,8 @@ const adapter = openaiRealtime()

| Model | Description |
|-------|-------------|
| `gpt-4o-realtime-preview` | Full realtime model |
| `gpt-4o-mini-realtime-preview` | Smaller, faster realtime model |
| `gpt-realtime` | Latest realtime model |
| `gpt-realtime-mini` | Latest mini realtime model |
| `gpt-realtime` | Full realtime model |
| `gpt-realtime-mini` | Smaller, faster realtime model |

**Available voices:** `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, `cedar`

Expand Down
4 changes: 1 addition & 3 deletions docs/reference/functions/realtimeToken.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,7 @@ export const getRealtimeToken = createServerFn()
.handler(async () => {
return realtimeToken({
adapter: openaiRealtimeToken({
model: 'gpt-4o-realtime-preview',
voice: 'alloy',
instructions: 'You are a helpful assistant...',
model: 'gpt-realtime',
}),
})
})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ export const Route = createFileRoute(
try {
const token = await realtimeToken({
adapter: openaiRealtimeToken({
model: 'gpt-4o-realtime-preview',
model: 'gpt-realtime',
}),
})
return new Response(JSON.stringify(token), {
Expand Down
2 changes: 1 addition & 1 deletion examples/ts-react-chat/src/lib/use-realtime.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ const getRealtimeTokenFn = createServerFn({ method: 'POST' })
if (data.provider === 'openai') {
return realtimeToken({
adapter: openaiRealtimeToken({
model: 'gpt-4o-realtime-preview',
model: 'gpt-realtime',
}),
})
}
Expand Down
94 changes: 26 additions & 68 deletions packages/ai-openai/src/realtime/adapter.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { resolveDebugOption } from '@tanstack/ai/adapter-internals'
import { buildSessionUpdate } from './session-update'
import type {
AnyClientTool,
AudioVisualization,
Expand Down Expand Up @@ -47,7 +48,7 @@ export function openaiRealtime(
token: RealtimeToken,
_clientTools?: ReadonlyArray<AnyClientTool>,
): Promise<RealtimeConnection> {
const model = token.config.model ?? 'gpt-4o-realtime-preview'
const model = token.config.model ?? 'gpt-realtime'
logger.request(`activity=realtime provider=openai model=${model}`, {
provider: 'openai',
model,
Expand All @@ -73,7 +74,6 @@ async function createWebRTCConnection(
token: RealtimeToken,
logger: InternalLogger,
): Promise<RealtimeConnection> {
const model = token.config.model ?? 'gpt-4o-realtime-preview'
const eventHandlers = new Map<RealtimeEvent, Set<RealtimeEventHandler<any>>>()

// WebRTC peer connection
Expand Down Expand Up @@ -187,10 +187,12 @@ async function createWebRTCConnection(
const offer = await pc.createOffer()
await pc.setLocalDescription(offer)

// Send SDP to OpenAI and get answer. `offer.sdp` is `string | undefined` per
// the WebRTC type definitions; coerce to `null` (which `RequestInit.body`
// accepts) under exactOptionalPropertyTypes.
const sdpResponse = await fetch(`${OPENAI_REALTIME_URL}?model=${model}`, {
// Send SDP to OpenAI's GA `/calls` endpoint and get the answer. The model
// is bound to the ephemeral token (minted via `/v1/realtime/client_secrets`),
// so it must NOT be passed as a query param β€” GA rejects `?model=` with a
// 400. `offer.sdp` is `string | undefined` per the WebRTC type definitions;
// coerce to `null`, which `RequestInit.body` accepts.
const sdpResponse = await fetch(`${OPENAI_REALTIME_URL}/calls`, {
method: 'POST',
headers: {
Authorization: `Bearer ${token.token}`,
Expand Down Expand Up @@ -260,7 +262,7 @@ async function createWebRTCConnection(
break
}

case 'response.audio_transcript.delta': {
case 'response.output_audio_transcript.delta': {
const delta = event.delta as string
emit('transcript', {
role: 'assistant',
Expand All @@ -270,7 +272,7 @@ async function createWebRTCConnection(
break
}

case 'response.audio_transcript.done': {
case 'response.output_audio_transcript.done': {
const transcript = event.transcript as string
emit('transcript', { role: 'assistant', transcript, isFinal: true })
break
Expand All @@ -296,14 +298,14 @@ async function createWebRTCConnection(
break
}

case 'response.audio.delta':
case 'response.output_audio.delta':
if (currentMode !== 'speaking') {
currentMode = 'speaking'
emit('mode_change', { mode: 'speaking' })
}
break

case 'response.audio.done':
case 'response.output_audio.done':
break

case 'response.function_call_arguments.done': {
Expand Down Expand Up @@ -359,12 +361,14 @@ async function createWebRTCConnection(
if (item.type === 'message' && item.content) {
const content = item.content as Array<Record<string, unknown>>
for (const part of content) {
if (part.type === 'audio' && part.transcript) {
// GA renamed assistant content types: `audio` -> `output_audio`,
// `text` -> `output_text`
if (part.type === 'output_audio' && part.transcript) {
message.parts.push({
type: 'audio',
transcript: part.transcript as string,
})
} else if (part.type === 'text' && part.text) {
} else if (part.type === 'output_text' && part.text) {
message.parts.push({
type: 'text',
content: part.text as string,
Expand Down Expand Up @@ -586,65 +590,19 @@ async function createWebRTCConnection(
},

updateSession(config: Partial<RealtimeSessionConfig>) {
const sessionUpdate: Record<string, unknown> = {}

if (config.instructions) {
sessionUpdate.instructions = config.instructions
}

if (config.voice) {
sessionUpdate.voice = config.voice
}

if (config.vadMode) {
if (config.vadMode === 'semantic') {
sessionUpdate.turn_detection = {
type: 'semantic_vad',
eagerness: config.semanticEagerness ?? 'medium',
}
} else if (config.vadMode === 'server') {
sessionUpdate.turn_detection = {
type: 'server_vad',
threshold: config.vadConfig?.threshold ?? 0.5,
prefix_padding_ms: config.vadConfig?.prefixPaddingMs ?? 300,
silence_duration_ms: config.vadConfig?.silenceDurationMs ?? 500,
}
} else {
sessionUpdate.turn_detection = null
}
}

if (config.tools !== undefined) {
sessionUpdate.tools = config.tools.map((t) => ({
type: 'function',
name: t.name,
description: t.description,
parameters: t.inputSchema ?? { type: 'object', properties: {} },
}))
sessionUpdate.tool_choice = 'auto'
}

if (config.outputModalities) {
sessionUpdate.modalities = config.outputModalities
}

if (config.temperature !== undefined) {
sessionUpdate.temperature = config.temperature
}

if (config.maxOutputTokens !== undefined) {
sessionUpdate.max_response_output_tokens = config.maxOutputTokens
// The GA API removed `temperature` from session config; sending it
// would get the whole update rejected with `unknown_parameter`.
logger.provider(
'provider=openai direction=out type=session.update dropped `temperature` (removed in the GA realtime API)',
{ frame: { temperature: config.temperature } },
)
}

// Always enable input audio transcription so user speech is transcribed
sessionUpdate.input_audio_transcription = { model: 'whisper-1' }

if (Object.keys(sessionUpdate).length > 0) {
sendEvent({
type: 'session.update',
session: sessionUpdate,
})
}
sendEvent({
type: 'session.update',
session: buildSessionUpdate(config),
})
},

interrupt() {
Expand Down
82 changes: 82 additions & 0 deletions packages/ai-openai/src/realtime/session-update.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import type { RealtimeSessionConfig } from '@tanstack/ai'

/**
* Builds the GA-shaped `session.update` payload for OpenAI's realtime API.
*
* The GA API requires `session.type` on every update and nests audio
* settings under `audio.input` / `audio.output` (the flat Beta field names
* were retired when the Beta shape was shut down on 2026-05-12). A
* `session.update` containing unknown fields is rejected with
* `unknown_parameter` and none of the config is applied, so the exact field
* names here are load-bearing.
*
* `temperature` was removed from the GA session config and is intentionally
* never sent; the adapter logs when it drops the option.
*/
export function buildSessionUpdate(
config: Partial<RealtimeSessionConfig>,
): Record<string, unknown> {
// Always enable input audio transcription so user speech is transcribed
const audioInput: Record<string, unknown> = {
transcription: { model: 'whisper-1' },
}

if (config.vadMode) {
if (config.vadMode === 'semantic') {
audioInput.turn_detection = {
type: 'semantic_vad',
eagerness: config.semanticEagerness ?? 'medium',
}
} else if (config.vadMode === 'server') {
audioInput.turn_detection = {
type: 'server_vad',
threshold: config.vadConfig?.threshold ?? 0.5,
prefix_padding_ms: config.vadConfig?.prefixPaddingMs ?? 300,
silence_duration_ms: config.vadConfig?.silenceDurationMs ?? 500,
}
} else {
audioInput.turn_detection = null
}
}

const audio: Record<string, unknown> = { input: audioInput }

if (config.voice) {
audio.output = { voice: config.voice }
}

const sessionUpdate: Record<string, unknown> = {
type: 'realtime',
audio,
}

if (config.instructions) {
sessionUpdate.instructions = config.instructions
}

if (config.tools !== undefined) {
sessionUpdate.tools = config.tools.map((t) => ({
type: 'function',
name: t.name,
description: t.description,
parameters: t.inputSchema ?? { type: 'object', properties: {} },
}))
sessionUpdate.tool_choice = 'auto'
}

if (config.outputModalities) {
// GA only supports a single output modality: ['audio'] or ['text']
// (Beta accepted ['audio', 'text']). Audio replies still stream text
// via `response.output_audio_transcript.*` events, so collapsing
// ['audio', 'text'] to ['audio'] preserves the visible behavior.
sessionUpdate.output_modalities = config.outputModalities.includes('audio')
? ['audio']
: ['text']
}

if (config.maxOutputTokens !== undefined) {
sessionUpdate.max_output_tokens = config.maxOutputTokens
}

return sessionUpdate
}
Loading