main
1# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
3from __future__ import annotations
4
5from typing import List, Union, Iterable
6from typing_extensions import Literal
7
8import httpx
9
10from .... import _legacy_response
11from ...._types import NOT_GIVEN, Body, Query, Headers, NotGiven
12from ...._utils import maybe_transform, async_maybe_transform
13from ...._compat import cached_property
14from ...._resource import SyncAPIResource, AsyncAPIResource
15from ...._response import to_streamed_response_wrapper, async_to_streamed_response_wrapper
16from ...._base_client import make_request_options
17from ....types.beta.realtime import session_create_params
18from ....types.beta.realtime.session_create_response import SessionCreateResponse
19
20__all__ = ["Sessions", "AsyncSessions"]
21
22
23class Sessions(SyncAPIResource):
24 @cached_property
25 def with_raw_response(self) -> SessionsWithRawResponse:
26 """
27 This property can be used as a prefix for any HTTP method call to return
28 the raw response object instead of the parsed content.
29
30 For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
31 """
32 return SessionsWithRawResponse(self)
33
34 @cached_property
35 def with_streaming_response(self) -> SessionsWithStreamingResponse:
36 """
37 An alternative to `.with_raw_response` that doesn't eagerly read the response body.
38
39 For more information, see https://www.github.com/openai/openai-python#with_streaming_response
40 """
41 return SessionsWithStreamingResponse(self)
42
43 def create(
44 self,
45 *,
46 client_secret: session_create_params.ClientSecret | NotGiven = NOT_GIVEN,
47 input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
48 input_audio_noise_reduction: session_create_params.InputAudioNoiseReduction | NotGiven = NOT_GIVEN,
49 input_audio_transcription: session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN,
50 instructions: str | NotGiven = NOT_GIVEN,
51 max_response_output_tokens: Union[int, Literal["inf"]] | NotGiven = NOT_GIVEN,
52 modalities: List[Literal["text", "audio"]] | NotGiven = NOT_GIVEN,
53 model: Literal[
54 "gpt-realtime",
55 "gpt-realtime-2025-08-28",
56 "gpt-4o-realtime-preview",
57 "gpt-4o-realtime-preview-2024-10-01",
58 "gpt-4o-realtime-preview-2024-12-17",
59 "gpt-4o-realtime-preview-2025-06-03",
60 "gpt-4o-mini-realtime-preview",
61 "gpt-4o-mini-realtime-preview-2024-12-17",
62 ]
63 | NotGiven = NOT_GIVEN,
64 output_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
65 speed: float | NotGiven = NOT_GIVEN,
66 temperature: float | NotGiven = NOT_GIVEN,
67 tool_choice: str | NotGiven = NOT_GIVEN,
68 tools: Iterable[session_create_params.Tool] | NotGiven = NOT_GIVEN,
69 tracing: session_create_params.Tracing | NotGiven = NOT_GIVEN,
70 turn_detection: session_create_params.TurnDetection | NotGiven = NOT_GIVEN,
71 voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"]]
72 | NotGiven = NOT_GIVEN,
73 # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
74 # The extra values given here take precedence over values defined on the client or passed to this method.
75 extra_headers: Headers | None = None,
76 extra_query: Query | None = None,
77 extra_body: Body | None = None,
78 timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
79 ) -> SessionCreateResponse:
80 """
81 Create an ephemeral API token for use in client-side applications with the
82 Realtime API. Can be configured with the same session parameters as the
83 `session.update` client event.
84
85 It responds with a session object, plus a `client_secret` key which contains a
86 usable ephemeral API token that can be used to authenticate browser clients for
87 the Realtime API.
88
89 Args:
90 client_secret: Configuration options for the generated client secret.
91
92 input_audio_format: The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
93 `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
94 (mono), and little-endian byte order.
95
96 input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn
97 off. Noise reduction filters audio added to the input audio buffer before it is
98 sent to VAD and the model. Filtering the audio can improve VAD and turn
99 detection accuracy (reducing false positives) and model performance by improving
100 perception of the input audio.
101
102 input_audio_transcription: Configuration for input audio transcription, defaults to off and can be set to
103 `null` to turn off once on. Input audio transcription is not native to the
104 model, since the model consumes audio directly. Transcription runs
105 asynchronously through
106 [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
107 and should be treated as guidance of input audio content rather than precisely
108 what the model heard. The client can optionally set the language and prompt for
109 transcription, these offer additional guidance to the transcription service.
110
111 instructions: The default system instructions (i.e. system message) prepended to model calls.
112 This field allows the client to guide the model on desired responses. The model
113 can be instructed on response content and format, (e.g. "be extremely succinct",
114 "act friendly", "here are examples of good responses") and on audio behavior
115 (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
116 instructions are not guaranteed to be followed by the model, but they provide
117 guidance to the model on the desired behavior.
118
119 Note that the server sets default instructions which will be used if this field
120 is not set and are visible in the `session.created` event at the start of the
121 session.
122
123 max_response_output_tokens: Maximum number of output tokens for a single assistant response, inclusive of
124 tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
125 `inf` for the maximum available tokens for a given model. Defaults to `inf`.
126
127 modalities: The set of modalities the model can respond with. To disable audio, set this to
128 ["text"].
129
130 model: The Realtime model used for this session.
131
132 output_audio_format: The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
133 For `pcm16`, output audio is sampled at a rate of 24kHz.
134
135 speed: The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the
136 minimum speed. 1.5 is the maximum speed. This value can only be changed in
137 between model turns, not while a response is in progress.
138
139 temperature: Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
140 temperature of 0.8 is highly recommended for best performance.
141
142 tool_choice: How the model chooses tools. Options are `auto`, `none`, `required`, or specify
143 a function.
144
145 tools: Tools (functions) available to the model.
146
147 tracing: Configuration options for tracing. Set to null to disable tracing. Once tracing
148 is enabled for a session, the configuration cannot be modified.
149
150 `auto` will create a trace for the session with default values for the workflow
151 name, group id, and metadata.
152
153 turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
154 set to `null` to turn off, in which case the client must manually trigger model
155 response. Server VAD means that the model will detect the start and end of
156 speech based on audio volume and respond at the end of user speech. Semantic VAD
157 is more advanced and uses a turn detection model (in conjunction with VAD) to
158 semantically estimate whether the user has finished speaking, then dynamically
159 sets a timeout based on this probability. For example, if user audio trails off
160 with "uhhm", the model will score a low probability of turn end and wait longer
161 for the user to continue speaking. This can be useful for more natural
162 conversations, but may have a higher latency.
163
164 voice: The voice the model uses to respond. Voice cannot be changed during the session
165 once the model has responded with audio at least once. Current voice options are
166 `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`.
167
168 extra_headers: Send extra headers
169
170 extra_query: Add additional query parameters to the request
171
172 extra_body: Add additional JSON properties to the request
173
174 timeout: Override the client-level default timeout for this request, in seconds
175 """
176 extra_headers = {"OpenAI-Beta": "assistants=v2", **(extra_headers or {})}
177 return self._post(
178 "/realtime/sessions",
179 body=maybe_transform(
180 {
181 "client_secret": client_secret,
182 "input_audio_format": input_audio_format,
183 "input_audio_noise_reduction": input_audio_noise_reduction,
184 "input_audio_transcription": input_audio_transcription,
185 "instructions": instructions,
186 "max_response_output_tokens": max_response_output_tokens,
187 "modalities": modalities,
188 "model": model,
189 "output_audio_format": output_audio_format,
190 "speed": speed,
191 "temperature": temperature,
192 "tool_choice": tool_choice,
193 "tools": tools,
194 "tracing": tracing,
195 "turn_detection": turn_detection,
196 "voice": voice,
197 },
198 session_create_params.SessionCreateParams,
199 ),
200 options=make_request_options(
201 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
202 ),
203 cast_to=SessionCreateResponse,
204 )
205
206
207class AsyncSessions(AsyncAPIResource):
208 @cached_property
209 def with_raw_response(self) -> AsyncSessionsWithRawResponse:
210 """
211 This property can be used as a prefix for any HTTP method call to return
212 the raw response object instead of the parsed content.
213
214 For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
215 """
216 return AsyncSessionsWithRawResponse(self)
217
218 @cached_property
219 def with_streaming_response(self) -> AsyncSessionsWithStreamingResponse:
220 """
221 An alternative to `.with_raw_response` that doesn't eagerly read the response body.
222
223 For more information, see https://www.github.com/openai/openai-python#with_streaming_response
224 """
225 return AsyncSessionsWithStreamingResponse(self)
226
227 async def create(
228 self,
229 *,
230 client_secret: session_create_params.ClientSecret | NotGiven = NOT_GIVEN,
231 input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
232 input_audio_noise_reduction: session_create_params.InputAudioNoiseReduction | NotGiven = NOT_GIVEN,
233 input_audio_transcription: session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN,
234 instructions: str | NotGiven = NOT_GIVEN,
235 max_response_output_tokens: Union[int, Literal["inf"]] | NotGiven = NOT_GIVEN,
236 modalities: List[Literal["text", "audio"]] | NotGiven = NOT_GIVEN,
237 model: Literal[
238 "gpt-realtime",
239 "gpt-realtime-2025-08-28",
240 "gpt-4o-realtime-preview",
241 "gpt-4o-realtime-preview-2024-10-01",
242 "gpt-4o-realtime-preview-2024-12-17",
243 "gpt-4o-realtime-preview-2025-06-03",
244 "gpt-4o-mini-realtime-preview",
245 "gpt-4o-mini-realtime-preview-2024-12-17",
246 ]
247 | NotGiven = NOT_GIVEN,
248 output_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
249 speed: float | NotGiven = NOT_GIVEN,
250 temperature: float | NotGiven = NOT_GIVEN,
251 tool_choice: str | NotGiven = NOT_GIVEN,
252 tools: Iterable[session_create_params.Tool] | NotGiven = NOT_GIVEN,
253 tracing: session_create_params.Tracing | NotGiven = NOT_GIVEN,
254 turn_detection: session_create_params.TurnDetection | NotGiven = NOT_GIVEN,
255 voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"]]
256 | NotGiven = NOT_GIVEN,
257 # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
258 # The extra values given here take precedence over values defined on the client or passed to this method.
259 extra_headers: Headers | None = None,
260 extra_query: Query | None = None,
261 extra_body: Body | None = None,
262 timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
263 ) -> SessionCreateResponse:
264 """
265 Create an ephemeral API token for use in client-side applications with the
266 Realtime API. Can be configured with the same session parameters as the
267 `session.update` client event.
268
269 It responds with a session object, plus a `client_secret` key which contains a
270 usable ephemeral API token that can be used to authenticate browser clients for
271 the Realtime API.
272
273 Args:
274 client_secret: Configuration options for the generated client secret.
275
276 input_audio_format: The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
277 `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
278 (mono), and little-endian byte order.
279
280 input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn
281 off. Noise reduction filters audio added to the input audio buffer before it is
282 sent to VAD and the model. Filtering the audio can improve VAD and turn
283 detection accuracy (reducing false positives) and model performance by improving
284 perception of the input audio.
285
286 input_audio_transcription: Configuration for input audio transcription, defaults to off and can be set to
287 `null` to turn off once on. Input audio transcription is not native to the
288 model, since the model consumes audio directly. Transcription runs
289 asynchronously through
290 [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
291 and should be treated as guidance of input audio content rather than precisely
292 what the model heard. The client can optionally set the language and prompt for
293 transcription, these offer additional guidance to the transcription service.
294
295 instructions: The default system instructions (i.e. system message) prepended to model calls.
296 This field allows the client to guide the model on desired responses. The model
297 can be instructed on response content and format, (e.g. "be extremely succinct",
298 "act friendly", "here are examples of good responses") and on audio behavior
299 (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
300 instructions are not guaranteed to be followed by the model, but they provide
301 guidance to the model on the desired behavior.
302
303 Note that the server sets default instructions which will be used if this field
304 is not set and are visible in the `session.created` event at the start of the
305 session.
306
307 max_response_output_tokens: Maximum number of output tokens for a single assistant response, inclusive of
308 tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
309 `inf` for the maximum available tokens for a given model. Defaults to `inf`.
310
311 modalities: The set of modalities the model can respond with. To disable audio, set this to
312 ["text"].
313
314 model: The Realtime model used for this session.
315
316 output_audio_format: The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
317 For `pcm16`, output audio is sampled at a rate of 24kHz.
318
319 speed: The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the
320 minimum speed. 1.5 is the maximum speed. This value can only be changed in
321 between model turns, not while a response is in progress.
322
323 temperature: Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
324 temperature of 0.8 is highly recommended for best performance.
325
326 tool_choice: How the model chooses tools. Options are `auto`, `none`, `required`, or specify
327 a function.
328
329 tools: Tools (functions) available to the model.
330
331 tracing: Configuration options for tracing. Set to null to disable tracing. Once tracing
332 is enabled for a session, the configuration cannot be modified.
333
334 `auto` will create a trace for the session with default values for the workflow
335 name, group id, and metadata.
336
337 turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
338 set to `null` to turn off, in which case the client must manually trigger model
339 response. Server VAD means that the model will detect the start and end of
340 speech based on audio volume and respond at the end of user speech. Semantic VAD
341 is more advanced and uses a turn detection model (in conjunction with VAD) to
342 semantically estimate whether the user has finished speaking, then dynamically
343 sets a timeout based on this probability. For example, if user audio trails off
344 with "uhhm", the model will score a low probability of turn end and wait longer
345 for the user to continue speaking. This can be useful for more natural
346 conversations, but may have a higher latency.
347
348 voice: The voice the model uses to respond. Voice cannot be changed during the session
349 once the model has responded with audio at least once. Current voice options are
350 `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`.
351
352 extra_headers: Send extra headers
353
354 extra_query: Add additional query parameters to the request
355
356 extra_body: Add additional JSON properties to the request
357
358 timeout: Override the client-level default timeout for this request, in seconds
359 """
360 extra_headers = {"OpenAI-Beta": "assistants=v2", **(extra_headers or {})}
361 return await self._post(
362 "/realtime/sessions",
363 body=await async_maybe_transform(
364 {
365 "client_secret": client_secret,
366 "input_audio_format": input_audio_format,
367 "input_audio_noise_reduction": input_audio_noise_reduction,
368 "input_audio_transcription": input_audio_transcription,
369 "instructions": instructions,
370 "max_response_output_tokens": max_response_output_tokens,
371 "modalities": modalities,
372 "model": model,
373 "output_audio_format": output_audio_format,
374 "speed": speed,
375 "temperature": temperature,
376 "tool_choice": tool_choice,
377 "tools": tools,
378 "tracing": tracing,
379 "turn_detection": turn_detection,
380 "voice": voice,
381 },
382 session_create_params.SessionCreateParams,
383 ),
384 options=make_request_options(
385 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
386 ),
387 cast_to=SessionCreateResponse,
388 )
389
390
391class SessionsWithRawResponse:
392 def __init__(self, sessions: Sessions) -> None:
393 self._sessions = sessions
394
395 self.create = _legacy_response.to_raw_response_wrapper(
396 sessions.create,
397 )
398
399
400class AsyncSessionsWithRawResponse:
401 def __init__(self, sessions: AsyncSessions) -> None:
402 self._sessions = sessions
403
404 self.create = _legacy_response.async_to_raw_response_wrapper(
405 sessions.create,
406 )
407
408
409class SessionsWithStreamingResponse:
410 def __init__(self, sessions: Sessions) -> None:
411 self._sessions = sessions
412
413 self.create = to_streamed_response_wrapper(
414 sessions.create,
415 )
416
417
418class AsyncSessionsWithStreamingResponse:
419 def __init__(self, sessions: AsyncSessions) -> None:
420 self._sessions = sessions
421
422 self.create = async_to_streamed_response_wrapper(
423 sessions.create,
424 )