openai-python/src/openai/resources/beta/realtime/sessions.py at main

  1# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
  2
  3from __future__ import annotations
  4
  5from typing import List, Union, Iterable
  6from typing_extensions import Literal
  7
  8import httpx
  9
 10from .... import _legacy_response
 11from ...._types import NOT_GIVEN, Body, Query, Headers, NotGiven
 12from ...._utils import maybe_transform, async_maybe_transform
 13from ...._compat import cached_property
 14from ...._resource import SyncAPIResource, AsyncAPIResource
 15from ...._response import to_streamed_response_wrapper, async_to_streamed_response_wrapper
 16from ...._base_client import make_request_options
 17from ....types.beta.realtime import session_create_params
 18from ....types.beta.realtime.session_create_response import SessionCreateResponse
 19
 20__all__ = ["Sessions", "AsyncSessions"]
 21
 22
 23class Sessions(SyncAPIResource):
 24    @cached_property
 25    def with_raw_response(self) -> SessionsWithRawResponse:
 26        """
 27        This property can be used as a prefix for any HTTP method call to return
 28        the raw response object instead of the parsed content.
 29
 30        For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
 31        """
 32        return SessionsWithRawResponse(self)
 33
 34    @cached_property
 35    def with_streaming_response(self) -> SessionsWithStreamingResponse:
 36        """
 37        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
 38
 39        For more information, see https://www.github.com/openai/openai-python#with_streaming_response
 40        """
 41        return SessionsWithStreamingResponse(self)
 42
 43    def create(
 44        self,
 45        *,
 46        client_secret: session_create_params.ClientSecret | NotGiven = NOT_GIVEN,
 47        input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
 48        input_audio_noise_reduction: session_create_params.InputAudioNoiseReduction | NotGiven = NOT_GIVEN,
 49        input_audio_transcription: session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN,
 50        instructions: str | NotGiven = NOT_GIVEN,
 51        max_response_output_tokens: Union[int, Literal["inf"]] | NotGiven = NOT_GIVEN,
 52        modalities: List[Literal["text", "audio"]] | NotGiven = NOT_GIVEN,
 53        model: Literal[
 54            "gpt-realtime",
 55            "gpt-realtime-2025-08-28",
 56            "gpt-4o-realtime-preview",
 57            "gpt-4o-realtime-preview-2024-10-01",
 58            "gpt-4o-realtime-preview-2024-12-17",
 59            "gpt-4o-realtime-preview-2025-06-03",
 60            "gpt-4o-mini-realtime-preview",
 61            "gpt-4o-mini-realtime-preview-2024-12-17",
 62        ]
 63        | NotGiven = NOT_GIVEN,
 64        output_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
 65        speed: float | NotGiven = NOT_GIVEN,
 66        temperature: float | NotGiven = NOT_GIVEN,
 67        tool_choice: str | NotGiven = NOT_GIVEN,
 68        tools: Iterable[session_create_params.Tool] | NotGiven = NOT_GIVEN,
 69        tracing: session_create_params.Tracing | NotGiven = NOT_GIVEN,
 70        turn_detection: session_create_params.TurnDetection | NotGiven = NOT_GIVEN,
 71        voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"]]
 72        | NotGiven = NOT_GIVEN,
 73        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
 74        # The extra values given here take precedence over values defined on the client or passed to this method.
 75        extra_headers: Headers | None = None,
 76        extra_query: Query | None = None,
 77        extra_body: Body | None = None,
 78        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
 79    ) -> SessionCreateResponse:
 80        """
 81        Create an ephemeral API token for use in client-side applications with the
 82        Realtime API. Can be configured with the same session parameters as the
 83        `session.update` client event.
 84
 85        It responds with a session object, plus a `client_secret` key which contains a
 86        usable ephemeral API token that can be used to authenticate browser clients for
 87        the Realtime API.
 88
 89        Args:
 90          client_secret: Configuration options for the generated client secret.
 91
 92          input_audio_format: The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
 93              `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
 94              (mono), and little-endian byte order.
 95
 96          input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn
 97              off. Noise reduction filters audio added to the input audio buffer before it is
 98              sent to VAD and the model. Filtering the audio can improve VAD and turn
 99              detection accuracy (reducing false positives) and model performance by improving
100              perception of the input audio.
101
102          input_audio_transcription: Configuration for input audio transcription, defaults to off and can be set to
103              `null` to turn off once on. Input audio transcription is not native to the
104              model, since the model consumes audio directly. Transcription runs
105              asynchronously through
106              [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
107              and should be treated as guidance of input audio content rather than precisely
108              what the model heard. The client can optionally set the language and prompt for
109              transcription, these offer additional guidance to the transcription service.
110
111          instructions: The default system instructions (i.e. system message) prepended to model calls.
112              This field allows the client to guide the model on desired responses. The model
113              can be instructed on response content and format, (e.g. "be extremely succinct",
114              "act friendly", "here are examples of good responses") and on audio behavior
115              (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
116              instructions are not guaranteed to be followed by the model, but they provide
117              guidance to the model on the desired behavior.
118
119              Note that the server sets default instructions which will be used if this field
120              is not set and are visible in the `session.created` event at the start of the
121              session.
122
123          max_response_output_tokens: Maximum number of output tokens for a single assistant response, inclusive of
124              tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
125              `inf` for the maximum available tokens for a given model. Defaults to `inf`.
126
127          modalities: The set of modalities the model can respond with. To disable audio, set this to
128              ["text"].
129
130          model: The Realtime model used for this session.
131
132          output_audio_format: The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
133              For `pcm16`, output audio is sampled at a rate of 24kHz.
134
135          speed: The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the
136              minimum speed. 1.5 is the maximum speed. This value can only be changed in
137              between model turns, not while a response is in progress.
138
139          temperature: Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
140              temperature of 0.8 is highly recommended for best performance.
141
142          tool_choice: How the model chooses tools. Options are `auto`, `none`, `required`, or specify
143              a function.
144
145          tools: Tools (functions) available to the model.
146
147          tracing: Configuration options for tracing. Set to null to disable tracing. Once tracing
148              is enabled for a session, the configuration cannot be modified.
149
150              `auto` will create a trace for the session with default values for the workflow
151              name, group id, and metadata.
152
153          turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
154              set to `null` to turn off, in which case the client must manually trigger model
155              response. Server VAD means that the model will detect the start and end of
156              speech based on audio volume and respond at the end of user speech. Semantic VAD
157              is more advanced and uses a turn detection model (in conjunction with VAD) to
158              semantically estimate whether the user has finished speaking, then dynamically
159              sets a timeout based on this probability. For example, if user audio trails off
160              with "uhhm", the model will score a low probability of turn end and wait longer
161              for the user to continue speaking. This can be useful for more natural
162              conversations, but may have a higher latency.
163
164          voice: The voice the model uses to respond. Voice cannot be changed during the session
165              once the model has responded with audio at least once. Current voice options are
166              `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`.
167
168          extra_headers: Send extra headers
169
170          extra_query: Add additional query parameters to the request
171
172          extra_body: Add additional JSON properties to the request
173
174          timeout: Override the client-level default timeout for this request, in seconds
175        """
176        extra_headers = {"OpenAI-Beta": "assistants=v2", **(extra_headers or {})}
177        return self._post(
178            "/realtime/sessions",
179            body=maybe_transform(
180                {
181                    "client_secret": client_secret,
182                    "input_audio_format": input_audio_format,
183                    "input_audio_noise_reduction": input_audio_noise_reduction,
184                    "input_audio_transcription": input_audio_transcription,
185                    "instructions": instructions,
186                    "max_response_output_tokens": max_response_output_tokens,
187                    "modalities": modalities,
188                    "model": model,
189                    "output_audio_format": output_audio_format,
190                    "speed": speed,
191                    "temperature": temperature,
192                    "tool_choice": tool_choice,
193                    "tools": tools,
194                    "tracing": tracing,
195                    "turn_detection": turn_detection,
196                    "voice": voice,
197                },
198                session_create_params.SessionCreateParams,
199            ),
200            options=make_request_options(
201                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
202            ),
203            cast_to=SessionCreateResponse,
204        )
205
206
207class AsyncSessions(AsyncAPIResource):
208    @cached_property
209    def with_raw_response(self) -> AsyncSessionsWithRawResponse:
210        """
211        This property can be used as a prefix for any HTTP method call to return
212        the raw response object instead of the parsed content.
213
214        For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
215        """
216        return AsyncSessionsWithRawResponse(self)
217
218    @cached_property
219    def with_streaming_response(self) -> AsyncSessionsWithStreamingResponse:
220        """
221        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
222
223        For more information, see https://www.github.com/openai/openai-python#with_streaming_response
224        """
225        return AsyncSessionsWithStreamingResponse(self)
226
227    async def create(
228        self,
229        *,
230        client_secret: session_create_params.ClientSecret | NotGiven = NOT_GIVEN,
231        input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
232        input_audio_noise_reduction: session_create_params.InputAudioNoiseReduction | NotGiven = NOT_GIVEN,
233        input_audio_transcription: session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN,
234        instructions: str | NotGiven = NOT_GIVEN,
235        max_response_output_tokens: Union[int, Literal["inf"]] | NotGiven = NOT_GIVEN,
236        modalities: List[Literal["text", "audio"]] | NotGiven = NOT_GIVEN,
237        model: Literal[
238            "gpt-realtime",
239            "gpt-realtime-2025-08-28",
240            "gpt-4o-realtime-preview",
241            "gpt-4o-realtime-preview-2024-10-01",
242            "gpt-4o-realtime-preview-2024-12-17",
243            "gpt-4o-realtime-preview-2025-06-03",
244            "gpt-4o-mini-realtime-preview",
245            "gpt-4o-mini-realtime-preview-2024-12-17",
246        ]
247        | NotGiven = NOT_GIVEN,
248        output_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
249        speed: float | NotGiven = NOT_GIVEN,
250        temperature: float | NotGiven = NOT_GIVEN,
251        tool_choice: str | NotGiven = NOT_GIVEN,
252        tools: Iterable[session_create_params.Tool] | NotGiven = NOT_GIVEN,
253        tracing: session_create_params.Tracing | NotGiven = NOT_GIVEN,
254        turn_detection: session_create_params.TurnDetection | NotGiven = NOT_GIVEN,
255        voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"]]
256        | NotGiven = NOT_GIVEN,
257        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
258        # The extra values given here take precedence over values defined on the client or passed to this method.
259        extra_headers: Headers | None = None,
260        extra_query: Query | None = None,
261        extra_body: Body | None = None,
262        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
263    ) -> SessionCreateResponse:
264        """
265        Create an ephemeral API token for use in client-side applications with the
266        Realtime API. Can be configured with the same session parameters as the
267        `session.update` client event.
268
269        It responds with a session object, plus a `client_secret` key which contains a
270        usable ephemeral API token that can be used to authenticate browser clients for
271        the Realtime API.
272
273        Args:
274          client_secret: Configuration options for the generated client secret.
275
276          input_audio_format: The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
277              `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
278              (mono), and little-endian byte order.
279
280          input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn
281              off. Noise reduction filters audio added to the input audio buffer before it is
282              sent to VAD and the model. Filtering the audio can improve VAD and turn
283              detection accuracy (reducing false positives) and model performance by improving
284              perception of the input audio.
285
286          input_audio_transcription: Configuration for input audio transcription, defaults to off and can be set to
287              `null` to turn off once on. Input audio transcription is not native to the
288              model, since the model consumes audio directly. Transcription runs
289              asynchronously through
290              [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
291              and should be treated as guidance of input audio content rather than precisely
292              what the model heard. The client can optionally set the language and prompt for
293              transcription, these offer additional guidance to the transcription service.
294
295          instructions: The default system instructions (i.e. system message) prepended to model calls.
296              This field allows the client to guide the model on desired responses. The model
297              can be instructed on response content and format, (e.g. "be extremely succinct",
298              "act friendly", "here are examples of good responses") and on audio behavior
299              (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
300              instructions are not guaranteed to be followed by the model, but they provide
301              guidance to the model on the desired behavior.
302
303              Note that the server sets default instructions which will be used if this field
304              is not set and are visible in the `session.created` event at the start of the
305              session.
306
307          max_response_output_tokens: Maximum number of output tokens for a single assistant response, inclusive of
308              tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
309              `inf` for the maximum available tokens for a given model. Defaults to `inf`.
310
311          modalities: The set of modalities the model can respond with. To disable audio, set this to
312              ["text"].
313
314          model: The Realtime model used for this session.
315
316          output_audio_format: The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
317              For `pcm16`, output audio is sampled at a rate of 24kHz.
318
319          speed: The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the
320              minimum speed. 1.5 is the maximum speed. This value can only be changed in
321              between model turns, not while a response is in progress.
322
323          temperature: Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
324              temperature of 0.8 is highly recommended for best performance.
325
326          tool_choice: How the model chooses tools. Options are `auto`, `none`, `required`, or specify
327              a function.
328
329          tools: Tools (functions) available to the model.
330
331          tracing: Configuration options for tracing. Set to null to disable tracing. Once tracing
332              is enabled for a session, the configuration cannot be modified.
333
334              `auto` will create a trace for the session with default values for the workflow
335              name, group id, and metadata.
336
337          turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
338              set to `null` to turn off, in which case the client must manually trigger model
339              response. Server VAD means that the model will detect the start and end of
340              speech based on audio volume and respond at the end of user speech. Semantic VAD
341              is more advanced and uses a turn detection model (in conjunction with VAD) to
342              semantically estimate whether the user has finished speaking, then dynamically
343              sets a timeout based on this probability. For example, if user audio trails off
344              with "uhhm", the model will score a low probability of turn end and wait longer
345              for the user to continue speaking. This can be useful for more natural
346              conversations, but may have a higher latency.
347
348          voice: The voice the model uses to respond. Voice cannot be changed during the session
349              once the model has responded with audio at least once. Current voice options are
350              `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`.
351
352          extra_headers: Send extra headers
353
354          extra_query: Add additional query parameters to the request
355
356          extra_body: Add additional JSON properties to the request
357
358          timeout: Override the client-level default timeout for this request, in seconds
359        """
360        extra_headers = {"OpenAI-Beta": "assistants=v2", **(extra_headers or {})}
361        return await self._post(
362            "/realtime/sessions",
363            body=await async_maybe_transform(
364                {
365                    "client_secret": client_secret,
366                    "input_audio_format": input_audio_format,
367                    "input_audio_noise_reduction": input_audio_noise_reduction,
368                    "input_audio_transcription": input_audio_transcription,
369                    "instructions": instructions,
370                    "max_response_output_tokens": max_response_output_tokens,
371                    "modalities": modalities,
372                    "model": model,
373                    "output_audio_format": output_audio_format,
374                    "speed": speed,
375                    "temperature": temperature,
376                    "tool_choice": tool_choice,
377                    "tools": tools,
378                    "tracing": tracing,
379                    "turn_detection": turn_detection,
380                    "voice": voice,
381                },
382                session_create_params.SessionCreateParams,
383            ),
384            options=make_request_options(
385                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
386            ),
387            cast_to=SessionCreateResponse,
388        )
389
390
391class SessionsWithRawResponse:
392    def __init__(self, sessions: Sessions) -> None:
393        self._sessions = sessions
394
395        self.create = _legacy_response.to_raw_response_wrapper(
396            sessions.create,
397        )
398
399
400class AsyncSessionsWithRawResponse:
401    def __init__(self, sessions: AsyncSessions) -> None:
402        self._sessions = sessions
403
404        self.create = _legacy_response.async_to_raw_response_wrapper(
405            sessions.create,
406        )
407
408
409class SessionsWithStreamingResponse:
410    def __init__(self, sessions: Sessions) -> None:
411        self._sessions = sessions
412
413        self.create = to_streamed_response_wrapper(
414            sessions.create,
415        )
416
417
418class AsyncSessionsWithStreamingResponse:
419    def __init__(self, sessions: AsyncSessions) -> None:
420        self._sessions = sessions
421
422        self.create = async_to_streamed_response_wrapper(
423            sessions.create,
424        )