openai-python/src/openai/resources/beta/realtime/transcription

  1# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
  2
  3from __future__ import annotations
  4
  5from typing import List
  6from typing_extensions import Literal
  7
  8import httpx
  9
 10from .... import _legacy_response
 11from ...._types import NOT_GIVEN, Body, Query, Headers, NotGiven
 12from ...._utils import maybe_transform, async_maybe_transform
 13from ...._compat import cached_property
 14from ...._resource import SyncAPIResource, AsyncAPIResource
 15from ...._response import to_streamed_response_wrapper, async_to_streamed_response_wrapper
 16from ...._base_client import make_request_options
 17from ....types.beta.realtime import transcription_session_create_params
 18from ....types.beta.realtime.transcription_session import TranscriptionSession
 19
 20__all__ = ["TranscriptionSessions", "AsyncTranscriptionSessions"]
 21
 22
 23class TranscriptionSessions(SyncAPIResource):
 24    @cached_property
 25    def with_raw_response(self) -> TranscriptionSessionsWithRawResponse:
 26        """
 27        This property can be used as a prefix for any HTTP method call to return
 28        the raw response object instead of the parsed content.
 29
 30        For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
 31        """
 32        return TranscriptionSessionsWithRawResponse(self)
 33
 34    @cached_property
 35    def with_streaming_response(self) -> TranscriptionSessionsWithStreamingResponse:
 36        """
 37        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
 38
 39        For more information, see https://www.github.com/openai/openai-python#with_streaming_response
 40        """
 41        return TranscriptionSessionsWithStreamingResponse(self)
 42
 43    def create(
 44        self,
 45        *,
 46        client_secret: transcription_session_create_params.ClientSecret | NotGiven = NOT_GIVEN,
 47        include: List[str] | NotGiven = NOT_GIVEN,
 48        input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
 49        input_audio_noise_reduction: transcription_session_create_params.InputAudioNoiseReduction
 50        | NotGiven = NOT_GIVEN,
 51        input_audio_transcription: transcription_session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN,
 52        modalities: List[Literal["text", "audio"]] | NotGiven = NOT_GIVEN,
 53        turn_detection: transcription_session_create_params.TurnDetection | NotGiven = NOT_GIVEN,
 54        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
 55        # The extra values given here take precedence over values defined on the client or passed to this method.
 56        extra_headers: Headers | None = None,
 57        extra_query: Query | None = None,
 58        extra_body: Body | None = None,
 59        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
 60    ) -> TranscriptionSession:
 61        """
 62        Create an ephemeral API token for use in client-side applications with the
 63        Realtime API specifically for realtime transcriptions. Can be configured with
 64        the same session parameters as the `transcription_session.update` client event.
 65
 66        It responds with a session object, plus a `client_secret` key which contains a
 67        usable ephemeral API token that can be used to authenticate browser clients for
 68        the Realtime API.
 69
 70        Args:
 71          client_secret: Configuration options for the generated client secret.
 72
 73          include:
 74              The set of items to include in the transcription. Current available items are:
 75
 76              - `item.input_audio_transcription.logprobs`
 77
 78          input_audio_format: The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
 79              `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
 80              (mono), and little-endian byte order.
 81
 82          input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn
 83              off. Noise reduction filters audio added to the input audio buffer before it is
 84              sent to VAD and the model. Filtering the audio can improve VAD and turn
 85              detection accuracy (reducing false positives) and model performance by improving
 86              perception of the input audio.
 87
 88          input_audio_transcription: Configuration for input audio transcription. The client can optionally set the
 89              language and prompt for transcription, these offer additional guidance to the
 90              transcription service.
 91
 92          modalities: The set of modalities the model can respond with. To disable audio, set this to
 93              ["text"].
 94
 95          turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
 96              set to `null` to turn off, in which case the client must manually trigger model
 97              response. Server VAD means that the model will detect the start and end of
 98              speech based on audio volume and respond at the end of user speech. Semantic VAD
 99              is more advanced and uses a turn detection model (in conjunction with VAD) to
100              semantically estimate whether the user has finished speaking, then dynamically
101              sets a timeout based on this probability. For example, if user audio trails off
102              with "uhhm", the model will score a low probability of turn end and wait longer
103              for the user to continue speaking. This can be useful for more natural
104              conversations, but may have a higher latency.
105
106          extra_headers: Send extra headers
107
108          extra_query: Add additional query parameters to the request
109
110          extra_body: Add additional JSON properties to the request
111
112          timeout: Override the client-level default timeout for this request, in seconds
113        """
114        extra_headers = {"OpenAI-Beta": "assistants=v2", **(extra_headers or {})}
115        return self._post(
116            "/realtime/transcription_sessions",
117            body=maybe_transform(
118                {
119                    "client_secret": client_secret,
120                    "include": include,
121                    "input_audio_format": input_audio_format,
122                    "input_audio_noise_reduction": input_audio_noise_reduction,
123                    "input_audio_transcription": input_audio_transcription,
124                    "modalities": modalities,
125                    "turn_detection": turn_detection,
126                },
127                transcription_session_create_params.TranscriptionSessionCreateParams,
128            ),
129            options=make_request_options(
130                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
131            ),
132            cast_to=TranscriptionSession,
133        )
134
135
136class AsyncTranscriptionSessions(AsyncAPIResource):
137    @cached_property
138    def with_raw_response(self) -> AsyncTranscriptionSessionsWithRawResponse:
139        """
140        This property can be used as a prefix for any HTTP method call to return
141        the raw response object instead of the parsed content.
142
143        For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
144        """
145        return AsyncTranscriptionSessionsWithRawResponse(self)
146
147    @cached_property
148    def with_streaming_response(self) -> AsyncTranscriptionSessionsWithStreamingResponse:
149        """
150        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
151
152        For more information, see https://www.github.com/openai/openai-python#with_streaming_response
153        """
154        return AsyncTranscriptionSessionsWithStreamingResponse(self)
155
156    async def create(
157        self,
158        *,
159        client_secret: transcription_session_create_params.ClientSecret | NotGiven = NOT_GIVEN,
160        include: List[str] | NotGiven = NOT_GIVEN,
161        input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
162        input_audio_noise_reduction: transcription_session_create_params.InputAudioNoiseReduction
163        | NotGiven = NOT_GIVEN,
164        input_audio_transcription: transcription_session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN,
165        modalities: List[Literal["text", "audio"]] | NotGiven = NOT_GIVEN,
166        turn_detection: transcription_session_create_params.TurnDetection | NotGiven = NOT_GIVEN,
167        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
168        # The extra values given here take precedence over values defined on the client or passed to this method.
169        extra_headers: Headers | None = None,
170        extra_query: Query | None = None,
171        extra_body: Body | None = None,
172        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
173    ) -> TranscriptionSession:
174        """
175        Create an ephemeral API token for use in client-side applications with the
176        Realtime API specifically for realtime transcriptions. Can be configured with
177        the same session parameters as the `transcription_session.update` client event.
178
179        It responds with a session object, plus a `client_secret` key which contains a
180        usable ephemeral API token that can be used to authenticate browser clients for
181        the Realtime API.
182
183        Args:
184          client_secret: Configuration options for the generated client secret.
185
186          include:
187              The set of items to include in the transcription. Current available items are:
188
189              - `item.input_audio_transcription.logprobs`
190
191          input_audio_format: The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
192              `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
193              (mono), and little-endian byte order.
194
195          input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn
196              off. Noise reduction filters audio added to the input audio buffer before it is
197              sent to VAD and the model. Filtering the audio can improve VAD and turn
198              detection accuracy (reducing false positives) and model performance by improving
199              perception of the input audio.
200
201          input_audio_transcription: Configuration for input audio transcription. The client can optionally set the
202              language and prompt for transcription, these offer additional guidance to the
203              transcription service.
204
205          modalities: The set of modalities the model can respond with. To disable audio, set this to
206              ["text"].
207
208          turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
209              set to `null` to turn off, in which case the client must manually trigger model
210              response. Server VAD means that the model will detect the start and end of
211              speech based on audio volume and respond at the end of user speech. Semantic VAD
212              is more advanced and uses a turn detection model (in conjunction with VAD) to
213              semantically estimate whether the user has finished speaking, then dynamically
214              sets a timeout based on this probability. For example, if user audio trails off
215              with "uhhm", the model will score a low probability of turn end and wait longer
216              for the user to continue speaking. This can be useful for more natural
217              conversations, but may have a higher latency.
218
219          extra_headers: Send extra headers
220
221          extra_query: Add additional query parameters to the request
222
223          extra_body: Add additional JSON properties to the request
224
225          timeout: Override the client-level default timeout for this request, in seconds
226        """
227        extra_headers = {"OpenAI-Beta": "assistants=v2", **(extra_headers or {})}
228        return await self._post(
229            "/realtime/transcription_sessions",
230            body=await async_maybe_transform(
231                {
232                    "client_secret": client_secret,
233                    "include": include,
234                    "input_audio_format": input_audio_format,
235                    "input_audio_noise_reduction": input_audio_noise_reduction,
236                    "input_audio_transcription": input_audio_transcription,
237                    "modalities": modalities,
238                    "turn_detection": turn_detection,
239                },
240                transcription_session_create_params.TranscriptionSessionCreateParams,
241            ),
242            options=make_request_options(
243                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
244            ),
245            cast_to=TranscriptionSession,
246        )
247
248
249class TranscriptionSessionsWithRawResponse:
250    def __init__(self, transcription_sessions: TranscriptionSessions) -> None:
251        self._transcription_sessions = transcription_sessions
252
253        self.create = _legacy_response.to_raw_response_wrapper(
254            transcription_sessions.create,
255        )
256
257
258class AsyncTranscriptionSessionsWithRawResponse:
259    def __init__(self, transcription_sessions: AsyncTranscriptionSessions) -> None:
260        self._transcription_sessions = transcription_sessions
261
262        self.create = _legacy_response.async_to_raw_response_wrapper(
263            transcription_sessions.create,
264        )
265
266
267class TranscriptionSessionsWithStreamingResponse:
268    def __init__(self, transcription_sessions: TranscriptionSessions) -> None:
269        self._transcription_sessions = transcription_sessions
270
271        self.create = to_streamed_response_wrapper(
272            transcription_sessions.create,
273        )
274
275
276class AsyncTranscriptionSessionsWithStreamingResponse:
277    def __init__(self, transcription_sessions: AsyncTranscriptionSessions) -> None:
278        self._transcription_sessions = transcription_sessions
279
280        self.create = async_to_streamed_response_wrapper(
281            transcription_sessions.create,
282        )