openai-python/src/openai/resources/audio/transcriptions.py at main

  1# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
  2
  3from __future__ import annotations
  4
  5import logging
  6from typing import TYPE_CHECKING, List, Union, Mapping, Optional, cast
  7from typing_extensions import Literal, overload, assert_never
  8
  9import httpx
 10
 11from ... import _legacy_response
 12from ..._types import (
 13    Body,
 14    Omit,
 15    Query,
 16    Headers,
 17    NotGiven,
 18    FileTypes,
 19    SequenceNotStr,
 20    omit,
 21    not_given,
 22)
 23from ..._utils import extract_files, required_args, maybe_transform, deepcopy_minimal, async_maybe_transform
 24from ..._compat import cached_property
 25from ..._resource import SyncAPIResource, AsyncAPIResource
 26from ..._response import to_streamed_response_wrapper, async_to_streamed_response_wrapper
 27from ..._streaming import Stream, AsyncStream
 28from ...types.audio import transcription_create_params
 29from ..._base_client import make_request_options
 30from ...types.audio_model import AudioModel
 31from ...types.audio.transcription import Transcription
 32from ...types.audio_response_format import AudioResponseFormat
 33from ...types.audio.transcription_include import TranscriptionInclude
 34from ...types.audio.transcription_verbose import TranscriptionVerbose
 35from ...types.audio.transcription_diarized import TranscriptionDiarized
 36from ...types.audio.transcription_stream_event import TranscriptionStreamEvent
 37from ...types.audio.transcription_create_response import TranscriptionCreateResponse
 38
 39__all__ = ["Transcriptions", "AsyncTranscriptions"]
 40
 41log: logging.Logger = logging.getLogger("openai.audio.transcriptions")
 42
 43
 44class Transcriptions(SyncAPIResource):
 45    @cached_property
 46    def with_raw_response(self) -> TranscriptionsWithRawResponse:
 47        """
 48        This property can be used as a prefix for any HTTP method call to return
 49        the raw response object instead of the parsed content.
 50
 51        For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
 52        """
 53        return TranscriptionsWithRawResponse(self)
 54
 55    @cached_property
 56    def with_streaming_response(self) -> TranscriptionsWithStreamingResponse:
 57        """
 58        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
 59
 60        For more information, see https://www.github.com/openai/openai-python#with_streaming_response
 61        """
 62        return TranscriptionsWithStreamingResponse(self)
 63
 64    @overload
 65    def create(
 66        self,
 67        *,
 68        file: FileTypes,
 69        model: Union[str, AudioModel],
 70        chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
 71        include: List[TranscriptionInclude] | Omit = omit,
 72        language: str | Omit = omit,
 73        prompt: str | Omit = omit,
 74        response_format: Union[Literal["json"], Omit] = omit,
 75        stream: Optional[Literal[False]] | Omit = omit,
 76        temperature: float | Omit = omit,
 77        timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
 78        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
 79        # The extra values given here take precedence over values defined on the client or passed to this method.
 80        extra_headers: Headers | None = None,
 81        extra_query: Query | None = None,
 82        extra_body: Body | None = None,
 83        timeout: float | httpx.Timeout | None | NotGiven = not_given,
 84    ) -> Transcription:
 85        """
 86        Transcribes audio into the input language.
 87
 88        Args:
 89          file:
 90              The audio file object (not file name) to transcribe, in one of these formats:
 91              flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
 92
 93          model: ID of the model to use. The options are `gpt-4o-transcribe`,
 94              `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
 95              Whisper V2 model).
 96
 97          chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
 98              first normalizes loudness and then uses voice activity detection (VAD) to choose
 99              boundaries. `server_vad` object can be provided to tweak VAD detection
100              parameters manually. If unset, the audio is transcribed as a single block.
101
102          include: Additional information to include in the transcription response. `logprobs` will
103              return the log probabilities of the tokens in the response to understand the
104              model's confidence in the transcription. `logprobs` only works with
105              response_format set to `json` and only with the models `gpt-4o-transcribe` and
106              `gpt-4o-mini-transcribe`.
107
108          language: The language of the input audio. Supplying the input language in
109              [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
110              format will improve accuracy and latency.
111
112          prompt: An optional text to guide the model's style or continue a previous audio
113              segment. The
114              [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
115              should match the audio language.
116
117          response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
118              `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
119              the only supported format is `json`.
120
121          stream: If set to true, the model response data will be streamed to the client as it is
122              generated using
123              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
124              See the
125              [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
126              for more information.
127
128              Note: Streaming is not supported for the `whisper-1` model and will be ignored.
129
130          temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
131              output more random, while lower values like 0.2 will make it more focused and
132              deterministic. If set to 0, the model will use
133              [log probability](https://en.wikipedia.org/wiki/Log_probability) to
134              automatically increase the temperature until certain thresholds are hit.
135
136          timestamp_granularities: The timestamp granularities to populate for this transcription.
137              `response_format` must be set `verbose_json` to use timestamp granularities.
138              Either or both of these options are supported: `word`, or `segment`. Note: There
139              is no additional latency for segment timestamps, but generating word timestamps
140              incurs additional latency.
141
142          extra_headers: Send extra headers
143
144          extra_query: Add additional query parameters to the request
145        """
146
147    @overload
148    def create(
149        self,
150        *,
151        file: FileTypes,
152        model: Union[str, AudioModel],
153        chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
154        include: List[TranscriptionInclude] | Omit = omit,
155        response_format: Literal["verbose_json"],
156        language: str | Omit = omit,
157        prompt: str | Omit = omit,
158        temperature: float | Omit = omit,
159        timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
160        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
161        # The extra values given here take precedence over values defined on the client or passed to this method.
162        extra_headers: Headers | None = None,
163        extra_query: Query | None = None,
164        extra_body: Body | None = None,
165        timeout: float | httpx.Timeout | None | NotGiven = not_given,
166    ) -> TranscriptionVerbose: ...
167
168    @overload
169    def create(
170        self,
171        *,
172        file: FileTypes,
173        model: Union[str, AudioModel],
174        chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
175        response_format: Literal["text", "srt", "vtt"],
176        include: List[TranscriptionInclude] | Omit = omit,
177        language: str | Omit = omit,
178        prompt: str | Omit = omit,
179        temperature: float | Omit = omit,
180        timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
181        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
182        # The extra values given here take precedence over values defined on the client or passed to this method.
183        extra_headers: Headers | None = None,
184        extra_query: Query | None = None,
185        extra_body: Body | None = None,
186        timeout: float | httpx.Timeout | None | NotGiven = not_given,
187    ) -> str: ...
188
189    @overload
190    def create(
191        self,
192        *,
193        file: FileTypes,
194        model: Union[str, AudioModel],
195        chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
196        response_format: Literal["diarized_json"],
197        known_speaker_names: SequenceNotStr[str] | Omit = omit,
198        known_speaker_references: SequenceNotStr[str] | Omit = omit,
199        language: str | Omit = omit,
200        temperature: float | Omit = omit,
201        timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
202        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
203        # The extra values given here take precedence over values defined on the client or passed to this method.
204        extra_headers: Headers | None = None,
205        extra_query: Query | None = None,
206        extra_body: Body | None = None,
207        timeout: float | httpx.Timeout | None | NotGiven = not_given,
208    ) -> TranscriptionDiarized: ...
209
210    @overload
211    def create(
212        self,
213        *,
214        file: FileTypes,
215        model: Union[str, AudioModel],
216        stream: Literal[True],
217        chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
218        include: List[TranscriptionInclude] | Omit = omit,
219        known_speaker_names: SequenceNotStr[str] | Omit = omit,
220        known_speaker_references: SequenceNotStr[str] | Omit = omit,
221        language: str | Omit = omit,
222        prompt: str | Omit = omit,
223        response_format: Union[AudioResponseFormat, Omit] = omit,
224        temperature: float | Omit = omit,
225        timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
226        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
227        # The extra values given here take precedence over values defined on the client or passed to this method.
228        extra_headers: Headers | None = None,
229        extra_query: Query | None = None,
230        extra_body: Body | None = None,
231        timeout: float | httpx.Timeout | None | NotGiven = not_given,
232    ) -> Stream[TranscriptionStreamEvent]:
233        """
234        Transcribes audio into the input language.
235
236        Args:
237          file:
238              The audio file object (not file name) to transcribe, in one of these formats:
239              flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
240
241          model: ID of the model to use. The options are `gpt-4o-transcribe`,
242              `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
243              Whisper V2 model), and `gpt-4o-transcribe-diarize`.
244
245          stream: If set to true, the model response data will be streamed to the client as it is
246              generated using
247              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
248              See the
249              [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
250              for more information.
251
252              Note: Streaming is not supported for the `whisper-1` model and will be ignored.
253
254          chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
255              first normalizes loudness and then uses voice activity detection (VAD) to choose
256              boundaries. `server_vad` object can be provided to tweak VAD detection
257              parameters manually. If unset, the audio is transcribed as a single block.
258              Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
259              seconds.
260
261          include: Additional information to include in the transcription response. `logprobs` will
262              return the log probabilities of the tokens in the response to understand the
263              model's confidence in the transcription. `logprobs` only works with
264              response_format set to `json` and only with the models `gpt-4o-transcribe` and
265              `gpt-4o-mini-transcribe`. This field is not supported when using
266              `gpt-4o-transcribe-diarize`.
267
268          known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
269              `known_speaker_references[]`. Each entry should be a short identifier (for
270              example `customer` or `agent`). Up to 4 speakers are supported.
271
272          known_speaker_references: Optional list of audio samples (as
273              [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
274              that contain known speaker references matching `known_speaker_names[]`. Each
275              sample must be between 2 and 10 seconds, and can use any of the same input audio
276              formats supported by `file`.
277
278          language: The language of the input audio. Supplying the input language in
279              [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
280              format will improve accuracy and latency.
281
282          prompt: An optional text to guide the model's style or continue a previous audio
283              segment. The
284              [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
285              should match the audio language. This field is not supported when using
286              `gpt-4o-transcribe-diarize`.
287
288          response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
289              `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
290              `gpt-4o-mini-transcribe`, the only supported format is `json`. For
291              `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
292              `diarized_json`, with `diarized_json` required to receive speaker annotations.
293
294          temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
295              output more random, while lower values like 0.2 will make it more focused and
296              deterministic. If set to 0, the model will use
297              [log probability](https://en.wikipedia.org/wiki/Log_probability) to
298              automatically increase the temperature until certain thresholds are hit.
299
300          timestamp_granularities: The timestamp granularities to populate for this transcription.
301              `response_format` must be set `verbose_json` to use timestamp granularities.
302              Either or both of these options are supported: `word`, or `segment`. Note: There
303              is no additional latency for segment timestamps, but generating word timestamps
304              incurs additional latency. This option is not available for
305              `gpt-4o-transcribe-diarize`.
306
307          extra_headers: Send extra headers
308
309          extra_query: Add additional query parameters to the request
310
311          extra_body: Add additional JSON properties to the request
312
313          timeout: Override the client-level default timeout for this request, in seconds
314        """
315        ...
316
317    @overload
318    def create(
319        self,
320        *,
321        file: FileTypes,
322        model: Union[str, AudioModel],
323        stream: bool,
324        chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
325        include: List[TranscriptionInclude] | Omit = omit,
326        known_speaker_names: SequenceNotStr[str] | Omit = omit,
327        known_speaker_references: SequenceNotStr[str] | Omit = omit,
328        language: str | Omit = omit,
329        prompt: str | Omit = omit,
330        response_format: Union[AudioResponseFormat, Omit] = omit,
331        temperature: float | Omit = omit,
332        timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
333        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
334        # The extra values given here take precedence over values defined on the client or passed to this method.
335        extra_headers: Headers | None = None,
336        extra_query: Query | None = None,
337        extra_body: Body | None = None,
338        timeout: float | httpx.Timeout | None | NotGiven = not_given,
339    ) -> TranscriptionCreateResponse | Stream[TranscriptionStreamEvent]:
340        """
341        Transcribes audio into the input language.
342
343        Args:
344          file:
345              The audio file object (not file name) to transcribe, in one of these formats:
346              flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
347
348          model: ID of the model to use. The options are `gpt-4o-transcribe`,
349              `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
350              Whisper V2 model), and `gpt-4o-transcribe-diarize`.
351
352          stream: If set to true, the model response data will be streamed to the client as it is
353              generated using
354              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
355              See the
356              [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
357              for more information.
358
359              Note: Streaming is not supported for the `whisper-1` model and will be ignored.
360
361          chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
362              first normalizes loudness and then uses voice activity detection (VAD) to choose
363              boundaries. `server_vad` object can be provided to tweak VAD detection
364              parameters manually. If unset, the audio is transcribed as a single block.
365              Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
366              seconds.
367
368          include: Additional information to include in the transcription response. `logprobs` will
369              return the log probabilities of the tokens in the response to understand the
370              model's confidence in the transcription. `logprobs` only works with
371              response_format set to `json` and only with the models `gpt-4o-transcribe` and
372              `gpt-4o-mini-transcribe`. This field is not supported when using
373              `gpt-4o-transcribe-diarize`.
374
375          known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
376              `known_speaker_references[]`. Each entry should be a short identifier (for
377              example `customer` or `agent`). Up to 4 speakers are supported.
378
379          known_speaker_references: Optional list of audio samples (as
380              [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
381              that contain known speaker references matching `known_speaker_names[]`. Each
382              sample must be between 2 and 10 seconds, and can use any of the same input audio
383              formats supported by `file`.
384
385          language: The language of the input audio. Supplying the input language in
386              [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
387              format will improve accuracy and latency.
388
389          prompt: An optional text to guide the model's style or continue a previous audio
390              segment. The
391              [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
392              should match the audio language. This field is not supported when using
393              `gpt-4o-transcribe-diarize`.
394
395          response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
396              `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
397              `gpt-4o-mini-transcribe`, the only supported format is `json`. For
398              `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
399              `diarized_json`, with `diarized_json` required to receive speaker annotations.
400
401          temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
402              output more random, while lower values like 0.2 will make it more focused and
403              deterministic. If set to 0, the model will use
404              [log probability](https://en.wikipedia.org/wiki/Log_probability) to
405              automatically increase the temperature until certain thresholds are hit.
406
407          timestamp_granularities: The timestamp granularities to populate for this transcription.
408              `response_format` must be set `verbose_json` to use timestamp granularities.
409              Either or both of these options are supported: `word`, or `segment`. Note: There
410              is no additional latency for segment timestamps, but generating word timestamps
411              incurs additional latency. This option is not available for
412              `gpt-4o-transcribe-diarize`.
413
414          extra_headers: Send extra headers
415
416          extra_query: Add additional query parameters to the request
417
418          extra_body: Add additional JSON properties to the request
419
420          timeout: Override the client-level default timeout for this request, in seconds
421        """
422        ...
423
424    @required_args(["file", "model"], ["file", "model", "stream"])
425    def create(
426        self,
427        *,
428        file: FileTypes,
429        model: Union[str, AudioModel],
430        chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
431        include: List[TranscriptionInclude] | Omit = omit,
432        known_speaker_names: SequenceNotStr[str] | Omit = omit,
433        known_speaker_references: SequenceNotStr[str] | Omit = omit,
434        language: str | Omit = omit,
435        prompt: str | Omit = omit,
436        response_format: Union[AudioResponseFormat, Omit] = omit,
437        stream: Optional[Literal[False]] | Literal[True] | Omit = omit,
438        temperature: float | Omit = omit,
439        timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
440        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
441        # The extra values given here take precedence over values defined on the client or passed to this method.
442        extra_headers: Headers | None = None,
443        extra_query: Query | None = None,
444        extra_body: Body | None = None,
445        timeout: float | httpx.Timeout | None | NotGiven = not_given,
446    ) -> str | Transcription | TranscriptionDiarized | TranscriptionVerbose | Stream[TranscriptionStreamEvent]:
447        body = deepcopy_minimal(
448            {
449                "file": file,
450                "model": model,
451                "chunking_strategy": chunking_strategy,
452                "include": include,
453                "known_speaker_names": known_speaker_names,
454                "known_speaker_references": known_speaker_references,
455                "language": language,
456                "prompt": prompt,
457                "response_format": response_format,
458                "stream": stream,
459                "temperature": temperature,
460                "timestamp_granularities": timestamp_granularities,
461            }
462        )
463        files = extract_files(cast(Mapping[str, object], body), paths=[["file"]])
464        # It should be noted that the actual Content-Type header that will be
465        # sent to the server will contain a `boundary` parameter, e.g.
466        # multipart/form-data; boundary=---abc--
467        extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})}
468        return self._post(  # type: ignore[return-value]
469            "/audio/transcriptions",
470            body=maybe_transform(
471                body,
472                transcription_create_params.TranscriptionCreateParamsStreaming
473                if stream
474                else transcription_create_params.TranscriptionCreateParamsNonStreaming,
475            ),
476            files=files,
477            options=make_request_options(
478                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
479            ),
480            cast_to=_get_response_format_type(response_format),
481            stream=stream or False,
482            stream_cls=Stream[TranscriptionStreamEvent],
483        )
484
485
486class AsyncTranscriptions(AsyncAPIResource):
487    @cached_property
488    def with_raw_response(self) -> AsyncTranscriptionsWithRawResponse:
489        """
490        This property can be used as a prefix for any HTTP method call to return
491        the raw response object instead of the parsed content.
492
493        For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
494        """
495        return AsyncTranscriptionsWithRawResponse(self)
496
497    @cached_property
498    def with_streaming_response(self) -> AsyncTranscriptionsWithStreamingResponse:
499        """
500        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
501
502        For more information, see https://www.github.com/openai/openai-python#with_streaming_response
503        """
504        return AsyncTranscriptionsWithStreamingResponse(self)
505
506    @overload
507    async def create(
508        self,
509        *,
510        file: FileTypes,
511        model: Union[str, AudioModel],
512        chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
513        include: List[TranscriptionInclude] | Omit = omit,
514        known_speaker_names: SequenceNotStr[str] | Omit = omit,
515        known_speaker_references: SequenceNotStr[str] | Omit = omit,
516        language: str | Omit = omit,
517        prompt: str | Omit = omit,
518        response_format: Union[Literal["json"], Omit] = omit,
519        stream: Optional[Literal[False]] | Omit = omit,
520        temperature: float | Omit = omit,
521        timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
522        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
523        # The extra values given here take precedence over values defined on the client or passed to this method.
524        extra_headers: Headers | None = None,
525        extra_query: Query | None = None,
526        extra_body: Body | None = None,
527        timeout: float | httpx.Timeout | None | NotGiven = not_given,
528    ) -> TranscriptionCreateResponse:
529        """
530        Transcribes audio into the input language.
531
532        Args:
533          file:
534              The audio file object (not file name) to transcribe, in one of these formats:
535              flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
536
537          model: ID of the model to use. The options are `gpt-4o-transcribe`,
538              `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
539              Whisper V2 model), and `gpt-4o-transcribe-diarize`.
540
541          chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
542              first normalizes loudness and then uses voice activity detection (VAD) to choose
543              boundaries. `server_vad` object can be provided to tweak VAD detection
544              parameters manually. If unset, the audio is transcribed as a single block.
545              Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
546              seconds.
547
548          include: Additional information to include in the transcription response. `logprobs` will
549              return the log probabilities of the tokens in the response to understand the
550              model's confidence in the transcription. `logprobs` only works with
551              response_format set to `json` and only with the models `gpt-4o-transcribe` and
552              `gpt-4o-mini-transcribe`. This field is not supported when using
553              `gpt-4o-transcribe-diarize`.
554
555          known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
556              `known_speaker_references[]`. Each entry should be a short identifier (for
557              example `customer` or `agent`). Up to 4 speakers are supported.
558
559          known_speaker_references: Optional list of audio samples (as
560              [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
561              that contain known speaker references matching `known_speaker_names[]`. Each
562              sample must be between 2 and 10 seconds, and can use any of the same input audio
563              formats supported by `file`.
564
565          language: The language of the input audio. Supplying the input language in
566              [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
567              format will improve accuracy and latency.
568
569          prompt: An optional text to guide the model's style or continue a previous audio
570              segment. The
571              [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
572              should match the audio language. This field is not supported when using
573              `gpt-4o-transcribe-diarize`.
574
575          response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
576              `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
577              `gpt-4o-mini-transcribe`, the only supported format is `json`. For
578              `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
579              `diarized_json`, with `diarized_json` required to receive speaker annotations.
580
581          stream: If set to true, the model response data will be streamed to the client as it is
582              generated using
583              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
584              See the
585              [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
586              for more information.
587
588              Note: Streaming is not supported for the `whisper-1` model and will be ignored.
589
590          temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
591              output more random, while lower values like 0.2 will make it more focused and
592              deterministic. If set to 0, the model will use
593              [log probability](https://en.wikipedia.org/wiki/Log_probability) to
594              automatically increase the temperature until certain thresholds are hit.
595
596          timestamp_granularities: The timestamp granularities to populate for this transcription.
597              `response_format` must be set `verbose_json` to use timestamp granularities.
598              Either or both of these options are supported: `word`, or `segment`. Note: There
599              is no additional latency for segment timestamps, but generating word timestamps
600              incurs additional latency. This option is not available for
601              `gpt-4o-transcribe-diarize`.
602
603          extra_headers: Send extra headers
604
605          extra_query: Add additional query parameters to the request
606        """
607
608    @overload
609    async def create(
610        self,
611        *,
612        file: FileTypes,
613        model: Union[str, AudioModel],
614        chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
615        include: List[TranscriptionInclude] | Omit = omit,
616        response_format: Literal["verbose_json"],
617        language: str | Omit = omit,
618        prompt: str | Omit = omit,
619        temperature: float | Omit = omit,
620        timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
621        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
622        # The extra values given here take precedence over values defined on the client or passed to this method.
623        extra_headers: Headers | None = None,
624        extra_query: Query | None = None,
625        extra_body: Body | None = None,
626        timeout: float | httpx.Timeout | None | NotGiven = not_given,
627    ) -> TranscriptionVerbose: ...
628
629    @overload
630    async def create(
631        self,
632        *,
633        file: FileTypes,
634        model: Union[str, AudioModel],
635        chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
636        include: List[TranscriptionInclude] | Omit = omit,
637        response_format: Literal["text", "srt", "vtt"],
638        language: str | Omit = omit,
639        prompt: str | Omit = omit,
640        temperature: float | Omit = omit,
641        timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
642        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
643        # The extra values given here take precedence over values defined on the client or passed to this method.
644        extra_headers: Headers | None = None,
645        extra_query: Query | None = None,
646        extra_body: Body | None = None,
647        timeout: float | httpx.Timeout | None | NotGiven = not_given,
648    ) -> str: ...
649
650    @overload
651    async def create(
652        self,
653        *,
654        file: FileTypes,
655        model: Union[str, AudioModel],
656        stream: Literal[True],
657        chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
658        include: List[TranscriptionInclude] | Omit = omit,
659        known_speaker_names: SequenceNotStr[str] | Omit = omit,
660        known_speaker_references: SequenceNotStr[str] | Omit = omit,
661        language: str | Omit = omit,
662        prompt: str | Omit = omit,
663        response_format: Union[AudioResponseFormat, Omit] = omit,
664        temperature: float | Omit = omit,
665        timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
666        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
667        # The extra values given here take precedence over values defined on the client or passed to this method.
668        extra_headers: Headers | None = None,
669        extra_query: Query | None = None,
670        extra_body: Body | None = None,
671        timeout: float | httpx.Timeout | None | NotGiven = not_given,
672    ) -> AsyncStream[TranscriptionStreamEvent]:
673        """
674        Transcribes audio into the input language.
675
676        Args:
677          file:
678              The audio file object (not file name) to transcribe, in one of these formats:
679              flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
680
681          model: ID of the model to use. The options are `gpt-4o-transcribe`,
682              `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
683              Whisper V2 model), and `gpt-4o-transcribe-diarize`.
684
685          stream: If set to true, the model response data will be streamed to the client as it is
686              generated using
687              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
688              See the
689              [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
690              for more information.
691
692              Note: Streaming is not supported for the `whisper-1` model and will be ignored.
693
694          chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
695              first normalizes loudness and then uses voice activity detection (VAD) to choose
696              boundaries. `server_vad` object can be provided to tweak VAD detection
697              parameters manually. If unset, the audio is transcribed as a single block.
698              Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
699              seconds.
700
701          include: Additional information to include in the transcription response. `logprobs` will
702              return the log probabilities of the tokens in the response to understand the
703              model's confidence in the transcription. `logprobs` only works with
704              response_format set to `json` and only with the models `gpt-4o-transcribe` and
705              `gpt-4o-mini-transcribe`. This field is not supported when using
706              `gpt-4o-transcribe-diarize`.
707
708          known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
709              `known_speaker_references[]`. Each entry should be a short identifier (for
710              example `customer` or `agent`). Up to 4 speakers are supported.
711
712          known_speaker_references: Optional list of audio samples (as
713              [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
714              that contain known speaker references matching `known_speaker_names[]`. Each
715              sample must be between 2 and 10 seconds, and can use any of the same input audio
716              formats supported by `file`.
717
718          language: The language of the input audio. Supplying the input language in
719              [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
720              format will improve accuracy and latency.
721
722          prompt: An optional text to guide the model's style or continue a previous audio
723              segment. The
724              [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
725              should match the audio language. This field is not supported when using
726              `gpt-4o-transcribe-diarize`.
727
728          response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
729              `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
730              `gpt-4o-mini-transcribe`, the only supported format is `json`. For
731              `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
732              `diarized_json`, with `diarized_json` required to receive speaker annotations.
733
734          temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
735              output more random, while lower values like 0.2 will make it more focused and
736              deterministic. If set to 0, the model will use
737              [log probability](https://en.wikipedia.org/wiki/Log_probability) to
738              automatically increase the temperature until certain thresholds are hit.
739
740          timestamp_granularities: The timestamp granularities to populate for this transcription.
741              `response_format` must be set `verbose_json` to use timestamp granularities.
742              Either or both of these options are supported: `word`, or `segment`. Note: There
743              is no additional latency for segment timestamps, but generating word timestamps
744              incurs additional latency. This option is not available for
745              `gpt-4o-transcribe-diarize`.
746
747          extra_headers: Send extra headers
748
749          extra_query: Add additional query parameters to the request
750
751          extra_body: Add additional JSON properties to the request
752
753          timeout: Override the client-level default timeout for this request, in seconds
754        """
755        ...
756
757    @overload
758    async def create(
759        self,
760        *,
761        file: FileTypes,
762        model: Union[str, AudioModel],
763        stream: bool,
764        chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
765        include: List[TranscriptionInclude] | Omit = omit,
766        known_speaker_names: SequenceNotStr[str] | Omit = omit,
767        known_speaker_references: SequenceNotStr[str] | Omit = omit,
768        language: str | Omit = omit,
769        prompt: str | Omit = omit,
770        response_format: Union[AudioResponseFormat, Omit] = omit,
771        temperature: float | Omit = omit,
772        timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
773        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
774        # The extra values given here take precedence over values defined on the client or passed to this method.
775        extra_headers: Headers | None = None,
776        extra_query: Query | None = None,
777        extra_body: Body | None = None,
778        timeout: float | httpx.Timeout | None | NotGiven = not_given,
779    ) -> TranscriptionCreateResponse | AsyncStream[TranscriptionStreamEvent]:
780        """
781        Transcribes audio into the input language.
782
783        Args:
784          file:
785              The audio file object (not file name) to transcribe, in one of these formats:
786              flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
787
788          model: ID of the model to use. The options are `gpt-4o-transcribe`,
789              `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
790              Whisper V2 model), and `gpt-4o-transcribe-diarize`.
791
792          stream: If set to true, the model response data will be streamed to the client as it is
793              generated using
794              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
795              See the
796              [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
797              for more information.
798
799              Note: Streaming is not supported for the `whisper-1` model and will be ignored.
800
801          chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
802              first normalizes loudness and then uses voice activity detection (VAD) to choose
803              boundaries. `server_vad` object can be provided to tweak VAD detection
804              parameters manually. If unset, the audio is transcribed as a single block.
805              Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
806              seconds.
807
808          include: Additional information to include in the transcription response. `logprobs` will
809              return the log probabilities of the tokens in the response to understand the
810              model's confidence in the transcription. `logprobs` only works with
811              response_format set to `json` and only with the models `gpt-4o-transcribe` and
812              `gpt-4o-mini-transcribe`. This field is not supported when using
813              `gpt-4o-transcribe-diarize`.
814
815          known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
816              `known_speaker_references[]`. Each entry should be a short identifier (for
817              example `customer` or `agent`). Up to 4 speakers are supported.
818
819          known_speaker_references: Optional list of audio samples (as
820              [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
821              that contain known speaker references matching `known_speaker_names[]`. Each
822              sample must be between 2 and 10 seconds, and can use any of the same input audio
823              formats supported by `file`.
824
825          language: The language of the input audio. Supplying the input language in
826              [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
827              format will improve accuracy and latency.
828
829          prompt: An optional text to guide the model's style or continue a previous audio
830              segment. The
831              [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
832              should match the audio language. This field is not supported when using
833              `gpt-4o-transcribe-diarize`.
834
835          response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
836              `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
837              `gpt-4o-mini-transcribe`, the only supported format is `json`. For
838              `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
839              `diarized_json`, with `diarized_json` required to receive speaker annotations.
840
841          temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
842              output more random, while lower values like 0.2 will make it more focused and
843              deterministic. If set to 0, the model will use
844              [log probability](https://en.wikipedia.org/wiki/Log_probability) to
845              automatically increase the temperature until certain thresholds are hit.
846
847          timestamp_granularities: The timestamp granularities to populate for this transcription.
848              `response_format` must be set `verbose_json` to use timestamp granularities.
849              Either or both of these options are supported: `word`, or `segment`. Note: There
850              is no additional latency for segment timestamps, but generating word timestamps
851              incurs additional latency. This option is not available for
852              `gpt-4o-transcribe-diarize`.
853
854          extra_headers: Send extra headers
855
856          extra_query: Add additional query parameters to the request
857
858          extra_body: Add additional JSON properties to the request
859
860          timeout: Override the client-level default timeout for this request, in seconds
861        """
862        ...
863
864    @required_args(["file", "model"], ["file", "model", "stream"])
865    async def create(
866        self,
867        *,
868        file: FileTypes,
869        model: Union[str, AudioModel],
870        chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
871        include: List[TranscriptionInclude] | Omit = omit,
872        known_speaker_names: SequenceNotStr[str] | Omit = omit,
873        known_speaker_references: SequenceNotStr[str] | Omit = omit,
874        language: str | Omit = omit,
875        prompt: str | Omit = omit,
876        response_format: Union[AudioResponseFormat, Omit] = omit,
877        stream: Optional[Literal[False]] | Literal[True] | Omit = omit,
878        temperature: float | Omit = omit,
879        timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
880        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
881        # The extra values given here take precedence over values defined on the client or passed to this method.
882        extra_headers: Headers | None = None,
883        extra_query: Query | None = None,
884        extra_body: Body | None = None,
885        timeout: float | httpx.Timeout | None | NotGiven = not_given,
886    ) -> Transcription | TranscriptionVerbose | TranscriptionDiarized | str | AsyncStream[TranscriptionStreamEvent]:
887        body = deepcopy_minimal(
888            {
889                "file": file,
890                "model": model,
891                "chunking_strategy": chunking_strategy,
892                "include": include,
893                "known_speaker_names": known_speaker_names,
894                "known_speaker_references": known_speaker_references,
895                "language": language,
896                "prompt": prompt,
897                "response_format": response_format,
898                "stream": stream,
899                "temperature": temperature,
900                "timestamp_granularities": timestamp_granularities,
901            }
902        )
903        files = extract_files(cast(Mapping[str, object], body), paths=[["file"]])
904        # It should be noted that the actual Content-Type header that will be
905        # sent to the server will contain a `boundary` parameter, e.g.
906        # multipart/form-data; boundary=---abc--
907        extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})}
908        return await self._post(
909            "/audio/transcriptions",
910            body=await async_maybe_transform(
911                body,
912                transcription_create_params.TranscriptionCreateParamsStreaming
913                if stream
914                else transcription_create_params.TranscriptionCreateParamsNonStreaming,
915            ),
916            files=files,
917            options=make_request_options(
918                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
919            ),
920            cast_to=_get_response_format_type(response_format),
921            stream=stream or False,
922            stream_cls=AsyncStream[TranscriptionStreamEvent],
923        )
924
925
926class TranscriptionsWithRawResponse:
927    def __init__(self, transcriptions: Transcriptions) -> None:
928        self._transcriptions = transcriptions
929
930        self.create = _legacy_response.to_raw_response_wrapper(
931            transcriptions.create,
932        )
933
934
935class AsyncTranscriptionsWithRawResponse:
936    def __init__(self, transcriptions: AsyncTranscriptions) -> None:
937        self._transcriptions = transcriptions
938
939        self.create = _legacy_response.async_to_raw_response_wrapper(
940            transcriptions.create,
941        )
942
943
944class TranscriptionsWithStreamingResponse:
945    def __init__(self, transcriptions: Transcriptions) -> None:
946        self._transcriptions = transcriptions
947
948        self.create = to_streamed_response_wrapper(
949            transcriptions.create,
950        )
951
952
953class AsyncTranscriptionsWithStreamingResponse:
954    def __init__(self, transcriptions: AsyncTranscriptions) -> None:
955        self._transcriptions = transcriptions
956
957        self.create = async_to_streamed_response_wrapper(
958            transcriptions.create,
959        )
960
961
962def _get_response_format_type(
963    response_format: AudioResponseFormat | Omit,
964) -> type[Transcription | TranscriptionVerbose | TranscriptionDiarized | str]:
965    if isinstance(response_format, Omit) or response_format is None:  # pyright: ignore[reportUnnecessaryComparison]
966        return Transcription
967
968    if response_format == "json":
969        return Transcription
970    elif response_format == "verbose_json":
971        return TranscriptionVerbose
972    elif response_format == "diarized_json":
973        return TranscriptionDiarized
974    elif response_format == "srt" or response_format == "text" or response_format == "vtt":
975        return str
976    elif TYPE_CHECKING:  # type: ignore[unreachable]
977        assert_never(response_format)
978    else:
979        log.warn("Unexpected audio response format: %s", response_format)
980        return Transcription