main
1# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
3from __future__ import annotations
4
5import logging
6from typing import TYPE_CHECKING, List, Union, Mapping, Optional, cast
7from typing_extensions import Literal, overload, assert_never
8
9import httpx
10
11from ... import _legacy_response
12from ..._types import (
13 Body,
14 Omit,
15 Query,
16 Headers,
17 NotGiven,
18 FileTypes,
19 SequenceNotStr,
20 omit,
21 not_given,
22)
23from ..._utils import extract_files, required_args, maybe_transform, deepcopy_minimal, async_maybe_transform
24from ..._compat import cached_property
25from ..._resource import SyncAPIResource, AsyncAPIResource
26from ..._response import to_streamed_response_wrapper, async_to_streamed_response_wrapper
27from ..._streaming import Stream, AsyncStream
28from ...types.audio import transcription_create_params
29from ..._base_client import make_request_options
30from ...types.audio_model import AudioModel
31from ...types.audio.transcription import Transcription
32from ...types.audio_response_format import AudioResponseFormat
33from ...types.audio.transcription_include import TranscriptionInclude
34from ...types.audio.transcription_verbose import TranscriptionVerbose
35from ...types.audio.transcription_diarized import TranscriptionDiarized
36from ...types.audio.transcription_stream_event import TranscriptionStreamEvent
37from ...types.audio.transcription_create_response import TranscriptionCreateResponse
38
39__all__ = ["Transcriptions", "AsyncTranscriptions"]
40
41log: logging.Logger = logging.getLogger("openai.audio.transcriptions")
42
43
44class Transcriptions(SyncAPIResource):
45 @cached_property
46 def with_raw_response(self) -> TranscriptionsWithRawResponse:
47 """
48 This property can be used as a prefix for any HTTP method call to return
49 the raw response object instead of the parsed content.
50
51 For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
52 """
53 return TranscriptionsWithRawResponse(self)
54
55 @cached_property
56 def with_streaming_response(self) -> TranscriptionsWithStreamingResponse:
57 """
58 An alternative to `.with_raw_response` that doesn't eagerly read the response body.
59
60 For more information, see https://www.github.com/openai/openai-python#with_streaming_response
61 """
62 return TranscriptionsWithStreamingResponse(self)
63
64 @overload
65 def create(
66 self,
67 *,
68 file: FileTypes,
69 model: Union[str, AudioModel],
70 chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
71 include: List[TranscriptionInclude] | Omit = omit,
72 language: str | Omit = omit,
73 prompt: str | Omit = omit,
74 response_format: Union[Literal["json"], Omit] = omit,
75 stream: Optional[Literal[False]] | Omit = omit,
76 temperature: float | Omit = omit,
77 timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
78 # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
79 # The extra values given here take precedence over values defined on the client or passed to this method.
80 extra_headers: Headers | None = None,
81 extra_query: Query | None = None,
82 extra_body: Body | None = None,
83 timeout: float | httpx.Timeout | None | NotGiven = not_given,
84 ) -> Transcription:
85 """
86 Transcribes audio into the input language.
87
88 Args:
89 file:
90 The audio file object (not file name) to transcribe, in one of these formats:
91 flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
92
93 model: ID of the model to use. The options are `gpt-4o-transcribe`,
94 `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
95 Whisper V2 model).
96
97 chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
98 first normalizes loudness and then uses voice activity detection (VAD) to choose
99 boundaries. `server_vad` object can be provided to tweak VAD detection
100 parameters manually. If unset, the audio is transcribed as a single block.
101
102 include: Additional information to include in the transcription response. `logprobs` will
103 return the log probabilities of the tokens in the response to understand the
104 model's confidence in the transcription. `logprobs` only works with
105 response_format set to `json` and only with the models `gpt-4o-transcribe` and
106 `gpt-4o-mini-transcribe`.
107
108 language: The language of the input audio. Supplying the input language in
109 [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
110 format will improve accuracy and latency.
111
112 prompt: An optional text to guide the model's style or continue a previous audio
113 segment. The
114 [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
115 should match the audio language.
116
117 response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
118 `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
119 the only supported format is `json`.
120
121 stream: If set to true, the model response data will be streamed to the client as it is
122 generated using
123 [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
124 See the
125 [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
126 for more information.
127
128 Note: Streaming is not supported for the `whisper-1` model and will be ignored.
129
130 temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
131 output more random, while lower values like 0.2 will make it more focused and
132 deterministic. If set to 0, the model will use
133 [log probability](https://en.wikipedia.org/wiki/Log_probability) to
134 automatically increase the temperature until certain thresholds are hit.
135
136 timestamp_granularities: The timestamp granularities to populate for this transcription.
137 `response_format` must be set `verbose_json` to use timestamp granularities.
138 Either or both of these options are supported: `word`, or `segment`. Note: There
139 is no additional latency for segment timestamps, but generating word timestamps
140 incurs additional latency.
141
142 extra_headers: Send extra headers
143
144 extra_query: Add additional query parameters to the request
145 """
146
147 @overload
148 def create(
149 self,
150 *,
151 file: FileTypes,
152 model: Union[str, AudioModel],
153 chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
154 include: List[TranscriptionInclude] | Omit = omit,
155 response_format: Literal["verbose_json"],
156 language: str | Omit = omit,
157 prompt: str | Omit = omit,
158 temperature: float | Omit = omit,
159 timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
160 # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
161 # The extra values given here take precedence over values defined on the client or passed to this method.
162 extra_headers: Headers | None = None,
163 extra_query: Query | None = None,
164 extra_body: Body | None = None,
165 timeout: float | httpx.Timeout | None | NotGiven = not_given,
166 ) -> TranscriptionVerbose: ...
167
168 @overload
169 def create(
170 self,
171 *,
172 file: FileTypes,
173 model: Union[str, AudioModel],
174 chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
175 response_format: Literal["text", "srt", "vtt"],
176 include: List[TranscriptionInclude] | Omit = omit,
177 language: str | Omit = omit,
178 prompt: str | Omit = omit,
179 temperature: float | Omit = omit,
180 timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
181 # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
182 # The extra values given here take precedence over values defined on the client or passed to this method.
183 extra_headers: Headers | None = None,
184 extra_query: Query | None = None,
185 extra_body: Body | None = None,
186 timeout: float | httpx.Timeout | None | NotGiven = not_given,
187 ) -> str: ...
188
189 @overload
190 def create(
191 self,
192 *,
193 file: FileTypes,
194 model: Union[str, AudioModel],
195 chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
196 response_format: Literal["diarized_json"],
197 known_speaker_names: SequenceNotStr[str] | Omit = omit,
198 known_speaker_references: SequenceNotStr[str] | Omit = omit,
199 language: str | Omit = omit,
200 temperature: float | Omit = omit,
201 timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
202 # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
203 # The extra values given here take precedence over values defined on the client or passed to this method.
204 extra_headers: Headers | None = None,
205 extra_query: Query | None = None,
206 extra_body: Body | None = None,
207 timeout: float | httpx.Timeout | None | NotGiven = not_given,
208 ) -> TranscriptionDiarized: ...
209
210 @overload
211 def create(
212 self,
213 *,
214 file: FileTypes,
215 model: Union[str, AudioModel],
216 stream: Literal[True],
217 chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
218 include: List[TranscriptionInclude] | Omit = omit,
219 known_speaker_names: SequenceNotStr[str] | Omit = omit,
220 known_speaker_references: SequenceNotStr[str] | Omit = omit,
221 language: str | Omit = omit,
222 prompt: str | Omit = omit,
223 response_format: Union[AudioResponseFormat, Omit] = omit,
224 temperature: float | Omit = omit,
225 timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
226 # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
227 # The extra values given here take precedence over values defined on the client or passed to this method.
228 extra_headers: Headers | None = None,
229 extra_query: Query | None = None,
230 extra_body: Body | None = None,
231 timeout: float | httpx.Timeout | None | NotGiven = not_given,
232 ) -> Stream[TranscriptionStreamEvent]:
233 """
234 Transcribes audio into the input language.
235
236 Args:
237 file:
238 The audio file object (not file name) to transcribe, in one of these formats:
239 flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
240
241 model: ID of the model to use. The options are `gpt-4o-transcribe`,
242 `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
243 Whisper V2 model), and `gpt-4o-transcribe-diarize`.
244
245 stream: If set to true, the model response data will be streamed to the client as it is
246 generated using
247 [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
248 See the
249 [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
250 for more information.
251
252 Note: Streaming is not supported for the `whisper-1` model and will be ignored.
253
254 chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
255 first normalizes loudness and then uses voice activity detection (VAD) to choose
256 boundaries. `server_vad` object can be provided to tweak VAD detection
257 parameters manually. If unset, the audio is transcribed as a single block.
258 Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
259 seconds.
260
261 include: Additional information to include in the transcription response. `logprobs` will
262 return the log probabilities of the tokens in the response to understand the
263 model's confidence in the transcription. `logprobs` only works with
264 response_format set to `json` and only with the models `gpt-4o-transcribe` and
265 `gpt-4o-mini-transcribe`. This field is not supported when using
266 `gpt-4o-transcribe-diarize`.
267
268 known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
269 `known_speaker_references[]`. Each entry should be a short identifier (for
270 example `customer` or `agent`). Up to 4 speakers are supported.
271
272 known_speaker_references: Optional list of audio samples (as
273 [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
274 that contain known speaker references matching `known_speaker_names[]`. Each
275 sample must be between 2 and 10 seconds, and can use any of the same input audio
276 formats supported by `file`.
277
278 language: The language of the input audio. Supplying the input language in
279 [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
280 format will improve accuracy and latency.
281
282 prompt: An optional text to guide the model's style or continue a previous audio
283 segment. The
284 [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
285 should match the audio language. This field is not supported when using
286 `gpt-4o-transcribe-diarize`.
287
288 response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
289 `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
290 `gpt-4o-mini-transcribe`, the only supported format is `json`. For
291 `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
292 `diarized_json`, with `diarized_json` required to receive speaker annotations.
293
294 temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
295 output more random, while lower values like 0.2 will make it more focused and
296 deterministic. If set to 0, the model will use
297 [log probability](https://en.wikipedia.org/wiki/Log_probability) to
298 automatically increase the temperature until certain thresholds are hit.
299
300 timestamp_granularities: The timestamp granularities to populate for this transcription.
301 `response_format` must be set `verbose_json` to use timestamp granularities.
302 Either or both of these options are supported: `word`, or `segment`. Note: There
303 is no additional latency for segment timestamps, but generating word timestamps
304 incurs additional latency. This option is not available for
305 `gpt-4o-transcribe-diarize`.
306
307 extra_headers: Send extra headers
308
309 extra_query: Add additional query parameters to the request
310
311 extra_body: Add additional JSON properties to the request
312
313 timeout: Override the client-level default timeout for this request, in seconds
314 """
315 ...
316
317 @overload
318 def create(
319 self,
320 *,
321 file: FileTypes,
322 model: Union[str, AudioModel],
323 stream: bool,
324 chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
325 include: List[TranscriptionInclude] | Omit = omit,
326 known_speaker_names: SequenceNotStr[str] | Omit = omit,
327 known_speaker_references: SequenceNotStr[str] | Omit = omit,
328 language: str | Omit = omit,
329 prompt: str | Omit = omit,
330 response_format: Union[AudioResponseFormat, Omit] = omit,
331 temperature: float | Omit = omit,
332 timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
333 # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
334 # The extra values given here take precedence over values defined on the client or passed to this method.
335 extra_headers: Headers | None = None,
336 extra_query: Query | None = None,
337 extra_body: Body | None = None,
338 timeout: float | httpx.Timeout | None | NotGiven = not_given,
339 ) -> TranscriptionCreateResponse | Stream[TranscriptionStreamEvent]:
340 """
341 Transcribes audio into the input language.
342
343 Args:
344 file:
345 The audio file object (not file name) to transcribe, in one of these formats:
346 flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
347
348 model: ID of the model to use. The options are `gpt-4o-transcribe`,
349 `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
350 Whisper V2 model), and `gpt-4o-transcribe-diarize`.
351
352 stream: If set to true, the model response data will be streamed to the client as it is
353 generated using
354 [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
355 See the
356 [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
357 for more information.
358
359 Note: Streaming is not supported for the `whisper-1` model and will be ignored.
360
361 chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
362 first normalizes loudness and then uses voice activity detection (VAD) to choose
363 boundaries. `server_vad` object can be provided to tweak VAD detection
364 parameters manually. If unset, the audio is transcribed as a single block.
365 Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
366 seconds.
367
368 include: Additional information to include in the transcription response. `logprobs` will
369 return the log probabilities of the tokens in the response to understand the
370 model's confidence in the transcription. `logprobs` only works with
371 response_format set to `json` and only with the models `gpt-4o-transcribe` and
372 `gpt-4o-mini-transcribe`. This field is not supported when using
373 `gpt-4o-transcribe-diarize`.
374
375 known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
376 `known_speaker_references[]`. Each entry should be a short identifier (for
377 example `customer` or `agent`). Up to 4 speakers are supported.
378
379 known_speaker_references: Optional list of audio samples (as
380 [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
381 that contain known speaker references matching `known_speaker_names[]`. Each
382 sample must be between 2 and 10 seconds, and can use any of the same input audio
383 formats supported by `file`.
384
385 language: The language of the input audio. Supplying the input language in
386 [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
387 format will improve accuracy and latency.
388
389 prompt: An optional text to guide the model's style or continue a previous audio
390 segment. The
391 [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
392 should match the audio language. This field is not supported when using
393 `gpt-4o-transcribe-diarize`.
394
395 response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
396 `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
397 `gpt-4o-mini-transcribe`, the only supported format is `json`. For
398 `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
399 `diarized_json`, with `diarized_json` required to receive speaker annotations.
400
401 temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
402 output more random, while lower values like 0.2 will make it more focused and
403 deterministic. If set to 0, the model will use
404 [log probability](https://en.wikipedia.org/wiki/Log_probability) to
405 automatically increase the temperature until certain thresholds are hit.
406
407 timestamp_granularities: The timestamp granularities to populate for this transcription.
408 `response_format` must be set `verbose_json` to use timestamp granularities.
409 Either or both of these options are supported: `word`, or `segment`. Note: There
410 is no additional latency for segment timestamps, but generating word timestamps
411 incurs additional latency. This option is not available for
412 `gpt-4o-transcribe-diarize`.
413
414 extra_headers: Send extra headers
415
416 extra_query: Add additional query parameters to the request
417
418 extra_body: Add additional JSON properties to the request
419
420 timeout: Override the client-level default timeout for this request, in seconds
421 """
422 ...
423
424 @required_args(["file", "model"], ["file", "model", "stream"])
425 def create(
426 self,
427 *,
428 file: FileTypes,
429 model: Union[str, AudioModel],
430 chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
431 include: List[TranscriptionInclude] | Omit = omit,
432 known_speaker_names: SequenceNotStr[str] | Omit = omit,
433 known_speaker_references: SequenceNotStr[str] | Omit = omit,
434 language: str | Omit = omit,
435 prompt: str | Omit = omit,
436 response_format: Union[AudioResponseFormat, Omit] = omit,
437 stream: Optional[Literal[False]] | Literal[True] | Omit = omit,
438 temperature: float | Omit = omit,
439 timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
440 # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
441 # The extra values given here take precedence over values defined on the client or passed to this method.
442 extra_headers: Headers | None = None,
443 extra_query: Query | None = None,
444 extra_body: Body | None = None,
445 timeout: float | httpx.Timeout | None | NotGiven = not_given,
446 ) -> str | Transcription | TranscriptionDiarized | TranscriptionVerbose | Stream[TranscriptionStreamEvent]:
447 body = deepcopy_minimal(
448 {
449 "file": file,
450 "model": model,
451 "chunking_strategy": chunking_strategy,
452 "include": include,
453 "known_speaker_names": known_speaker_names,
454 "known_speaker_references": known_speaker_references,
455 "language": language,
456 "prompt": prompt,
457 "response_format": response_format,
458 "stream": stream,
459 "temperature": temperature,
460 "timestamp_granularities": timestamp_granularities,
461 }
462 )
463 files = extract_files(cast(Mapping[str, object], body), paths=[["file"]])
464 # It should be noted that the actual Content-Type header that will be
465 # sent to the server will contain a `boundary` parameter, e.g.
466 # multipart/form-data; boundary=---abc--
467 extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})}
468 return self._post( # type: ignore[return-value]
469 "/audio/transcriptions",
470 body=maybe_transform(
471 body,
472 transcription_create_params.TranscriptionCreateParamsStreaming
473 if stream
474 else transcription_create_params.TranscriptionCreateParamsNonStreaming,
475 ),
476 files=files,
477 options=make_request_options(
478 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
479 ),
480 cast_to=_get_response_format_type(response_format),
481 stream=stream or False,
482 stream_cls=Stream[TranscriptionStreamEvent],
483 )
484
485
486class AsyncTranscriptions(AsyncAPIResource):
487 @cached_property
488 def with_raw_response(self) -> AsyncTranscriptionsWithRawResponse:
489 """
490 This property can be used as a prefix for any HTTP method call to return
491 the raw response object instead of the parsed content.
492
493 For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
494 """
495 return AsyncTranscriptionsWithRawResponse(self)
496
497 @cached_property
498 def with_streaming_response(self) -> AsyncTranscriptionsWithStreamingResponse:
499 """
500 An alternative to `.with_raw_response` that doesn't eagerly read the response body.
501
502 For more information, see https://www.github.com/openai/openai-python#with_streaming_response
503 """
504 return AsyncTranscriptionsWithStreamingResponse(self)
505
506 @overload
507 async def create(
508 self,
509 *,
510 file: FileTypes,
511 model: Union[str, AudioModel],
512 chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
513 include: List[TranscriptionInclude] | Omit = omit,
514 known_speaker_names: SequenceNotStr[str] | Omit = omit,
515 known_speaker_references: SequenceNotStr[str] | Omit = omit,
516 language: str | Omit = omit,
517 prompt: str | Omit = omit,
518 response_format: Union[Literal["json"], Omit] = omit,
519 stream: Optional[Literal[False]] | Omit = omit,
520 temperature: float | Omit = omit,
521 timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
522 # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
523 # The extra values given here take precedence over values defined on the client or passed to this method.
524 extra_headers: Headers | None = None,
525 extra_query: Query | None = None,
526 extra_body: Body | None = None,
527 timeout: float | httpx.Timeout | None | NotGiven = not_given,
528 ) -> TranscriptionCreateResponse:
529 """
530 Transcribes audio into the input language.
531
532 Args:
533 file:
534 The audio file object (not file name) to transcribe, in one of these formats:
535 flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
536
537 model: ID of the model to use. The options are `gpt-4o-transcribe`,
538 `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
539 Whisper V2 model), and `gpt-4o-transcribe-diarize`.
540
541 chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
542 first normalizes loudness and then uses voice activity detection (VAD) to choose
543 boundaries. `server_vad` object can be provided to tweak VAD detection
544 parameters manually. If unset, the audio is transcribed as a single block.
545 Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
546 seconds.
547
548 include: Additional information to include in the transcription response. `logprobs` will
549 return the log probabilities of the tokens in the response to understand the
550 model's confidence in the transcription. `logprobs` only works with
551 response_format set to `json` and only with the models `gpt-4o-transcribe` and
552 `gpt-4o-mini-transcribe`. This field is not supported when using
553 `gpt-4o-transcribe-diarize`.
554
555 known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
556 `known_speaker_references[]`. Each entry should be a short identifier (for
557 example `customer` or `agent`). Up to 4 speakers are supported.
558
559 known_speaker_references: Optional list of audio samples (as
560 [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
561 that contain known speaker references matching `known_speaker_names[]`. Each
562 sample must be between 2 and 10 seconds, and can use any of the same input audio
563 formats supported by `file`.
564
565 language: The language of the input audio. Supplying the input language in
566 [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
567 format will improve accuracy and latency.
568
569 prompt: An optional text to guide the model's style or continue a previous audio
570 segment. The
571 [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
572 should match the audio language. This field is not supported when using
573 `gpt-4o-transcribe-diarize`.
574
575 response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
576 `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
577 `gpt-4o-mini-transcribe`, the only supported format is `json`. For
578 `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
579 `diarized_json`, with `diarized_json` required to receive speaker annotations.
580
581 stream: If set to true, the model response data will be streamed to the client as it is
582 generated using
583 [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
584 See the
585 [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
586 for more information.
587
588 Note: Streaming is not supported for the `whisper-1` model and will be ignored.
589
590 temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
591 output more random, while lower values like 0.2 will make it more focused and
592 deterministic. If set to 0, the model will use
593 [log probability](https://en.wikipedia.org/wiki/Log_probability) to
594 automatically increase the temperature until certain thresholds are hit.
595
596 timestamp_granularities: The timestamp granularities to populate for this transcription.
597 `response_format` must be set `verbose_json` to use timestamp granularities.
598 Either or both of these options are supported: `word`, or `segment`. Note: There
599 is no additional latency for segment timestamps, but generating word timestamps
600 incurs additional latency. This option is not available for
601 `gpt-4o-transcribe-diarize`.
602
603 extra_headers: Send extra headers
604
605 extra_query: Add additional query parameters to the request
606 """
607
608 @overload
609 async def create(
610 self,
611 *,
612 file: FileTypes,
613 model: Union[str, AudioModel],
614 chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
615 include: List[TranscriptionInclude] | Omit = omit,
616 response_format: Literal["verbose_json"],
617 language: str | Omit = omit,
618 prompt: str | Omit = omit,
619 temperature: float | Omit = omit,
620 timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
621 # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
622 # The extra values given here take precedence over values defined on the client or passed to this method.
623 extra_headers: Headers | None = None,
624 extra_query: Query | None = None,
625 extra_body: Body | None = None,
626 timeout: float | httpx.Timeout | None | NotGiven = not_given,
627 ) -> TranscriptionVerbose: ...
628
629 @overload
630 async def create(
631 self,
632 *,
633 file: FileTypes,
634 model: Union[str, AudioModel],
635 chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
636 include: List[TranscriptionInclude] | Omit = omit,
637 response_format: Literal["text", "srt", "vtt"],
638 language: str | Omit = omit,
639 prompt: str | Omit = omit,
640 temperature: float | Omit = omit,
641 timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
642 # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
643 # The extra values given here take precedence over values defined on the client or passed to this method.
644 extra_headers: Headers | None = None,
645 extra_query: Query | None = None,
646 extra_body: Body | None = None,
647 timeout: float | httpx.Timeout | None | NotGiven = not_given,
648 ) -> str: ...
649
650 @overload
651 async def create(
652 self,
653 *,
654 file: FileTypes,
655 model: Union[str, AudioModel],
656 stream: Literal[True],
657 chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
658 include: List[TranscriptionInclude] | Omit = omit,
659 known_speaker_names: SequenceNotStr[str] | Omit = omit,
660 known_speaker_references: SequenceNotStr[str] | Omit = omit,
661 language: str | Omit = omit,
662 prompt: str | Omit = omit,
663 response_format: Union[AudioResponseFormat, Omit] = omit,
664 temperature: float | Omit = omit,
665 timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
666 # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
667 # The extra values given here take precedence over values defined on the client or passed to this method.
668 extra_headers: Headers | None = None,
669 extra_query: Query | None = None,
670 extra_body: Body | None = None,
671 timeout: float | httpx.Timeout | None | NotGiven = not_given,
672 ) -> AsyncStream[TranscriptionStreamEvent]:
673 """
674 Transcribes audio into the input language.
675
676 Args:
677 file:
678 The audio file object (not file name) to transcribe, in one of these formats:
679 flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
680
681 model: ID of the model to use. The options are `gpt-4o-transcribe`,
682 `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
683 Whisper V2 model), and `gpt-4o-transcribe-diarize`.
684
685 stream: If set to true, the model response data will be streamed to the client as it is
686 generated using
687 [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
688 See the
689 [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
690 for more information.
691
692 Note: Streaming is not supported for the `whisper-1` model and will be ignored.
693
694 chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
695 first normalizes loudness and then uses voice activity detection (VAD) to choose
696 boundaries. `server_vad` object can be provided to tweak VAD detection
697 parameters manually. If unset, the audio is transcribed as a single block.
698 Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
699 seconds.
700
701 include: Additional information to include in the transcription response. `logprobs` will
702 return the log probabilities of the tokens in the response to understand the
703 model's confidence in the transcription. `logprobs` only works with
704 response_format set to `json` and only with the models `gpt-4o-transcribe` and
705 `gpt-4o-mini-transcribe`. This field is not supported when using
706 `gpt-4o-transcribe-diarize`.
707
708 known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
709 `known_speaker_references[]`. Each entry should be a short identifier (for
710 example `customer` or `agent`). Up to 4 speakers are supported.
711
712 known_speaker_references: Optional list of audio samples (as
713 [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
714 that contain known speaker references matching `known_speaker_names[]`. Each
715 sample must be between 2 and 10 seconds, and can use any of the same input audio
716 formats supported by `file`.
717
718 language: The language of the input audio. Supplying the input language in
719 [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
720 format will improve accuracy and latency.
721
722 prompt: An optional text to guide the model's style or continue a previous audio
723 segment. The
724 [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
725 should match the audio language. This field is not supported when using
726 `gpt-4o-transcribe-diarize`.
727
728 response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
729 `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
730 `gpt-4o-mini-transcribe`, the only supported format is `json`. For
731 `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
732 `diarized_json`, with `diarized_json` required to receive speaker annotations.
733
734 temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
735 output more random, while lower values like 0.2 will make it more focused and
736 deterministic. If set to 0, the model will use
737 [log probability](https://en.wikipedia.org/wiki/Log_probability) to
738 automatically increase the temperature until certain thresholds are hit.
739
740 timestamp_granularities: The timestamp granularities to populate for this transcription.
741 `response_format` must be set `verbose_json` to use timestamp granularities.
742 Either or both of these options are supported: `word`, or `segment`. Note: There
743 is no additional latency for segment timestamps, but generating word timestamps
744 incurs additional latency. This option is not available for
745 `gpt-4o-transcribe-diarize`.
746
747 extra_headers: Send extra headers
748
749 extra_query: Add additional query parameters to the request
750
751 extra_body: Add additional JSON properties to the request
752
753 timeout: Override the client-level default timeout for this request, in seconds
754 """
755 ...
756
757 @overload
758 async def create(
759 self,
760 *,
761 file: FileTypes,
762 model: Union[str, AudioModel],
763 stream: bool,
764 chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
765 include: List[TranscriptionInclude] | Omit = omit,
766 known_speaker_names: SequenceNotStr[str] | Omit = omit,
767 known_speaker_references: SequenceNotStr[str] | Omit = omit,
768 language: str | Omit = omit,
769 prompt: str | Omit = omit,
770 response_format: Union[AudioResponseFormat, Omit] = omit,
771 temperature: float | Omit = omit,
772 timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
773 # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
774 # The extra values given here take precedence over values defined on the client or passed to this method.
775 extra_headers: Headers | None = None,
776 extra_query: Query | None = None,
777 extra_body: Body | None = None,
778 timeout: float | httpx.Timeout | None | NotGiven = not_given,
779 ) -> TranscriptionCreateResponse | AsyncStream[TranscriptionStreamEvent]:
780 """
781 Transcribes audio into the input language.
782
783 Args:
784 file:
785 The audio file object (not file name) to transcribe, in one of these formats:
786 flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
787
788 model: ID of the model to use. The options are `gpt-4o-transcribe`,
789 `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
790 Whisper V2 model), and `gpt-4o-transcribe-diarize`.
791
792 stream: If set to true, the model response data will be streamed to the client as it is
793 generated using
794 [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
795 See the
796 [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
797 for more information.
798
799 Note: Streaming is not supported for the `whisper-1` model and will be ignored.
800
801 chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
802 first normalizes loudness and then uses voice activity detection (VAD) to choose
803 boundaries. `server_vad` object can be provided to tweak VAD detection
804 parameters manually. If unset, the audio is transcribed as a single block.
805 Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
806 seconds.
807
808 include: Additional information to include in the transcription response. `logprobs` will
809 return the log probabilities of the tokens in the response to understand the
810 model's confidence in the transcription. `logprobs` only works with
811 response_format set to `json` and only with the models `gpt-4o-transcribe` and
812 `gpt-4o-mini-transcribe`. This field is not supported when using
813 `gpt-4o-transcribe-diarize`.
814
815 known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
816 `known_speaker_references[]`. Each entry should be a short identifier (for
817 example `customer` or `agent`). Up to 4 speakers are supported.
818
819 known_speaker_references: Optional list of audio samples (as
820 [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
821 that contain known speaker references matching `known_speaker_names[]`. Each
822 sample must be between 2 and 10 seconds, and can use any of the same input audio
823 formats supported by `file`.
824
825 language: The language of the input audio. Supplying the input language in
826 [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
827 format will improve accuracy and latency.
828
829 prompt: An optional text to guide the model's style or continue a previous audio
830 segment. The
831 [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
832 should match the audio language. This field is not supported when using
833 `gpt-4o-transcribe-diarize`.
834
835 response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
836 `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
837 `gpt-4o-mini-transcribe`, the only supported format is `json`. For
838 `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
839 `diarized_json`, with `diarized_json` required to receive speaker annotations.
840
841 temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
842 output more random, while lower values like 0.2 will make it more focused and
843 deterministic. If set to 0, the model will use
844 [log probability](https://en.wikipedia.org/wiki/Log_probability) to
845 automatically increase the temperature until certain thresholds are hit.
846
847 timestamp_granularities: The timestamp granularities to populate for this transcription.
848 `response_format` must be set `verbose_json` to use timestamp granularities.
849 Either or both of these options are supported: `word`, or `segment`. Note: There
850 is no additional latency for segment timestamps, but generating word timestamps
851 incurs additional latency. This option is not available for
852 `gpt-4o-transcribe-diarize`.
853
854 extra_headers: Send extra headers
855
856 extra_query: Add additional query parameters to the request
857
858 extra_body: Add additional JSON properties to the request
859
860 timeout: Override the client-level default timeout for this request, in seconds
861 """
862 ...
863
864 @required_args(["file", "model"], ["file", "model", "stream"])
865 async def create(
866 self,
867 *,
868 file: FileTypes,
869 model: Union[str, AudioModel],
870 chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
871 include: List[TranscriptionInclude] | Omit = omit,
872 known_speaker_names: SequenceNotStr[str] | Omit = omit,
873 known_speaker_references: SequenceNotStr[str] | Omit = omit,
874 language: str | Omit = omit,
875 prompt: str | Omit = omit,
876 response_format: Union[AudioResponseFormat, Omit] = omit,
877 stream: Optional[Literal[False]] | Literal[True] | Omit = omit,
878 temperature: float | Omit = omit,
879 timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
880 # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
881 # The extra values given here take precedence over values defined on the client or passed to this method.
882 extra_headers: Headers | None = None,
883 extra_query: Query | None = None,
884 extra_body: Body | None = None,
885 timeout: float | httpx.Timeout | None | NotGiven = not_given,
886 ) -> Transcription | TranscriptionVerbose | TranscriptionDiarized | str | AsyncStream[TranscriptionStreamEvent]:
887 body = deepcopy_minimal(
888 {
889 "file": file,
890 "model": model,
891 "chunking_strategy": chunking_strategy,
892 "include": include,
893 "known_speaker_names": known_speaker_names,
894 "known_speaker_references": known_speaker_references,
895 "language": language,
896 "prompt": prompt,
897 "response_format": response_format,
898 "stream": stream,
899 "temperature": temperature,
900 "timestamp_granularities": timestamp_granularities,
901 }
902 )
903 files = extract_files(cast(Mapping[str, object], body), paths=[["file"]])
904 # It should be noted that the actual Content-Type header that will be
905 # sent to the server will contain a `boundary` parameter, e.g.
906 # multipart/form-data; boundary=---abc--
907 extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})}
908 return await self._post(
909 "/audio/transcriptions",
910 body=await async_maybe_transform(
911 body,
912 transcription_create_params.TranscriptionCreateParamsStreaming
913 if stream
914 else transcription_create_params.TranscriptionCreateParamsNonStreaming,
915 ),
916 files=files,
917 options=make_request_options(
918 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
919 ),
920 cast_to=_get_response_format_type(response_format),
921 stream=stream or False,
922 stream_cls=AsyncStream[TranscriptionStreamEvent],
923 )
924
925
926class TranscriptionsWithRawResponse:
927 def __init__(self, transcriptions: Transcriptions) -> None:
928 self._transcriptions = transcriptions
929
930 self.create = _legacy_response.to_raw_response_wrapper(
931 transcriptions.create,
932 )
933
934
935class AsyncTranscriptionsWithRawResponse:
936 def __init__(self, transcriptions: AsyncTranscriptions) -> None:
937 self._transcriptions = transcriptions
938
939 self.create = _legacy_response.async_to_raw_response_wrapper(
940 transcriptions.create,
941 )
942
943
944class TranscriptionsWithStreamingResponse:
945 def __init__(self, transcriptions: Transcriptions) -> None:
946 self._transcriptions = transcriptions
947
948 self.create = to_streamed_response_wrapper(
949 transcriptions.create,
950 )
951
952
953class AsyncTranscriptionsWithStreamingResponse:
954 def __init__(self, transcriptions: AsyncTranscriptions) -> None:
955 self._transcriptions = transcriptions
956
957 self.create = async_to_streamed_response_wrapper(
958 transcriptions.create,
959 )
960
961
962def _get_response_format_type(
963 response_format: AudioResponseFormat | Omit,
964) -> type[Transcription | TranscriptionVerbose | TranscriptionDiarized | str]:
965 if isinstance(response_format, Omit) or response_format is None: # pyright: ignore[reportUnnecessaryComparison]
966 return Transcription
967
968 if response_format == "json":
969 return Transcription
970 elif response_format == "verbose_json":
971 return TranscriptionVerbose
972 elif response_format == "diarized_json":
973 return TranscriptionDiarized
974 elif response_format == "srt" or response_format == "text" or response_format == "vtt":
975 return str
976 elif TYPE_CHECKING: # type: ignore[unreachable]
977 assert_never(response_format)
978 else:
979 log.warn("Unexpected audio response format: %s", response_format)
980 return Transcription