Commit 25cbb74f
Changed files (21)
src
openai
resources
vector_stores
types
audio
tests
api_resources
src/openai/resources/audio/transcriptions.py
@@ -9,8 +9,17 @@ from typing_extensions import Literal, overload, assert_never
import httpx
from ... import _legacy_response
-from ...types import AudioResponseFormat
-from ..._types import Body, Omit, Query, Headers, NotGiven, FileTypes, omit, not_given
+from ..._types import (
+ Body,
+ Omit,
+ Query,
+ Headers,
+ NotGiven,
+ FileTypes,
+ SequenceNotStr,
+ omit,
+ not_given,
+)
from ..._utils import extract_files, required_args, maybe_transform, deepcopy_minimal, async_maybe_transform
from ..._compat import cached_property
from ..._resource import SyncAPIResource, AsyncAPIResource
@@ -23,6 +32,7 @@ from ...types.audio.transcription import Transcription
from ...types.audio_response_format import AudioResponseFormat
from ...types.audio.transcription_include import TranscriptionInclude
from ...types.audio.transcription_verbose import TranscriptionVerbose
+from ...types.audio.transcription_diarized import TranscriptionDiarized
from ...types.audio.transcription_stream_event import TranscriptionStreamEvent
from ...types.audio.transcription_create_response import TranscriptionCreateResponse
@@ -93,6 +103,66 @@ class Transcriptions(SyncAPIResource):
timeout: float | httpx.Timeout | None | NotGiven = not_given,
) -> TranscriptionVerbose: ...
+ model's confidence in the transcription. `logprobs` only works with
+ response_format set to `json` and only with the models `gpt-4o-transcribe` and
+ `gpt-4o-mini-transcribe`. This field is not supported when using
+ `gpt-4o-transcribe-diarize`.
+
+ known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
+ `known_speaker_references[]`. Each entry should be a short identifier (for
+ example `customer` or `agent`). Up to 4 speakers are supported.
+
+ known_speaker_references: Optional list of audio samples (as
+ [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+ that contain known speaker references matching `known_speaker_names[]`. Each
+ sample must be between 2 and 10 seconds, and can use any of the same input audio
+ formats supported by `file`.
+
+ language: The language of the input audio. Supplying the input language in
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ format will improve accuracy and latency.
+
+ prompt: An optional text to guide the model's style or continue a previous audio
+ segment. The
+ [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
+ should match the audio language. This field is not supported when using
+ `gpt-4o-transcribe-diarize`.
+
+ response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
+ `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+ `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+ `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+ `diarized_json`, with `diarized_json` required to receive speaker annotations.
+
+ stream: If set to true, the model response data will be streamed to the client as it is
+ generated using
+ [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
+ See the
+ [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
+ for more information.
+
+ Note: Streaming is not supported for the `whisper-1` model and will be ignored.
+
+ temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
+ output more random, while lower values like 0.2 will make it more focused and
+ deterministic. If set to 0, the model will use
+ [log probability](https://en.wikipedia.org/wiki/Log_probability) to
+ automatically increase the temperature until certain thresholds are hit.
+
+ timestamp_granularities: The timestamp granularities to populate for this transcription.
+ `response_format` must be set `verbose_json` to use timestamp granularities.
+ Either or both of these options are supported: `word`, or `segment`. Note: There
+ is no additional latency for segment timestamps, but generating word timestamps
+ incurs additional latency. This option is not available for
+ `gpt-4o-transcribe-diarize`.
+
+ extra_headers: Send extra headers
+
+ extra_query: Add additional query parameters to the request
+
+ extra_body: Add additional JSON properties to the request
+ ) -> Transcription: ...
+
@overload
def create(
self,
@@ -114,6 +184,27 @@ class Transcriptions(SyncAPIResource):
timeout: float | httpx.Timeout | None | NotGiven = not_given,
) -> str: ...
+ @overload
+ def create(
+ self,
+ *,
+ file: FileTypes,
+ model: Union[str, AudioModel],
+ chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
+ response_format: Literal["diarized_json"],
+ known_speaker_names: SequenceNotStr[str] | Omit = omit,
+ known_speaker_references: SequenceNotStr[str] | Omit = omit,
+ language: str | Omit = omit,
+ temperature: float | Omit = omit,
+ timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+ # The extra values given here take precedence over values defined on the client or passed to this method.
+ extra_headers: Headers | None = None,
+ extra_query: Query | None = None,
+ extra_body: Body | None = None,
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
+ ) -> TranscriptionDiarized: ...
+
@overload
def create(
self,
@@ -123,6 +214,8 @@ class Transcriptions(SyncAPIResource):
stream: Literal[True],
chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
include: List[TranscriptionInclude] | Omit = omit,
+ known_speaker_names: SequenceNotStr[str] | Omit = omit,
+ known_speaker_references: SequenceNotStr[str] | Omit = omit,
language: str | Omit = omit,
prompt: str | Omit = omit,
response_format: Union[AudioResponseFormat, Omit] = omit,
@@ -144,8 +237,8 @@ class Transcriptions(SyncAPIResource):
flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
model: ID of the model to use. The options are `gpt-4o-transcribe`,
- `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
- Whisper V2 model).
+ `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
+ Whisper V2 model), and `gpt-4o-transcribe-diarize`.
stream: If set to true, the model response data will be streamed to the client as it is
generated using
@@ -160,12 +253,25 @@ class Transcriptions(SyncAPIResource):
first normalizes loudness and then uses voice activity detection (VAD) to choose
boundaries. `server_vad` object can be provided to tweak VAD detection
parameters manually. If unset, the audio is transcribed as a single block.
+ Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
+ seconds.
include: Additional information to include in the transcription response. `logprobs` will
return the log probabilities of the tokens in the response to understand the
model's confidence in the transcription. `logprobs` only works with
response_format set to `json` and only with the models `gpt-4o-transcribe` and
- `gpt-4o-mini-transcribe`.
+ `gpt-4o-mini-transcribe`. This field is not supported when using
+ `gpt-4o-transcribe-diarize`.
+
+ known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
+ `known_speaker_references[]`. Each entry should be a short identifier (for
+ example `customer` or `agent`). Up to 4 speakers are supported.
+
+ known_speaker_references: Optional list of audio samples (as
+ [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+ that contain known speaker references matching `known_speaker_names[]`. Each
+ sample must be between 2 and 10 seconds, and can use any of the same input audio
+ formats supported by `file`.
language: The language of the input audio. Supplying the input language in
[ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -174,11 +280,14 @@ class Transcriptions(SyncAPIResource):
prompt: An optional text to guide the model's style or continue a previous audio
segment. The
[prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- should match the audio language.
+ should match the audio language. This field is not supported when using
+ `gpt-4o-transcribe-diarize`.
response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
- `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
- the only supported format is `json`.
+ `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+ `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+ `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+ `diarized_json`, with `diarized_json` required to receive speaker annotations.
temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
output more random, while lower values like 0.2 will make it more focused and
@@ -190,7 +299,8 @@ class Transcriptions(SyncAPIResource):
`response_format` must be set `verbose_json` to use timestamp granularities.
Either or both of these options are supported: `word`, or `segment`. Note: There
is no additional latency for segment timestamps, but generating word timestamps
- incurs additional latency.
+ incurs additional latency. This option is not available for
+ `gpt-4o-transcribe-diarize`.
extra_headers: Send extra headers
@@ -211,6 +321,8 @@ class Transcriptions(SyncAPIResource):
stream: bool,
chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
include: List[TranscriptionInclude] | Omit = omit,
+ known_speaker_names: SequenceNotStr[str] | Omit = omit,
+ known_speaker_references: SequenceNotStr[str] | Omit = omit,
language: str | Omit = omit,
prompt: str | Omit = omit,
response_format: Union[AudioResponseFormat, Omit] = omit,
@@ -232,8 +344,8 @@ class Transcriptions(SyncAPIResource):
flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
model: ID of the model to use. The options are `gpt-4o-transcribe`,
- `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
- Whisper V2 model).
+ `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
+ Whisper V2 model), and `gpt-4o-transcribe-diarize`.
stream: If set to true, the model response data will be streamed to the client as it is
generated using
@@ -248,12 +360,25 @@ class Transcriptions(SyncAPIResource):
first normalizes loudness and then uses voice activity detection (VAD) to choose
boundaries. `server_vad` object can be provided to tweak VAD detection
parameters manually. If unset, the audio is transcribed as a single block.
+ Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
+ seconds.
include: Additional information to include in the transcription response. `logprobs` will
return the log probabilities of the tokens in the response to understand the
model's confidence in the transcription. `logprobs` only works with
response_format set to `json` and only with the models `gpt-4o-transcribe` and
- `gpt-4o-mini-transcribe`.
+ `gpt-4o-mini-transcribe`. This field is not supported when using
+ `gpt-4o-transcribe-diarize`.
+
+ known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
+ `known_speaker_references[]`. Each entry should be a short identifier (for
+ example `customer` or `agent`). Up to 4 speakers are supported.
+
+ known_speaker_references: Optional list of audio samples (as
+ [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+ that contain known speaker references matching `known_speaker_names[]`. Each
+ sample must be between 2 and 10 seconds, and can use any of the same input audio
+ formats supported by `file`.
language: The language of the input audio. Supplying the input language in
[ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -262,11 +387,14 @@ class Transcriptions(SyncAPIResource):
prompt: An optional text to guide the model's style or continue a previous audio
segment. The
[prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- should match the audio language.
+ should match the audio language. This field is not supported when using
+ `gpt-4o-transcribe-diarize`.
response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
- `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
- the only supported format is `json`.
+ `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+ `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+ `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+ `diarized_json`, with `diarized_json` required to receive speaker annotations.
temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
output more random, while lower values like 0.2 will make it more focused and
@@ -278,7 +406,8 @@ class Transcriptions(SyncAPIResource):
`response_format` must be set `verbose_json` to use timestamp granularities.
Either or both of these options are supported: `word`, or `segment`. Note: There
is no additional latency for segment timestamps, but generating word timestamps
- incurs additional latency.
+ incurs additional latency. This option is not available for
+ `gpt-4o-transcribe-diarize`.
extra_headers: Send extra headers
@@ -298,6 +427,8 @@ class Transcriptions(SyncAPIResource):
model: Union[str, AudioModel],
chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
include: List[TranscriptionInclude] | Omit = omit,
+ known_speaker_names: SequenceNotStr[str] | Omit = omit,
+ known_speaker_references: SequenceNotStr[str] | Omit = omit,
language: str | Omit = omit,
prompt: str | Omit = omit,
response_format: Union[AudioResponseFormat, Omit] = omit,
@@ -310,13 +441,15 @@ class Transcriptions(SyncAPIResource):
extra_query: Query | None = None,
extra_body: Body | None = None,
timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> str | Transcription | TranscriptionVerbose | Stream[TranscriptionStreamEvent]:
+ ) -> str | Transcription | TranscriptionDiarized | TranscriptionVerbose | Stream[TranscriptionStreamEvent]:
body = deepcopy_minimal(
{
"file": file,
"model": model,
"chunking_strategy": chunking_strategy,
"include": include,
+ "known_speaker_names": known_speaker_names,
+ "known_speaker_references": known_speaker_references,
"language": language,
"prompt": prompt,
"response_format": response_format,
@@ -376,6 +509,8 @@ class AsyncTranscriptions(AsyncAPIResource):
model: Union[str, AudioModel],
chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
include: List[TranscriptionInclude] | Omit = omit,
+ known_speaker_names: SequenceNotStr[str] | Omit = omit,
+ known_speaker_references: SequenceNotStr[str] | Omit = omit,
language: str | Omit = omit,
prompt: str | Omit = omit,
response_format: Union[Literal["json"], Omit] = omit,
@@ -398,19 +533,32 @@ class AsyncTranscriptions(AsyncAPIResource):
flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
model: ID of the model to use. The options are `gpt-4o-transcribe`,
- `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
- Whisper V2 model).
+ `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
+ Whisper V2 model), and `gpt-4o-transcribe-diarize`.
chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
first normalizes loudness and then uses voice activity detection (VAD) to choose
boundaries. `server_vad` object can be provided to tweak VAD detection
parameters manually. If unset, the audio is transcribed as a single block.
+ Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
+ seconds.
include: Additional information to include in the transcription response. `logprobs` will
return the log probabilities of the tokens in the response to understand the
model's confidence in the transcription. `logprobs` only works with
response_format set to `json` and only with the models `gpt-4o-transcribe` and
- `gpt-4o-mini-transcribe`.
+ `gpt-4o-mini-transcribe`. This field is not supported when using
+ `gpt-4o-transcribe-diarize`.
+
+ known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
+ `known_speaker_references[]`. Each entry should be a short identifier (for
+ example `customer` or `agent`). Up to 4 speakers are supported.
+
+ known_speaker_references: Optional list of audio samples (as
+ [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+ that contain known speaker references matching `known_speaker_names[]`. Each
+ sample must be between 2 and 10 seconds, and can use any of the same input audio
+ formats supported by `file`.
language: The language of the input audio. Supplying the input language in
[ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -419,11 +567,14 @@ class AsyncTranscriptions(AsyncAPIResource):
prompt: An optional text to guide the model's style or continue a previous audio
segment. The
[prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- should match the audio language.
+ should match the audio language. This field is not supported when using
+ `gpt-4o-transcribe-diarize`.
response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
- `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
- the only supported format is `json`.
+ `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+ `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+ `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+ `diarized_json`, with `diarized_json` required to receive speaker annotations.
stream: If set to true, the model response data will be streamed to the client as it is
generated using
@@ -444,7 +595,8 @@ class AsyncTranscriptions(AsyncAPIResource):
`response_format` must be set `verbose_json` to use timestamp granularities.
Either or both of these options are supported: `word`, or `segment`. Note: There
is no additional latency for segment timestamps, but generating word timestamps
- incurs additional latency.
+ incurs additional latency. This option is not available for
+ `gpt-4o-transcribe-diarize`.
extra_headers: Send extra headers
@@ -502,6 +654,8 @@ class AsyncTranscriptions(AsyncAPIResource):
stream: Literal[True],
chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
include: List[TranscriptionInclude] | Omit = omit,
+ known_speaker_names: SequenceNotStr[str] | Omit = omit,
+ known_speaker_references: SequenceNotStr[str] | Omit = omit,
language: str | Omit = omit,
prompt: str | Omit = omit,
response_format: Union[AudioResponseFormat, Omit] = omit,
@@ -523,8 +677,8 @@ class AsyncTranscriptions(AsyncAPIResource):
flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
model: ID of the model to use. The options are `gpt-4o-transcribe`,
- `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
- Whisper V2 model).
+ `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
+ Whisper V2 model), and `gpt-4o-transcribe-diarize`.
stream: If set to true, the model response data will be streamed to the client as it is
generated using
@@ -539,12 +693,25 @@ class AsyncTranscriptions(AsyncAPIResource):
first normalizes loudness and then uses voice activity detection (VAD) to choose
boundaries. `server_vad` object can be provided to tweak VAD detection
parameters manually. If unset, the audio is transcribed as a single block.
+ Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
+ seconds.
include: Additional information to include in the transcription response. `logprobs` will
return the log probabilities of the tokens in the response to understand the
model's confidence in the transcription. `logprobs` only works with
response_format set to `json` and only with the models `gpt-4o-transcribe` and
- `gpt-4o-mini-transcribe`.
+ `gpt-4o-mini-transcribe`. This field is not supported when using
+ `gpt-4o-transcribe-diarize`.
+
+ known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
+ `known_speaker_references[]`. Each entry should be a short identifier (for
+ example `customer` or `agent`). Up to 4 speakers are supported.
+
+ known_speaker_references: Optional list of audio samples (as
+ [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+ that contain known speaker references matching `known_speaker_names[]`. Each
+ sample must be between 2 and 10 seconds, and can use any of the same input audio
+ formats supported by `file`.
language: The language of the input audio. Supplying the input language in
[ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -553,11 +720,14 @@ class AsyncTranscriptions(AsyncAPIResource):
prompt: An optional text to guide the model's style or continue a previous audio
segment. The
[prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- should match the audio language.
+ should match the audio language. This field is not supported when using
+ `gpt-4o-transcribe-diarize`.
response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
- `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
- the only supported format is `json`.
+ `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+ `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+ `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+ `diarized_json`, with `diarized_json` required to receive speaker annotations.
temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
output more random, while lower values like 0.2 will make it more focused and
@@ -569,7 +739,8 @@ class AsyncTranscriptions(AsyncAPIResource):
`response_format` must be set `verbose_json` to use timestamp granularities.
Either or both of these options are supported: `word`, or `segment`. Note: There
is no additional latency for segment timestamps, but generating word timestamps
- incurs additional latency.
+ incurs additional latency. This option is not available for
+ `gpt-4o-transcribe-diarize`.
extra_headers: Send extra headers
@@ -590,6 +761,8 @@ class AsyncTranscriptions(AsyncAPIResource):
stream: bool,
chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
include: List[TranscriptionInclude] | Omit = omit,
+ known_speaker_names: SequenceNotStr[str] | Omit = omit,
+ known_speaker_references: SequenceNotStr[str] | Omit = omit,
language: str | Omit = omit,
prompt: str | Omit = omit,
response_format: Union[AudioResponseFormat, Omit] = omit,
@@ -611,8 +784,8 @@ class AsyncTranscriptions(AsyncAPIResource):
flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
model: ID of the model to use. The options are `gpt-4o-transcribe`,
- `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
- Whisper V2 model).
+ `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
+ Whisper V2 model), and `gpt-4o-transcribe-diarize`.
stream: If set to true, the model response data will be streamed to the client as it is
generated using
@@ -627,12 +800,25 @@ class AsyncTranscriptions(AsyncAPIResource):
first normalizes loudness and then uses voice activity detection (VAD) to choose
boundaries. `server_vad` object can be provided to tweak VAD detection
parameters manually. If unset, the audio is transcribed as a single block.
+ Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
+ seconds.
include: Additional information to include in the transcription response. `logprobs` will
return the log probabilities of the tokens in the response to understand the
model's confidence in the transcription. `logprobs` only works with
response_format set to `json` and only with the models `gpt-4o-transcribe` and
- `gpt-4o-mini-transcribe`.
+ `gpt-4o-mini-transcribe`. This field is not supported when using
+ `gpt-4o-transcribe-diarize`.
+
+ known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
+ `known_speaker_references[]`. Each entry should be a short identifier (for
+ example `customer` or `agent`). Up to 4 speakers are supported.
+
+ known_speaker_references: Optional list of audio samples (as
+ [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+ that contain known speaker references matching `known_speaker_names[]`. Each
+ sample must be between 2 and 10 seconds, and can use any of the same input audio
+ formats supported by `file`.
language: The language of the input audio. Supplying the input language in
[ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -641,11 +827,14 @@ class AsyncTranscriptions(AsyncAPIResource):
prompt: An optional text to guide the model's style or continue a previous audio
segment. The
[prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- should match the audio language.
+ should match the audio language. This field is not supported when using
+ `gpt-4o-transcribe-diarize`.
response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
- `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
- the only supported format is `json`.
+ `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+ `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+ `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+ `diarized_json`, with `diarized_json` required to receive speaker annotations.
temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
output more random, while lower values like 0.2 will make it more focused and
@@ -657,7 +846,8 @@ class AsyncTranscriptions(AsyncAPIResource):
`response_format` must be set `verbose_json` to use timestamp granularities.
Either or both of these options are supported: `word`, or `segment`. Note: There
is no additional latency for segment timestamps, but generating word timestamps
- incurs additional latency.
+ incurs additional latency. This option is not available for
+ `gpt-4o-transcribe-diarize`.
extra_headers: Send extra headers
@@ -677,6 +867,8 @@ class AsyncTranscriptions(AsyncAPIResource):
model: Union[str, AudioModel],
chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
include: List[TranscriptionInclude] | Omit = omit,
+ known_speaker_names: SequenceNotStr[str] | Omit = omit,
+ known_speaker_references: SequenceNotStr[str] | Omit = omit,
language: str | Omit = omit,
prompt: str | Omit = omit,
response_format: Union[AudioResponseFormat, Omit] = omit,
@@ -689,13 +881,15 @@ class AsyncTranscriptions(AsyncAPIResource):
extra_query: Query | None = None,
extra_body: Body | None = None,
timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> Transcription | TranscriptionVerbose | str | AsyncStream[TranscriptionStreamEvent]:
+ ) -> Transcription | TranscriptionVerbose | TranscriptionDiarized | str | AsyncStream[TranscriptionStreamEvent]:
body = deepcopy_minimal(
{
"file": file,
"model": model,
"chunking_strategy": chunking_strategy,
"include": include,
+ "known_speaker_names": known_speaker_names,
+ "known_speaker_references": known_speaker_references,
"language": language,
"prompt": prompt,
"response_format": response_format,
@@ -764,8 +958,8 @@ class AsyncTranscriptionsWithStreamingResponse:
def _get_response_format_type(
- response_format: Literal["json", "text", "srt", "verbose_json", "vtt"] | Omit,
-) -> type[Transcription | TranscriptionVerbose | str]:
+ response_format: AudioResponseFormat | Omit,
+) -> type[Transcription | TranscriptionVerbose | TranscriptionDiarized | str]:
if isinstance(response_format, Omit) or response_format is None: # pyright: ignore[reportUnnecessaryComparison]
return Transcription
@@ -773,6 +967,8 @@ def _get_response_format_type(
return Transcription
elif response_format == "verbose_json":
return TranscriptionVerbose
+ elif response_format == "diarized_json":
+ return TranscriptionDiarized
elif response_format == "srt" or response_format == "text" or response_format == "vtt":
return str
elif TYPE_CHECKING: # type: ignore[unreachable]
src/openai/resources/audio/translations.py
@@ -349,7 +349,7 @@ class AsyncTranslationsWithStreamingResponse:
def _get_response_format_type(
- response_format: Literal["json", "text", "srt", "verbose_json", "vtt"] | Omit,
+ response_format: AudioResponseFormat | Omit,
) -> type[Translation | TranslationVerbose | str]:
if isinstance(response_format, Omit) or response_format is None: # pyright: ignore[reportUnnecessaryComparison]
return Translation
@@ -360,8 +360,8 @@ def _get_response_format_type(
return TranslationVerbose
elif response_format == "srt" or response_format == "text" or response_format == "vtt":
return str
- elif TYPE_CHECKING: # type: ignore[unreachable]
+ elif TYPE_CHECKING and response_format != "diarized_json": # type: ignore[unreachable]
assert_never(response_format)
else:
- log.warn("Unexpected audio response format: %s", response_format)
- return Transcription
+ log.warning("Unexpected audio response format: %s", response_format)
+ return Translation
src/openai/resources/vector_stores/vector_stores.py
@@ -79,6 +79,7 @@ class VectorStores(SyncAPIResource):
self,
*,
chunking_strategy: FileChunkingStrategyParam | Omit = omit,
+ description: str | Omit = omit,
expires_after: vector_store_create_params.ExpiresAfter | Omit = omit,
file_ids: SequenceNotStr[str] | Omit = omit,
metadata: Optional[Metadata] | Omit = omit,
@@ -97,6 +98,9 @@ class VectorStores(SyncAPIResource):
chunking_strategy: The chunking strategy used to chunk the file(s). If not set, will use the `auto`
strategy. Only applicable if `file_ids` is non-empty.
+ description: A description for the vector store. Can be used to describe the vector store's
+ purpose.
+
expires_after: The expiration policy for a vector store.
file_ids: A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that
@@ -126,6 +130,7 @@ class VectorStores(SyncAPIResource):
body=maybe_transform(
{
"chunking_strategy": chunking_strategy,
+ "description": description,
"expires_after": expires_after,
"file_ids": file_ids,
"metadata": metadata,
@@ -424,6 +429,7 @@ class AsyncVectorStores(AsyncAPIResource):
self,
*,
chunking_strategy: FileChunkingStrategyParam | Omit = omit,
+ description: str | Omit = omit,
expires_after: vector_store_create_params.ExpiresAfter | Omit = omit,
file_ids: SequenceNotStr[str] | Omit = omit,
metadata: Optional[Metadata] | Omit = omit,
@@ -442,6 +448,9 @@ class AsyncVectorStores(AsyncAPIResource):
chunking_strategy: The chunking strategy used to chunk the file(s). If not set, will use the `auto`
strategy. Only applicable if `file_ids` is non-empty.
+ description: A description for the vector store. Can be used to describe the vector store's
+ purpose.
+
expires_after: The expiration policy for a vector store.
file_ids: A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that
@@ -471,6 +480,7 @@ class AsyncVectorStores(AsyncAPIResource):
body=await async_maybe_transform(
{
"chunking_strategy": chunking_strategy,
+ "description": description,
"expires_after": expires_after,
"file_ids": file_ids,
"metadata": metadata,
src/openai/types/audio/__init__.py
@@ -11,10 +11,13 @@ from .speech_create_params import SpeechCreateParams as SpeechCreateParams
from .transcription_include import TranscriptionInclude as TranscriptionInclude
from .transcription_segment import TranscriptionSegment as TranscriptionSegment
from .transcription_verbose import TranscriptionVerbose as TranscriptionVerbose
+from .transcription_diarized import TranscriptionDiarized as TranscriptionDiarized
from .translation_create_params import TranslationCreateParams as TranslationCreateParams
from .transcription_stream_event import TranscriptionStreamEvent as TranscriptionStreamEvent
from .transcription_create_params import TranscriptionCreateParams as TranscriptionCreateParams
from .translation_create_response import TranslationCreateResponse as TranslationCreateResponse
from .transcription_create_response import TranscriptionCreateResponse as TranscriptionCreateResponse
from .transcription_text_done_event import TranscriptionTextDoneEvent as TranscriptionTextDoneEvent
+from .transcription_diarized_segment import TranscriptionDiarizedSegment as TranscriptionDiarizedSegment
from .transcription_text_delta_event import TranscriptionTextDeltaEvent as TranscriptionTextDeltaEvent
+from .transcription_text_segment_event import TranscriptionTextSegmentEvent as TranscriptionTextSegmentEvent
src/openai/types/audio/transcription_create_params.py
@@ -5,7 +5,7 @@ from __future__ import annotations
from typing import List, Union, Optional
from typing_extensions import Literal, Required, TypeAlias, TypedDict
-from ..._types import FileTypes
+from ..._types import FileTypes, SequenceNotStr
from ..audio_model import AudioModel
from .transcription_include import TranscriptionInclude
from ..audio_response_format import AudioResponseFormat
@@ -29,8 +29,9 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
model: Required[Union[str, AudioModel]]
"""ID of the model to use.
- The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1`
- (which is powered by our open source Whisper V2 model).
+ The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, `whisper-1`
+ (which is powered by our open source Whisper V2 model), and
+ `gpt-4o-transcribe-diarize`.
"""
chunking_strategy: Optional[ChunkingStrategy]
@@ -39,7 +40,8 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
When set to `"auto"`, the server first normalizes loudness and then uses voice
activity detection (VAD) to choose boundaries. `server_vad` object can be
provided to tweak VAD detection parameters manually. If unset, the audio is
- transcribed as a single block.
+ transcribed as a single block. Required when using `gpt-4o-transcribe-diarize`
+ for inputs longer than 30 seconds.
"""
include: List[TranscriptionInclude]
@@ -48,7 +50,24 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
return the log probabilities of the tokens in the response to understand the
model's confidence in the transcription. `logprobs` only works with
response_format set to `json` and only with the models `gpt-4o-transcribe` and
- `gpt-4o-mini-transcribe`.
+ `gpt-4o-mini-transcribe`. This field is not supported when using
+ `gpt-4o-transcribe-diarize`.
+ """
+
+ known_speaker_names: SequenceNotStr[str]
+ """
+ Optional list of speaker names that correspond to the audio samples provided in
+ `known_speaker_references[]`. Each entry should be a short identifier (for
+ example `customer` or `agent`). Up to 4 speakers are supported.
+ """
+
+ known_speaker_references: SequenceNotStr[str]
+ """
+ Optional list of audio samples (as
+ [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+ that contain known speaker references matching `known_speaker_names[]`. Each
+ sample must be between 2 and 10 seconds, and can use any of the same input audio
+ formats supported by `file`.
"""
language: str
@@ -64,14 +83,17 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
segment.
The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- should match the audio language.
+ should match the audio language. This field is not supported when using
+ `gpt-4o-transcribe-diarize`.
"""
response_format: AudioResponseFormat
"""
The format of the output, in one of these options: `json`, `text`, `srt`,
- `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
- the only supported format is `json`.
+ `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+ `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+ `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+ `diarized_json`, with `diarized_json` required to receive speaker annotations.
"""
temperature: float
@@ -89,7 +111,8 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
`response_format` must be set `verbose_json` to use timestamp granularities.
Either or both of these options are supported: `word`, or `segment`. Note: There
is no additional latency for segment timestamps, but generating word timestamps
- incurs additional latency.
+ incurs additional latency. This option is not available for
+ `gpt-4o-transcribe-diarize`.
"""
src/openai/types/audio/transcription_create_response.py
@@ -5,7 +5,8 @@ from typing_extensions import TypeAlias
from .transcription import Transcription
from .transcription_verbose import TranscriptionVerbose
+from .transcription_diarized import TranscriptionDiarized
__all__ = ["TranscriptionCreateResponse"]
-TranscriptionCreateResponse: TypeAlias = Union[Transcription, TranscriptionVerbose]
+TranscriptionCreateResponse: TypeAlias = Union[Transcription, TranscriptionDiarized, TranscriptionVerbose]
src/openai/types/audio/transcription_diarized.py
@@ -0,0 +1,63 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Union, Optional
+from typing_extensions import Literal, Annotated, TypeAlias
+
+from ..._utils import PropertyInfo
+from ..._models import BaseModel
+from .transcription_diarized_segment import TranscriptionDiarizedSegment
+
+__all__ = ["TranscriptionDiarized", "Usage", "UsageTokens", "UsageTokensInputTokenDetails", "UsageDuration"]
+
+
+class UsageTokensInputTokenDetails(BaseModel):
+ audio_tokens: Optional[int] = None
+ """Number of audio tokens billed for this request."""
+
+ text_tokens: Optional[int] = None
+ """Number of text tokens billed for this request."""
+
+
+class UsageTokens(BaseModel):
+ input_tokens: int
+ """Number of input tokens billed for this request."""
+
+ output_tokens: int
+ """Number of output tokens generated."""
+
+ total_tokens: int
+ """Total number of tokens used (input + output)."""
+
+ type: Literal["tokens"]
+ """The type of the usage object. Always `tokens` for this variant."""
+
+ input_token_details: Optional[UsageTokensInputTokenDetails] = None
+ """Details about the input tokens billed for this request."""
+
+
+class UsageDuration(BaseModel):
+ seconds: float
+ """Duration of the input audio in seconds."""
+
+ type: Literal["duration"]
+ """The type of the usage object. Always `duration` for this variant."""
+
+
+Usage: TypeAlias = Annotated[Union[UsageTokens, UsageDuration], PropertyInfo(discriminator="type")]
+
+
+class TranscriptionDiarized(BaseModel):
+ duration: float
+ """Duration of the input audio in seconds."""
+
+ segments: List[TranscriptionDiarizedSegment]
+ """Segments of the transcript annotated with timestamps and speaker labels."""
+
+ task: Literal["transcribe"]
+ """The type of task that was run. Always `transcribe`."""
+
+ text: str
+ """The concatenated transcript text for the entire audio input."""
+
+ usage: Optional[Usage] = None
+ """Token or duration usage statistics for the request."""
src/openai/types/audio/transcription_diarized_segment.py
@@ -0,0 +1,32 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+
+__all__ = ["TranscriptionDiarizedSegment"]
+
+
+class TranscriptionDiarizedSegment(BaseModel):
+ id: str
+ """Unique identifier for the segment."""
+
+ end: float
+ """End timestamp of the segment in seconds."""
+
+ speaker: str
+ """Speaker label for this segment.
+
+ When known speakers are provided, the label matches `known_speaker_names[]`.
+ Otherwise speakers are labeled sequentially using capital letters (`A`, `B`,
+ ...).
+ """
+
+ start: float
+ """Start timestamp of the segment in seconds."""
+
+ text: str
+ """Transcript text for this segment."""
+
+ type: Literal["transcript.text.segment"]
+ """The type of the segment. Always `transcript.text.segment`."""
src/openai/types/audio/transcription_stream_event.py
@@ -6,9 +6,11 @@ from typing_extensions import Annotated, TypeAlias
from ..._utils import PropertyInfo
from .transcription_text_done_event import TranscriptionTextDoneEvent
from .transcription_text_delta_event import TranscriptionTextDeltaEvent
+from .transcription_text_segment_event import TranscriptionTextSegmentEvent
__all__ = ["TranscriptionStreamEvent"]
TranscriptionStreamEvent: TypeAlias = Annotated[
- Union[TranscriptionTextDeltaEvent, TranscriptionTextDoneEvent], PropertyInfo(discriminator="type")
+ Union[TranscriptionTextSegmentEvent, TranscriptionTextDeltaEvent, TranscriptionTextDoneEvent],
+ PropertyInfo(discriminator="type"),
]
src/openai/types/audio/transcription_text_delta_event.py
@@ -33,3 +33,9 @@ class TranscriptionTextDeltaEvent(BaseModel):
[create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
with the `include[]` parameter set to `logprobs`.
"""
+
+ segment_id: Optional[str] = None
+ """Identifier of the diarized segment that this delta belongs to.
+
+ Only present when using `gpt-4o-transcribe-diarize`.
+ """
src/openai/types/audio/transcription_text_segment_event.py
@@ -0,0 +1,27 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+
+__all__ = ["TranscriptionTextSegmentEvent"]
+
+
+class TranscriptionTextSegmentEvent(BaseModel):
+ id: str
+ """Unique identifier for the segment."""
+
+ end: float
+ """End timestamp of the segment in seconds."""
+
+ speaker: str
+ """Speaker label for this segment."""
+
+ start: float
+ """Start timestamp of the segment in seconds."""
+
+ text: str
+ """Transcript text for this segment."""
+
+ type: Literal["transcript.text.segment"]
+ """The type of the event. Always `transcript.text.segment`."""
src/openai/types/realtime/audio_transcription.py
@@ -17,13 +17,14 @@ class AudioTranscription(BaseModel):
format will improve accuracy and latency.
"""
- model: Optional[Literal["whisper-1", "gpt-4o-transcribe-latest", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"]] = (
- None
- )
+ model: Optional[
+ Literal["whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe", "gpt-4o-transcribe-diarize"]
+ ] = None
"""The model to use for transcription.
- Current options are `whisper-1`, `gpt-4o-transcribe-latest`,
- `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
+ Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`,
+ and `gpt-4o-transcribe-diarize`. Use `gpt-4o-transcribe-diarize` when you need
+ diarization with speaker labels.
"""
prompt: Optional[str] = None
@@ -31,6 +32,6 @@ class AudioTranscription(BaseModel):
An optional text to guide the model's style or continue a previous audio
segment. For `whisper-1`, the
[prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
- For `gpt-4o-transcribe` models, the prompt is a free text string, for example
- "expect words related to technology".
+ For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the
+ prompt is a free text string, for example "expect words related to technology".
"""
src/openai/types/realtime/audio_transcription_param.py
@@ -16,11 +16,12 @@ class AudioTranscriptionParam(TypedDict, total=False):
format will improve accuracy and latency.
"""
- model: Literal["whisper-1", "gpt-4o-transcribe-latest", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"]
+ model: Literal["whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe", "gpt-4o-transcribe-diarize"]
"""The model to use for transcription.
- Current options are `whisper-1`, `gpt-4o-transcribe-latest`,
- `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
+ Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`,
+ and `gpt-4o-transcribe-diarize`. Use `gpt-4o-transcribe-diarize` when you need
+ diarization with speaker labels.
"""
prompt: str
@@ -28,6 +29,6 @@ class AudioTranscriptionParam(TypedDict, total=False):
An optional text to guide the model's style or continue a previous audio
segment. For `whisper-1`, the
[prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
- For `gpt-4o-transcribe` models, the prompt is a free text string, for example
- "expect words related to technology".
+ For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the
+ prompt is a free text string, for example "expect words related to technology".
"""
src/openai/types/audio_model.py
@@ -4,4 +4,4 @@ from typing_extensions import Literal, TypeAlias
__all__ = ["AudioModel"]
-AudioModel: TypeAlias = Literal["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]
+AudioModel: TypeAlias = Literal["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe", "gpt-4o-transcribe-diarize"]
src/openai/types/audio_response_format.py
@@ -4,4 +4,4 @@ from typing_extensions import Literal, TypeAlias
__all__ = ["AudioResponseFormat"]
-AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt"]
+AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt", "diarized_json"]
src/openai/types/vector_store_create_params.py
@@ -20,6 +20,12 @@ class VectorStoreCreateParams(TypedDict, total=False):
non-empty.
"""
+ description: str
+ """A description for the vector store.
+
+ Can be used to describe the vector store's purpose.
+ """
+
expires_after: ExpiresAfter
"""The expiration policy for a vector store."""
tests/api_resources/audio/test_transcriptions.py
@@ -32,6 +32,8 @@ class TestTranscriptions:
model="gpt-4o-transcribe",
chunking_strategy="auto",
include=["logprobs"],
+ known_speaker_names=["string"],
+ known_speaker_references=["string"],
language="language",
prompt="prompt",
response_format="json",
@@ -84,6 +86,8 @@ class TestTranscriptions:
stream=True,
chunking_strategy="auto",
include=["logprobs"],
+ known_speaker_names=["string"],
+ known_speaker_references=["string"],
language="language",
prompt="prompt",
response_format="json",
@@ -140,6 +144,8 @@ class TestAsyncTranscriptions:
model="gpt-4o-transcribe",
chunking_strategy="auto",
include=["logprobs"],
+ known_speaker_names=["string"],
+ known_speaker_references=["string"],
language="language",
prompt="prompt",
response_format="json",
@@ -192,6 +198,8 @@ class TestAsyncTranscriptions:
stream=True,
chunking_strategy="auto",
include=["logprobs"],
+ known_speaker_names=["string"],
+ known_speaker_references=["string"],
language="language",
prompt="prompt",
response_format="json",
tests/api_resources/test_vector_stores.py
@@ -31,6 +31,7 @@ class TestVectorStores:
def test_method_create_with_all_params(self, client: OpenAI) -> None:
vector_store = client.vector_stores.create(
chunking_strategy={"type": "auto"},
+ description="description",
expires_after={
"anchor": "last_active_at",
"days": 1,
@@ -299,6 +300,7 @@ class TestAsyncVectorStores:
async def test_method_create_with_all_params(self, async_client: AsyncOpenAI) -> None:
vector_store = await async_client.vector_stores.create(
chunking_strategy={"type": "auto"},
+ description="description",
expires_after={
"anchor": "last_active_at",
"days": 1,
tests/lib/test_audio.py
@@ -44,7 +44,8 @@ def test_translation_create_overloads_in_sync(sync: bool, client: OpenAI, async_
elif is_literal_type(typ):
overload_response_formats.update(get_args(typ))
- src_response_formats: set[str] = set(get_args(AudioResponseFormat))
+ # 'diarized_json' applies only to transcriptions, not translations.
+ src_response_formats: set[str] = set(get_args(AudioResponseFormat)) - {"diarized_json"}
diff = src_response_formats.difference(overload_response_formats)
assert len(diff) == 0, f"some response format options don't have overloads"
@@ -57,18 +58,27 @@ def test_transcription_create_overloads_in_sync(sync: bool, client: OpenAI, asyn
overload_response_formats: set[str] = set()
for i, overload in enumerate(typing_extensions.get_overloads(fn)):
- assert_signatures_in_sync(
- fn,
- overload,
- exclude_params={"response_format", "stream"},
- description=f" for overload {i}",
- )
-
sig = inspect.signature(overload)
typ = evaluate_forwardref(
sig.parameters["response_format"].annotation,
globalns=sys.modules[fn.__module__].__dict__,
)
+
+ exclude_params = {"response_format", "stream"}
+ # known_speaker_names and known_speaker_references are only supported by diarized_json
+ if not (is_literal_type(typ) and set(get_args(typ)) == {"diarized_json"}):
+ exclude_params.update({"known_speaker_names", "known_speaker_references"})
+
+ # diarized_json does not support these parameters
+ if is_literal_type(typ) and set(get_args(typ)) == {"diarized_json"}:
+ exclude_params.update({"include", "prompt", "timestamp_granularities"})
+
+ assert_signatures_in_sync(
+ fn,
+ overload,
+ exclude_params=exclude_params,
+ description=f" for overload {i}",
+ )
if is_union_type(typ):
for arg in get_args(typ):
if not is_literal_type(arg):
.stats.yml
@@ -1,4 +1,4 @@
configured_endpoints: 136
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-11d308a9ef78ad01aa11c880a084a3982276800d7994db3f454aa515474977d7.yml
-openapi_spec_hash: 0a4bbb5aa0ae532a072bd6b3854e70b1
-config_hash: f0940d0906846178759ef7128e4cb98e
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-104cced8f4c7436a76eea02e26307828166405ccfb296faffb008b72772c11a7.yml
+openapi_spec_hash: fdc03ed84a65a31b80da909255e53924
+config_hash: 03b48e9b8c7231a902403210dbd7dfa0
api.md
@@ -171,11 +171,14 @@ Types:
```python
from openai.types.audio import (
Transcription,
+ TranscriptionDiarized,
+ TranscriptionDiarizedSegment,
TranscriptionInclude,
TranscriptionSegment,
TranscriptionStreamEvent,
TranscriptionTextDeltaEvent,
TranscriptionTextDoneEvent,
+ TranscriptionTextSegmentEvent,
TranscriptionVerbose,
TranscriptionWord,
TranscriptionCreateResponse,