Commit `25cbb74f`

stainless-app[bot] <142633134+stainless-app[bot]@users.noreply.github.com>

2025-10-16 22:55:31

feat(api): Add support for gpt-4o-transcribe-diarize on audio/transcriptions endpoint

1 parent 8cdfd06

Changed files (21)

@@ -9,8 +9,17 @@ from typing_extensions import Literal, overload, assert_never
 import httpx
 
 from ... import _legacy_response
-from ...types import AudioResponseFormat
-from ..._types import Body, Omit, Query, Headers, NotGiven, FileTypes, omit, not_given
+from ..._types import (
+    Body,
+    Omit,
+    Query,
+    Headers,
+    NotGiven,
+    FileTypes,
+    SequenceNotStr,
+    omit,
+    not_given,
+)
 from ..._utils import extract_files, required_args, maybe_transform, deepcopy_minimal, async_maybe_transform
 from ..._compat import cached_property
 from ..._resource import SyncAPIResource, AsyncAPIResource
@@ -23,6 +32,7 @@ from ...types.audio.transcription import Transcription
 from ...types.audio_response_format import AudioResponseFormat
 from ...types.audio.transcription_include import TranscriptionInclude
 from ...types.audio.transcription_verbose import TranscriptionVerbose
+from ...types.audio.transcription_diarized import TranscriptionDiarized
 from ...types.audio.transcription_stream_event import TranscriptionStreamEvent
 from ...types.audio.transcription_create_response import TranscriptionCreateResponse
 
@@ -93,6 +103,66 @@ class Transcriptions(SyncAPIResource):
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> TranscriptionVerbose: ...
 
+              model's confidence in the transcription. `logprobs` only works with
+              response_format set to `json` and only with the models `gpt-4o-transcribe` and
+              `gpt-4o-mini-transcribe`. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
+
+          known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
+              `known_speaker_references[]`. Each entry should be a short identifier (for
+              example `customer` or `agent`). Up to 4 speakers are supported.
+
+          known_speaker_references: Optional list of audio samples (as
+              [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+              that contain known speaker references matching `known_speaker_names[]`. Each
+              sample must be between 2 and 10 seconds, and can use any of the same input audio
+              formats supported by `file`.
+
+          language: The language of the input audio. Supplying the input language in
+              [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+              format will improve accuracy and latency.
+
+          prompt: An optional text to guide the model's style or continue a previous audio
+              segment. The
+              [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
+              should match the audio language. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
+
+          response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
+              `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+              `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+              `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+              `diarized_json`, with `diarized_json` required to receive speaker annotations.
+
+          stream: If set to true, the model response data will be streamed to the client as it is
+              generated using
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
+              See the
+              [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
+              for more information.
+
+              Note: Streaming is not supported for the `whisper-1` model and will be ignored.
+
+          temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
+              output more random, while lower values like 0.2 will make it more focused and
+              deterministic. If set to 0, the model will use
+              [log probability](https://en.wikipedia.org/wiki/Log_probability) to
+              automatically increase the temperature until certain thresholds are hit.
+
+          timestamp_granularities: The timestamp granularities to populate for this transcription.
+              `response_format` must be set `verbose_json` to use timestamp granularities.
+              Either or both of these options are supported: `word`, or `segment`. Note: There
+              is no additional latency for segment timestamps, but generating word timestamps
+              incurs additional latency. This option is not available for
+              `gpt-4o-transcribe-diarize`.
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+    ) -> Transcription: ...
+
     @overload
     def create(
         self,
@@ -114,6 +184,27 @@ class Transcriptions(SyncAPIResource):
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> str: ...
 
+    @overload
+    def create(
+        self,
+        *,
+        file: FileTypes,
+        model: Union[str, AudioModel],
+        chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
+        response_format: Literal["diarized_json"],
+        known_speaker_names: SequenceNotStr[str] | Omit = omit,
+        known_speaker_references: SequenceNotStr[str] | Omit = omit,
+        language: str | Omit = omit,
+        temperature: float | Omit = omit,
+        timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> TranscriptionDiarized: ...
+
     @overload
     def create(
         self,
@@ -123,6 +214,8 @@ class Transcriptions(SyncAPIResource):
         stream: Literal[True],
         chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
         include: List[TranscriptionInclude] | Omit = omit,
+        known_speaker_names: SequenceNotStr[str] | Omit = omit,
+        known_speaker_references: SequenceNotStr[str] | Omit = omit,
         language: str | Omit = omit,
         prompt: str | Omit = omit,
         response_format: Union[AudioResponseFormat, Omit] = omit,
@@ -144,8 +237,8 @@ class Transcriptions(SyncAPIResource):
               flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
 
           model: ID of the model to use. The options are `gpt-4o-transcribe`,
-              `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
-              Whisper V2 model).
+              `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
+              Whisper V2 model), and `gpt-4o-transcribe-diarize`.
 
           stream: If set to true, the model response data will be streamed to the client as it is
               generated using
@@ -160,12 +253,25 @@ class Transcriptions(SyncAPIResource):
               first normalizes loudness and then uses voice activity detection (VAD) to choose
               boundaries. `server_vad` object can be provided to tweak VAD detection
               parameters manually. If unset, the audio is transcribed as a single block.
+              Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
+              seconds.
 
           include: Additional information to include in the transcription response. `logprobs` will
               return the log probabilities of the tokens in the response to understand the
               model's confidence in the transcription. `logprobs` only works with
               response_format set to `json` and only with the models `gpt-4o-transcribe` and
-              `gpt-4o-mini-transcribe`.
+              `gpt-4o-mini-transcribe`. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
+
+          known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
+              `known_speaker_references[]`. Each entry should be a short identifier (for
+              example `customer` or `agent`). Up to 4 speakers are supported.
+
+          known_speaker_references: Optional list of audio samples (as
+              [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+              that contain known speaker references matching `known_speaker_names[]`. Each
+              sample must be between 2 and 10 seconds, and can use any of the same input audio
+              formats supported by `file`.
 
           language: The language of the input audio. Supplying the input language in
               [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -174,11 +280,14 @@ class Transcriptions(SyncAPIResource):
           prompt: An optional text to guide the model's style or continue a previous audio
               segment. The
               [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
-              should match the audio language.
+              should match the audio language. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
 
           response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
-              `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
-              the only supported format is `json`.
+              `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+              `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+              `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+              `diarized_json`, with `diarized_json` required to receive speaker annotations.
 
           temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
               output more random, while lower values like 0.2 will make it more focused and
@@ -190,7 +299,8 @@ class Transcriptions(SyncAPIResource):
               `response_format` must be set `verbose_json` to use timestamp granularities.
               Either or both of these options are supported: `word`, or `segment`. Note: There
               is no additional latency for segment timestamps, but generating word timestamps
-              incurs additional latency.
+              incurs additional latency. This option is not available for
+              `gpt-4o-transcribe-diarize`.
 
           extra_headers: Send extra headers
 
@@ -211,6 +321,8 @@ class Transcriptions(SyncAPIResource):
         stream: bool,
         chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
         include: List[TranscriptionInclude] | Omit = omit,
+        known_speaker_names: SequenceNotStr[str] | Omit = omit,
+        known_speaker_references: SequenceNotStr[str] | Omit = omit,
         language: str | Omit = omit,
         prompt: str | Omit = omit,
         response_format: Union[AudioResponseFormat, Omit] = omit,
@@ -232,8 +344,8 @@ class Transcriptions(SyncAPIResource):
               flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
 
           model: ID of the model to use. The options are `gpt-4o-transcribe`,
-              `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
-              Whisper V2 model).
+              `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
+              Whisper V2 model), and `gpt-4o-transcribe-diarize`.
 
           stream: If set to true, the model response data will be streamed to the client as it is
               generated using
@@ -248,12 +360,25 @@ class Transcriptions(SyncAPIResource):
               first normalizes loudness and then uses voice activity detection (VAD) to choose
               boundaries. `server_vad` object can be provided to tweak VAD detection
               parameters manually. If unset, the audio is transcribed as a single block.
+              Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
+              seconds.
 
           include: Additional information to include in the transcription response. `logprobs` will
               return the log probabilities of the tokens in the response to understand the
               model's confidence in the transcription. `logprobs` only works with
               response_format set to `json` and only with the models `gpt-4o-transcribe` and
-              `gpt-4o-mini-transcribe`.
+              `gpt-4o-mini-transcribe`. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
+
+          known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
+              `known_speaker_references[]`. Each entry should be a short identifier (for
+              example `customer` or `agent`). Up to 4 speakers are supported.
+
+          known_speaker_references: Optional list of audio samples (as
+              [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+              that contain known speaker references matching `known_speaker_names[]`. Each
+              sample must be between 2 and 10 seconds, and can use any of the same input audio
+              formats supported by `file`.
 
           language: The language of the input audio. Supplying the input language in
               [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -262,11 +387,14 @@ class Transcriptions(SyncAPIResource):
           prompt: An optional text to guide the model's style or continue a previous audio
               segment. The
               [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
-              should match the audio language.
+              should match the audio language. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
 
           response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
-              `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
-              the only supported format is `json`.
+              `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+              `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+              `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+              `diarized_json`, with `diarized_json` required to receive speaker annotations.
 
           temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
               output more random, while lower values like 0.2 will make it more focused and
@@ -278,7 +406,8 @@ class Transcriptions(SyncAPIResource):
               `response_format` must be set `verbose_json` to use timestamp granularities.
               Either or both of these options are supported: `word`, or `segment`. Note: There
               is no additional latency for segment timestamps, but generating word timestamps
-              incurs additional latency.
+              incurs additional latency. This option is not available for
+              `gpt-4o-transcribe-diarize`.
 
           extra_headers: Send extra headers
 
@@ -298,6 +427,8 @@ class Transcriptions(SyncAPIResource):
         model: Union[str, AudioModel],
         chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
         include: List[TranscriptionInclude] | Omit = omit,
+        known_speaker_names: SequenceNotStr[str] | Omit = omit,
+        known_speaker_references: SequenceNotStr[str] | Omit = omit,
         language: str | Omit = omit,
         prompt: str | Omit = omit,
         response_format: Union[AudioResponseFormat, Omit] = omit,
@@ -310,13 +441,15 @@ class Transcriptions(SyncAPIResource):
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> str | Transcription | TranscriptionVerbose | Stream[TranscriptionStreamEvent]:
+    ) -> str | Transcription | TranscriptionDiarized | TranscriptionVerbose | Stream[TranscriptionStreamEvent]:
         body = deepcopy_minimal(
             {
                 "file": file,
                 "model": model,
                 "chunking_strategy": chunking_strategy,
                 "include": include,
+                "known_speaker_names": known_speaker_names,
+                "known_speaker_references": known_speaker_references,
                 "language": language,
                 "prompt": prompt,
                 "response_format": response_format,
@@ -376,6 +509,8 @@ class AsyncTranscriptions(AsyncAPIResource):
         model: Union[str, AudioModel],
         chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
         include: List[TranscriptionInclude] | Omit = omit,
+        known_speaker_names: SequenceNotStr[str] | Omit = omit,
+        known_speaker_references: SequenceNotStr[str] | Omit = omit,
         language: str | Omit = omit,
         prompt: str | Omit = omit,
         response_format: Union[Literal["json"], Omit] = omit,
@@ -398,19 +533,32 @@ class AsyncTranscriptions(AsyncAPIResource):
               flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
 
           model: ID of the model to use. The options are `gpt-4o-transcribe`,
-              `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
-              Whisper V2 model).
+              `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
+              Whisper V2 model), and `gpt-4o-transcribe-diarize`.
 
           chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
               first normalizes loudness and then uses voice activity detection (VAD) to choose
               boundaries. `server_vad` object can be provided to tweak VAD detection
               parameters manually. If unset, the audio is transcribed as a single block.
+              Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
+              seconds.
 
           include: Additional information to include in the transcription response. `logprobs` will
               return the log probabilities of the tokens in the response to understand the
               model's confidence in the transcription. `logprobs` only works with
               response_format set to `json` and only with the models `gpt-4o-transcribe` and
-              `gpt-4o-mini-transcribe`.
+              `gpt-4o-mini-transcribe`. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
+
+          known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
+              `known_speaker_references[]`. Each entry should be a short identifier (for
+              example `customer` or `agent`). Up to 4 speakers are supported.
+
+          known_speaker_references: Optional list of audio samples (as
+              [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+              that contain known speaker references matching `known_speaker_names[]`. Each
+              sample must be between 2 and 10 seconds, and can use any of the same input audio
+              formats supported by `file`.
 
           language: The language of the input audio. Supplying the input language in
               [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -419,11 +567,14 @@ class AsyncTranscriptions(AsyncAPIResource):
           prompt: An optional text to guide the model's style or continue a previous audio
               segment. The
               [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
-              should match the audio language.
+              should match the audio language. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
 
           response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
-              `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
-              the only supported format is `json`.
+              `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+              `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+              `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+              `diarized_json`, with `diarized_json` required to receive speaker annotations.
 
           stream: If set to true, the model response data will be streamed to the client as it is
               generated using
@@ -444,7 +595,8 @@ class AsyncTranscriptions(AsyncAPIResource):
               `response_format` must be set `verbose_json` to use timestamp granularities.
               Either or both of these options are supported: `word`, or `segment`. Note: There
               is no additional latency for segment timestamps, but generating word timestamps
-              incurs additional latency.
+              incurs additional latency. This option is not available for
+              `gpt-4o-transcribe-diarize`.
 
           extra_headers: Send extra headers
 
@@ -502,6 +654,8 @@ class AsyncTranscriptions(AsyncAPIResource):
         stream: Literal[True],
         chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
         include: List[TranscriptionInclude] | Omit = omit,
+        known_speaker_names: SequenceNotStr[str] | Omit = omit,
+        known_speaker_references: SequenceNotStr[str] | Omit = omit,
         language: str | Omit = omit,
         prompt: str | Omit = omit,
         response_format: Union[AudioResponseFormat, Omit] = omit,
@@ -523,8 +677,8 @@ class AsyncTranscriptions(AsyncAPIResource):
               flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
 
           model: ID of the model to use. The options are `gpt-4o-transcribe`,
-              `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
-              Whisper V2 model).
+              `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
+              Whisper V2 model), and `gpt-4o-transcribe-diarize`.
 
           stream: If set to true, the model response data will be streamed to the client as it is
               generated using
@@ -539,12 +693,25 @@ class AsyncTranscriptions(AsyncAPIResource):
               first normalizes loudness and then uses voice activity detection (VAD) to choose
               boundaries. `server_vad` object can be provided to tweak VAD detection
               parameters manually. If unset, the audio is transcribed as a single block.
+              Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
+              seconds.
 
           include: Additional information to include in the transcription response. `logprobs` will
               return the log probabilities of the tokens in the response to understand the
               model's confidence in the transcription. `logprobs` only works with
               response_format set to `json` and only with the models `gpt-4o-transcribe` and
-              `gpt-4o-mini-transcribe`.
+              `gpt-4o-mini-transcribe`. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
+
+          known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
+              `known_speaker_references[]`. Each entry should be a short identifier (for
+              example `customer` or `agent`). Up to 4 speakers are supported.
+
+          known_speaker_references: Optional list of audio samples (as
+              [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+              that contain known speaker references matching `known_speaker_names[]`. Each
+              sample must be between 2 and 10 seconds, and can use any of the same input audio
+              formats supported by `file`.
 
           language: The language of the input audio. Supplying the input language in
               [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -553,11 +720,14 @@ class AsyncTranscriptions(AsyncAPIResource):
           prompt: An optional text to guide the model's style or continue a previous audio
               segment. The
               [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
-              should match the audio language.
+              should match the audio language. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
 
           response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
-              `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
-              the only supported format is `json`.
+              `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+              `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+              `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+              `diarized_json`, with `diarized_json` required to receive speaker annotations.
 
           temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
               output more random, while lower values like 0.2 will make it more focused and
@@ -569,7 +739,8 @@ class AsyncTranscriptions(AsyncAPIResource):
               `response_format` must be set `verbose_json` to use timestamp granularities.
               Either or both of these options are supported: `word`, or `segment`. Note: There
               is no additional latency for segment timestamps, but generating word timestamps
-              incurs additional latency.
+              incurs additional latency. This option is not available for
+              `gpt-4o-transcribe-diarize`.
 
           extra_headers: Send extra headers
 
@@ -590,6 +761,8 @@ class AsyncTranscriptions(AsyncAPIResource):
         stream: bool,
         chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
         include: List[TranscriptionInclude] | Omit = omit,
+        known_speaker_names: SequenceNotStr[str] | Omit = omit,
+        known_speaker_references: SequenceNotStr[str] | Omit = omit,
         language: str | Omit = omit,
         prompt: str | Omit = omit,
         response_format: Union[AudioResponseFormat, Omit] = omit,
@@ -611,8 +784,8 @@ class AsyncTranscriptions(AsyncAPIResource):
               flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
 
           model: ID of the model to use. The options are `gpt-4o-transcribe`,
-              `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
-              Whisper V2 model).
+              `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
+              Whisper V2 model), and `gpt-4o-transcribe-diarize`.
 
           stream: If set to true, the model response data will be streamed to the client as it is
               generated using
@@ -627,12 +800,25 @@ class AsyncTranscriptions(AsyncAPIResource):
               first normalizes loudness and then uses voice activity detection (VAD) to choose
               boundaries. `server_vad` object can be provided to tweak VAD detection
               parameters manually. If unset, the audio is transcribed as a single block.
+              Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
+              seconds.
 
           include: Additional information to include in the transcription response. `logprobs` will
               return the log probabilities of the tokens in the response to understand the
               model's confidence in the transcription. `logprobs` only works with
               response_format set to `json` and only with the models `gpt-4o-transcribe` and
-              `gpt-4o-mini-transcribe`.
+              `gpt-4o-mini-transcribe`. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
+
+          known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
+              `known_speaker_references[]`. Each entry should be a short identifier (for
+              example `customer` or `agent`). Up to 4 speakers are supported.
+
+          known_speaker_references: Optional list of audio samples (as
+              [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+              that contain known speaker references matching `known_speaker_names[]`. Each
+              sample must be between 2 and 10 seconds, and can use any of the same input audio
+              formats supported by `file`.
 
           language: The language of the input audio. Supplying the input language in
               [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -641,11 +827,14 @@ class AsyncTranscriptions(AsyncAPIResource):
           prompt: An optional text to guide the model's style or continue a previous audio
               segment. The
               [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
-              should match the audio language.
+              should match the audio language. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
 
           response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
-              `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
-              the only supported format is `json`.
+              `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+              `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+              `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+              `diarized_json`, with `diarized_json` required to receive speaker annotations.
 
           temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
               output more random, while lower values like 0.2 will make it more focused and
@@ -657,7 +846,8 @@ class AsyncTranscriptions(AsyncAPIResource):
               `response_format` must be set `verbose_json` to use timestamp granularities.
               Either or both of these options are supported: `word`, or `segment`. Note: There
               is no additional latency for segment timestamps, but generating word timestamps
-              incurs additional latency.
+              incurs additional latency. This option is not available for
+              `gpt-4o-transcribe-diarize`.
 
           extra_headers: Send extra headers
 
@@ -677,6 +867,8 @@ class AsyncTranscriptions(AsyncAPIResource):
         model: Union[str, AudioModel],
         chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
         include: List[TranscriptionInclude] | Omit = omit,
+        known_speaker_names: SequenceNotStr[str] | Omit = omit,
+        known_speaker_references: SequenceNotStr[str] | Omit = omit,
         language: str | Omit = omit,
         prompt: str | Omit = omit,
         response_format: Union[AudioResponseFormat, Omit] = omit,
@@ -689,13 +881,15 @@ class AsyncTranscriptions(AsyncAPIResource):
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> Transcription | TranscriptionVerbose | str | AsyncStream[TranscriptionStreamEvent]:
+    ) -> Transcription | TranscriptionVerbose | TranscriptionDiarized | str | AsyncStream[TranscriptionStreamEvent]:
         body = deepcopy_minimal(
             {
                 "file": file,
                 "model": model,
                 "chunking_strategy": chunking_strategy,
                 "include": include,
+                "known_speaker_names": known_speaker_names,
+                "known_speaker_references": known_speaker_references,
                 "language": language,
                 "prompt": prompt,
                 "response_format": response_format,
@@ -764,8 +958,8 @@ class AsyncTranscriptionsWithStreamingResponse:
 
 
 def _get_response_format_type(
-    response_format: Literal["json", "text", "srt", "verbose_json", "vtt"] | Omit,
-) -> type[Transcription | TranscriptionVerbose | str]:
+    response_format: AudioResponseFormat | Omit,
+) -> type[Transcription | TranscriptionVerbose | TranscriptionDiarized | str]:
     if isinstance(response_format, Omit) or response_format is None:  # pyright: ignore[reportUnnecessaryComparison]
         return Transcription
 
@@ -773,6 +967,8 @@ def _get_response_format_type(
         return Transcription
     elif response_format == "verbose_json":
         return TranscriptionVerbose
+    elif response_format == "diarized_json":
+        return TranscriptionDiarized
     elif response_format == "srt" or response_format == "text" or response_format == "vtt":
         return str
     elif TYPE_CHECKING:  # type: ignore[unreachable]

@@ -349,7 +349,7 @@ class AsyncTranslationsWithStreamingResponse:
 
 
 def _get_response_format_type(
-    response_format: Literal["json", "text", "srt", "verbose_json", "vtt"] | Omit,
+    response_format: AudioResponseFormat | Omit,
 ) -> type[Translation | TranslationVerbose | str]:
     if isinstance(response_format, Omit) or response_format is None:  # pyright: ignore[reportUnnecessaryComparison]
         return Translation
@@ -360,8 +360,8 @@ def _get_response_format_type(
         return TranslationVerbose
     elif response_format == "srt" or response_format == "text" or response_format == "vtt":
         return str
-    elif TYPE_CHECKING:  # type: ignore[unreachable]
+    elif TYPE_CHECKING and response_format != "diarized_json":  # type: ignore[unreachable]
         assert_never(response_format)
     else:
-        log.warn("Unexpected audio response format: %s", response_format)
-        return Transcription
+        log.warning("Unexpected audio response format: %s", response_format)
+        return Translation

@@ -79,6 +79,7 @@ class VectorStores(SyncAPIResource):
         self,
         *,
         chunking_strategy: FileChunkingStrategyParam | Omit = omit,
+        description: str | Omit = omit,
         expires_after: vector_store_create_params.ExpiresAfter | Omit = omit,
         file_ids: SequenceNotStr[str] | Omit = omit,
         metadata: Optional[Metadata] | Omit = omit,
@@ -97,6 +98,9 @@ class VectorStores(SyncAPIResource):
           chunking_strategy: The chunking strategy used to chunk the file(s). If not set, will use the `auto`
               strategy. Only applicable if `file_ids` is non-empty.
 
+          description: A description for the vector store. Can be used to describe the vector store's
+              purpose.
+
           expires_after: The expiration policy for a vector store.
 
           file_ids: A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that
@@ -126,6 +130,7 @@ class VectorStores(SyncAPIResource):
             body=maybe_transform(
                 {
                     "chunking_strategy": chunking_strategy,
+                    "description": description,
                     "expires_after": expires_after,
                     "file_ids": file_ids,
                     "metadata": metadata,
@@ -424,6 +429,7 @@ class AsyncVectorStores(AsyncAPIResource):
         self,
         *,
         chunking_strategy: FileChunkingStrategyParam | Omit = omit,
+        description: str | Omit = omit,
         expires_after: vector_store_create_params.ExpiresAfter | Omit = omit,
         file_ids: SequenceNotStr[str] | Omit = omit,
         metadata: Optional[Metadata] | Omit = omit,
@@ -442,6 +448,9 @@ class AsyncVectorStores(AsyncAPIResource):
           chunking_strategy: The chunking strategy used to chunk the file(s). If not set, will use the `auto`
               strategy. Only applicable if `file_ids` is non-empty.
 
+          description: A description for the vector store. Can be used to describe the vector store's
+              purpose.
+
           expires_after: The expiration policy for a vector store.
 
           file_ids: A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that
@@ -471,6 +480,7 @@ class AsyncVectorStores(AsyncAPIResource):
             body=await async_maybe_transform(
                 {
                     "chunking_strategy": chunking_strategy,
+                    "description": description,
                     "expires_after": expires_after,
                     "file_ids": file_ids,
                     "metadata": metadata,

@@ -11,10 +11,13 @@ from .speech_create_params import SpeechCreateParams as SpeechCreateParams
 from .transcription_include import TranscriptionInclude as TranscriptionInclude
 from .transcription_segment import TranscriptionSegment as TranscriptionSegment
 from .transcription_verbose import TranscriptionVerbose as TranscriptionVerbose
+from .transcription_diarized import TranscriptionDiarized as TranscriptionDiarized
 from .translation_create_params import TranslationCreateParams as TranslationCreateParams
 from .transcription_stream_event import TranscriptionStreamEvent as TranscriptionStreamEvent
 from .transcription_create_params import TranscriptionCreateParams as TranscriptionCreateParams
 from .translation_create_response import TranslationCreateResponse as TranslationCreateResponse
 from .transcription_create_response import TranscriptionCreateResponse as TranscriptionCreateResponse
 from .transcription_text_done_event import TranscriptionTextDoneEvent as TranscriptionTextDoneEvent
+from .transcription_diarized_segment import TranscriptionDiarizedSegment as TranscriptionDiarizedSegment
 from .transcription_text_delta_event import TranscriptionTextDeltaEvent as TranscriptionTextDeltaEvent
+from .transcription_text_segment_event import TranscriptionTextSegmentEvent as TranscriptionTextSegmentEvent

@@ -5,7 +5,7 @@ from __future__ import annotations
 from typing import List, Union, Optional
 from typing_extensions import Literal, Required, TypeAlias, TypedDict
 
-from ..._types import FileTypes
+from ..._types import FileTypes, SequenceNotStr
 from ..audio_model import AudioModel
 from .transcription_include import TranscriptionInclude
 from ..audio_response_format import AudioResponseFormat
@@ -29,8 +29,9 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
     model: Required[Union[str, AudioModel]]
     """ID of the model to use.
 
-    The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1`
-    (which is powered by our open source Whisper V2 model).
+    The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, `whisper-1`
+    (which is powered by our open source Whisper V2 model), and
+    `gpt-4o-transcribe-diarize`.
     """
 
     chunking_strategy: Optional[ChunkingStrategy]
@@ -39,7 +40,8 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
     When set to `"auto"`, the server first normalizes loudness and then uses voice
     activity detection (VAD) to choose boundaries. `server_vad` object can be
     provided to tweak VAD detection parameters manually. If unset, the audio is
-    transcribed as a single block.
+    transcribed as a single block. Required when using `gpt-4o-transcribe-diarize`
+    for inputs longer than 30 seconds.
     """
 
     include: List[TranscriptionInclude]
@@ -48,7 +50,24 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
     return the log probabilities of the tokens in the response to understand the
     model's confidence in the transcription. `logprobs` only works with
     response_format set to `json` and only with the models `gpt-4o-transcribe` and
-    `gpt-4o-mini-transcribe`.
+    `gpt-4o-mini-transcribe`. This field is not supported when using
+    `gpt-4o-transcribe-diarize`.
+    """
+
+    known_speaker_names: SequenceNotStr[str]
+    """
+    Optional list of speaker names that correspond to the audio samples provided in
+    `known_speaker_references[]`. Each entry should be a short identifier (for
+    example `customer` or `agent`). Up to 4 speakers are supported.
+    """
+
+    known_speaker_references: SequenceNotStr[str]
+    """
+    Optional list of audio samples (as
+    [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+    that contain known speaker references matching `known_speaker_names[]`. Each
+    sample must be between 2 and 10 seconds, and can use any of the same input audio
+    formats supported by `file`.
     """
 
     language: str
@@ -64,14 +83,17 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
     segment.
 
     The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
-    should match the audio language.
+    should match the audio language. This field is not supported when using
+    `gpt-4o-transcribe-diarize`.
     """
 
     response_format: AudioResponseFormat
     """
     The format of the output, in one of these options: `json`, `text`, `srt`,
-    `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
-    the only supported format is `json`.
+    `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+    `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+    `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+    `diarized_json`, with `diarized_json` required to receive speaker annotations.
     """
 
     temperature: float
@@ -89,7 +111,8 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
     `response_format` must be set `verbose_json` to use timestamp granularities.
     Either or both of these options are supported: `word`, or `segment`. Note: There
     is no additional latency for segment timestamps, but generating word timestamps
-    incurs additional latency.
+    incurs additional latency. This option is not available for
+    `gpt-4o-transcribe-diarize`.
     """

@@ -5,7 +5,8 @@ from typing_extensions import TypeAlias
 
 from .transcription import Transcription
 from .transcription_verbose import TranscriptionVerbose
+from .transcription_diarized import TranscriptionDiarized
 
 __all__ = ["TranscriptionCreateResponse"]
 
-TranscriptionCreateResponse: TypeAlias = Union[Transcription, TranscriptionVerbose]
+TranscriptionCreateResponse: TypeAlias = Union[Transcription, TranscriptionDiarized, TranscriptionVerbose]

@@ -0,0 +1,63 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Union, Optional
+from typing_extensions import Literal, Annotated, TypeAlias
+
+from ..._utils import PropertyInfo
+from ..._models import BaseModel
+from .transcription_diarized_segment import TranscriptionDiarizedSegment
+
+__all__ = ["TranscriptionDiarized", "Usage", "UsageTokens", "UsageTokensInputTokenDetails", "UsageDuration"]
+
+
+class UsageTokensInputTokenDetails(BaseModel):
+    audio_tokens: Optional[int] = None
+    """Number of audio tokens billed for this request."""
+
+    text_tokens: Optional[int] = None
+    """Number of text tokens billed for this request."""
+
+
+class UsageTokens(BaseModel):
+    input_tokens: int
+    """Number of input tokens billed for this request."""
+
+    output_tokens: int
+    """Number of output tokens generated."""
+
+    total_tokens: int
+    """Total number of tokens used (input + output)."""
+
+    type: Literal["tokens"]
+    """The type of the usage object. Always `tokens` for this variant."""
+
+    input_token_details: Optional[UsageTokensInputTokenDetails] = None
+    """Details about the input tokens billed for this request."""
+
+
+class UsageDuration(BaseModel):
+    seconds: float
+    """Duration of the input audio in seconds."""
+
+    type: Literal["duration"]
+    """The type of the usage object. Always `duration` for this variant."""
+
+
+Usage: TypeAlias = Annotated[Union[UsageTokens, UsageDuration], PropertyInfo(discriminator="type")]
+
+
+class TranscriptionDiarized(BaseModel):
+    duration: float
+    """Duration of the input audio in seconds."""
+
+    segments: List[TranscriptionDiarizedSegment]
+    """Segments of the transcript annotated with timestamps and speaker labels."""
+
+    task: Literal["transcribe"]
+    """The type of task that was run. Always `transcribe`."""
+
+    text: str
+    """The concatenated transcript text for the entire audio input."""
+
+    usage: Optional[Usage] = None
+    """Token or duration usage statistics for the request."""

@@ -0,0 +1,32 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+
+__all__ = ["TranscriptionDiarizedSegment"]
+
+
+class TranscriptionDiarizedSegment(BaseModel):
+    id: str
+    """Unique identifier for the segment."""
+
+    end: float
+    """End timestamp of the segment in seconds."""
+
+    speaker: str
+    """Speaker label for this segment.
+
+    When known speakers are provided, the label matches `known_speaker_names[]`.
+    Otherwise speakers are labeled sequentially using capital letters (`A`, `B`,
+    ...).
+    """
+
+    start: float
+    """Start timestamp of the segment in seconds."""
+
+    text: str
+    """Transcript text for this segment."""
+
+    type: Literal["transcript.text.segment"]
+    """The type of the segment. Always `transcript.text.segment`."""

@@ -6,9 +6,11 @@ from typing_extensions import Annotated, TypeAlias
 from ..._utils import PropertyInfo
 from .transcription_text_done_event import TranscriptionTextDoneEvent
 from .transcription_text_delta_event import TranscriptionTextDeltaEvent
+from .transcription_text_segment_event import TranscriptionTextSegmentEvent
 
 __all__ = ["TranscriptionStreamEvent"]
 
 TranscriptionStreamEvent: TypeAlias = Annotated[
-    Union[TranscriptionTextDeltaEvent, TranscriptionTextDoneEvent], PropertyInfo(discriminator="type")
+    Union[TranscriptionTextSegmentEvent, TranscriptionTextDeltaEvent, TranscriptionTextDoneEvent],
+    PropertyInfo(discriminator="type"),
 ]

@@ -33,3 +33,9 @@ class TranscriptionTextDeltaEvent(BaseModel):
     [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
     with the `include[]` parameter set to `logprobs`.
     """
+
+    segment_id: Optional[str] = None
+    """Identifier of the diarized segment that this delta belongs to.
+
+    Only present when using `gpt-4o-transcribe-diarize`.
+    """

@@ -0,0 +1,27 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+
+__all__ = ["TranscriptionTextSegmentEvent"]
+
+
+class TranscriptionTextSegmentEvent(BaseModel):
+    id: str
+    """Unique identifier for the segment."""
+
+    end: float
+    """End timestamp of the segment in seconds."""
+
+    speaker: str
+    """Speaker label for this segment."""
+
+    start: float
+    """Start timestamp of the segment in seconds."""
+
+    text: str
+    """Transcript text for this segment."""
+
+    type: Literal["transcript.text.segment"]
+    """The type of the event. Always `transcript.text.segment`."""

@@ -17,13 +17,14 @@ class AudioTranscription(BaseModel):
     format will improve accuracy and latency.
     """
 
-    model: Optional[Literal["whisper-1", "gpt-4o-transcribe-latest", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"]] = (
-        None
-    )
+    model: Optional[
+        Literal["whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe", "gpt-4o-transcribe-diarize"]
+    ] = None
     """The model to use for transcription.
 
-    Current options are `whisper-1`, `gpt-4o-transcribe-latest`,
-    `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
+    Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`,
+    and `gpt-4o-transcribe-diarize`. Use `gpt-4o-transcribe-diarize` when you need
+    diarization with speaker labels.
     """
 
     prompt: Optional[str] = None
@@ -31,6 +32,6 @@ class AudioTranscription(BaseModel):
     An optional text to guide the model's style or continue a previous audio
     segment. For `whisper-1`, the
     [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
-    For `gpt-4o-transcribe` models, the prompt is a free text string, for example
-    "expect words related to technology".
+    For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the
+    prompt is a free text string, for example "expect words related to technology".
     """

@@ -16,11 +16,12 @@ class AudioTranscriptionParam(TypedDict, total=False):
     format will improve accuracy and latency.
     """
 
-    model: Literal["whisper-1", "gpt-4o-transcribe-latest", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"]
+    model: Literal["whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe", "gpt-4o-transcribe-diarize"]
     """The model to use for transcription.
 
-    Current options are `whisper-1`, `gpt-4o-transcribe-latest`,
-    `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
+    Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`,
+    and `gpt-4o-transcribe-diarize`. Use `gpt-4o-transcribe-diarize` when you need
+    diarization with speaker labels.
     """
 
     prompt: str
@@ -28,6 +29,6 @@ class AudioTranscriptionParam(TypedDict, total=False):
     An optional text to guide the model's style or continue a previous audio
     segment. For `whisper-1`, the
     [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
-    For `gpt-4o-transcribe` models, the prompt is a free text string, for example
-    "expect words related to technology".
+    For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the
+    prompt is a free text string, for example "expect words related to technology".
     """

@@ -4,4 +4,4 @@ from typing_extensions import Literal, TypeAlias
 
 __all__ = ["AudioModel"]
 
-AudioModel: TypeAlias = Literal["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]
+AudioModel: TypeAlias = Literal["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe", "gpt-4o-transcribe-diarize"]

@@ -4,4 +4,4 @@ from typing_extensions import Literal, TypeAlias
 
 __all__ = ["AudioResponseFormat"]
 
-AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt"]
+AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt", "diarized_json"]

@@ -20,6 +20,12 @@ class VectorStoreCreateParams(TypedDict, total=False):
     non-empty.
     """
 
+    description: str
+    """A description for the vector store.
+
+    Can be used to describe the vector store's purpose.
+    """
+
     expires_after: ExpiresAfter
     """The expiration policy for a vector store."""

@@ -32,6 +32,8 @@ class TestTranscriptions:
             model="gpt-4o-transcribe",
             chunking_strategy="auto",
             include=["logprobs"],
+            known_speaker_names=["string"],
+            known_speaker_references=["string"],
             language="language",
             prompt="prompt",
             response_format="json",
@@ -84,6 +86,8 @@ class TestTranscriptions:
             stream=True,
             chunking_strategy="auto",
             include=["logprobs"],
+            known_speaker_names=["string"],
+            known_speaker_references=["string"],
             language="language",
             prompt="prompt",
             response_format="json",
@@ -140,6 +144,8 @@ class TestAsyncTranscriptions:
             model="gpt-4o-transcribe",
             chunking_strategy="auto",
             include=["logprobs"],
+            known_speaker_names=["string"],
+            known_speaker_references=["string"],
             language="language",
             prompt="prompt",
             response_format="json",
@@ -192,6 +198,8 @@ class TestAsyncTranscriptions:
             stream=True,
             chunking_strategy="auto",
             include=["logprobs"],
+            known_speaker_names=["string"],
+            known_speaker_references=["string"],
             language="language",
             prompt="prompt",
             response_format="json",

@@ -31,6 +31,7 @@ class TestVectorStores:
     def test_method_create_with_all_params(self, client: OpenAI) -> None:
         vector_store = client.vector_stores.create(
             chunking_strategy={"type": "auto"},
+            description="description",
             expires_after={
                 "anchor": "last_active_at",
                 "days": 1,
@@ -299,6 +300,7 @@ class TestAsyncVectorStores:
     async def test_method_create_with_all_params(self, async_client: AsyncOpenAI) -> None:
         vector_store = await async_client.vector_stores.create(
             chunking_strategy={"type": "auto"},
+            description="description",
             expires_after={
                 "anchor": "last_active_at",
                 "days": 1,

@@ -44,7 +44,8 @@ def test_translation_create_overloads_in_sync(sync: bool, client: OpenAI, async_
         elif is_literal_type(typ):
             overload_response_formats.update(get_args(typ))
 
-    src_response_formats: set[str] = set(get_args(AudioResponseFormat))
+    # 'diarized_json' applies only to transcriptions, not translations.
+    src_response_formats: set[str] = set(get_args(AudioResponseFormat)) - {"diarized_json"}
     diff = src_response_formats.difference(overload_response_formats)
     assert len(diff) == 0, f"some response format options don't have overloads"
 
@@ -57,18 +58,27 @@ def test_transcription_create_overloads_in_sync(sync: bool, client: OpenAI, asyn
     overload_response_formats: set[str] = set()
 
     for i, overload in enumerate(typing_extensions.get_overloads(fn)):
-        assert_signatures_in_sync(
-            fn,
-            overload,
-            exclude_params={"response_format", "stream"},
-            description=f" for overload {i}",
-        )
-
         sig = inspect.signature(overload)
         typ = evaluate_forwardref(
             sig.parameters["response_format"].annotation,
             globalns=sys.modules[fn.__module__].__dict__,
         )
+
+        exclude_params = {"response_format", "stream"}
+        # known_speaker_names and known_speaker_references are only supported by diarized_json
+        if not (is_literal_type(typ) and set(get_args(typ)) == {"diarized_json"}):
+            exclude_params.update({"known_speaker_names", "known_speaker_references"})
+
+        # diarized_json does not support these parameters
+        if is_literal_type(typ) and set(get_args(typ)) == {"diarized_json"}:
+            exclude_params.update({"include", "prompt", "timestamp_granularities"})
+
+        assert_signatures_in_sync(
+            fn,
+            overload,
+            exclude_params=exclude_params,
+            description=f" for overload {i}",
+        )
         if is_union_type(typ):
             for arg in get_args(typ):
                 if not is_literal_type(arg):

@@ -1,4 +1,4 @@
 configured_endpoints: 136
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-11d308a9ef78ad01aa11c880a084a3982276800d7994db3f454aa515474977d7.yml
-openapi_spec_hash: 0a4bbb5aa0ae532a072bd6b3854e70b1
-config_hash: f0940d0906846178759ef7128e4cb98e
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-104cced8f4c7436a76eea02e26307828166405ccfb296faffb008b72772c11a7.yml
+openapi_spec_hash: fdc03ed84a65a31b80da909255e53924
+config_hash: 03b48e9b8c7231a902403210dbd7dfa0

@@ -171,11 +171,14 @@ Types:
 ```python
 from openai.types.audio import (
     Transcription,
+    TranscriptionDiarized,
+    TranscriptionDiarizedSegment,
     TranscriptionInclude,
     TranscriptionSegment,
     TranscriptionStreamEvent,
     TranscriptionTextDeltaEvent,
     TranscriptionTextDoneEvent,
+    TranscriptionTextSegmentEvent,
     TranscriptionVerbose,
     TranscriptionWord,
     TranscriptionCreateResponse,

Commit 25cbb74f

Commit `25cbb74f`