Commit 2b4bc759

stainless-app[bot] <142633134+stainless-app[bot]@users.noreply.github.com>
2025-03-21 00:25:10
feat(api): new models for TTS, STT, + new audio features for Realtime (#2232)
1 parent e9f971a
src/openai/resources/audio/speech.py
@@ -54,6 +54,7 @@ class Speech(SyncAPIResource):
         input: str,
         model: Union[str, SpeechModel],
         voice: Literal["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"],
+        instructions: str | NotGiven = NOT_GIVEN,
         response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] | NotGiven = NOT_GIVEN,
         speed: float | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
@@ -71,13 +72,16 @@ class Speech(SyncAPIResource):
 
           model:
               One of the available [TTS models](https://platform.openai.com/docs/models#tts):
-              `tts-1` or `tts-1-hd`
+              `tts-1`, `tts-1-hd` or `gpt-4o-mini-tts`.
 
           voice: The voice to use when generating the audio. Supported voices are `alloy`, `ash`,
               `coral`, `echo`, `fable`, `onyx`, `nova`, `sage` and `shimmer`. Previews of the
               voices are available in the
               [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
 
+          instructions: Control the voice of your generated audio with additional instructions. Does not
+              work with `tts-1` or `tts-1-hd`.
+
           response_format: The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`,
               `wav`, and `pcm`.
 
@@ -100,6 +104,7 @@ class Speech(SyncAPIResource):
                     "input": input,
                     "model": model,
                     "voice": voice,
+                    "instructions": instructions,
                     "response_format": response_format,
                     "speed": speed,
                 },
@@ -138,6 +143,7 @@ class AsyncSpeech(AsyncAPIResource):
         input: str,
         model: Union[str, SpeechModel],
         voice: Literal["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"],
+        instructions: str | NotGiven = NOT_GIVEN,
         response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] | NotGiven = NOT_GIVEN,
         speed: float | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
@@ -155,13 +161,16 @@ class AsyncSpeech(AsyncAPIResource):
 
           model:
               One of the available [TTS models](https://platform.openai.com/docs/models#tts):
-              `tts-1` or `tts-1-hd`
+              `tts-1`, `tts-1-hd` or `gpt-4o-mini-tts`.
 
           voice: The voice to use when generating the audio. Supported voices are `alloy`, `ash`,
               `coral`, `echo`, `fable`, `onyx`, `nova`, `sage` and `shimmer`. Previews of the
               voices are available in the
               [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
 
+          instructions: Control the voice of your generated audio with additional instructions. Does not
+              work with `tts-1` or `tts-1-hd`.
+
           response_format: The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`,
               `wav`, and `pcm`.
 
@@ -184,6 +193,7 @@ class AsyncSpeech(AsyncAPIResource):
                     "input": input,
                     "model": model,
                     "voice": voice,
+                    "instructions": instructions,
                     "response_format": response_format,
                     "speed": speed,
                 },
src/openai/resources/audio/transcriptions.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import logging
-from typing import TYPE_CHECKING, List, Union, Mapping, cast
+from typing import TYPE_CHECKING, List, Union, Mapping, Optional, cast
 from typing_extensions import Literal, overload, assert_never
 
 import httpx
@@ -13,6 +13,7 @@ from ...types import AudioResponseFormat
 from ..._types import NOT_GIVEN, Body, Query, Headers, NotGiven, FileTypes
 from ..._utils import (
     extract_files,
+    required_args,
     maybe_transform,
     deepcopy_minimal,
     async_maybe_transform,
@@ -20,12 +21,16 @@ from ..._utils import (
 from ..._compat import cached_property
 from ..._resource import SyncAPIResource, AsyncAPIResource
 from ..._response import to_streamed_response_wrapper, async_to_streamed_response_wrapper
+from ..._streaming import Stream, AsyncStream
 from ...types.audio import transcription_create_params
 from ..._base_client import make_request_options
 from ...types.audio_model import AudioModel
 from ...types.audio.transcription import Transcription
 from ...types.audio_response_format import AudioResponseFormat
+from ...types.audio.transcription_include import TranscriptionInclude
 from ...types.audio.transcription_verbose import TranscriptionVerbose
+from ...types.audio.transcription_stream_event import TranscriptionStreamEvent
+from ...types.audio.transcription_create_response import TranscriptionCreateResponse
 
 __all__ = ["Transcriptions", "AsyncTranscriptions"]
 
@@ -58,6 +63,7 @@ class Transcriptions(SyncAPIResource):
         *,
         file: FileTypes,
         model: Union[str, AudioModel],
+        include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
         response_format: Union[Literal["json"], NotGiven] = NOT_GIVEN,
         language: str | NotGiven = NOT_GIVEN,
         prompt: str | NotGiven = NOT_GIVEN,
@@ -77,6 +83,7 @@ class Transcriptions(SyncAPIResource):
         *,
         file: FileTypes,
         model: Union[str, AudioModel],
+        include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
         response_format: Literal["verbose_json"],
         language: str | NotGiven = NOT_GIVEN,
         prompt: str | NotGiven = NOT_GIVEN,
@@ -97,6 +104,7 @@ class Transcriptions(SyncAPIResource):
         file: FileTypes,
         model: Union[str, AudioModel],
         response_format: Literal["text", "srt", "vtt"],
+        include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
         language: str | NotGiven = NOT_GIVEN,
         prompt: str | NotGiven = NOT_GIVEN,
         temperature: float | NotGiven = NOT_GIVEN,
@@ -109,11 +117,96 @@ class Transcriptions(SyncAPIResource):
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
     ) -> str: ...
 
+    @overload
+    def create(
+        self,
+        *,
+        file: FileTypes,
+        model: Union[str, AudioModel],
+        stream: Literal[True],
+        include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
+        language: str | NotGiven = NOT_GIVEN,
+        prompt: str | NotGiven = NOT_GIVEN,
+        response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        timestamp_granularities: List[Literal["word", "segment"]] | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> Stream[TranscriptionStreamEvent]:
+        """
+        Transcribes audio into the input language.
+
+        Args:
+          file:
+              The audio file object (not file name) to transcribe, in one of these formats:
+              flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
+
+          model: ID of the model to use. The options are `gpt-4o-transcribe`,
+              `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
+              Whisper V2 model).
+
+          stream: If set to true, the model response data will be streamed to the client as it is
+              generated using
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
+              See the
+              [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
+              for more information.
+
+              Note: Streaming is not supported for the `whisper-1` model and will be ignored.
+
+          include: Additional information to include in the transcription response. `logprobs` will
+              return the log probabilities of the tokens in the response to understand the
+              model's confidence in the transcription. `logprobs` only works with
+              response_format set to `json` and only with the models `gpt-4o-transcribe` and
+              `gpt-4o-mini-transcribe`.
+
+          language: The language of the input audio. Supplying the input language in
+              [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+              format will improve accuracy and latency.
+
+          prompt: An optional text to guide the model's style or continue a previous audio
+              segment. The
+              [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
+              should match the audio language.
+
+          response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
+              `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
+              the only supported format is `json`.
+
+          temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
+              output more random, while lower values like 0.2 will make it more focused and
+              deterministic. If set to 0, the model will use
+              [log probability](https://en.wikipedia.org/wiki/Log_probability) to
+              automatically increase the temperature until certain thresholds are hit.
+
+          timestamp_granularities: The timestamp granularities to populate for this transcription.
+              `response_format` must be set `verbose_json` to use timestamp granularities.
+              Either or both of these options are supported: `word`, or `segment`. Note: There
+              is no additional latency for segment timestamps, but generating word timestamps
+              incurs additional latency.
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
     def create(
         self,
         *,
         file: FileTypes,
         model: Union[str, AudioModel],
+        stream: bool,
+        include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
         language: str | NotGiven = NOT_GIVEN,
         prompt: str | NotGiven = NOT_GIVEN,
         response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN,
@@ -125,7 +218,7 @@ class Transcriptions(SyncAPIResource):
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
-    ) -> Transcription | TranscriptionVerbose | str:
+    ) -> TranscriptionCreateResponse | Stream[TranscriptionStreamEvent]:
         """
         Transcribes audio into the input language.
 
@@ -134,8 +227,24 @@ class Transcriptions(SyncAPIResource):
               The audio file object (not file name) to transcribe, in one of these formats:
               flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
 
-          model: ID of the model to use. Only `whisper-1` (which is powered by our open source
-              Whisper V2 model) is currently available.
+          model: ID of the model to use. The options are `gpt-4o-transcribe`,
+              `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
+              Whisper V2 model).
+
+          stream: If set to true, the model response data will be streamed to the client as it is
+              generated using
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
+              See the
+              [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
+              for more information.
+
+              Note: Streaming is not supported for the `whisper-1` model and will be ignored.
+
+          include: Additional information to include in the transcription response. `logprobs` will
+              return the log probabilities of the tokens in the response to understand the
+              model's confidence in the transcription. `logprobs` only works with
+              response_format set to `json` and only with the models `gpt-4o-transcribe` and
+              `gpt-4o-mini-transcribe`.
 
           language: The language of the input audio. Supplying the input language in
               [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -147,7 +256,8 @@ class Transcriptions(SyncAPIResource):
               should match the audio language.
 
           response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
-              `verbose_json`, or `vtt`.
+              `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
+              the only supported format is `json`.
 
           temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
               output more random, while lower values like 0.2 will make it more focused and
@@ -169,13 +279,37 @@ class Transcriptions(SyncAPIResource):
 
           timeout: Override the client-level default timeout for this request, in seconds
         """
+        ...
+
+    @required_args(["file", "model"], ["file", "model", "stream"])
+    def create(
+        self,
+        *,
+        file: FileTypes,
+        model: Union[str, AudioModel],
+        include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
+        language: str | NotGiven = NOT_GIVEN,
+        prompt: str | NotGiven = NOT_GIVEN,
+        response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN,
+        stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        timestamp_granularities: List[Literal["word", "segment"]] | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> str | Transcription | TranscriptionVerbose | Stream[TranscriptionStreamEvent]:
         body = deepcopy_minimal(
             {
                 "file": file,
                 "model": model,
+                "include": include,
                 "language": language,
                 "prompt": prompt,
                 "response_format": response_format,
+                "stream": stream,
                 "temperature": temperature,
                 "timestamp_granularities": timestamp_granularities,
             }
@@ -193,6 +327,8 @@ class Transcriptions(SyncAPIResource):
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
             cast_to=_get_response_format_type(response_format),
+            stream=stream or False,
+            stream_cls=Stream[TranscriptionStreamEvent],
         )
 
 
@@ -226,6 +362,7 @@ class AsyncTranscriptions(AsyncAPIResource):
         language: str | NotGiven = NOT_GIVEN,
         prompt: str | NotGiven = NOT_GIVEN,
         temperature: float | NotGiven = NOT_GIVEN,
+        include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
         timestamp_granularities: List[Literal["word", "segment"]] | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
@@ -241,6 +378,7 @@ class AsyncTranscriptions(AsyncAPIResource):
         *,
         file: FileTypes,
         model: Union[str, AudioModel],
+        include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
         response_format: Literal["verbose_json"],
         language: str | NotGiven = NOT_GIVEN,
         prompt: str | NotGiven = NOT_GIVEN,
@@ -260,6 +398,7 @@ class AsyncTranscriptions(AsyncAPIResource):
         *,
         file: FileTypes,
         model: Union[str, AudioModel],
+        include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
         response_format: Literal["text", "srt", "vtt"],
         language: str | NotGiven = NOT_GIVEN,
         prompt: str | NotGiven = NOT_GIVEN,
@@ -273,11 +412,96 @@ class AsyncTranscriptions(AsyncAPIResource):
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
     ) -> str: ...
 
+    @overload
+    async def create(
+        self,
+        *,
+        file: FileTypes,
+        model: Union[str, AudioModel],
+        stream: Literal[True],
+        include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
+        language: str | NotGiven = NOT_GIVEN,
+        prompt: str | NotGiven = NOT_GIVEN,
+        response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        timestamp_granularities: List[Literal["word", "segment"]] | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> AsyncStream[TranscriptionStreamEvent]:
+        """
+        Transcribes audio into the input language.
+
+        Args:
+          file:
+              The audio file object (not file name) to transcribe, in one of these formats:
+              flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
+
+          model: ID of the model to use. The options are `gpt-4o-transcribe`,
+              `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
+              Whisper V2 model).
+
+          stream: If set to true, the model response data will be streamed to the client as it is
+              generated using
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
+              See the
+              [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
+              for more information.
+
+              Note: Streaming is not supported for the `whisper-1` model and will be ignored.
+
+          include: Additional information to include in the transcription response. `logprobs` will
+              return the log probabilities of the tokens in the response to understand the
+              model's confidence in the transcription. `logprobs` only works with
+              response_format set to `json` and only with the models `gpt-4o-transcribe` and
+              `gpt-4o-mini-transcribe`.
+
+          language: The language of the input audio. Supplying the input language in
+              [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+              format will improve accuracy and latency.
+
+          prompt: An optional text to guide the model's style or continue a previous audio
+              segment. The
+              [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
+              should match the audio language.
+
+          response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
+              `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
+              the only supported format is `json`.
+
+          temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
+              output more random, while lower values like 0.2 will make it more focused and
+              deterministic. If set to 0, the model will use
+              [log probability](https://en.wikipedia.org/wiki/Log_probability) to
+              automatically increase the temperature until certain thresholds are hit.
+
+          timestamp_granularities: The timestamp granularities to populate for this transcription.
+              `response_format` must be set `verbose_json` to use timestamp granularities.
+              Either or both of these options are supported: `word`, or `segment`. Note: There
+              is no additional latency for segment timestamps, but generating word timestamps
+              incurs additional latency.
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
     async def create(
         self,
         *,
         file: FileTypes,
         model: Union[str, AudioModel],
+        stream: bool,
+        include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
         language: str | NotGiven = NOT_GIVEN,
         prompt: str | NotGiven = NOT_GIVEN,
         response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN,
@@ -289,7 +513,7 @@ class AsyncTranscriptions(AsyncAPIResource):
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
-    ) -> Transcription | TranscriptionVerbose | str:
+    ) -> TranscriptionCreateResponse | AsyncStream[TranscriptionStreamEvent]:
         """
         Transcribes audio into the input language.
 
@@ -298,8 +522,24 @@ class AsyncTranscriptions(AsyncAPIResource):
               The audio file object (not file name) to transcribe, in one of these formats:
               flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
 
-          model: ID of the model to use. Only `whisper-1` (which is powered by our open source
-              Whisper V2 model) is currently available.
+          model: ID of the model to use. The options are `gpt-4o-transcribe`,
+              `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
+              Whisper V2 model).
+
+          stream: If set to true, the model response data will be streamed to the client as it is
+              generated using
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
+              See the
+              [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
+              for more information.
+
+              Note: Streaming is not supported for the `whisper-1` model and will be ignored.
+
+          include: Additional information to include in the transcription response. `logprobs` will
+              return the log probabilities of the tokens in the response to understand the
+              model's confidence in the transcription. `logprobs` only works with
+              response_format set to `json` and only with the models `gpt-4o-transcribe` and
+              `gpt-4o-mini-transcribe`.
 
           language: The language of the input audio. Supplying the input language in
               [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -311,7 +551,8 @@ class AsyncTranscriptions(AsyncAPIResource):
               should match the audio language.
 
           response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
-              `verbose_json`, or `vtt`.
+              `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
+              the only supported format is `json`.
 
           temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
               output more random, while lower values like 0.2 will make it more focused and
@@ -333,13 +574,37 @@ class AsyncTranscriptions(AsyncAPIResource):
 
           timeout: Override the client-level default timeout for this request, in seconds
         """
+        ...
+
+    @required_args(["file", "model"], ["file", "model", "stream"])
+    async def create(
+        self,
+        *,
+        file: FileTypes,
+        model: Union[str, AudioModel],
+        include: List[TranscriptionInclude] | NotGiven = NOT_GIVEN,
+        language: str | NotGiven = NOT_GIVEN,
+        prompt: str | NotGiven = NOT_GIVEN,
+        response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN,
+        stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,
+        temperature: float | NotGiven = NOT_GIVEN,
+        timestamp_granularities: List[Literal["word", "segment"]] | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> Transcription | TranscriptionVerbose | str | AsyncStream[TranscriptionStreamEvent]:
         body = deepcopy_minimal(
             {
                 "file": file,
                 "model": model,
+                "include": include,
                 "language": language,
                 "prompt": prompt,
                 "response_format": response_format,
+                "stream": stream,
                 "temperature": temperature,
                 "timestamp_granularities": timestamp_granularities,
             }
@@ -357,6 +622,8 @@ class AsyncTranscriptions(AsyncAPIResource):
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
             cast_to=_get_response_format_type(response_format),
+            stream=stream or False,
+            stream_cls=AsyncStream[TranscriptionStreamEvent],
         )
 
 
src/openai/resources/audio/translations.py
@@ -9,7 +9,6 @@ from typing_extensions import Literal, overload, assert_never
 import httpx
 
 from ... import _legacy_response
-from ...types import AudioResponseFormat
 from ..._types import NOT_GIVEN, Body, Query, Headers, NotGiven, FileTypes
 from ..._utils import (
     extract_files,
@@ -109,7 +108,7 @@ class Translations(SyncAPIResource):
         file: FileTypes,
         model: Union[str, AudioModel],
         prompt: str | NotGiven = NOT_GIVEN,
-        response_format: Union[AudioResponseFormat, NotGiven] = NOT_GIVEN,
+        response_format: Union[Literal["json", "text", "srt", "verbose_json", "vtt"], NotGiven] = NOT_GIVEN,
         temperature: float | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
src/openai/resources/beta/realtime/__init__.py
@@ -16,6 +16,14 @@ from .sessions import (
     SessionsWithStreamingResponse,
     AsyncSessionsWithStreamingResponse,
 )
+from .transcription_sessions import (
+    TranscriptionSessions,
+    AsyncTranscriptionSessions,
+    TranscriptionSessionsWithRawResponse,
+    AsyncTranscriptionSessionsWithRawResponse,
+    TranscriptionSessionsWithStreamingResponse,
+    AsyncTranscriptionSessionsWithStreamingResponse,
+)
 
 __all__ = [
     "Sessions",
@@ -24,6 +32,12 @@ __all__ = [
     "AsyncSessionsWithRawResponse",
     "SessionsWithStreamingResponse",
     "AsyncSessionsWithStreamingResponse",
+    "TranscriptionSessions",
+    "AsyncTranscriptionSessions",
+    "TranscriptionSessionsWithRawResponse",
+    "AsyncTranscriptionSessionsWithRawResponse",
+    "TranscriptionSessionsWithStreamingResponse",
+    "AsyncTranscriptionSessionsWithStreamingResponse",
     "Realtime",
     "AsyncRealtime",
     "RealtimeWithRawResponse",
src/openai/resources/beta/realtime/realtime.py
@@ -32,7 +32,19 @@ from ...._models import construct_type_unchecked
 from ...._resource import SyncAPIResource, AsyncAPIResource
 from ...._exceptions import OpenAIError
 from ...._base_client import _merge_mappings
-from ....types.beta.realtime import session_update_event_param, response_create_event_param
+from ....types.beta.realtime import (
+    session_update_event_param,
+    response_create_event_param,
+    transcription_session_update_param,
+)
+from .transcription_sessions import (
+    TranscriptionSessions,
+    AsyncTranscriptionSessions,
+    TranscriptionSessionsWithRawResponse,
+    AsyncTranscriptionSessionsWithRawResponse,
+    TranscriptionSessionsWithStreamingResponse,
+    AsyncTranscriptionSessionsWithStreamingResponse,
+)
 from ....types.websocket_connection_options import WebsocketConnectionOptions
 from ....types.beta.realtime.realtime_client_event import RealtimeClientEvent
 from ....types.beta.realtime.realtime_server_event import RealtimeServerEvent
@@ -55,6 +67,10 @@ class Realtime(SyncAPIResource):
     def sessions(self) -> Sessions:
         return Sessions(self._client)
 
+    @cached_property
+    def transcription_sessions(self) -> TranscriptionSessions:
+        return TranscriptionSessions(self._client)
+
     @cached_property
     def with_raw_response(self) -> RealtimeWithRawResponse:
         """
@@ -107,6 +123,10 @@ class AsyncRealtime(AsyncAPIResource):
     def sessions(self) -> AsyncSessions:
         return AsyncSessions(self._client)
 
+    @cached_property
+    def transcription_sessions(self) -> AsyncTranscriptionSessions:
+        return AsyncTranscriptionSessions(self._client)
+
     @cached_property
     def with_raw_response(self) -> AsyncRealtimeWithRawResponse:
         """
@@ -162,6 +182,10 @@ class RealtimeWithRawResponse:
     def sessions(self) -> SessionsWithRawResponse:
         return SessionsWithRawResponse(self._realtime.sessions)
 
+    @cached_property
+    def transcription_sessions(self) -> TranscriptionSessionsWithRawResponse:
+        return TranscriptionSessionsWithRawResponse(self._realtime.transcription_sessions)
+
 
 class AsyncRealtimeWithRawResponse:
     def __init__(self, realtime: AsyncRealtime) -> None:
@@ -171,6 +195,10 @@ class AsyncRealtimeWithRawResponse:
     def sessions(self) -> AsyncSessionsWithRawResponse:
         return AsyncSessionsWithRawResponse(self._realtime.sessions)
 
+    @cached_property
+    def transcription_sessions(self) -> AsyncTranscriptionSessionsWithRawResponse:
+        return AsyncTranscriptionSessionsWithRawResponse(self._realtime.transcription_sessions)
+
 
 class RealtimeWithStreamingResponse:
     def __init__(self, realtime: Realtime) -> None:
@@ -180,6 +208,10 @@ class RealtimeWithStreamingResponse:
     def sessions(self) -> SessionsWithStreamingResponse:
         return SessionsWithStreamingResponse(self._realtime.sessions)
 
+    @cached_property
+    def transcription_sessions(self) -> TranscriptionSessionsWithStreamingResponse:
+        return TranscriptionSessionsWithStreamingResponse(self._realtime.transcription_sessions)
+
 
 class AsyncRealtimeWithStreamingResponse:
     def __init__(self, realtime: AsyncRealtime) -> None:
@@ -189,14 +221,19 @@ class AsyncRealtimeWithStreamingResponse:
     def sessions(self) -> AsyncSessionsWithStreamingResponse:
         return AsyncSessionsWithStreamingResponse(self._realtime.sessions)
 
+    @cached_property
+    def transcription_sessions(self) -> AsyncTranscriptionSessionsWithStreamingResponse:
+        return AsyncTranscriptionSessionsWithStreamingResponse(self._realtime.transcription_sessions)
+
 
 class AsyncRealtimeConnection:
     """Represents a live websocket connection to the Realtime API"""
 
     session: AsyncRealtimeSessionResource
     response: AsyncRealtimeResponseResource
-    conversation: AsyncRealtimeConversationResource
     input_audio_buffer: AsyncRealtimeInputAudioBufferResource
+    conversation: AsyncRealtimeConversationResource
+    transcription_session: AsyncRealtimeTranscriptionSessionResource
 
     _connection: AsyncWebsocketConnection
 
@@ -205,8 +242,9 @@ class AsyncRealtimeConnection:
 
         self.session = AsyncRealtimeSessionResource(self)
         self.response = AsyncRealtimeResponseResource(self)
-        self.conversation = AsyncRealtimeConversationResource(self)
         self.input_audio_buffer = AsyncRealtimeInputAudioBufferResource(self)
+        self.conversation = AsyncRealtimeConversationResource(self)
+        self.transcription_session = AsyncRealtimeTranscriptionSessionResource(self)
 
     async def __aiter__(self) -> AsyncIterator[RealtimeServerEvent]:
         """
@@ -377,8 +415,9 @@ class RealtimeConnection:
 
     session: RealtimeSessionResource
     response: RealtimeResponseResource
-    conversation: RealtimeConversationResource
     input_audio_buffer: RealtimeInputAudioBufferResource
+    conversation: RealtimeConversationResource
+    transcription_session: RealtimeTranscriptionSessionResource
 
     _connection: WebsocketConnection
 
@@ -387,8 +426,9 @@ class RealtimeConnection:
 
         self.session = RealtimeSessionResource(self)
         self.response = RealtimeResponseResource(self)
-        self.conversation = RealtimeConversationResource(self)
         self.input_audio_buffer = RealtimeInputAudioBufferResource(self)
+        self.conversation = RealtimeConversationResource(self)
+        self.transcription_session = RealtimeTranscriptionSessionResource(self)
 
     def __iter__(self) -> Iterator[RealtimeServerEvent]:
         """
@@ -582,20 +622,6 @@ class RealtimeSessionResource(BaseRealtimeConnectionResource):
 
 
 class RealtimeResponseResource(BaseRealtimeConnectionResource):
-    def cancel(self, *, event_id: str | NotGiven = NOT_GIVEN, response_id: str | NotGiven = NOT_GIVEN) -> None:
-        """Send this event to cancel an in-progress response.
-
-        The server will respond
-        with a `response.cancelled` event or an error if there is no response to
-        cancel.
-        """
-        self._connection.send(
-            cast(
-                RealtimeClientEventParam,
-                strip_not_given({"type": "response.cancel", "event_id": event_id, "response_id": response_id}),
-            )
-        )
-
     def create(
         self,
         *,
@@ -626,6 +652,70 @@ class RealtimeResponseResource(BaseRealtimeConnectionResource):
             )
         )
 
+    def cancel(self, *, event_id: str | NotGiven = NOT_GIVEN, response_id: str | NotGiven = NOT_GIVEN) -> None:
+        """Send this event to cancel an in-progress response.
+
+        The server will respond
+        with a `response.cancelled` event or an error if there is no response to
+        cancel.
+        """
+        self._connection.send(
+            cast(
+                RealtimeClientEventParam,
+                strip_not_given({"type": "response.cancel", "event_id": event_id, "response_id": response_id}),
+            )
+        )
+
+
+class RealtimeInputAudioBufferResource(BaseRealtimeConnectionResource):
+    def clear(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None:
+        """Send this event to clear the audio bytes in the buffer.
+
+        The server will
+        respond with an `input_audio_buffer.cleared` event.
+        """
+        self._connection.send(
+            cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.clear", "event_id": event_id}))
+        )
+
+    def commit(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None:
+        """
+        Send this event to commit the user input audio buffer, which will create a
+        new user message item in the conversation. This event will produce an error
+        if the input audio buffer is empty. When in Server VAD mode, the client does
+        not need to send this event, the server will commit the audio buffer
+        automatically.
+
+        Committing the input audio buffer will trigger input audio transcription
+        (if enabled in session configuration), but it will not create a response
+        from the model. The server will respond with an `input_audio_buffer.committed`
+        event.
+        """
+        self._connection.send(
+            cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.commit", "event_id": event_id}))
+        )
+
+    def append(self, *, audio: str, event_id: str | NotGiven = NOT_GIVEN) -> None:
+        """Send this event to append audio bytes to the input audio buffer.
+
+        The audio
+        buffer is temporary storage you can write to and later commit. In Server VAD
+        mode, the audio buffer is used to detect speech and the server will decide
+        when to commit. When Server VAD is disabled, you must commit the audio buffer
+        manually.
+
+        The client may choose how much audio to place in each event up to a maximum
+        of 15 MiB, for example streaming smaller chunks from the client may allow the
+        VAD to be more responsive. Unlike made other client events, the server will
+        not send a confirmation response to this event.
+        """
+        self._connection.send(
+            cast(
+                RealtimeClientEventParam,
+                strip_not_given({"type": "input_audio_buffer.append", "audio": audio, "event_id": event_id}),
+            )
+        )
+
 
 class RealtimeConversationResource(BaseRealtimeConnectionResource):
     @cached_property
@@ -711,53 +801,30 @@ class RealtimeConversationItemResource(BaseRealtimeConnectionResource):
             )
         )
 
-
-class RealtimeInputAudioBufferResource(BaseRealtimeConnectionResource):
-    def clear(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None:
-        """Send this event to clear the audio bytes in the buffer.
-
-        The server will
-        respond with an `input_audio_buffer.cleared` event.
+    def retrieve(self, *, item_id: str, event_id: str | NotGiven = NOT_GIVEN) -> None:
         """
-        self._connection.send(
-            cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.clear", "event_id": event_id}))
-        )
-
-    def commit(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None:
-        """
-        Send this event to commit the user input audio buffer, which will create a
-        new user message item in the conversation. This event will produce an error
-        if the input audio buffer is empty. When in Server VAD mode, the client does
-        not need to send this event, the server will commit the audio buffer
-        automatically.
-
-        Committing the input audio buffer will trigger input audio transcription
-        (if enabled in session configuration), but it will not create a response
-        from the model. The server will respond with an `input_audio_buffer.committed`
-        event.
+        Send this event when you want to retrieve the server's representation of a specific item in the conversation history. This is useful, for example, to inspect user audio after noise cancellation and VAD.
+        The server will respond with a `conversation.item.retrieved` event,
+        unless the item does not exist in the conversation history, in which case the
+        server will respond with an error.
         """
         self._connection.send(
-            cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.commit", "event_id": event_id}))
+            cast(
+                RealtimeClientEventParam,
+                strip_not_given({"type": "conversation.item.retrieve", "item_id": item_id, "event_id": event_id}),
+            )
         )
 
-    def append(self, *, audio: str, event_id: str | NotGiven = NOT_GIVEN) -> None:
-        """Send this event to append audio bytes to the input audio buffer.
 
-        The audio
-        buffer is temporary storage you can write to and later commit. In Server VAD
-        mode, the audio buffer is used to detect speech and the server will decide
-        when to commit. When Server VAD is disabled, you must commit the audio buffer
-        manually.
-
-        The client may choose how much audio to place in each event up to a maximum
-        of 15 MiB, for example streaming smaller chunks from the client may allow the
-        VAD to be more responsive. Unlike made other client events, the server will
-        not send a confirmation response to this event.
-        """
+class RealtimeTranscriptionSessionResource(BaseRealtimeConnectionResource):
+    def update(
+        self, *, session: transcription_session_update_param.Session, event_id: str | NotGiven = NOT_GIVEN
+    ) -> None:
+        """Send this event to update a transcription session."""
         self._connection.send(
             cast(
                 RealtimeClientEventParam,
-                strip_not_given({"type": "input_audio_buffer.append", "audio": audio, "event_id": event_id}),
+                strip_not_given({"type": "transcription_session.update", "session": session, "event_id": event_id}),
             )
         )
 
@@ -792,20 +859,6 @@ class AsyncRealtimeSessionResource(BaseAsyncRealtimeConnectionResource):
 
 
 class AsyncRealtimeResponseResource(BaseAsyncRealtimeConnectionResource):
-    async def cancel(self, *, event_id: str | NotGiven = NOT_GIVEN, response_id: str | NotGiven = NOT_GIVEN) -> None:
-        """Send this event to cancel an in-progress response.
-
-        The server will respond
-        with a `response.cancelled` event or an error if there is no response to
-        cancel.
-        """
-        await self._connection.send(
-            cast(
-                RealtimeClientEventParam,
-                strip_not_given({"type": "response.cancel", "event_id": event_id, "response_id": response_id}),
-            )
-        )
-
     async def create(
         self,
         *,
@@ -836,6 +889,70 @@ class AsyncRealtimeResponseResource(BaseAsyncRealtimeConnectionResource):
             )
         )
 
+    async def cancel(self, *, event_id: str | NotGiven = NOT_GIVEN, response_id: str | NotGiven = NOT_GIVEN) -> None:
+        """Send this event to cancel an in-progress response.
+
+        The server will respond
+        with a `response.cancelled` event or an error if there is no response to
+        cancel.
+        """
+        await self._connection.send(
+            cast(
+                RealtimeClientEventParam,
+                strip_not_given({"type": "response.cancel", "event_id": event_id, "response_id": response_id}),
+            )
+        )
+
+
+class AsyncRealtimeInputAudioBufferResource(BaseAsyncRealtimeConnectionResource):
+    async def clear(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None:
+        """Send this event to clear the audio bytes in the buffer.
+
+        The server will
+        respond with an `input_audio_buffer.cleared` event.
+        """
+        await self._connection.send(
+            cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.clear", "event_id": event_id}))
+        )
+
+    async def commit(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None:
+        """
+        Send this event to commit the user input audio buffer, which will create a
+        new user message item in the conversation. This event will produce an error
+        if the input audio buffer is empty. When in Server VAD mode, the client does
+        not need to send this event, the server will commit the audio buffer
+        automatically.
+
+        Committing the input audio buffer will trigger input audio transcription
+        (if enabled in session configuration), but it will not create a response
+        from the model. The server will respond with an `input_audio_buffer.committed`
+        event.
+        """
+        await self._connection.send(
+            cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.commit", "event_id": event_id}))
+        )
+
+    async def append(self, *, audio: str, event_id: str | NotGiven = NOT_GIVEN) -> None:
+        """Send this event to append audio bytes to the input audio buffer.
+
+        The audio
+        buffer is temporary storage you can write to and later commit. In Server VAD
+        mode, the audio buffer is used to detect speech and the server will decide
+        when to commit. When Server VAD is disabled, you must commit the audio buffer
+        manually.
+
+        The client may choose how much audio to place in each event up to a maximum
+        of 15 MiB, for example streaming smaller chunks from the client may allow the
+        VAD to be more responsive. Unlike made other client events, the server will
+        not send a confirmation response to this event.
+        """
+        await self._connection.send(
+            cast(
+                RealtimeClientEventParam,
+                strip_not_given({"type": "input_audio_buffer.append", "audio": audio, "event_id": event_id}),
+            )
+        )
+
 
 class AsyncRealtimeConversationResource(BaseAsyncRealtimeConnectionResource):
     @cached_property
@@ -921,52 +1038,29 @@ class AsyncRealtimeConversationItemResource(BaseAsyncRealtimeConnectionResource)
             )
         )
 
-
-class AsyncRealtimeInputAudioBufferResource(BaseAsyncRealtimeConnectionResource):
-    async def clear(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None:
-        """Send this event to clear the audio bytes in the buffer.
-
-        The server will
-        respond with an `input_audio_buffer.cleared` event.
-        """
-        await self._connection.send(
-            cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.clear", "event_id": event_id}))
-        )
-
-    async def commit(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None:
+    async def retrieve(self, *, item_id: str, event_id: str | NotGiven = NOT_GIVEN) -> None:
         """
-        Send this event to commit the user input audio buffer, which will create a
-        new user message item in the conversation. This event will produce an error
-        if the input audio buffer is empty. When in Server VAD mode, the client does
-        not need to send this event, the server will commit the audio buffer
-        automatically.
-
-        Committing the input audio buffer will trigger input audio transcription
-        (if enabled in session configuration), but it will not create a response
-        from the model. The server will respond with an `input_audio_buffer.committed`
-        event.
+        Send this event when you want to retrieve the server's representation of a specific item in the conversation history. This is useful, for example, to inspect user audio after noise cancellation and VAD.
+        The server will respond with a `conversation.item.retrieved` event,
+        unless the item does not exist in the conversation history, in which case the
+        server will respond with an error.
         """
         await self._connection.send(
-            cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.commit", "event_id": event_id}))
+            cast(
+                RealtimeClientEventParam,
+                strip_not_given({"type": "conversation.item.retrieve", "item_id": item_id, "event_id": event_id}),
+            )
         )
 
-    async def append(self, *, audio: str, event_id: str | NotGiven = NOT_GIVEN) -> None:
-        """Send this event to append audio bytes to the input audio buffer.
-
-        The audio
-        buffer is temporary storage you can write to and later commit. In Server VAD
-        mode, the audio buffer is used to detect speech and the server will decide
-        when to commit. When Server VAD is disabled, you must commit the audio buffer
-        manually.
 
-        The client may choose how much audio to place in each event up to a maximum
-        of 15 MiB, for example streaming smaller chunks from the client may allow the
-        VAD to be more responsive. Unlike made other client events, the server will
-        not send a confirmation response to this event.
-        """
+class AsyncRealtimeTranscriptionSessionResource(BaseAsyncRealtimeConnectionResource):
+    async def update(
+        self, *, session: transcription_session_update_param.Session, event_id: str | NotGiven = NOT_GIVEN
+    ) -> None:
+        """Send this event to update a transcription session."""
         await self._connection.send(
             cast(
                 RealtimeClientEventParam,
-                strip_not_given({"type": "input_audio_buffer.append", "audio": audio, "event_id": event_id}),
+                strip_not_given({"type": "transcription_session.update", "session": session, "event_id": event_id}),
             )
         )
src/openai/resources/beta/realtime/sessions.py
@@ -47,6 +47,7 @@ class Sessions(SyncAPIResource):
         self,
         *,
         input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
+        input_audio_noise_reduction: session_create_params.InputAudioNoiseReduction | NotGiven = NOT_GIVEN,
         input_audio_transcription: session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN,
         instructions: str | NotGiven = NOT_GIVEN,
         max_response_output_tokens: Union[int, Literal["inf"]] | NotGiven = NOT_GIVEN,
@@ -86,14 +87,20 @@ class Sessions(SyncAPIResource):
               `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
               (mono), and little-endian byte order.
 
+          input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn
+              off. Noise reduction filters audio added to the input audio buffer before it is
+              sent to VAD and the model. Filtering the audio can improve VAD and turn
+              detection accuracy (reducing false positives) and model performance by improving
+              perception of the input audio.
+
           input_audio_transcription: Configuration for input audio transcription, defaults to off and can be set to
               `null` to turn off once on. Input audio transcription is not native to the
               model, since the model consumes audio directly. Transcription runs
               asynchronously through
-              [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
-              and should be treated as rough guidance rather than the representation
-              understood by the model. The client can optionally set the language and prompt
-              for transcription, these fields will be passed to the Whisper API.
+              [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+              and should be treated as guidance of input audio content rather than precisely
+              what the model heard. The client can optionally set the language and prompt for
+              transcription, these offer additional guidance to the transcription service.
 
           instructions: The default system instructions (i.e. system message) prepended to model calls.
               This field allows the client to guide the model on desired responses. The model
@@ -119,16 +126,24 @@ class Sessions(SyncAPIResource):
           output_audio_format: The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
               For `pcm16`, output audio is sampled at a rate of 24kHz.
 
-          temperature: Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
+          temperature: Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
+              temperature of 0.8 is highly recommended for best performance.
 
           tool_choice: How the model chooses tools. Options are `auto`, `none`, `required`, or specify
               a function.
 
           tools: Tools (functions) available to the model.
 
-          turn_detection: Configuration for turn detection. Can be set to `null` to turn off. Server VAD
-              means that the model will detect the start and end of speech based on audio
-              volume and respond at the end of user speech.
+          turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+              set to `null` to turn off, in which case the client must manually trigger model
+              response. Server VAD means that the model will detect the start and end of
+              speech based on audio volume and respond at the end of user speech. Semantic VAD
+              is more advanced and uses a turn detection model (in conjuction with VAD) to
+              semantically estimate whether the user has finished speaking, then dynamically
+              sets a timeout based on this probability. For example, if user audio trails off
+              with "uhhm", the model will score a low probability of turn end and wait longer
+              for the user to continue speaking. This can be useful for more natural
+              conversations, but may have a higher latency.
 
           voice: The voice the model uses to respond. Voice cannot be changed during the session
               once the model has responded with audio at least once. Current voice options are
@@ -148,6 +163,7 @@ class Sessions(SyncAPIResource):
             body=maybe_transform(
                 {
                     "input_audio_format": input_audio_format,
+                    "input_audio_noise_reduction": input_audio_noise_reduction,
                     "input_audio_transcription": input_audio_transcription,
                     "instructions": instructions,
                     "max_response_output_tokens": max_response_output_tokens,
@@ -193,6 +209,7 @@ class AsyncSessions(AsyncAPIResource):
         self,
         *,
         input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
+        input_audio_noise_reduction: session_create_params.InputAudioNoiseReduction | NotGiven = NOT_GIVEN,
         input_audio_transcription: session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN,
         instructions: str | NotGiven = NOT_GIVEN,
         max_response_output_tokens: Union[int, Literal["inf"]] | NotGiven = NOT_GIVEN,
@@ -232,14 +249,20 @@ class AsyncSessions(AsyncAPIResource):
               `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
               (mono), and little-endian byte order.
 
+          input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn
+              off. Noise reduction filters audio added to the input audio buffer before it is
+              sent to VAD and the model. Filtering the audio can improve VAD and turn
+              detection accuracy (reducing false positives) and model performance by improving
+              perception of the input audio.
+
           input_audio_transcription: Configuration for input audio transcription, defaults to off and can be set to
               `null` to turn off once on. Input audio transcription is not native to the
               model, since the model consumes audio directly. Transcription runs
               asynchronously through
-              [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
-              and should be treated as rough guidance rather than the representation
-              understood by the model. The client can optionally set the language and prompt
-              for transcription, these fields will be passed to the Whisper API.
+              [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+              and should be treated as guidance of input audio content rather than precisely
+              what the model heard. The client can optionally set the language and prompt for
+              transcription, these offer additional guidance to the transcription service.
 
           instructions: The default system instructions (i.e. system message) prepended to model calls.
               This field allows the client to guide the model on desired responses. The model
@@ -265,16 +288,24 @@ class AsyncSessions(AsyncAPIResource):
           output_audio_format: The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
               For `pcm16`, output audio is sampled at a rate of 24kHz.
 
-          temperature: Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
+          temperature: Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
+              temperature of 0.8 is highly recommended for best performance.
 
           tool_choice: How the model chooses tools. Options are `auto`, `none`, `required`, or specify
               a function.
 
           tools: Tools (functions) available to the model.
 
-          turn_detection: Configuration for turn detection. Can be set to `null` to turn off. Server VAD
-              means that the model will detect the start and end of speech based on audio
-              volume and respond at the end of user speech.
+          turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+              set to `null` to turn off, in which case the client must manually trigger model
+              response. Server VAD means that the model will detect the start and end of
+              speech based on audio volume and respond at the end of user speech. Semantic VAD
+              is more advanced and uses a turn detection model (in conjuction with VAD) to
+              semantically estimate whether the user has finished speaking, then dynamically
+              sets a timeout based on this probability. For example, if user audio trails off
+              with "uhhm", the model will score a low probability of turn end and wait longer
+              for the user to continue speaking. This can be useful for more natural
+              conversations, but may have a higher latency.
 
           voice: The voice the model uses to respond. Voice cannot be changed during the session
               once the model has responded with audio at least once. Current voice options are
@@ -294,6 +325,7 @@ class AsyncSessions(AsyncAPIResource):
             body=await async_maybe_transform(
                 {
                     "input_audio_format": input_audio_format,
+                    "input_audio_noise_reduction": input_audio_noise_reduction,
                     "input_audio_transcription": input_audio_transcription,
                     "instructions": instructions,
                     "max_response_output_tokens": max_response_output_tokens,
src/openai/resources/beta/realtime/transcription_sessions.py
@@ -0,0 +1,277 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import List
+from typing_extensions import Literal
+
+import httpx
+
+from .... import _legacy_response
+from ...._types import NOT_GIVEN, Body, Query, Headers, NotGiven
+from ...._utils import (
+    maybe_transform,
+    async_maybe_transform,
+)
+from ...._compat import cached_property
+from ...._resource import SyncAPIResource, AsyncAPIResource
+from ...._response import to_streamed_response_wrapper, async_to_streamed_response_wrapper
+from ...._base_client import make_request_options
+from ....types.beta.realtime import transcription_session_create_params
+from ....types.beta.realtime.transcription_session import TranscriptionSession
+
+__all__ = ["TranscriptionSessions", "AsyncTranscriptionSessions"]
+
+
+class TranscriptionSessions(SyncAPIResource):
+    @cached_property
+    def with_raw_response(self) -> TranscriptionSessionsWithRawResponse:
+        """
+        This property can be used as a prefix for any HTTP method call to return
+        the raw response object instead of the parsed content.
+
+        For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
+        """
+        return TranscriptionSessionsWithRawResponse(self)
+
+    @cached_property
+    def with_streaming_response(self) -> TranscriptionSessionsWithStreamingResponse:
+        """
+        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
+
+        For more information, see https://www.github.com/openai/openai-python#with_streaming_response
+        """
+        return TranscriptionSessionsWithStreamingResponse(self)
+
+    def create(
+        self,
+        *,
+        include: List[str] | NotGiven = NOT_GIVEN,
+        input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
+        input_audio_noise_reduction: transcription_session_create_params.InputAudioNoiseReduction
+        | NotGiven = NOT_GIVEN,
+        input_audio_transcription: transcription_session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN,
+        modalities: List[Literal["text", "audio"]] | NotGiven = NOT_GIVEN,
+        turn_detection: transcription_session_create_params.TurnDetection | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> TranscriptionSession:
+        """
+        Create an ephemeral API token for use in client-side applications with the
+        Realtime API specifically for realtime transcriptions. Can be configured with
+        the same session parameters as the `transcription_session.update` client event.
+
+        It responds with a session object, plus a `client_secret` key which contains a
+        usable ephemeral API token that can be used to authenticate browser clients for
+        the Realtime API.
+
+        Args:
+          include:
+              The set of items to include in the transcription. Current available items are:
+
+              - `item.input_audio_transcription.logprobs`
+
+          input_audio_format: The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
+              `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
+              (mono), and little-endian byte order.
+
+          input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn
+              off. Noise reduction filters audio added to the input audio buffer before it is
+              sent to VAD and the model. Filtering the audio can improve VAD and turn
+              detection accuracy (reducing false positives) and model performance by improving
+              perception of the input audio.
+
+          input_audio_transcription: Configuration for input audio transcription. The client can optionally set the
+              language and prompt for transcription, these offer additional guidance to the
+              transcription service.
+
+          modalities: The set of modalities the model can respond with. To disable audio, set this to
+              ["text"].
+
+          turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+              set to `null` to turn off, in which case the client must manually trigger model
+              response. Server VAD means that the model will detect the start and end of
+              speech based on audio volume and respond at the end of user speech. Semantic VAD
+              is more advanced and uses a turn detection model (in conjuction with VAD) to
+              semantically estimate whether the user has finished speaking, then dynamically
+              sets a timeout based on this probability. For example, if user audio trails off
+              with "uhhm", the model will score a low probability of turn end and wait longer
+              for the user to continue speaking. This can be useful for more natural
+              conversations, but may have a higher latency.
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        extra_headers = {"OpenAI-Beta": "assistants=v2", **(extra_headers or {})}
+        return self._post(
+            "/realtime/transcription_sessions",
+            body=maybe_transform(
+                {
+                    "include": include,
+                    "input_audio_format": input_audio_format,
+                    "input_audio_noise_reduction": input_audio_noise_reduction,
+                    "input_audio_transcription": input_audio_transcription,
+                    "modalities": modalities,
+                    "turn_detection": turn_detection,
+                },
+                transcription_session_create_params.TranscriptionSessionCreateParams,
+            ),
+            options=make_request_options(
+                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+            ),
+            cast_to=TranscriptionSession,
+        )
+
+
+class AsyncTranscriptionSessions(AsyncAPIResource):
+    @cached_property
+    def with_raw_response(self) -> AsyncTranscriptionSessionsWithRawResponse:
+        """
+        This property can be used as a prefix for any HTTP method call to return
+        the raw response object instead of the parsed content.
+
+        For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
+        """
+        return AsyncTranscriptionSessionsWithRawResponse(self)
+
+    @cached_property
+    def with_streaming_response(self) -> AsyncTranscriptionSessionsWithStreamingResponse:
+        """
+        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
+
+        For more information, see https://www.github.com/openai/openai-python#with_streaming_response
+        """
+        return AsyncTranscriptionSessionsWithStreamingResponse(self)
+
+    async def create(
+        self,
+        *,
+        include: List[str] | NotGiven = NOT_GIVEN,
+        input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
+        input_audio_noise_reduction: transcription_session_create_params.InputAudioNoiseReduction
+        | NotGiven = NOT_GIVEN,
+        input_audio_transcription: transcription_session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN,
+        modalities: List[Literal["text", "audio"]] | NotGiven = NOT_GIVEN,
+        turn_detection: transcription_session_create_params.TurnDetection | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> TranscriptionSession:
+        """
+        Create an ephemeral API token for use in client-side applications with the
+        Realtime API specifically for realtime transcriptions. Can be configured with
+        the same session parameters as the `transcription_session.update` client event.
+
+        It responds with a session object, plus a `client_secret` key which contains a
+        usable ephemeral API token that can be used to authenticate browser clients for
+        the Realtime API.
+
+        Args:
+          include:
+              The set of items to include in the transcription. Current available items are:
+
+              - `item.input_audio_transcription.logprobs`
+
+          input_audio_format: The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
+              `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
+              (mono), and little-endian byte order.
+
+          input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn
+              off. Noise reduction filters audio added to the input audio buffer before it is
+              sent to VAD and the model. Filtering the audio can improve VAD and turn
+              detection accuracy (reducing false positives) and model performance by improving
+              perception of the input audio.
+
+          input_audio_transcription: Configuration for input audio transcription. The client can optionally set the
+              language and prompt for transcription, these offer additional guidance to the
+              transcription service.
+
+          modalities: The set of modalities the model can respond with. To disable audio, set this to
+              ["text"].
+
+          turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+              set to `null` to turn off, in which case the client must manually trigger model
+              response. Server VAD means that the model will detect the start and end of
+              speech based on audio volume and respond at the end of user speech. Semantic VAD
+              is more advanced and uses a turn detection model (in conjuction with VAD) to
+              semantically estimate whether the user has finished speaking, then dynamically
+              sets a timeout based on this probability. For example, if user audio trails off
+              with "uhhm", the model will score a low probability of turn end and wait longer
+              for the user to continue speaking. This can be useful for more natural
+              conversations, but may have a higher latency.
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        extra_headers = {"OpenAI-Beta": "assistants=v2", **(extra_headers or {})}
+        return await self._post(
+            "/realtime/transcription_sessions",
+            body=await async_maybe_transform(
+                {
+                    "include": include,
+                    "input_audio_format": input_audio_format,
+                    "input_audio_noise_reduction": input_audio_noise_reduction,
+                    "input_audio_transcription": input_audio_transcription,
+                    "modalities": modalities,
+                    "turn_detection": turn_detection,
+                },
+                transcription_session_create_params.TranscriptionSessionCreateParams,
+            ),
+            options=make_request_options(
+                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+            ),
+            cast_to=TranscriptionSession,
+        )
+
+
+class TranscriptionSessionsWithRawResponse:
+    def __init__(self, transcription_sessions: TranscriptionSessions) -> None:
+        self._transcription_sessions = transcription_sessions
+
+        self.create = _legacy_response.to_raw_response_wrapper(
+            transcription_sessions.create,
+        )
+
+
+class AsyncTranscriptionSessionsWithRawResponse:
+    def __init__(self, transcription_sessions: AsyncTranscriptionSessions) -> None:
+        self._transcription_sessions = transcription_sessions
+
+        self.create = _legacy_response.async_to_raw_response_wrapper(
+            transcription_sessions.create,
+        )
+
+
+class TranscriptionSessionsWithStreamingResponse:
+    def __init__(self, transcription_sessions: TranscriptionSessions) -> None:
+        self._transcription_sessions = transcription_sessions
+
+        self.create = to_streamed_response_wrapper(
+            transcription_sessions.create,
+        )
+
+
+class AsyncTranscriptionSessionsWithStreamingResponse:
+    def __init__(self, transcription_sessions: AsyncTranscriptionSessions) -> None:
+        self._transcription_sessions = transcription_sessions
+
+        self.create = async_to_streamed_response_wrapper(
+            transcription_sessions.create,
+        )
src/openai/types/audio/__init__.py
@@ -8,9 +8,13 @@ from .transcription import Transcription as Transcription
 from .transcription_word import TranscriptionWord as TranscriptionWord
 from .translation_verbose import TranslationVerbose as TranslationVerbose
 from .speech_create_params import SpeechCreateParams as SpeechCreateParams
+from .transcription_include import TranscriptionInclude as TranscriptionInclude
 from .transcription_segment import TranscriptionSegment as TranscriptionSegment
 from .transcription_verbose import TranscriptionVerbose as TranscriptionVerbose
 from .translation_create_params import TranslationCreateParams as TranslationCreateParams
+from .transcription_stream_event import TranscriptionStreamEvent as TranscriptionStreamEvent
 from .transcription_create_params import TranscriptionCreateParams as TranscriptionCreateParams
 from .translation_create_response import TranslationCreateResponse as TranslationCreateResponse
 from .transcription_create_response import TranscriptionCreateResponse as TranscriptionCreateResponse
+from .transcription_text_done_event import TranscriptionTextDoneEvent as TranscriptionTextDoneEvent
+from .transcription_text_delta_event import TranscriptionTextDeltaEvent as TranscriptionTextDeltaEvent
src/openai/types/audio/speech_create_params.py
@@ -17,7 +17,7 @@ class SpeechCreateParams(TypedDict, total=False):
     model: Required[Union[str, SpeechModel]]
     """
     One of the available [TTS models](https://platform.openai.com/docs/models#tts):
-    `tts-1` or `tts-1-hd`
+    `tts-1`, `tts-1-hd` or `gpt-4o-mini-tts`.
     """
 
     voice: Required[Literal["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"]]
@@ -28,6 +28,12 @@ class SpeechCreateParams(TypedDict, total=False):
     [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
     """
 
+    instructions: str
+    """Control the voice of your generated audio with additional instructions.
+
+    Does not work with `tts-1` or `tts-1-hd`.
+    """
+
     response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]
     """The format to audio in.
 
src/openai/types/audio/speech_model.py
@@ -4,4 +4,4 @@ from typing_extensions import Literal, TypeAlias
 
 __all__ = ["SpeechModel"]
 
-SpeechModel: TypeAlias = Literal["tts-1", "tts-1-hd"]
+SpeechModel: TypeAlias = Literal["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
src/openai/types/audio/transcription.py
@@ -1,11 +1,30 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
+from typing import List, Optional
 
 from ..._models import BaseModel
 
-__all__ = ["Transcription"]
+__all__ = ["Transcription", "Logprob"]
+
+
+class Logprob(BaseModel):
+    token: Optional[str] = None
+    """The token in the transcription."""
+
+    bytes: Optional[List[float]] = None
+    """The bytes of the token."""
+
+    logprob: Optional[float] = None
+    """The log probability of the token."""
 
 
 class Transcription(BaseModel):
     text: str
     """The transcribed text."""
+
+    logprobs: Optional[List[Logprob]] = None
+    """The log probabilities of the tokens in the transcription.
+
+    Only returned with the models `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`
+    if `logprobs` is added to the `include` array.
+    """
src/openai/types/audio/transcription_create_params.py
@@ -2,17 +2,22 @@
 
 from __future__ import annotations
 
-from typing import List, Union
+from typing import List, Union, Optional
 from typing_extensions import Literal, Required, TypedDict
 
 from ..._types import FileTypes
 from ..audio_model import AudioModel
+from .transcription_include import TranscriptionInclude
 from ..audio_response_format import AudioResponseFormat
 
-__all__ = ["TranscriptionCreateParams"]
+__all__ = [
+    "TranscriptionCreateParamsBase",
+    "TranscriptionCreateParamsNonStreaming",
+    "TranscriptionCreateParamsStreaming",
+]
 
 
-class TranscriptionCreateParams(TypedDict, total=False):
+class TranscriptionCreateParamsBase(TypedDict, total=False):
     file: Required[FileTypes]
     """
     The audio file object (not file name) to transcribe, in one of these formats:
@@ -22,8 +27,17 @@ class TranscriptionCreateParams(TypedDict, total=False):
     model: Required[Union[str, AudioModel]]
     """ID of the model to use.
 
-    Only `whisper-1` (which is powered by our open source Whisper V2 model) is
-    currently available.
+    The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1`
+    (which is powered by our open source Whisper V2 model).
+    """
+
+    include: List[TranscriptionInclude]
+    """Additional information to include in the transcription response.
+
+    `logprobs` will return the log probabilities of the tokens in the response to
+    understand the model's confidence in the transcription. `logprobs` only works
+    with response_format set to `json` and only with the models `gpt-4o-transcribe`
+    and `gpt-4o-mini-transcribe`.
     """
 
     language: str
@@ -45,7 +59,8 @@ class TranscriptionCreateParams(TypedDict, total=False):
     response_format: AudioResponseFormat
     """
     The format of the output, in one of these options: `json`, `text`, `srt`,
-    `verbose_json`, or `vtt`.
+    `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
+    the only supported format is `json`.
     """
 
     temperature: float
@@ -65,3 +80,34 @@ class TranscriptionCreateParams(TypedDict, total=False):
     is no additional latency for segment timestamps, but generating word timestamps
     incurs additional latency.
     """
+
+
+class TranscriptionCreateParamsNonStreaming(TranscriptionCreateParamsBase, total=False):
+    stream: Optional[Literal[False]]
+    """
+    If set to true, the model response data will be streamed to the client as it is
+    generated using
+    [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
+    See the
+    [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
+    for more information.
+
+    Note: Streaming is not supported for the `whisper-1` model and will be ignored.
+    """
+
+
+class TranscriptionCreateParamsStreaming(TranscriptionCreateParamsBase):
+    stream: Required[Literal[True]]
+    """
+    If set to true, the model response data will be streamed to the client as it is
+    generated using
+    [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
+    See the
+    [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
+    for more information.
+
+    Note: Streaming is not supported for the `whisper-1` model and will be ignored.
+    """
+
+
+TranscriptionCreateParams = Union[TranscriptionCreateParamsNonStreaming, TranscriptionCreateParamsStreaming]
src/openai/types/audio/transcription_include.py
@@ -0,0 +1,7 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing_extensions import Literal, TypeAlias
+
+__all__ = ["TranscriptionInclude"]
+
+TranscriptionInclude: TypeAlias = Literal["logprobs"]
src/openai/types/audio/transcription_stream_event.py
@@ -0,0 +1,14 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Union
+from typing_extensions import Annotated, TypeAlias
+
+from ..._utils import PropertyInfo
+from .transcription_text_done_event import TranscriptionTextDoneEvent
+from .transcription_text_delta_event import TranscriptionTextDeltaEvent
+
+__all__ = ["TranscriptionStreamEvent"]
+
+TranscriptionStreamEvent: TypeAlias = Annotated[
+    Union[TranscriptionTextDeltaEvent, TranscriptionTextDoneEvent], PropertyInfo(discriminator="type")
+]
src/openai/types/audio/transcription_text_delta_event.py
@@ -0,0 +1,35 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Optional
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+
+__all__ = ["TranscriptionTextDeltaEvent", "Logprob"]
+
+
+class Logprob(BaseModel):
+    token: Optional[str] = None
+    """The token that was used to generate the log probability."""
+
+    bytes: Optional[List[object]] = None
+    """The bytes that were used to generate the log probability."""
+
+    logprob: Optional[float] = None
+    """The log probability of the token."""
+
+
+class TranscriptionTextDeltaEvent(BaseModel):
+    delta: str
+    """The text delta that was additionally transcribed."""
+
+    type: Literal["transcript.text.delta"]
+    """The type of the event. Always `transcript.text.delta`."""
+
+    logprobs: Optional[List[Logprob]] = None
+    """The log probabilities of the delta.
+
+    Only included if you
+    [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
+    with the `include[]` parameter set to `logprobs`.
+    """
src/openai/types/audio/transcription_text_done_event.py
@@ -0,0 +1,35 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Optional
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+
+__all__ = ["TranscriptionTextDoneEvent", "Logprob"]
+
+
+class Logprob(BaseModel):
+    token: Optional[str] = None
+    """The token that was used to generate the log probability."""
+
+    bytes: Optional[List[object]] = None
+    """The bytes that were used to generate the log probability."""
+
+    logprob: Optional[float] = None
+    """The log probability of the token."""
+
+
+class TranscriptionTextDoneEvent(BaseModel):
+    text: str
+    """The text that was transcribed."""
+
+    type: Literal["transcript.text.done"]
+    """The type of the event. Always `transcript.text.done`."""
+
+    logprobs: Optional[List[Logprob]] = None
+    """The log probabilities of the individual tokens in the transcription.
+
+    Only included if you
+    [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
+    with the `include[]` parameter set to `logprobs`.
+    """
src/openai/types/audio/translation_create_params.py
@@ -3,11 +3,10 @@
 from __future__ import annotations
 
 from typing import Union
-from typing_extensions import Required, TypedDict
+from typing_extensions import Literal, Required, TypedDict
 
 from ..._types import FileTypes
 from ..audio_model import AudioModel
-from ..audio_response_format import AudioResponseFormat
 
 __all__ = ["TranslationCreateParams"]
 
@@ -34,7 +33,7 @@ class TranslationCreateParams(TypedDict, total=False):
     should be in English.
     """
 
-    response_format: AudioResponseFormat
+    response_format: Literal["json", "text", "srt", "verbose_json", "vtt"]
     """
     The format of the output, in one of these options: `json`, `text`, `srt`,
     `verbose_json`, or `vtt`.
src/openai/types/beta/realtime/__init__.py
@@ -15,6 +15,7 @@ from .response_create_event import ResponseCreateEvent as ResponseCreateEvent
 from .session_create_params import SessionCreateParams as SessionCreateParams
 from .session_created_event import SessionCreatedEvent as SessionCreatedEvent
 from .session_updated_event import SessionUpdatedEvent as SessionUpdatedEvent
+from .transcription_session import TranscriptionSession as TranscriptionSession
 from .response_created_event import ResponseCreatedEvent as ResponseCreatedEvent
 from .conversation_item_param import ConversationItemParam as ConversationItemParam
 from .realtime_connect_params import RealtimeConnectParams as RealtimeConnectParams
@@ -32,6 +33,7 @@ from .session_update_event_param import SessionUpdateEventParam as SessionUpdate
 from .realtime_client_event_param import RealtimeClientEventParam as RealtimeClientEventParam
 from .response_cancel_event_param import ResponseCancelEventParam as ResponseCancelEventParam
 from .response_create_event_param import ResponseCreateEventParam as ResponseCreateEventParam
+from .transcription_session_update import TranscriptionSessionUpdate as TranscriptionSessionUpdate
 from .conversation_item_create_event import ConversationItemCreateEvent as ConversationItemCreateEvent
 from .conversation_item_delete_event import ConversationItemDeleteEvent as ConversationItemDeleteEvent
 from .input_audio_buffer_clear_event import InputAudioBufferClearEvent as InputAudioBufferClearEvent
@@ -41,6 +43,7 @@ from .conversation_item_deleted_event import ConversationItemDeletedEvent as Con
 from .input_audio_buffer_append_event import InputAudioBufferAppendEvent as InputAudioBufferAppendEvent
 from .input_audio_buffer_commit_event import InputAudioBufferCommitEvent as InputAudioBufferCommitEvent
 from .response_output_item_done_event import ResponseOutputItemDoneEvent as ResponseOutputItemDoneEvent
+from .conversation_item_retrieve_event import ConversationItemRetrieveEvent as ConversationItemRetrieveEvent
 from .conversation_item_truncate_event import ConversationItemTruncateEvent as ConversationItemTruncateEvent
 from .conversation_item_with_reference import ConversationItemWithReference as ConversationItemWithReference
 from .input_audio_buffer_cleared_event import InputAudioBufferClearedEvent as InputAudioBufferClearedEvent
@@ -49,6 +52,9 @@ from .response_output_item_added_event import ResponseOutputItemAddedEvent as Re
 from .conversation_item_truncated_event import ConversationItemTruncatedEvent as ConversationItemTruncatedEvent
 from .response_content_part_added_event import ResponseContentPartAddedEvent as ResponseContentPartAddedEvent
 from .input_audio_buffer_committed_event import InputAudioBufferCommittedEvent as InputAudioBufferCommittedEvent
+from .transcription_session_update_param import TranscriptionSessionUpdateParam as TranscriptionSessionUpdateParam
+from .transcription_session_create_params import TranscriptionSessionCreateParams as TranscriptionSessionCreateParams
+from .transcription_session_updated_event import TranscriptionSessionUpdatedEvent as TranscriptionSessionUpdatedEvent
 from .conversation_item_create_event_param import ConversationItemCreateEventParam as ConversationItemCreateEventParam
 from .conversation_item_delete_event_param import ConversationItemDeleteEventParam as ConversationItemDeleteEventParam
 from .input_audio_buffer_clear_event_param import InputAudioBufferClearEventParam as InputAudioBufferClearEventParam
@@ -58,6 +64,9 @@ from .input_audio_buffer_commit_event_param import InputAudioBufferCommitEventPa
 from .response_audio_transcript_delta_event import (
     ResponseAudioTranscriptDeltaEvent as ResponseAudioTranscriptDeltaEvent,
 )
+from .conversation_item_retrieve_event_param import (
+    ConversationItemRetrieveEventParam as ConversationItemRetrieveEventParam,
+)
 from .conversation_item_truncate_event_param import (
     ConversationItemTruncateEventParam as ConversationItemTruncateEventParam,
 )
@@ -76,6 +85,9 @@ from .response_function_call_arguments_done_event import (
 from .response_function_call_arguments_delta_event import (
     ResponseFunctionCallArgumentsDeltaEvent as ResponseFunctionCallArgumentsDeltaEvent,
 )
+from .conversation_item_input_audio_transcription_delta_event import (
+    ConversationItemInputAudioTranscriptionDeltaEvent as ConversationItemInputAudioTranscriptionDeltaEvent,
+)
 from .conversation_item_input_audio_transcription_failed_event import (
     ConversationItemInputAudioTranscriptionFailedEvent as ConversationItemInputAudioTranscriptionFailedEvent,
 )
src/openai/types/beta/realtime/conversation_item_input_audio_transcription_completed_event.py
@@ -1,10 +1,22 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
+from typing import List, Optional
 from typing_extensions import Literal
 
 from ...._models import BaseModel
 
-__all__ = ["ConversationItemInputAudioTranscriptionCompletedEvent"]
+__all__ = ["ConversationItemInputAudioTranscriptionCompletedEvent", "Logprob"]
+
+
+class Logprob(BaseModel):
+    token: str
+    """The token that was used to generate the log probability."""
+
+    bytes: List[int]
+    """The bytes that were used to generate the log probability."""
+
+    logprob: float
+    """The log probability of the token."""
 
 
 class ConversationItemInputAudioTranscriptionCompletedEvent(BaseModel):
@@ -24,3 +36,6 @@ class ConversationItemInputAudioTranscriptionCompletedEvent(BaseModel):
     """
     The event type, must be `conversation.item.input_audio_transcription.completed`.
     """
+
+    logprobs: Optional[List[Logprob]] = None
+    """The log probabilities of the transcription."""
src/openai/types/beta/realtime/conversation_item_input_audio_transcription_delta_event.py
@@ -0,0 +1,39 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Optional
+from typing_extensions import Literal
+
+from ...._models import BaseModel
+
+__all__ = ["ConversationItemInputAudioTranscriptionDeltaEvent", "Logprob"]
+
+
+class Logprob(BaseModel):
+    token: str
+    """The token that was used to generate the log probability."""
+
+    bytes: List[int]
+    """The bytes that were used to generate the log probability."""
+
+    logprob: float
+    """The log probability of the token."""
+
+
+class ConversationItemInputAudioTranscriptionDeltaEvent(BaseModel):
+    event_id: str
+    """The unique ID of the server event."""
+
+    item_id: str
+    """The ID of the item."""
+
+    type: Literal["conversation.item.input_audio_transcription.delta"]
+    """The event type, must be `conversation.item.input_audio_transcription.delta`."""
+
+    content_index: Optional[int] = None
+    """The index of the content part in the item's content array."""
+
+    delta: Optional[str] = None
+    """The text delta."""
+
+    logprobs: Optional[List[Logprob]] = None
+    """The log probabilities of the transcription."""
src/openai/types/beta/realtime/conversation_item_retrieve_event.py
@@ -0,0 +1,19 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Optional
+from typing_extensions import Literal
+
+from ...._models import BaseModel
+
+__all__ = ["ConversationItemRetrieveEvent"]
+
+
+class ConversationItemRetrieveEvent(BaseModel):
+    item_id: str
+    """The ID of the item to retrieve."""
+
+    type: Literal["conversation.item.retrieve"]
+    """The event type, must be `conversation.item.retrieve`."""
+
+    event_id: Optional[str] = None
+    """Optional client-generated ID used to identify this event."""
src/openai/types/beta/realtime/conversation_item_retrieve_event_param.py
@@ -0,0 +1,18 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing_extensions import Literal, Required, TypedDict
+
+__all__ = ["ConversationItemRetrieveEventParam"]
+
+
+class ConversationItemRetrieveEventParam(TypedDict, total=False):
+    item_id: Required[str]
+    """The ID of the item to retrieve."""
+
+    type: Required[Literal["conversation.item.retrieve"]]
+    """The event type, must be `conversation.item.retrieve`."""
+
+    event_id: str
+    """Optional client-generated ID used to identify this event."""
src/openai/types/beta/realtime/realtime_client_event.py
@@ -7,26 +7,30 @@ from ...._utils import PropertyInfo
 from .session_update_event import SessionUpdateEvent
 from .response_cancel_event import ResponseCancelEvent
 from .response_create_event import ResponseCreateEvent
+from .transcription_session_update import TranscriptionSessionUpdate
 from .conversation_item_create_event import ConversationItemCreateEvent
 from .conversation_item_delete_event import ConversationItemDeleteEvent
 from .input_audio_buffer_clear_event import InputAudioBufferClearEvent
 from .input_audio_buffer_append_event import InputAudioBufferAppendEvent
 from .input_audio_buffer_commit_event import InputAudioBufferCommitEvent
+from .conversation_item_retrieve_event import ConversationItemRetrieveEvent
 from .conversation_item_truncate_event import ConversationItemTruncateEvent
 
 __all__ = ["RealtimeClientEvent"]
 
 RealtimeClientEvent: TypeAlias = Annotated[
     Union[
-        SessionUpdateEvent,
-        InputAudioBufferAppendEvent,
-        InputAudioBufferCommitEvent,
-        InputAudioBufferClearEvent,
         ConversationItemCreateEvent,
-        ConversationItemTruncateEvent,
         ConversationItemDeleteEvent,
-        ResponseCreateEvent,
+        ConversationItemRetrieveEvent,
+        ConversationItemTruncateEvent,
+        InputAudioBufferAppendEvent,
+        InputAudioBufferClearEvent,
+        InputAudioBufferCommitEvent,
         ResponseCancelEvent,
+        ResponseCreateEvent,
+        SessionUpdateEvent,
+        TranscriptionSessionUpdate,
     ],
     PropertyInfo(discriminator="type"),
 ]
src/openai/types/beta/realtime/realtime_client_event_param.py
@@ -8,23 +8,27 @@ from typing_extensions import TypeAlias
 from .session_update_event_param import SessionUpdateEventParam
 from .response_cancel_event_param import ResponseCancelEventParam
 from .response_create_event_param import ResponseCreateEventParam
+from .transcription_session_update_param import TranscriptionSessionUpdateParam
 from .conversation_item_create_event_param import ConversationItemCreateEventParam
 from .conversation_item_delete_event_param import ConversationItemDeleteEventParam
 from .input_audio_buffer_clear_event_param import InputAudioBufferClearEventParam
 from .input_audio_buffer_append_event_param import InputAudioBufferAppendEventParam
 from .input_audio_buffer_commit_event_param import InputAudioBufferCommitEventParam
+from .conversation_item_retrieve_event_param import ConversationItemRetrieveEventParam
 from .conversation_item_truncate_event_param import ConversationItemTruncateEventParam
 
 __all__ = ["RealtimeClientEventParam"]
 
 RealtimeClientEventParam: TypeAlias = Union[
-    SessionUpdateEventParam,
-    InputAudioBufferAppendEventParam,
-    InputAudioBufferCommitEventParam,
-    InputAudioBufferClearEventParam,
     ConversationItemCreateEventParam,
-    ConversationItemTruncateEventParam,
     ConversationItemDeleteEventParam,
-    ResponseCreateEventParam,
+    ConversationItemRetrieveEventParam,
+    ConversationItemTruncateEventParam,
+    InputAudioBufferAppendEventParam,
+    InputAudioBufferClearEventParam,
+    InputAudioBufferCommitEventParam,
     ResponseCancelEventParam,
+    ResponseCreateEventParam,
+    SessionUpdateEventParam,
+    TranscriptionSessionUpdateParam,
 ]
src/openai/types/beta/realtime/realtime_server_event.py
@@ -1,10 +1,12 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
 from typing import Union
-from typing_extensions import Annotated, TypeAlias
+from typing_extensions import Literal, Annotated, TypeAlias
 
 from ...._utils import PropertyInfo
+from ...._models import BaseModel
 from .error_event import ErrorEvent
+from .conversation_item import ConversationItem
 from .response_done_event import ResponseDoneEvent
 from .session_created_event import SessionCreatedEvent
 from .session_updated_event import SessionUpdatedEvent
@@ -24,49 +26,66 @@ from .response_output_item_added_event import ResponseOutputItemAddedEvent
 from .conversation_item_truncated_event import ConversationItemTruncatedEvent
 from .response_content_part_added_event import ResponseContentPartAddedEvent
 from .input_audio_buffer_committed_event import InputAudioBufferCommittedEvent
+from .transcription_session_updated_event import TranscriptionSessionUpdatedEvent
 from .response_audio_transcript_done_event import ResponseAudioTranscriptDoneEvent
 from .response_audio_transcript_delta_event import ResponseAudioTranscriptDeltaEvent
 from .input_audio_buffer_speech_started_event import InputAudioBufferSpeechStartedEvent
 from .input_audio_buffer_speech_stopped_event import InputAudioBufferSpeechStoppedEvent
 from .response_function_call_arguments_done_event import ResponseFunctionCallArgumentsDoneEvent
 from .response_function_call_arguments_delta_event import ResponseFunctionCallArgumentsDeltaEvent
+from .conversation_item_input_audio_transcription_delta_event import ConversationItemInputAudioTranscriptionDeltaEvent
 from .conversation_item_input_audio_transcription_failed_event import ConversationItemInputAudioTranscriptionFailedEvent
 from .conversation_item_input_audio_transcription_completed_event import (
     ConversationItemInputAudioTranscriptionCompletedEvent,
 )
 
-__all__ = ["RealtimeServerEvent"]
+__all__ = ["RealtimeServerEvent", "ConversationItemRetrieved"]
+
+
+class ConversationItemRetrieved(BaseModel):
+    event_id: str
+    """The unique ID of the server event."""
+
+    item: ConversationItem
+    """The item to add to the conversation."""
+
+    type: Literal["conversation.item.retrieved"]
+    """The event type, must be `conversation.item.retrieved`."""
+
 
 RealtimeServerEvent: TypeAlias = Annotated[
     Union[
-        ErrorEvent,
-        SessionCreatedEvent,
-        SessionUpdatedEvent,
         ConversationCreatedEvent,
-        InputAudioBufferCommittedEvent,
-        InputAudioBufferClearedEvent,
-        InputAudioBufferSpeechStartedEvent,
-        InputAudioBufferSpeechStoppedEvent,
         ConversationItemCreatedEvent,
+        ConversationItemDeletedEvent,
         ConversationItemInputAudioTranscriptionCompletedEvent,
+        ConversationItemInputAudioTranscriptionDeltaEvent,
         ConversationItemInputAudioTranscriptionFailedEvent,
+        ConversationItemRetrieved,
         ConversationItemTruncatedEvent,
-        ConversationItemDeletedEvent,
+        ErrorEvent,
+        InputAudioBufferClearedEvent,
+        InputAudioBufferCommittedEvent,
+        InputAudioBufferSpeechStartedEvent,
+        InputAudioBufferSpeechStoppedEvent,
+        RateLimitsUpdatedEvent,
+        ResponseAudioDeltaEvent,
+        ResponseAudioDoneEvent,
+        ResponseAudioTranscriptDeltaEvent,
+        ResponseAudioTranscriptDoneEvent,
+        ResponseContentPartAddedEvent,
+        ResponseContentPartDoneEvent,
         ResponseCreatedEvent,
         ResponseDoneEvent,
+        ResponseFunctionCallArgumentsDeltaEvent,
+        ResponseFunctionCallArgumentsDoneEvent,
         ResponseOutputItemAddedEvent,
         ResponseOutputItemDoneEvent,
-        ResponseContentPartAddedEvent,
-        ResponseContentPartDoneEvent,
         ResponseTextDeltaEvent,
         ResponseTextDoneEvent,
-        ResponseAudioTranscriptDeltaEvent,
-        ResponseAudioTranscriptDoneEvent,
-        ResponseAudioDeltaEvent,
-        ResponseAudioDoneEvent,
-        ResponseFunctionCallArgumentsDeltaEvent,
-        ResponseFunctionCallArgumentsDoneEvent,
-        RateLimitsUpdatedEvent,
+        SessionCreatedEvent,
+        SessionUpdatedEvent,
+        TranscriptionSessionUpdatedEvent,
     ],
     PropertyInfo(discriminator="type"),
 ]
src/openai/types/beta/realtime/session.py
@@ -5,14 +5,40 @@ from typing_extensions import Literal
 
 from ...._models import BaseModel
 
-__all__ = ["Session", "InputAudioTranscription", "Tool", "TurnDetection"]
+__all__ = ["Session", "InputAudioNoiseReduction", "InputAudioTranscription", "Tool", "TurnDetection"]
+
+
+class InputAudioNoiseReduction(BaseModel):
+    type: Optional[Literal["near_field", "far_field"]] = None
+    """Type of noise reduction.
+
+    `near_field` is for close-talking microphones such as headphones, `far_field` is
+    for far-field microphones such as laptop or conference room microphones.
+    """
 
 
 class InputAudioTranscription(BaseModel):
+    language: Optional[str] = None
+    """The language of the input audio.
+
+    Supplying the input language in
+    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+    format will improve accuracy and latency.
+    """
+
     model: Optional[str] = None
     """
-    The model to use for transcription, `whisper-1` is the only currently supported
-    model.
+    The model to use for transcription, current options are `gpt-4o-transcribe`,
+    `gpt-4o-mini-transcribe`, and `whisper-1`.
+    """
+
+    prompt: Optional[str] = None
+    """
+    An optional text to guide the model's style or continue a previous audio
+    segment. For `whisper-1`, the
+    [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+    For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+    "expect words related to technology".
     """
 
 
@@ -35,46 +61,56 @@ class Tool(BaseModel):
 
 class TurnDetection(BaseModel):
     create_response: Optional[bool] = None
-    """Whether or not to automatically generate a response when a VAD stop event
+    """
+    Whether or not to automatically generate a response when a VAD stop event
     occurs.
+    """
+
+    eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None
+    """Used only for `semantic_vad` mode.
 
-    `true` by default.
+    The eagerness of the model to respond. `low` will wait longer for the user to
+    continue speaking, `high` will respond more quickly. `auto` is the default and
+    is equivalent to `medium`.
     """
 
     interrupt_response: Optional[bool] = None
     """
     Whether or not to automatically interrupt any ongoing response with output to
     the default conversation (i.e. `conversation` of `auto`) when a VAD start event
-    occurs. `true` by default.
+    occurs.
     """
 
     prefix_padding_ms: Optional[int] = None
-    """Amount of audio to include before the VAD detected speech (in milliseconds).
+    """Used only for `server_vad` mode.
 
+    Amount of audio to include before the VAD detected speech (in milliseconds).
     Defaults to 300ms.
     """
 
     silence_duration_ms: Optional[int] = None
-    """Duration of silence to detect speech stop (in milliseconds).
+    """Used only for `server_vad` mode.
 
-    Defaults to 500ms. With shorter values the model will respond more quickly, but
-    may jump in on short pauses from the user.
+    Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+    With shorter values the model will respond more quickly, but may jump in on
+    short pauses from the user.
     """
 
     threshold: Optional[float] = None
-    """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5.
+    """Used only for `server_vad` mode.
 
-    A higher threshold will require louder audio to activate the model, and thus
-    might perform better in noisy environments.
+    Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+    threshold will require louder audio to activate the model, and thus might
+    perform better in noisy environments.
     """
 
-    type: Optional[Literal["server_vad"]] = None
-    """Type of turn detection, only `server_vad` is currently supported."""
+    type: Optional[Literal["server_vad", "semantic_vad"]] = None
+    """Type of turn detection."""
 
 
 class Session(BaseModel):
     id: Optional[str] = None
-    """Unique identifier for the session object."""
+    """Unique identifier for the session that looks like `sess_1234567890abcdef`."""
 
     input_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None
     """The format of input audio.
@@ -84,13 +120,25 @@ class Session(BaseModel):
     byte order.
     """
 
+    input_audio_noise_reduction: Optional[InputAudioNoiseReduction] = None
+    """Configuration for input audio noise reduction.
+
+    This can be set to `null` to turn off. Noise reduction filters audio added to
+    the input audio buffer before it is sent to VAD and the model. Filtering the
+    audio can improve VAD and turn detection accuracy (reducing false positives) and
+    model performance by improving perception of the input audio.
+    """
+
     input_audio_transcription: Optional[InputAudioTranscription] = None
     """
     Configuration for input audio transcription, defaults to off and can be set to
     `null` to turn off once on. Input audio transcription is not native to the
     model, since the model consumes audio directly. Transcription runs
-    asynchronously through Whisper and should be treated as rough guidance rather
-    than the representation understood by the model.
+    asynchronously through
+    [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+    and should be treated as guidance of input audio content rather than precisely
+    what the model heard. The client can optionally set the language and prompt for
+    transcription, these offer additional guidance to the transcription service.
     """
 
     instructions: Optional[str] = None
@@ -122,16 +170,14 @@ class Session(BaseModel):
     To disable audio, set this to ["text"].
     """
 
-    model: Union[
-        str,
+    model: Optional[
         Literal[
             "gpt-4o-realtime-preview",
             "gpt-4o-realtime-preview-2024-10-01",
             "gpt-4o-realtime-preview-2024-12-17",
             "gpt-4o-mini-realtime-preview",
             "gpt-4o-mini-realtime-preview-2024-12-17",
-        ],
-        None,
+        ]
     ] = None
     """The Realtime model used for this session."""
 
@@ -143,7 +189,11 @@ class Session(BaseModel):
     """
 
     temperature: Optional[float] = None
-    """Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8."""
+    """Sampling temperature for the model, limited to [0.6, 1.2].
+
+    For audio models a temperature of 0.8 is highly recommended for best
+    performance.
+    """
 
     tool_choice: Optional[str] = None
     """How the model chooses tools.
@@ -155,11 +205,17 @@ class Session(BaseModel):
     """Tools (functions) available to the model."""
 
     turn_detection: Optional[TurnDetection] = None
-    """Configuration for turn detection.
-
-    Can be set to `null` to turn off. Server VAD means that the model will detect
-    the start and end of speech based on audio volume and respond at the end of user
-    speech.
+    """Configuration for turn detection, ether Server VAD or Semantic VAD.
+
+    This can be set to `null` to turn off, in which case the client must manually
+    trigger model response. Server VAD means that the model will detect the start
+    and end of speech based on audio volume and respond at the end of user speech.
+    Semantic VAD is more advanced and uses a turn detection model (in conjuction
+    with VAD) to semantically estimate whether the user has finished speaking, then
+    dynamically sets a timeout based on this probability. For example, if user audio
+    trails off with "uhhm", the model will score a low probability of turn end and
+    wait longer for the user to continue speaking. This can be useful for more
+    natural conversations, but may have a higher latency.
     """
 
     voice: Optional[Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"]] = None
src/openai/types/beta/realtime/session_create_params.py
@@ -5,7 +5,7 @@ from __future__ import annotations
 from typing import List, Union, Iterable
 from typing_extensions import Literal, TypedDict
 
-__all__ = ["SessionCreateParams", "InputAudioTranscription", "Tool", "TurnDetection"]
+__all__ = ["SessionCreateParams", "InputAudioNoiseReduction", "InputAudioTranscription", "Tool", "TurnDetection"]
 
 
 class SessionCreateParams(TypedDict, total=False):
@@ -17,16 +17,25 @@ class SessionCreateParams(TypedDict, total=False):
     byte order.
     """
 
+    input_audio_noise_reduction: InputAudioNoiseReduction
+    """Configuration for input audio noise reduction.
+
+    This can be set to `null` to turn off. Noise reduction filters audio added to
+    the input audio buffer before it is sent to VAD and the model. Filtering the
+    audio can improve VAD and turn detection accuracy (reducing false positives) and
+    model performance by improving perception of the input audio.
+    """
+
     input_audio_transcription: InputAudioTranscription
     """
     Configuration for input audio transcription, defaults to off and can be set to
     `null` to turn off once on. Input audio transcription is not native to the
     model, since the model consumes audio directly. Transcription runs
     asynchronously through
-    [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
-    and should be treated as rough guidance rather than the representation
-    understood by the model. The client can optionally set the language and prompt
-    for transcription, these fields will be passed to the Whisper API.
+    [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+    and should be treated as guidance of input audio content rather than precisely
+    what the model heard. The client can optionally set the language and prompt for
+    transcription, these offer additional guidance to the transcription service.
     """
 
     instructions: str
@@ -75,7 +84,11 @@ class SessionCreateParams(TypedDict, total=False):
     """
 
     temperature: float
-    """Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8."""
+    """Sampling temperature for the model, limited to [0.6, 1.2].
+
+    For audio models a temperature of 0.8 is highly recommended for best
+    performance.
+    """
 
     tool_choice: str
     """How the model chooses tools.
@@ -87,11 +100,17 @@ class SessionCreateParams(TypedDict, total=False):
     """Tools (functions) available to the model."""
 
     turn_detection: TurnDetection
-    """Configuration for turn detection.
+    """Configuration for turn detection, ether Server VAD or Semantic VAD.
 
-    Can be set to `null` to turn off. Server VAD means that the model will detect
-    the start and end of speech based on audio volume and respond at the end of user
-    speech.
+    This can be set to `null` to turn off, in which case the client must manually
+    trigger model response. Server VAD means that the model will detect the start
+    and end of speech based on audio volume and respond at the end of user speech.
+    Semantic VAD is more advanced and uses a turn detection model (in conjuction
+    with VAD) to semantically estimate whether the user has finished speaking, then
+    dynamically sets a timeout based on this probability. For example, if user audio
+    trails off with "uhhm", the model will score a low probability of turn end and
+    wait longer for the user to continue speaking. This can be useful for more
+    natural conversations, but may have a higher latency.
     """
 
     voice: Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"]
@@ -103,6 +122,15 @@ class SessionCreateParams(TypedDict, total=False):
     """
 
 
+class InputAudioNoiseReduction(TypedDict, total=False):
+    type: Literal["near_field", "far_field"]
+    """Type of noise reduction.
+
+    `near_field` is for close-talking microphones such as headphones, `far_field` is
+    for far-field microphones such as laptop or conference room microphones.
+    """
+
+
 class InputAudioTranscription(TypedDict, total=False):
     language: str
     """The language of the input audio.
@@ -114,16 +142,17 @@ class InputAudioTranscription(TypedDict, total=False):
 
     model: str
     """
-    The model to use for transcription, `whisper-1` is the only currently supported
-    model.
+    The model to use for transcription, current options are `gpt-4o-transcribe`,
+    `gpt-4o-mini-transcribe`, and `whisper-1`.
     """
 
     prompt: str
-    """An optional text to guide the model's style or continue a previous audio
-    segment.
-
-    The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
-    should match the audio language.
+    """
+    An optional text to guide the model's style or continue a previous audio
+    segment. For `whisper-1`, the
+    [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+    For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+    "expect words related to technology".
     """
 
 
@@ -146,38 +175,48 @@ class Tool(TypedDict, total=False):
 
 class TurnDetection(TypedDict, total=False):
     create_response: bool
-    """Whether or not to automatically generate a response when a VAD stop event
+    """
+    Whether or not to automatically generate a response when a VAD stop event
     occurs.
+    """
+
+    eagerness: Literal["low", "medium", "high", "auto"]
+    """Used only for `semantic_vad` mode.
 
-    `true` by default.
+    The eagerness of the model to respond. `low` will wait longer for the user to
+    continue speaking, `high` will respond more quickly. `auto` is the default and
+    is equivalent to `medium`.
     """
 
     interrupt_response: bool
     """
     Whether or not to automatically interrupt any ongoing response with output to
     the default conversation (i.e. `conversation` of `auto`) when a VAD start event
-    occurs. `true` by default.
+    occurs.
     """
 
     prefix_padding_ms: int
-    """Amount of audio to include before the VAD detected speech (in milliseconds).
+    """Used only for `server_vad` mode.
 
+    Amount of audio to include before the VAD detected speech (in milliseconds).
     Defaults to 300ms.
     """
 
     silence_duration_ms: int
-    """Duration of silence to detect speech stop (in milliseconds).
+    """Used only for `server_vad` mode.
 
-    Defaults to 500ms. With shorter values the model will respond more quickly, but
-    may jump in on short pauses from the user.
+    Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+    With shorter values the model will respond more quickly, but may jump in on
+    short pauses from the user.
     """
 
     threshold: float
-    """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5.
+    """Used only for `server_vad` mode.
 
-    A higher threshold will require louder audio to activate the model, and thus
-    might perform better in noisy environments.
+    Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+    threshold will require louder audio to activate the model, and thus might
+    perform better in noisy environments.
     """
 
-    type: str
-    """Type of turn detection, only `server_vad` is currently supported."""
+    type: Literal["server_vad", "semantic_vad"]
+    """Type of turn detection."""
src/openai/types/beta/realtime/session_update_event.py
@@ -5,7 +5,23 @@ from typing_extensions import Literal
 
 from ...._models import BaseModel
 
-__all__ = ["SessionUpdateEvent", "Session", "SessionInputAudioTranscription", "SessionTool", "SessionTurnDetection"]
+__all__ = [
+    "SessionUpdateEvent",
+    "Session",
+    "SessionInputAudioNoiseReduction",
+    "SessionInputAudioTranscription",
+    "SessionTool",
+    "SessionTurnDetection",
+]
+
+
+class SessionInputAudioNoiseReduction(BaseModel):
+    type: Optional[Literal["near_field", "far_field"]] = None
+    """Type of noise reduction.
+
+    `near_field` is for close-talking microphones such as headphones, `far_field` is
+    for far-field microphones such as laptop or conference room microphones.
+    """
 
 
 class SessionInputAudioTranscription(BaseModel):
@@ -19,16 +35,17 @@ class SessionInputAudioTranscription(BaseModel):
 
     model: Optional[str] = None
     """
-    The model to use for transcription, `whisper-1` is the only currently supported
-    model.
+    The model to use for transcription, current options are `gpt-4o-transcribe`,
+    `gpt-4o-mini-transcribe`, and `whisper-1`.
     """
 
     prompt: Optional[str] = None
-    """An optional text to guide the model's style or continue a previous audio
-    segment.
-
-    The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
-    should match the audio language.
+    """
+    An optional text to guide the model's style or continue a previous audio
+    segment. For `whisper-1`, the
+    [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+    For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+    "expect words related to technology".
     """
 
 
@@ -51,41 +68,51 @@ class SessionTool(BaseModel):
 
 class SessionTurnDetection(BaseModel):
     create_response: Optional[bool] = None
-    """Whether or not to automatically generate a response when a VAD stop event
+    """
+    Whether or not to automatically generate a response when a VAD stop event
     occurs.
+    """
 
-    `true` by default.
+    eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None
+    """Used only for `semantic_vad` mode.
+
+    The eagerness of the model to respond. `low` will wait longer for the user to
+    continue speaking, `high` will respond more quickly. `auto` is the default and
+    is equivalent to `medium`.
     """
 
     interrupt_response: Optional[bool] = None
     """
     Whether or not to automatically interrupt any ongoing response with output to
     the default conversation (i.e. `conversation` of `auto`) when a VAD start event
-    occurs. `true` by default.
+    occurs.
     """
 
     prefix_padding_ms: Optional[int] = None
-    """Amount of audio to include before the VAD detected speech (in milliseconds).
+    """Used only for `server_vad` mode.
 
+    Amount of audio to include before the VAD detected speech (in milliseconds).
     Defaults to 300ms.
     """
 
     silence_duration_ms: Optional[int] = None
-    """Duration of silence to detect speech stop (in milliseconds).
+    """Used only for `server_vad` mode.
 
-    Defaults to 500ms. With shorter values the model will respond more quickly, but
-    may jump in on short pauses from the user.
+    Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+    With shorter values the model will respond more quickly, but may jump in on
+    short pauses from the user.
     """
 
     threshold: Optional[float] = None
-    """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5.
+    """Used only for `server_vad` mode.
 
-    A higher threshold will require louder audio to activate the model, and thus
-    might perform better in noisy environments.
+    Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+    threshold will require louder audio to activate the model, and thus might
+    perform better in noisy environments.
     """
 
-    type: Optional[str] = None
-    """Type of turn detection, only `server_vad` is currently supported."""
+    type: Optional[Literal["server_vad", "semantic_vad"]] = None
+    """Type of turn detection."""
 
 
 class Session(BaseModel):
@@ -97,16 +124,25 @@ class Session(BaseModel):
     byte order.
     """
 
+    input_audio_noise_reduction: Optional[SessionInputAudioNoiseReduction] = None
+    """Configuration for input audio noise reduction.
+
+    This can be set to `null` to turn off. Noise reduction filters audio added to
+    the input audio buffer before it is sent to VAD and the model. Filtering the
+    audio can improve VAD and turn detection accuracy (reducing false positives) and
+    model performance by improving perception of the input audio.
+    """
+
     input_audio_transcription: Optional[SessionInputAudioTranscription] = None
     """
     Configuration for input audio transcription, defaults to off and can be set to
     `null` to turn off once on. Input audio transcription is not native to the
     model, since the model consumes audio directly. Transcription runs
     asynchronously through
-    [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
-    and should be treated as rough guidance rather than the representation
-    understood by the model. The client can optionally set the language and prompt
-    for transcription, these fields will be passed to the Whisper API.
+    [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+    and should be treated as guidance of input audio content rather than precisely
+    what the model heard. The client can optionally set the language and prompt for
+    transcription, these offer additional guidance to the transcription service.
     """
 
     instructions: Optional[str] = None
@@ -157,7 +193,11 @@ class Session(BaseModel):
     """
 
     temperature: Optional[float] = None
-    """Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8."""
+    """Sampling temperature for the model, limited to [0.6, 1.2].
+
+    For audio models a temperature of 0.8 is highly recommended for best
+    performance.
+    """
 
     tool_choice: Optional[str] = None
     """How the model chooses tools.
@@ -169,11 +209,17 @@ class Session(BaseModel):
     """Tools (functions) available to the model."""
 
     turn_detection: Optional[SessionTurnDetection] = None
-    """Configuration for turn detection.
-
-    Can be set to `null` to turn off. Server VAD means that the model will detect
-    the start and end of speech based on audio volume and respond at the end of user
-    speech.
+    """Configuration for turn detection, ether Server VAD or Semantic VAD.
+
+    This can be set to `null` to turn off, in which case the client must manually
+    trigger model response. Server VAD means that the model will detect the start
+    and end of speech based on audio volume and respond at the end of user speech.
+    Semantic VAD is more advanced and uses a turn detection model (in conjuction
+    with VAD) to semantically estimate whether the user has finished speaking, then
+    dynamically sets a timeout based on this probability. For example, if user audio
+    trails off with "uhhm", the model will score a low probability of turn end and
+    wait longer for the user to continue speaking. This can be useful for more
+    natural conversations, but may have a higher latency.
     """
 
     voice: Optional[Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"]] = None
src/openai/types/beta/realtime/session_update_event_param.py
@@ -8,12 +8,22 @@ from typing_extensions import Literal, Required, TypedDict
 __all__ = [
     "SessionUpdateEventParam",
     "Session",
+    "SessionInputAudioNoiseReduction",
     "SessionInputAudioTranscription",
     "SessionTool",
     "SessionTurnDetection",
 ]
 
 
+class SessionInputAudioNoiseReduction(TypedDict, total=False):
+    type: Literal["near_field", "far_field"]
+    """Type of noise reduction.
+
+    `near_field` is for close-talking microphones such as headphones, `far_field` is
+    for far-field microphones such as laptop or conference room microphones.
+    """
+
+
 class SessionInputAudioTranscription(TypedDict, total=False):
     language: str
     """The language of the input audio.
@@ -25,16 +35,17 @@ class SessionInputAudioTranscription(TypedDict, total=False):
 
     model: str
     """
-    The model to use for transcription, `whisper-1` is the only currently supported
-    model.
+    The model to use for transcription, current options are `gpt-4o-transcribe`,
+    `gpt-4o-mini-transcribe`, and `whisper-1`.
     """
 
     prompt: str
-    """An optional text to guide the model's style or continue a previous audio
-    segment.
-
-    The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
-    should match the audio language.
+    """
+    An optional text to guide the model's style or continue a previous audio
+    segment. For `whisper-1`, the
+    [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+    For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+    "expect words related to technology".
     """
 
 
@@ -57,41 +68,51 @@ class SessionTool(TypedDict, total=False):
 
 class SessionTurnDetection(TypedDict, total=False):
     create_response: bool
-    """Whether or not to automatically generate a response when a VAD stop event
+    """
+    Whether or not to automatically generate a response when a VAD stop event
     occurs.
+    """
 
-    `true` by default.
+    eagerness: Literal["low", "medium", "high", "auto"]
+    """Used only for `semantic_vad` mode.
+
+    The eagerness of the model to respond. `low` will wait longer for the user to
+    continue speaking, `high` will respond more quickly. `auto` is the default and
+    is equivalent to `medium`.
     """
 
     interrupt_response: bool
     """
     Whether or not to automatically interrupt any ongoing response with output to
     the default conversation (i.e. `conversation` of `auto`) when a VAD start event
-    occurs. `true` by default.
+    occurs.
     """
 
     prefix_padding_ms: int
-    """Amount of audio to include before the VAD detected speech (in milliseconds).
+    """Used only for `server_vad` mode.
 
+    Amount of audio to include before the VAD detected speech (in milliseconds).
     Defaults to 300ms.
     """
 
     silence_duration_ms: int
-    """Duration of silence to detect speech stop (in milliseconds).
+    """Used only for `server_vad` mode.
 
-    Defaults to 500ms. With shorter values the model will respond more quickly, but
-    may jump in on short pauses from the user.
+    Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+    With shorter values the model will respond more quickly, but may jump in on
+    short pauses from the user.
     """
 
     threshold: float
-    """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5.
+    """Used only for `server_vad` mode.
 
-    A higher threshold will require louder audio to activate the model, and thus
-    might perform better in noisy environments.
+    Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+    threshold will require louder audio to activate the model, and thus might
+    perform better in noisy environments.
     """
 
-    type: str
-    """Type of turn detection, only `server_vad` is currently supported."""
+    type: Literal["server_vad", "semantic_vad"]
+    """Type of turn detection."""
 
 
 class Session(TypedDict, total=False):
@@ -103,16 +124,25 @@ class Session(TypedDict, total=False):
     byte order.
     """
 
+    input_audio_noise_reduction: SessionInputAudioNoiseReduction
+    """Configuration for input audio noise reduction.
+
+    This can be set to `null` to turn off. Noise reduction filters audio added to
+    the input audio buffer before it is sent to VAD and the model. Filtering the
+    audio can improve VAD and turn detection accuracy (reducing false positives) and
+    model performance by improving perception of the input audio.
+    """
+
     input_audio_transcription: SessionInputAudioTranscription
     """
     Configuration for input audio transcription, defaults to off and can be set to
     `null` to turn off once on. Input audio transcription is not native to the
     model, since the model consumes audio directly. Transcription runs
     asynchronously through
-    [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
-    and should be treated as rough guidance rather than the representation
-    understood by the model. The client can optionally set the language and prompt
-    for transcription, these fields will be passed to the Whisper API.
+    [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+    and should be treated as guidance of input audio content rather than precisely
+    what the model heard. The client can optionally set the language and prompt for
+    transcription, these offer additional guidance to the transcription service.
     """
 
     instructions: str
@@ -161,7 +191,11 @@ class Session(TypedDict, total=False):
     """
 
     temperature: float
-    """Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8."""
+    """Sampling temperature for the model, limited to [0.6, 1.2].
+
+    For audio models a temperature of 0.8 is highly recommended for best
+    performance.
+    """
 
     tool_choice: str
     """How the model chooses tools.
@@ -173,11 +207,17 @@ class Session(TypedDict, total=False):
     """Tools (functions) available to the model."""
 
     turn_detection: SessionTurnDetection
-    """Configuration for turn detection.
-
-    Can be set to `null` to turn off. Server VAD means that the model will detect
-    the start and end of speech based on audio volume and respond at the end of user
-    speech.
+    """Configuration for turn detection, ether Server VAD or Semantic VAD.
+
+    This can be set to `null` to turn off, in which case the client must manually
+    trigger model response. Server VAD means that the model will detect the start
+    and end of speech based on audio volume and respond at the end of user speech.
+    Semantic VAD is more advanced and uses a turn detection model (in conjuction
+    with VAD) to semantically estimate whether the user has finished speaking, then
+    dynamically sets a timeout based on this probability. For example, if user audio
+    trails off with "uhhm", the model will score a low probability of turn end and
+    wait longer for the user to continue speaking. This can be useful for more
+    natural conversations, but may have a higher latency.
     """
 
     voice: Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"]
src/openai/types/beta/realtime/transcription_session.py
@@ -0,0 +1,100 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Optional
+from typing_extensions import Literal
+
+from ...._models import BaseModel
+
+__all__ = ["TranscriptionSession", "ClientSecret", "InputAudioTranscription", "TurnDetection"]
+
+
+class ClientSecret(BaseModel):
+    expires_at: int
+    """Timestamp for when the token expires.
+
+    Currently, all tokens expire after one minute.
+    """
+
+    value: str
+    """
+    Ephemeral key usable in client environments to authenticate connections to the
+    Realtime API. Use this in client-side environments rather than a standard API
+    token, which should only be used server-side.
+    """
+
+
+class InputAudioTranscription(BaseModel):
+    language: Optional[str] = None
+    """The language of the input audio.
+
+    Supplying the input language in
+    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+    format will improve accuracy and latency.
+    """
+
+    model: Optional[Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"]] = None
+    """The model to use for transcription.
+
+    Can be `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, or `whisper-1`.
+    """
+
+    prompt: Optional[str] = None
+    """An optional text to guide the model's style or continue a previous audio
+    segment.
+
+    The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
+    should match the audio language.
+    """
+
+
+class TurnDetection(BaseModel):
+    prefix_padding_ms: Optional[int] = None
+    """Amount of audio to include before the VAD detected speech (in milliseconds).
+
+    Defaults to 300ms.
+    """
+
+    silence_duration_ms: Optional[int] = None
+    """Duration of silence to detect speech stop (in milliseconds).
+
+    Defaults to 500ms. With shorter values the model will respond more quickly, but
+    may jump in on short pauses from the user.
+    """
+
+    threshold: Optional[float] = None
+    """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5.
+
+    A higher threshold will require louder audio to activate the model, and thus
+    might perform better in noisy environments.
+    """
+
+    type: Optional[str] = None
+    """Type of turn detection, only `server_vad` is currently supported."""
+
+
+class TranscriptionSession(BaseModel):
+    client_secret: ClientSecret
+    """Ephemeral key returned by the API.
+
+    Only present when the session is created on the server via REST API.
+    """
+
+    input_audio_format: Optional[str] = None
+    """The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`."""
+
+    input_audio_transcription: Optional[InputAudioTranscription] = None
+    """Configuration of the transcription model."""
+
+    modalities: Optional[List[Literal["text", "audio"]]] = None
+    """The set of modalities the model can respond with.
+
+    To disable audio, set this to ["text"].
+    """
+
+    turn_detection: Optional[TurnDetection] = None
+    """Configuration for turn detection.
+
+    Can be set to `null` to turn off. Server VAD means that the model will detect
+    the start and end of speech based on audio volume and respond at the end of user
+    speech.
+    """
src/openai/types/beta/realtime/transcription_session_create_params.py
@@ -0,0 +1,143 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import List
+from typing_extensions import Literal, TypedDict
+
+__all__ = ["TranscriptionSessionCreateParams", "InputAudioNoiseReduction", "InputAudioTranscription", "TurnDetection"]
+
+
+class TranscriptionSessionCreateParams(TypedDict, total=False):
+    include: List[str]
+    """The set of items to include in the transcription. Current available items are:
+
+    - `item.input_audio_transcription.logprobs`
+    """
+
+    input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"]
+    """The format of input audio.
+
+    Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must
+    be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian
+    byte order.
+    """
+
+    input_audio_noise_reduction: InputAudioNoiseReduction
+    """Configuration for input audio noise reduction.
+
+    This can be set to `null` to turn off. Noise reduction filters audio added to
+    the input audio buffer before it is sent to VAD and the model. Filtering the
+    audio can improve VAD and turn detection accuracy (reducing false positives) and
+    model performance by improving perception of the input audio.
+    """
+
+    input_audio_transcription: InputAudioTranscription
+    """Configuration for input audio transcription.
+
+    The client can optionally set the language and prompt for transcription, these
+    offer additional guidance to the transcription service.
+    """
+
+    modalities: List[Literal["text", "audio"]]
+    """The set of modalities the model can respond with.
+
+    To disable audio, set this to ["text"].
+    """
+
+    turn_detection: TurnDetection
+    """Configuration for turn detection, ether Server VAD or Semantic VAD.
+
+    This can be set to `null` to turn off, in which case the client must manually
+    trigger model response. Server VAD means that the model will detect the start
+    and end of speech based on audio volume and respond at the end of user speech.
+    Semantic VAD is more advanced and uses a turn detection model (in conjuction
+    with VAD) to semantically estimate whether the user has finished speaking, then
+    dynamically sets a timeout based on this probability. For example, if user audio
+    trails off with "uhhm", the model will score a low probability of turn end and
+    wait longer for the user to continue speaking. This can be useful for more
+    natural conversations, but may have a higher latency.
+    """
+
+
+class InputAudioNoiseReduction(TypedDict, total=False):
+    type: Literal["near_field", "far_field"]
+    """Type of noise reduction.
+
+    `near_field` is for close-talking microphones such as headphones, `far_field` is
+    for far-field microphones such as laptop or conference room microphones.
+    """
+
+
+class InputAudioTranscription(TypedDict, total=False):
+    language: str
+    """The language of the input audio.
+
+    Supplying the input language in
+    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+    format will improve accuracy and latency.
+    """
+
+    model: Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"]
+    """
+    The model to use for transcription, current options are `gpt-4o-transcribe`,
+    `gpt-4o-mini-transcribe`, and `whisper-1`.
+    """
+
+    prompt: str
+    """
+    An optional text to guide the model's style or continue a previous audio
+    segment. For `whisper-1`, the
+    [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+    For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+    "expect words related to technology".
+    """
+
+
+class TurnDetection(TypedDict, total=False):
+    create_response: bool
+    """
+    Whether or not to automatically generate a response when a VAD stop event
+    occurs.
+    """
+
+    eagerness: Literal["low", "medium", "high", "auto"]
+    """Used only for `semantic_vad` mode.
+
+    The eagerness of the model to respond. `low` will wait longer for the user to
+    continue speaking, `high` will respond more quickly. `auto` is the default and
+    is equivalent to `medium`.
+    """
+
+    interrupt_response: bool
+    """
+    Whether or not to automatically interrupt any ongoing response with output to
+    the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+    occurs.
+    """
+
+    prefix_padding_ms: int
+    """Used only for `server_vad` mode.
+
+    Amount of audio to include before the VAD detected speech (in milliseconds).
+    Defaults to 300ms.
+    """
+
+    silence_duration_ms: int
+    """Used only for `server_vad` mode.
+
+    Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+    With shorter values the model will respond more quickly, but may jump in on
+    short pauses from the user.
+    """
+
+    threshold: float
+    """Used only for `server_vad` mode.
+
+    Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+    threshold will require louder audio to activate the model, and thus might
+    perform better in noisy environments.
+    """
+
+    type: Literal["server_vad", "semantic_vad"]
+    """Type of turn detection."""
src/openai/types/beta/realtime/transcription_session_update.py
@@ -0,0 +1,160 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Optional
+from typing_extensions import Literal
+
+from ...._models import BaseModel
+
+__all__ = [
+    "TranscriptionSessionUpdate",
+    "Session",
+    "SessionInputAudioNoiseReduction",
+    "SessionInputAudioTranscription",
+    "SessionTurnDetection",
+]
+
+
+class SessionInputAudioNoiseReduction(BaseModel):
+    type: Optional[Literal["near_field", "far_field"]] = None
+    """Type of noise reduction.
+
+    `near_field` is for close-talking microphones such as headphones, `far_field` is
+    for far-field microphones such as laptop or conference room microphones.
+    """
+
+
+class SessionInputAudioTranscription(BaseModel):
+    language: Optional[str] = None
+    """The language of the input audio.
+
+    Supplying the input language in
+    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+    format will improve accuracy and latency.
+    """
+
+    model: Optional[Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"]] = None
+    """
+    The model to use for transcription, current options are `gpt-4o-transcribe`,
+    `gpt-4o-mini-transcribe`, and `whisper-1`.
+    """
+
+    prompt: Optional[str] = None
+    """
+    An optional text to guide the model's style or continue a previous audio
+    segment. For `whisper-1`, the
+    [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+    For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+    "expect words related to technology".
+    """
+
+
+class SessionTurnDetection(BaseModel):
+    create_response: Optional[bool] = None
+    """
+    Whether or not to automatically generate a response when a VAD stop event
+    occurs.
+    """
+
+    eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None
+    """Used only for `semantic_vad` mode.
+
+    The eagerness of the model to respond. `low` will wait longer for the user to
+    continue speaking, `high` will respond more quickly. `auto` is the default and
+    is equivalent to `medium`.
+    """
+
+    interrupt_response: Optional[bool] = None
+    """
+    Whether or not to automatically interrupt any ongoing response with output to
+    the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+    occurs.
+    """
+
+    prefix_padding_ms: Optional[int] = None
+    """Used only for `server_vad` mode.
+
+    Amount of audio to include before the VAD detected speech (in milliseconds).
+    Defaults to 300ms.
+    """
+
+    silence_duration_ms: Optional[int] = None
+    """Used only for `server_vad` mode.
+
+    Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+    With shorter values the model will respond more quickly, but may jump in on
+    short pauses from the user.
+    """
+
+    threshold: Optional[float] = None
+    """Used only for `server_vad` mode.
+
+    Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+    threshold will require louder audio to activate the model, and thus might
+    perform better in noisy environments.
+    """
+
+    type: Optional[Literal["server_vad", "semantic_vad"]] = None
+    """Type of turn detection."""
+
+
+class Session(BaseModel):
+    include: Optional[List[str]] = None
+    """The set of items to include in the transcription. Current available items are:
+
+    - `item.input_audio_transcription.logprobs`
+    """
+
+    input_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None
+    """The format of input audio.
+
+    Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must
+    be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian
+    byte order.
+    """
+
+    input_audio_noise_reduction: Optional[SessionInputAudioNoiseReduction] = None
+    """Configuration for input audio noise reduction.
+
+    This can be set to `null` to turn off. Noise reduction filters audio added to
+    the input audio buffer before it is sent to VAD and the model. Filtering the
+    audio can improve VAD and turn detection accuracy (reducing false positives) and
+    model performance by improving perception of the input audio.
+    """
+
+    input_audio_transcription: Optional[SessionInputAudioTranscription] = None
+    """Configuration for input audio transcription.
+
+    The client can optionally set the language and prompt for transcription, these
+    offer additional guidance to the transcription service.
+    """
+
+    modalities: Optional[List[Literal["text", "audio"]]] = None
+    """The set of modalities the model can respond with.
+
+    To disable audio, set this to ["text"].
+    """
+
+    turn_detection: Optional[SessionTurnDetection] = None
+    """Configuration for turn detection, ether Server VAD or Semantic VAD.
+
+    This can be set to `null` to turn off, in which case the client must manually
+    trigger model response. Server VAD means that the model will detect the start
+    and end of speech based on audio volume and respond at the end of user speech.
+    Semantic VAD is more advanced and uses a turn detection model (in conjuction
+    with VAD) to semantically estimate whether the user has finished speaking, then
+    dynamically sets a timeout based on this probability. For example, if user audio
+    trails off with "uhhm", the model will score a low probability of turn end and
+    wait longer for the user to continue speaking. This can be useful for more
+    natural conversations, but may have a higher latency.
+    """
+
+
+class TranscriptionSessionUpdate(BaseModel):
+    session: Session
+    """Realtime transcription session object configuration."""
+
+    type: Literal["transcription_session.update"]
+    """The event type, must be `transcription_session.update`."""
+
+    event_id: Optional[str] = None
+    """Optional client-generated ID used to identify this event."""
src/openai/types/beta/realtime/transcription_session_update_param.py
@@ -0,0 +1,160 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import List
+from typing_extensions import Literal, Required, TypedDict
+
+__all__ = [
+    "TranscriptionSessionUpdateParam",
+    "Session",
+    "SessionInputAudioNoiseReduction",
+    "SessionInputAudioTranscription",
+    "SessionTurnDetection",
+]
+
+
+class SessionInputAudioNoiseReduction(TypedDict, total=False):
+    type: Literal["near_field", "far_field"]
+    """Type of noise reduction.
+
+    `near_field` is for close-talking microphones such as headphones, `far_field` is
+    for far-field microphones such as laptop or conference room microphones.
+    """
+
+
+class SessionInputAudioTranscription(TypedDict, total=False):
+    language: str
+    """The language of the input audio.
+
+    Supplying the input language in
+    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+    format will improve accuracy and latency.
+    """
+
+    model: Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"]
+    """
+    The model to use for transcription, current options are `gpt-4o-transcribe`,
+    `gpt-4o-mini-transcribe`, and `whisper-1`.
+    """
+
+    prompt: str
+    """
+    An optional text to guide the model's style or continue a previous audio
+    segment. For `whisper-1`, the
+    [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+    For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+    "expect words related to technology".
+    """
+
+
+class SessionTurnDetection(TypedDict, total=False):
+    create_response: bool
+    """
+    Whether or not to automatically generate a response when a VAD stop event
+    occurs.
+    """
+
+    eagerness: Literal["low", "medium", "high", "auto"]
+    """Used only for `semantic_vad` mode.
+
+    The eagerness of the model to respond. `low` will wait longer for the user to
+    continue speaking, `high` will respond more quickly. `auto` is the default and
+    is equivalent to `medium`.
+    """
+
+    interrupt_response: bool
+    """
+    Whether or not to automatically interrupt any ongoing response with output to
+    the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+    occurs.
+    """
+
+    prefix_padding_ms: int
+    """Used only for `server_vad` mode.
+
+    Amount of audio to include before the VAD detected speech (in milliseconds).
+    Defaults to 300ms.
+    """
+
+    silence_duration_ms: int
+    """Used only for `server_vad` mode.
+
+    Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+    With shorter values the model will respond more quickly, but may jump in on
+    short pauses from the user.
+    """
+
+    threshold: float
+    """Used only for `server_vad` mode.
+
+    Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+    threshold will require louder audio to activate the model, and thus might
+    perform better in noisy environments.
+    """
+
+    type: Literal["server_vad", "semantic_vad"]
+    """Type of turn detection."""
+
+
+class Session(TypedDict, total=False):
+    include: List[str]
+    """The set of items to include in the transcription. Current available items are:
+
+    - `item.input_audio_transcription.logprobs`
+    """
+
+    input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"]
+    """The format of input audio.
+
+    Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must
+    be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian
+    byte order.
+    """
+
+    input_audio_noise_reduction: SessionInputAudioNoiseReduction
+    """Configuration for input audio noise reduction.
+
+    This can be set to `null` to turn off. Noise reduction filters audio added to
+    the input audio buffer before it is sent to VAD and the model. Filtering the
+    audio can improve VAD and turn detection accuracy (reducing false positives) and
+    model performance by improving perception of the input audio.
+    """
+
+    input_audio_transcription: SessionInputAudioTranscription
+    """Configuration for input audio transcription.
+
+    The client can optionally set the language and prompt for transcription, these
+    offer additional guidance to the transcription service.
+    """
+
+    modalities: List[Literal["text", "audio"]]
+    """The set of modalities the model can respond with.
+
+    To disable audio, set this to ["text"].
+    """
+
+    turn_detection: SessionTurnDetection
+    """Configuration for turn detection, ether Server VAD or Semantic VAD.
+
+    This can be set to `null` to turn off, in which case the client must manually
+    trigger model response. Server VAD means that the model will detect the start
+    and end of speech based on audio volume and respond at the end of user speech.
+    Semantic VAD is more advanced and uses a turn detection model (in conjuction
+    with VAD) to semantically estimate whether the user has finished speaking, then
+    dynamically sets a timeout based on this probability. For example, if user audio
+    trails off with "uhhm", the model will score a low probability of turn end and
+    wait longer for the user to continue speaking. This can be useful for more
+    natural conversations, but may have a higher latency.
+    """
+
+
+class TranscriptionSessionUpdateParam(TypedDict, total=False):
+    session: Required[Session]
+    """Realtime transcription session object configuration."""
+
+    type: Required[Literal["transcription_session.update"]]
+    """The event type, must be `transcription_session.update`."""
+
+    event_id: str
+    """Optional client-generated ID used to identify this event."""
src/openai/types/beta/realtime/transcription_session_updated_event.py
@@ -0,0 +1,24 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing_extensions import Literal
+
+from ...._models import BaseModel
+from .transcription_session import TranscriptionSession
+
+__all__ = ["TranscriptionSessionUpdatedEvent"]
+
+
+class TranscriptionSessionUpdatedEvent(BaseModel):
+    event_id: str
+    """The unique ID of the server event."""
+
+    session: TranscriptionSession
+    """A new Realtime transcription session configuration.
+
+    When a session is created on the server via REST API, the session object also
+    contains an ephemeral key. Default TTL for keys is one minute. This property is
+    not present when a session is updated via the WebSocket API.
+    """
+
+    type: Literal["transcription_session.updated"]
+    """The event type, must be `transcription_session.updated`."""
src/openai/types/audio_model.py
@@ -4,4 +4,4 @@ from typing_extensions import Literal, TypeAlias
 
 __all__ = ["AudioModel"]
 
-AudioModel: TypeAlias = Literal["whisper-1"]
+AudioModel: TypeAlias = Literal["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]
tests/api_resources/audio/test_speech.py
@@ -41,6 +41,7 @@ class TestSpeech:
             input="string",
             model="string",
             voice="alloy",
+            instructions="instructions",
             response_format="mp3",
             speed=0.25,
         )
@@ -104,6 +105,7 @@ class TestAsyncSpeech:
             input="string",
             model="string",
             voice="alloy",
+            instructions="instructions",
             response_format="mp3",
             speed=0.25,
         )
tests/api_resources/audio/test_transcriptions.py
@@ -18,31 +18,33 @@ class TestTranscriptions:
     parametrize = pytest.mark.parametrize("client", [False, True], indirect=True, ids=["loose", "strict"])
 
     @parametrize
-    def test_method_create(self, client: OpenAI) -> None:
+    def test_method_create_overload_1(self, client: OpenAI) -> None:
         transcription = client.audio.transcriptions.create(
             file=b"raw file contents",
-            model="whisper-1",
+            model="gpt-4o-transcribe",
         )
         assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"])
 
     @parametrize
-    def test_method_create_with_all_params(self, client: OpenAI) -> None:
+    def test_method_create_with_all_params_overload_1(self, client: OpenAI) -> None:
         transcription = client.audio.transcriptions.create(
             file=b"raw file contents",
-            model="whisper-1",
-            language="string",
-            prompt="string",
+            model="gpt-4o-transcribe",
+            include=["logprobs"],
+            language="language",
+            prompt="prompt",
             response_format="json",
+            stream=False,
             temperature=0,
             timestamp_granularities=["word"],
         )
         assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"])
 
     @parametrize
-    def test_raw_response_create(self, client: OpenAI) -> None:
+    def test_raw_response_create_overload_1(self, client: OpenAI) -> None:
         response = client.audio.transcriptions.with_raw_response.create(
             file=b"raw file contents",
-            model="whisper-1",
+            model="gpt-4o-transcribe",
         )
 
         assert response.is_closed is True
@@ -51,10 +53,10 @@ class TestTranscriptions:
         assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"])
 
     @parametrize
-    def test_streaming_response_create(self, client: OpenAI) -> None:
+    def test_streaming_response_create_overload_1(self, client: OpenAI) -> None:
         with client.audio.transcriptions.with_streaming_response.create(
             file=b"raw file contents",
-            model="whisper-1",
+            model="gpt-4o-transcribe",
         ) as response:
             assert not response.is_closed
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -64,36 +66,89 @@ class TestTranscriptions:
 
         assert cast(Any, response.is_closed) is True
 
+    @parametrize
+    def test_method_create_overload_2(self, client: OpenAI) -> None:
+        transcription_stream = client.audio.transcriptions.create(
+            file=b"raw file contents",
+            model="gpt-4o-transcribe",
+            stream=True,
+        )
+        transcription_stream.response.close()
+
+    @parametrize
+    def test_method_create_with_all_params_overload_2(self, client: OpenAI) -> None:
+        transcription_stream = client.audio.transcriptions.create(
+            file=b"raw file contents",
+            model="gpt-4o-transcribe",
+            stream=True,
+            include=["logprobs"],
+            language="language",
+            prompt="prompt",
+            response_format="json",
+            temperature=0,
+            timestamp_granularities=["word"],
+        )
+        transcription_stream.response.close()
+
+    @parametrize
+    def test_raw_response_create_overload_2(self, client: OpenAI) -> None:
+        response = client.audio.transcriptions.with_raw_response.create(
+            file=b"raw file contents",
+            model="gpt-4o-transcribe",
+            stream=True,
+        )
+
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        stream = response.parse()
+        stream.close()
+
+    @parametrize
+    def test_streaming_response_create_overload_2(self, client: OpenAI) -> None:
+        with client.audio.transcriptions.with_streaming_response.create(
+            file=b"raw file contents",
+            model="gpt-4o-transcribe",
+            stream=True,
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            stream = response.parse()
+            stream.close()
+
+        assert cast(Any, response.is_closed) is True
+
 
 class TestAsyncTranscriptions:
     parametrize = pytest.mark.parametrize("async_client", [False, True], indirect=True, ids=["loose", "strict"])
 
     @parametrize
-    async def test_method_create(self, async_client: AsyncOpenAI) -> None:
+    async def test_method_create_overload_1(self, async_client: AsyncOpenAI) -> None:
         transcription = await async_client.audio.transcriptions.create(
             file=b"raw file contents",
-            model="whisper-1",
+            model="gpt-4o-transcribe",
         )
         assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"])
 
     @parametrize
-    async def test_method_create_with_all_params(self, async_client: AsyncOpenAI) -> None:
+    async def test_method_create_with_all_params_overload_1(self, async_client: AsyncOpenAI) -> None:
         transcription = await async_client.audio.transcriptions.create(
             file=b"raw file contents",
-            model="whisper-1",
-            language="string",
-            prompt="string",
+            model="gpt-4o-transcribe",
+            include=["logprobs"],
+            language="language",
+            prompt="prompt",
             response_format="json",
+            stream=False,
             temperature=0,
             timestamp_granularities=["word"],
         )
         assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"])
 
     @parametrize
-    async def test_raw_response_create(self, async_client: AsyncOpenAI) -> None:
+    async def test_raw_response_create_overload_1(self, async_client: AsyncOpenAI) -> None:
         response = await async_client.audio.transcriptions.with_raw_response.create(
             file=b"raw file contents",
-            model="whisper-1",
+            model="gpt-4o-transcribe",
         )
 
         assert response.is_closed is True
@@ -102,10 +157,10 @@ class TestAsyncTranscriptions:
         assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"])
 
     @parametrize
-    async def test_streaming_response_create(self, async_client: AsyncOpenAI) -> None:
+    async def test_streaming_response_create_overload_1(self, async_client: AsyncOpenAI) -> None:
         async with async_client.audio.transcriptions.with_streaming_response.create(
             file=b"raw file contents",
-            model="whisper-1",
+            model="gpt-4o-transcribe",
         ) as response:
             assert not response.is_closed
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -114,3 +169,54 @@ class TestAsyncTranscriptions:
             assert_matches_type(TranscriptionCreateResponse, transcription, path=["response"])
 
         assert cast(Any, response.is_closed) is True
+
+    @parametrize
+    async def test_method_create_overload_2(self, async_client: AsyncOpenAI) -> None:
+        transcription_stream = await async_client.audio.transcriptions.create(
+            file=b"raw file contents",
+            model="gpt-4o-transcribe",
+            stream=True,
+        )
+        await transcription_stream.response.aclose()
+
+    @parametrize
+    async def test_method_create_with_all_params_overload_2(self, async_client: AsyncOpenAI) -> None:
+        transcription_stream = await async_client.audio.transcriptions.create(
+            file=b"raw file contents",
+            model="gpt-4o-transcribe",
+            stream=True,
+            include=["logprobs"],
+            language="language",
+            prompt="prompt",
+            response_format="json",
+            temperature=0,
+            timestamp_granularities=["word"],
+        )
+        await transcription_stream.response.aclose()
+
+    @parametrize
+    async def test_raw_response_create_overload_2(self, async_client: AsyncOpenAI) -> None:
+        response = await async_client.audio.transcriptions.with_raw_response.create(
+            file=b"raw file contents",
+            model="gpt-4o-transcribe",
+            stream=True,
+        )
+
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        stream = response.parse()
+        await stream.close()
+
+    @parametrize
+    async def test_streaming_response_create_overload_2(self, async_client: AsyncOpenAI) -> None:
+        async with async_client.audio.transcriptions.with_streaming_response.create(
+            file=b"raw file contents",
+            model="gpt-4o-transcribe",
+            stream=True,
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            stream = await response.parse()
+            await stream.close()
+
+        assert cast(Any, response.is_closed) is True
tests/api_resources/beta/realtime/test_sessions.py
@@ -26,6 +26,7 @@ class TestSessions:
     def test_method_create_with_all_params(self, client: OpenAI) -> None:
         session = client.beta.realtime.sessions.create(
             input_audio_format="pcm16",
+            input_audio_noise_reduction={"type": "near_field"},
             input_audio_transcription={
                 "language": "language",
                 "model": "model",
@@ -48,11 +49,12 @@ class TestSessions:
             ],
             turn_detection={
                 "create_response": True,
+                "eagerness": "low",
                 "interrupt_response": True,
                 "prefix_padding_ms": 0,
                 "silence_duration_ms": 0,
                 "threshold": 0,
-                "type": "type",
+                "type": "server_vad",
             },
             voice="alloy",
         )
@@ -91,6 +93,7 @@ class TestAsyncSessions:
     async def test_method_create_with_all_params(self, async_client: AsyncOpenAI) -> None:
         session = await async_client.beta.realtime.sessions.create(
             input_audio_format="pcm16",
+            input_audio_noise_reduction={"type": "near_field"},
             input_audio_transcription={
                 "language": "language",
                 "model": "model",
@@ -113,11 +116,12 @@ class TestAsyncSessions:
             ],
             turn_detection={
                 "create_response": True,
+                "eagerness": "low",
                 "interrupt_response": True,
                 "prefix_padding_ms": 0,
                 "silence_duration_ms": 0,
                 "threshold": 0,
-                "type": "type",
+                "type": "server_vad",
             },
             voice="alloy",
         )
tests/api_resources/beta/realtime/test_transcription_sessions.py
@@ -0,0 +1,120 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+import os
+from typing import Any, cast
+
+import pytest
+
+from openai import OpenAI, AsyncOpenAI
+from tests.utils import assert_matches_type
+from openai.types.beta.realtime import TranscriptionSession
+
+base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010")
+
+
+class TestTranscriptionSessions:
+    parametrize = pytest.mark.parametrize("client", [False, True], indirect=True, ids=["loose", "strict"])
+
+    @parametrize
+    def test_method_create(self, client: OpenAI) -> None:
+        transcription_session = client.beta.realtime.transcription_sessions.create()
+        assert_matches_type(TranscriptionSession, transcription_session, path=["response"])
+
+    @parametrize
+    def test_method_create_with_all_params(self, client: OpenAI) -> None:
+        transcription_session = client.beta.realtime.transcription_sessions.create(
+            include=["string"],
+            input_audio_format="pcm16",
+            input_audio_noise_reduction={"type": "near_field"},
+            input_audio_transcription={
+                "language": "language",
+                "model": "gpt-4o-transcribe",
+                "prompt": "prompt",
+            },
+            modalities=["text"],
+            turn_detection={
+                "create_response": True,
+                "eagerness": "low",
+                "interrupt_response": True,
+                "prefix_padding_ms": 0,
+                "silence_duration_ms": 0,
+                "threshold": 0,
+                "type": "server_vad",
+            },
+        )
+        assert_matches_type(TranscriptionSession, transcription_session, path=["response"])
+
+    @parametrize
+    def test_raw_response_create(self, client: OpenAI) -> None:
+        response = client.beta.realtime.transcription_sessions.with_raw_response.create()
+
+        assert response.is_closed is True
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        transcription_session = response.parse()
+        assert_matches_type(TranscriptionSession, transcription_session, path=["response"])
+
+    @parametrize
+    def test_streaming_response_create(self, client: OpenAI) -> None:
+        with client.beta.realtime.transcription_sessions.with_streaming_response.create() as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            transcription_session = response.parse()
+            assert_matches_type(TranscriptionSession, transcription_session, path=["response"])
+
+        assert cast(Any, response.is_closed) is True
+
+
+class TestAsyncTranscriptionSessions:
+    parametrize = pytest.mark.parametrize("async_client", [False, True], indirect=True, ids=["loose", "strict"])
+
+    @parametrize
+    async def test_method_create(self, async_client: AsyncOpenAI) -> None:
+        transcription_session = await async_client.beta.realtime.transcription_sessions.create()
+        assert_matches_type(TranscriptionSession, transcription_session, path=["response"])
+
+    @parametrize
+    async def test_method_create_with_all_params(self, async_client: AsyncOpenAI) -> None:
+        transcription_session = await async_client.beta.realtime.transcription_sessions.create(
+            include=["string"],
+            input_audio_format="pcm16",
+            input_audio_noise_reduction={"type": "near_field"},
+            input_audio_transcription={
+                "language": "language",
+                "model": "gpt-4o-transcribe",
+                "prompt": "prompt",
+            },
+            modalities=["text"],
+            turn_detection={
+                "create_response": True,
+                "eagerness": "low",
+                "interrupt_response": True,
+                "prefix_padding_ms": 0,
+                "silence_duration_ms": 0,
+                "threshold": 0,
+                "type": "server_vad",
+            },
+        )
+        assert_matches_type(TranscriptionSession, transcription_session, path=["response"])
+
+    @parametrize
+    async def test_raw_response_create(self, async_client: AsyncOpenAI) -> None:
+        response = await async_client.beta.realtime.transcription_sessions.with_raw_response.create()
+
+        assert response.is_closed is True
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        transcription_session = response.parse()
+        assert_matches_type(TranscriptionSession, transcription_session, path=["response"])
+
+    @parametrize
+    async def test_streaming_response_create(self, async_client: AsyncOpenAI) -> None:
+        async with async_client.beta.realtime.transcription_sessions.with_streaming_response.create() as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            transcription_session = await response.parse()
+            assert_matches_type(TranscriptionSession, transcription_session, path=["response"])
+
+        assert cast(Any, response.is_closed) is True
tests/lib/test_audio.py
@@ -26,7 +26,7 @@ def test_translation_create_overloads_in_sync(sync: bool, client: OpenAI, async_
         assert_signatures_in_sync(
             fn,
             overload,
-            exclude_params={"response_format"},
+            exclude_params={"response_format", "stream"},
             description=f" for overload {i}",
         )
 
@@ -60,7 +60,7 @@ def test_transcription_create_overloads_in_sync(sync: bool, client: OpenAI, asyn
         assert_signatures_in_sync(
             fn,
             overload,
-            exclude_params={"response_format"},
+            exclude_params={"response_format", "stream"},
             description=f" for overload {i}",
         )
 
.stats.yml
@@ -1,2 +1,2 @@
-configured_endpoints: 81
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-b26121d5df6eb5d3032a45a267473798b15fcfec76dd44a3256cf1238be05fa4.yml
+configured_endpoints: 82
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c22f59c66aec7914b6ee653d3098d1c1c8c16c180d2a158e819c8ddbf476f74b.yml
api.md
@@ -151,7 +151,11 @@ Types:
 ```python
 from openai.types.audio import (
     Transcription,
+    TranscriptionInclude,
     TranscriptionSegment,
+    TranscriptionStreamEvent,
+    TranscriptionTextDeltaEvent,
+    TranscriptionTextDoneEvent,
     TranscriptionVerbose,
     TranscriptionWord,
     TranscriptionCreateResponse,
@@ -338,7 +342,9 @@ from openai.types.beta.realtime import (
     ConversationItemDeleteEvent,
     ConversationItemDeletedEvent,
     ConversationItemInputAudioTranscriptionCompletedEvent,
+    ConversationItemInputAudioTranscriptionDeltaEvent,
     ConversationItemInputAudioTranscriptionFailedEvent,
+    ConversationItemRetrieveEvent,
     ConversationItemTruncateEvent,
     ConversationItemTruncatedEvent,
     ConversationItemWithReference,
@@ -375,6 +381,8 @@ from openai.types.beta.realtime import (
     SessionCreatedEvent,
     SessionUpdateEvent,
     SessionUpdatedEvent,
+    TranscriptionSessionUpdate,
+    TranscriptionSessionUpdatedEvent,
 )
 ```
 
@@ -390,6 +398,18 @@ Methods:
 
 - <code title="post /realtime/sessions">client.beta.realtime.sessions.<a href="./src/openai/resources/beta/realtime/sessions.py">create</a>(\*\*<a href="src/openai/types/beta/realtime/session_create_params.py">params</a>) -> <a href="./src/openai/types/beta/realtime/session_create_response.py">SessionCreateResponse</a></code>
 
+### TranscriptionSessions
+
+Types:
+
+```python
+from openai.types.beta.realtime import TranscriptionSession
+```
+
+Methods:
+
+- <code title="post /realtime/transcription_sessions">client.beta.realtime.transcription_sessions.<a href="./src/openai/resources/beta/realtime/transcription_sessions.py">create</a>(\*\*<a href="src/openai/types/beta/realtime/transcription_session_create_params.py">params</a>) -> <a href="./src/openai/types/beta/realtime/transcription_session.py">TranscriptionSession</a></code>
+
 ## Assistants
 
 Types: