Commit 0296375f
Changed files (89)
src
openai
resources
realtime
types
realtime
tests
api_resources
realtime
src/openai/resources/realtime/client_secrets.py
@@ -50,11 +50,13 @@ class ClientSecrets(SyncAPIResource):
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
) -> ClientSecretCreateResponse:
"""
- Create a Realtime session and client secret for either realtime or
- transcription.
+ Create a Realtime client secret with an associated session configuration.
Args:
- expires_after: Configuration for the ephemeral token expiration.
+ expires_after: Configuration for the client secret expiration. Expiration refers to the time
+ after which a client secret will no longer be valid for creating sessions. The
+ session itself may continue after that time once started. A secret can be used
+ to create multiple sessions until it expires.
session: Session configuration to use for the client secret. Choose either a realtime
session or a transcription session.
@@ -116,11 +118,13 @@ class AsyncClientSecrets(AsyncAPIResource):
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
) -> ClientSecretCreateResponse:
"""
- Create a Realtime session and client secret for either realtime or
- transcription.
+ Create a Realtime client secret with an associated session configuration.
Args:
- expires_after: Configuration for the ephemeral token expiration.
+ expires_after: Configuration for the client secret expiration. Expiration refers to the time
+ after which a client secret will no longer be valid for creating sessions. The
+ session itself may continue after that time once started. A secret can be used
+ to create multiple sessions until it expires.
session: Session configuration to use for the client secret. Choose either a realtime
session or a transcription session.
src/openai/resources/realtime/realtime.py
@@ -32,16 +32,13 @@ from .client_secrets import (
ClientSecretsWithStreamingResponse,
AsyncClientSecretsWithStreamingResponse,
)
-from ...types.realtime import response_create_event_param
+from ...types.realtime import session_update_event_param, transcription_session_update_param
from ...types.websocket_connection_options import WebsocketConnectionOptions
from ...types.realtime.realtime_client_event import RealtimeClientEvent
from ...types.realtime.realtime_server_event import RealtimeServerEvent
from ...types.realtime.conversation_item_param import ConversationItemParam
from ...types.realtime.realtime_client_event_param import RealtimeClientEventParam
-from ...types.realtime.realtime_session_create_request_param import RealtimeSessionCreateRequestParam
-from ...types.realtime.realtime_transcription_session_create_request_param import (
- RealtimeTranscriptionSessionCreateRequestParam,
-)
+from ...types.realtime.realtime_response_create_params_param import RealtimeResponseCreateParamsParam
if TYPE_CHECKING:
from websockets.sync.client import ClientConnection as WebsocketConnection
@@ -564,18 +561,18 @@ class BaseRealtimeConnectionResource:
class RealtimeSessionResource(BaseRealtimeConnectionResource):
- def update(self, *, session: RealtimeSessionCreateRequestParam, event_id: str | NotGiven = NOT_GIVEN) -> None:
+ def update(self, *, session: session_update_event_param.Session, event_id: str | NotGiven = NOT_GIVEN) -> None:
"""
- Send this event to update the session’s default configuration.
- The client may send this event at any time to update any field,
- except for `voice`. However, note that once a session has been
- initialized with a particular `model`, it can’t be changed to
- another model using `session.update`.
+ Send this event to update the session’s configuration.
+ The client may send this event at any time to update any field
+ except for `voice` and `model`. `voice` can be updated only if there have been no other
+ audio outputs yet.
When the server receives a `session.update`, it will respond
with a `session.updated` event showing the full, effective configuration.
- Only the fields that are present are updated. To clear a field like
- `instructions`, pass an empty string.
+ Only the fields that are present in the `session.update` are updated. To clear a field like
+ `instructions`, pass an empty string. To clear a field like `tools`, pass an empty array.
+ To clear a field like `turn_detection`, pass `null`.
"""
self._connection.send(
cast(
@@ -590,7 +587,7 @@ class RealtimeResponseResource(BaseRealtimeConnectionResource):
self,
*,
event_id: str | NotGiven = NOT_GIVEN,
- response: response_create_event_param.Response | NotGiven = NOT_GIVEN,
+ response: RealtimeResponseCreateParamsParam | NotGiven = NOT_GIVEN,
) -> None:
"""
This event instructs the server to create a Response, which means triggering
@@ -599,15 +596,25 @@ class RealtimeResponseResource(BaseRealtimeConnectionResource):
A Response will include at least one Item, and may have two, in which case
the second will be a function call. These Items will be appended to the
- conversation history.
+ conversation history by default.
The server will respond with a `response.created` event, events for Items
and content created, and finally a `response.done` event to indicate the
Response is complete.
The `response.create` event includes inference configuration like
- `instructions`, and `temperature`. These fields will override the Session's
+ `instructions` and `tools`. If these are set, they will override the Session's
configuration for this Response only.
+
+ Responses can be created out-of-band of the default Conversation, meaning that they can
+ have arbitrary input, and it's possible to disable writing the output to the Conversation.
+ Only one Response can write to the default Conversation at a time, but otherwise multiple
+ Responses can be created in parallel. The `metadata` field is a good way to disambiguate
+ multiple simultaneous Responses.
+
+ Clients can set `conversation` to `none` to create a Response that does not write to the default
+ Conversation. Arbitrary input can be provided with the `input` field, which is an array accepting
+ raw Items and references to existing Items.
"""
self._connection.send(
cast(
@@ -621,7 +628,9 @@ class RealtimeResponseResource(BaseRealtimeConnectionResource):
The server will respond
with a `response.done` event with a status of `response.status=cancelled`. If
- there is no response to cancel, the server will respond with an error.
+ there is no response to cancel, the server will respond with an error. It's safe
+ to call `response.cancel` even if no response is in progress, an error will be
+ returned the session will remain unaffected.
"""
self._connection.send(
cast(
@@ -644,16 +653,9 @@ class RealtimeInputAudioBufferResource(BaseRealtimeConnectionResource):
def commit(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None:
"""
- Send this event to commit the user input audio buffer, which will create a
- new user message item in the conversation. This event will produce an error
- if the input audio buffer is empty. When in Server VAD mode, the client does
- not need to send this event, the server will commit the audio buffer
- automatically.
+ Send this event to commit the user input audio buffer, which will create a new user message item in the conversation. This event will produce an error if the input audio buffer is empty. When in Server VAD mode, the client does not need to send this event, the server will commit the audio buffer automatically.
- Committing the input audio buffer will trigger input audio transcription
- (if enabled in session configuration), but it will not create a response
- from the model. The server will respond with an `input_audio_buffer.committed`
- event.
+ Committing the input audio buffer will trigger input audio transcription (if enabled in session configuration), but it will not create a response from the model. The server will respond with an `input_audio_buffer.committed` event.
"""
self._connection.send(
cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.commit", "event_id": event_id}))
@@ -663,14 +665,17 @@ class RealtimeInputAudioBufferResource(BaseRealtimeConnectionResource):
"""Send this event to append audio bytes to the input audio buffer.
The audio
- buffer is temporary storage you can write to and later commit. In Server VAD
- mode, the audio buffer is used to detect speech and the server will decide
+ buffer is temporary storage you can write to and later commit. A "commit" will create a new
+ user message item in the conversation history from the buffer content and clear the buffer.
+ Input audio transcription (if enabled) will be generated when the buffer is committed.
+
+ If VAD is enabled the audio buffer is used to detect speech and the server will decide
when to commit. When Server VAD is disabled, you must commit the audio buffer
- manually.
+ manually. Input audio noise reduction operates on writes to the audio buffer.
The client may choose how much audio to place in each event up to a maximum
of 15 MiB, for example streaming smaller chunks from the client may allow the
- VAD to be more responsive. Unlike made other client events, the server will
+ VAD to be more responsive. Unlike most other client events, the server will
not send a confirmation response to this event.
"""
self._connection.send(
@@ -797,7 +802,7 @@ class RealtimeOutputAudioBufferResource(BaseRealtimeConnectionResource):
class RealtimeTranscriptionSessionResource(BaseRealtimeConnectionResource):
def update(
- self, *, session: RealtimeTranscriptionSessionCreateRequestParam, event_id: str | NotGiven = NOT_GIVEN
+ self, *, session: transcription_session_update_param.Session, event_id: str | NotGiven = NOT_GIVEN
) -> None:
"""Send this event to update a transcription session."""
self._connection.send(
@@ -814,18 +819,20 @@ class BaseAsyncRealtimeConnectionResource:
class AsyncRealtimeSessionResource(BaseAsyncRealtimeConnectionResource):
- async def update(self, *, session: RealtimeSessionCreateRequestParam, event_id: str | NotGiven = NOT_GIVEN) -> None:
+ async def update(
+ self, *, session: session_update_event_param.Session, event_id: str | NotGiven = NOT_GIVEN
+ ) -> None:
"""
- Send this event to update the session’s default configuration.
- The client may send this event at any time to update any field,
- except for `voice`. However, note that once a session has been
- initialized with a particular `model`, it can’t be changed to
- another model using `session.update`.
+ Send this event to update the session’s configuration.
+ The client may send this event at any time to update any field
+ except for `voice` and `model`. `voice` can be updated only if there have been no other
+ audio outputs yet.
When the server receives a `session.update`, it will respond
with a `session.updated` event showing the full, effective configuration.
- Only the fields that are present are updated. To clear a field like
- `instructions`, pass an empty string.
+ Only the fields that are present in the `session.update` are updated. To clear a field like
+ `instructions`, pass an empty string. To clear a field like `tools`, pass an empty array.
+ To clear a field like `turn_detection`, pass `null`.
"""
await self._connection.send(
cast(
@@ -840,7 +847,7 @@ class AsyncRealtimeResponseResource(BaseAsyncRealtimeConnectionResource):
self,
*,
event_id: str | NotGiven = NOT_GIVEN,
- response: response_create_event_param.Response | NotGiven = NOT_GIVEN,
+ response: RealtimeResponseCreateParamsParam | NotGiven = NOT_GIVEN,
) -> None:
"""
This event instructs the server to create a Response, which means triggering
@@ -849,15 +856,25 @@ class AsyncRealtimeResponseResource(BaseAsyncRealtimeConnectionResource):
A Response will include at least one Item, and may have two, in which case
the second will be a function call. These Items will be appended to the
- conversation history.
+ conversation history by default.
The server will respond with a `response.created` event, events for Items
and content created, and finally a `response.done` event to indicate the
Response is complete.
The `response.create` event includes inference configuration like
- `instructions`, and `temperature`. These fields will override the Session's
+ `instructions` and `tools`. If these are set, they will override the Session's
configuration for this Response only.
+
+ Responses can be created out-of-band of the default Conversation, meaning that they can
+ have arbitrary input, and it's possible to disable writing the output to the Conversation.
+ Only one Response can write to the default Conversation at a time, but otherwise multiple
+ Responses can be created in parallel. The `metadata` field is a good way to disambiguate
+ multiple simultaneous Responses.
+
+ Clients can set `conversation` to `none` to create a Response that does not write to the default
+ Conversation. Arbitrary input can be provided with the `input` field, which is an array accepting
+ raw Items and references to existing Items.
"""
await self._connection.send(
cast(
@@ -871,7 +888,9 @@ class AsyncRealtimeResponseResource(BaseAsyncRealtimeConnectionResource):
The server will respond
with a `response.done` event with a status of `response.status=cancelled`. If
- there is no response to cancel, the server will respond with an error.
+ there is no response to cancel, the server will respond with an error. It's safe
+ to call `response.cancel` even if no response is in progress, an error will be
+ returned the session will remain unaffected.
"""
await self._connection.send(
cast(
@@ -894,16 +913,9 @@ class AsyncRealtimeInputAudioBufferResource(BaseAsyncRealtimeConnectionResource)
async def commit(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None:
"""
- Send this event to commit the user input audio buffer, which will create a
- new user message item in the conversation. This event will produce an error
- if the input audio buffer is empty. When in Server VAD mode, the client does
- not need to send this event, the server will commit the audio buffer
- automatically.
+ Send this event to commit the user input audio buffer, which will create a new user message item in the conversation. This event will produce an error if the input audio buffer is empty. When in Server VAD mode, the client does not need to send this event, the server will commit the audio buffer automatically.
- Committing the input audio buffer will trigger input audio transcription
- (if enabled in session configuration), but it will not create a response
- from the model. The server will respond with an `input_audio_buffer.committed`
- event.
+ Committing the input audio buffer will trigger input audio transcription (if enabled in session configuration), but it will not create a response from the model. The server will respond with an `input_audio_buffer.committed` event.
"""
await self._connection.send(
cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.commit", "event_id": event_id}))
@@ -913,14 +925,17 @@ class AsyncRealtimeInputAudioBufferResource(BaseAsyncRealtimeConnectionResource)
"""Send this event to append audio bytes to the input audio buffer.
The audio
- buffer is temporary storage you can write to and later commit. In Server VAD
- mode, the audio buffer is used to detect speech and the server will decide
+ buffer is temporary storage you can write to and later commit. A "commit" will create a new
+ user message item in the conversation history from the buffer content and clear the buffer.
+ Input audio transcription (if enabled) will be generated when the buffer is committed.
+
+ If VAD is enabled the audio buffer is used to detect speech and the server will decide
when to commit. When Server VAD is disabled, you must commit the audio buffer
- manually.
+ manually. Input audio noise reduction operates on writes to the audio buffer.
The client may choose how much audio to place in each event up to a maximum
of 15 MiB, for example streaming smaller chunks from the client may allow the
- VAD to be more responsive. Unlike made other client events, the server will
+ VAD to be more responsive. Unlike most other client events, the server will
not send a confirmation response to this event.
"""
await self._connection.send(
@@ -1047,7 +1062,7 @@ class AsyncRealtimeOutputAudioBufferResource(BaseAsyncRealtimeConnectionResource
class AsyncRealtimeTranscriptionSessionResource(BaseAsyncRealtimeConnectionResource):
async def update(
- self, *, session: RealtimeTranscriptionSessionCreateRequestParam, event_id: str | NotGiven = NOT_GIVEN
+ self, *, session: transcription_session_update_param.Session, event_id: str | NotGiven = NOT_GIVEN
) -> None:
"""Send this event to update a transcription session."""
await self._connection.send(
src/openai/types/realtime/__init__.py
@@ -2,13 +2,16 @@
from __future__ import annotations
+from .models import Models as Models
+from .models_param import ModelsParam as ModelsParam
from .realtime_error import RealtimeError as RealtimeError
-from .realtime_session import RealtimeSession as RealtimeSession
from .conversation_item import ConversationItem as ConversationItem
from .realtime_response import RealtimeResponse as RealtimeResponse
+from .audio_transcription import AudioTranscription as AudioTranscription
from .log_prob_properties import LogProbProperties as LogProbProperties
from .realtime_truncation import RealtimeTruncation as RealtimeTruncation
from .response_done_event import ResponseDoneEvent as ResponseDoneEvent
+from .noise_reduction_type import NoiseReductionType as NoiseReductionType
from .realtime_error_event import RealtimeErrorEvent as RealtimeErrorEvent
from .session_update_event import SessionUpdateEvent as SessionUpdateEvent
from .mcp_list_tools_failed import McpListToolsFailed as McpListToolsFailed
@@ -21,6 +24,7 @@ from .response_create_event import ResponseCreateEvent as ResponseCreateEvent
from .session_created_event import SessionCreatedEvent as SessionCreatedEvent
from .session_updated_event import SessionUpdatedEvent as SessionUpdatedEvent
from .conversation_item_done import ConversationItemDone as ConversationItemDone
+from .realtime_audio_formats import RealtimeAudioFormats as RealtimeAudioFormats
from .realtime_mcp_tool_call import RealtimeMcpToolCall as RealtimeMcpToolCall
from .realtime_mcphttp_error import RealtimeMcphttpError as RealtimeMcphttpError
from .response_created_event import ResponseCreatedEvent as ResponseCreatedEvent
@@ -34,6 +38,7 @@ from .mcp_list_tools_completed import McpListToolsCompleted as McpListToolsCompl
from .realtime_response_status import RealtimeResponseStatus as RealtimeResponseStatus
from .response_mcp_call_failed import ResponseMcpCallFailed as ResponseMcpCallFailed
from .response_text_done_event import ResponseTextDoneEvent as ResponseTextDoneEvent
+from .audio_transcription_param import AudioTranscriptionParam as AudioTranscriptionParam
from .rate_limits_updated_event import RateLimitsUpdatedEvent as RateLimitsUpdatedEvent
from .realtime_truncation_param import RealtimeTruncationParam as RealtimeTruncationParam
from .response_audio_done_event import ResponseAudioDoneEvent as ResponseAudioDoneEvent
@@ -43,6 +48,7 @@ from .mcp_list_tools_in_progress import McpListToolsInProgress as McpListToolsIn
from .response_audio_delta_event import ResponseAudioDeltaEvent as ResponseAudioDeltaEvent
from .session_update_event_param import SessionUpdateEventParam as SessionUpdateEventParam
from .client_secret_create_params import ClientSecretCreateParams as ClientSecretCreateParams
+from .realtime_audio_config_input import RealtimeAudioConfigInput as RealtimeAudioConfigInput
from .realtime_audio_config_param import RealtimeAudioConfigParam as RealtimeAudioConfigParam
from .realtime_client_event_param import RealtimeClientEventParam as RealtimeClientEventParam
from .realtime_mcp_protocol_error import RealtimeMcpProtocolError as RealtimeMcpProtocolError
@@ -52,11 +58,12 @@ from .realtime_tools_config_union import RealtimeToolsConfigUnion as RealtimeToo
from .response_cancel_event_param import ResponseCancelEventParam as ResponseCancelEventParam
from .response_create_event_param import ResponseCreateEventParam as ResponseCreateEventParam
from .response_mcp_call_completed import ResponseMcpCallCompleted as ResponseMcpCallCompleted
+from .realtime_audio_config_output import RealtimeAudioConfigOutput as RealtimeAudioConfigOutput
+from .realtime_audio_formats_param import RealtimeAudioFormatsParam as RealtimeAudioFormatsParam
from .realtime_mcp_tool_call_param import RealtimeMcpToolCallParam as RealtimeMcpToolCallParam
from .realtime_mcphttp_error_param import RealtimeMcphttpErrorParam as RealtimeMcphttpErrorParam
from .transcription_session_update import TranscriptionSessionUpdate as TranscriptionSessionUpdate
from .client_secret_create_response import ClientSecretCreateResponse as ClientSecretCreateResponse
-from .realtime_client_secret_config import RealtimeClientSecretConfig as RealtimeClientSecretConfig
from .realtime_mcp_approval_request import RealtimeMcpApprovalRequest as RealtimeMcpApprovalRequest
from .realtime_mcp_list_tools_param import RealtimeMcpListToolsParam as RealtimeMcpListToolsParam
from .realtime_tracing_config_param import RealtimeTracingConfigParam as RealtimeTracingConfigParam
@@ -66,11 +73,13 @@ from .conversation_item_create_event import ConversationItemCreateEvent as Conve
from .conversation_item_delete_event import ConversationItemDeleteEvent as ConversationItemDeleteEvent
from .input_audio_buffer_clear_event import InputAudioBufferClearEvent as InputAudioBufferClearEvent
from .realtime_mcp_approval_response import RealtimeMcpApprovalResponse as RealtimeMcpApprovalResponse
+from .realtime_session_client_secret import RealtimeSessionClientSecret as RealtimeSessionClientSecret
from .conversation_item_created_event import ConversationItemCreatedEvent as ConversationItemCreatedEvent
from .conversation_item_deleted_event import ConversationItemDeletedEvent as ConversationItemDeletedEvent
from .input_audio_buffer_append_event import InputAudioBufferAppendEvent as InputAudioBufferAppendEvent
from .input_audio_buffer_commit_event import InputAudioBufferCommitEvent as InputAudioBufferCommitEvent
from .output_audio_buffer_clear_event import OutputAudioBufferClearEvent as OutputAudioBufferClearEvent
+from .realtime_response_create_params import RealtimeResponseCreateParams as RealtimeResponseCreateParams
from .realtime_session_create_request import RealtimeSessionCreateRequest as RealtimeSessionCreateRequest
from .response_output_item_done_event import ResponseOutputItemDoneEvent as ResponseOutputItemDoneEvent
from .conversation_item_retrieve_event import ConversationItemRetrieveEvent as ConversationItemRetrieveEvent
@@ -81,26 +90,37 @@ from .response_content_part_done_event import ResponseContentPartDoneEvent as Re
from .response_mcp_call_arguments_done import ResponseMcpCallArgumentsDone as ResponseMcpCallArgumentsDone
from .response_output_item_added_event import ResponseOutputItemAddedEvent as ResponseOutputItemAddedEvent
from .conversation_item_truncated_event import ConversationItemTruncatedEvent as ConversationItemTruncatedEvent
+from .realtime_audio_config_input_param import RealtimeAudioConfigInputParam as RealtimeAudioConfigInputParam
from .realtime_mcp_protocol_error_param import RealtimeMcpProtocolErrorParam as RealtimeMcpProtocolErrorParam
from .realtime_mcp_tool_execution_error import RealtimeMcpToolExecutionError as RealtimeMcpToolExecutionError
+from .realtime_response_create_mcp_tool import RealtimeResponseCreateMcpTool as RealtimeResponseCreateMcpTool
from .realtime_tool_choice_config_param import RealtimeToolChoiceConfigParam as RealtimeToolChoiceConfigParam
from .realtime_tools_config_union_param import RealtimeToolsConfigUnionParam as RealtimeToolsConfigUnionParam
from .response_content_part_added_event import ResponseContentPartAddedEvent as ResponseContentPartAddedEvent
from .response_mcp_call_arguments_delta import ResponseMcpCallArgumentsDelta as ResponseMcpCallArgumentsDelta
from .input_audio_buffer_committed_event import InputAudioBufferCommittedEvent as InputAudioBufferCommittedEvent
+from .realtime_audio_config_output_param import RealtimeAudioConfigOutputParam as RealtimeAudioConfigOutputParam
from .transcription_session_update_param import TranscriptionSessionUpdateParam as TranscriptionSessionUpdateParam
-from .realtime_client_secret_config_param import RealtimeClientSecretConfigParam as RealtimeClientSecretConfigParam
+from .realtime_audio_input_turn_detection import RealtimeAudioInputTurnDetection as RealtimeAudioInputTurnDetection
from .realtime_mcp_approval_request_param import RealtimeMcpApprovalRequestParam as RealtimeMcpApprovalRequestParam
+from .realtime_truncation_retention_ratio import RealtimeTruncationRetentionRatio as RealtimeTruncationRetentionRatio
from .transcription_session_updated_event import TranscriptionSessionUpdatedEvent as TranscriptionSessionUpdatedEvent
from .conversation_item_create_event_param import ConversationItemCreateEventParam as ConversationItemCreateEventParam
from .conversation_item_delete_event_param import ConversationItemDeleteEventParam as ConversationItemDeleteEventParam
from .input_audio_buffer_clear_event_param import InputAudioBufferClearEventParam as InputAudioBufferClearEventParam
from .input_audio_buffer_timeout_triggered import InputAudioBufferTimeoutTriggered as InputAudioBufferTimeoutTriggered
from .realtime_mcp_approval_response_param import RealtimeMcpApprovalResponseParam as RealtimeMcpApprovalResponseParam
+from .realtime_transcription_session_audio import RealtimeTranscriptionSessionAudio as RealtimeTranscriptionSessionAudio
from .response_audio_transcript_done_event import ResponseAudioTranscriptDoneEvent as ResponseAudioTranscriptDoneEvent
from .input_audio_buffer_append_event_param import InputAudioBufferAppendEventParam as InputAudioBufferAppendEventParam
from .input_audio_buffer_commit_event_param import InputAudioBufferCommitEventParam as InputAudioBufferCommitEventParam
from .output_audio_buffer_clear_event_param import OutputAudioBufferClearEventParam as OutputAudioBufferClearEventParam
+from .realtime_response_create_audio_output import (
+ RealtimeResponseCreateAudioOutput as RealtimeResponseCreateAudioOutput,
+)
+from .realtime_response_create_params_param import (
+ RealtimeResponseCreateParamsParam as RealtimeResponseCreateParamsParam,
+)
from .realtime_session_create_request_param import (
RealtimeSessionCreateRequestParam as RealtimeSessionCreateRequestParam,
)
@@ -125,12 +145,30 @@ from .realtime_conversation_item_user_message import (
from .realtime_mcp_tool_execution_error_param import (
RealtimeMcpToolExecutionErrorParam as RealtimeMcpToolExecutionErrorParam,
)
+from .realtime_response_create_mcp_tool_param import (
+ RealtimeResponseCreateMcpToolParam as RealtimeResponseCreateMcpToolParam,
+)
from .realtime_conversation_item_function_call import (
RealtimeConversationItemFunctionCall as RealtimeConversationItemFunctionCall,
)
+from .realtime_audio_input_turn_detection_param import (
+ RealtimeAudioInputTurnDetectionParam as RealtimeAudioInputTurnDetectionParam,
+)
from .realtime_conversation_item_system_message import (
RealtimeConversationItemSystemMessage as RealtimeConversationItemSystemMessage,
)
+from .realtime_truncation_retention_ratio_param import (
+ RealtimeTruncationRetentionRatioParam as RealtimeTruncationRetentionRatioParam,
+)
+from .realtime_transcription_session_audio_input import (
+ RealtimeTranscriptionSessionAudioInput as RealtimeTranscriptionSessionAudioInput,
+)
+from .realtime_transcription_session_audio_param import (
+ RealtimeTranscriptionSessionAudioParam as RealtimeTranscriptionSessionAudioParam,
+)
+from .realtime_response_create_audio_output_param import (
+ RealtimeResponseCreateAudioOutputParam as RealtimeResponseCreateAudioOutputParam,
+)
from .realtime_response_usage_input_token_details import (
RealtimeResponseUsageInputTokenDetails as RealtimeResponseUsageInputTokenDetails,
)
@@ -143,6 +181,9 @@ from .realtime_conversation_item_assistant_message import (
from .realtime_response_usage_output_token_details import (
RealtimeResponseUsageOutputTokenDetails as RealtimeResponseUsageOutputTokenDetails,
)
+from .realtime_transcription_session_client_secret import (
+ RealtimeTranscriptionSessionClientSecret as RealtimeTranscriptionSessionClientSecret,
+)
from .response_function_call_arguments_delta_event import (
ResponseFunctionCallArgumentsDeltaEvent as ResponseFunctionCallArgumentsDeltaEvent,
)
@@ -152,15 +193,24 @@ from .realtime_conversation_item_user_message_param import (
from .realtime_transcription_session_create_request import (
RealtimeTranscriptionSessionCreateRequest as RealtimeTranscriptionSessionCreateRequest,
)
+from .realtime_transcription_session_turn_detection import (
+ RealtimeTranscriptionSessionTurnDetection as RealtimeTranscriptionSessionTurnDetection,
+)
from .realtime_conversation_item_function_call_param import (
RealtimeConversationItemFunctionCallParam as RealtimeConversationItemFunctionCallParam,
)
+from .realtime_transcription_session_create_response import (
+ RealtimeTranscriptionSessionCreateResponse as RealtimeTranscriptionSessionCreateResponse,
+)
from .realtime_conversation_item_function_call_output import (
RealtimeConversationItemFunctionCallOutput as RealtimeConversationItemFunctionCallOutput,
)
from .realtime_conversation_item_system_message_param import (
RealtimeConversationItemSystemMessageParam as RealtimeConversationItemSystemMessageParam,
)
+from .realtime_transcription_session_audio_input_param import (
+ RealtimeTranscriptionSessionAudioInputParam as RealtimeTranscriptionSessionAudioInputParam,
+)
from .realtime_conversation_item_assistant_message_param import (
RealtimeConversationItemAssistantMessageParam as RealtimeConversationItemAssistantMessageParam,
)
@@ -179,6 +229,15 @@ from .conversation_item_input_audio_transcription_delta_event import (
from .conversation_item_input_audio_transcription_failed_event import (
ConversationItemInputAudioTranscriptionFailedEvent as ConversationItemInputAudioTranscriptionFailedEvent,
)
+from .realtime_transcription_session_input_audio_transcription import (
+ RealtimeTranscriptionSessionInputAudioTranscription as RealtimeTranscriptionSessionInputAudioTranscription,
+)
+from .realtime_transcription_session_audio_input_turn_detection import (
+ RealtimeTranscriptionSessionAudioInputTurnDetection as RealtimeTranscriptionSessionAudioInputTurnDetection,
+)
from .conversation_item_input_audio_transcription_completed_event import (
ConversationItemInputAudioTranscriptionCompletedEvent as ConversationItemInputAudioTranscriptionCompletedEvent,
)
+from .realtime_transcription_session_audio_input_turn_detection_param import (
+ RealtimeTranscriptionSessionAudioInputTurnDetectionParam as RealtimeTranscriptionSessionAudioInputTurnDetectionParam,
+)
src/openai/types/realtime/audio_transcription.py
@@ -0,0 +1,36 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Optional
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+
+__all__ = ["AudioTranscription"]
+
+
+class AudioTranscription(BaseModel):
+ language: Optional[str] = None
+ """The language of the input audio.
+
+ Supplying the input language in
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ format will improve accuracy and latency.
+ """
+
+ model: Optional[Literal["whisper-1", "gpt-4o-transcribe-latest", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"]] = (
+ None
+ )
+ """The model to use for transcription.
+
+ Current options are `whisper-1`, `gpt-4o-transcribe-latest`,
+ `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
+ """
+
+ prompt: Optional[str] = None
+ """
+ An optional text to guide the model's style or continue a previous audio
+ segment. For `whisper-1`, the
+ [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+ For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+ "expect words related to technology".
+ """
src/openai/types/realtime/audio_transcription_param.py
@@ -0,0 +1,33 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing_extensions import Literal, TypedDict
+
+__all__ = ["AudioTranscriptionParam"]
+
+
+class AudioTranscriptionParam(TypedDict, total=False):
+ language: str
+ """The language of the input audio.
+
+ Supplying the input language in
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ format will improve accuracy and latency.
+ """
+
+ model: Literal["whisper-1", "gpt-4o-transcribe-latest", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"]
+ """The model to use for transcription.
+
+ Current options are `whisper-1`, `gpt-4o-transcribe-latest`,
+ `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
+ """
+
+ prompt: str
+ """
+ An optional text to guide the model's style or continue a previous audio
+ segment. For `whisper-1`, the
+ [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+ For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+ "expect words related to technology".
+ """
src/openai/types/realtime/client_secret_create_params.py
@@ -13,7 +13,12 @@ __all__ = ["ClientSecretCreateParams", "ExpiresAfter", "Session"]
class ClientSecretCreateParams(TypedDict, total=False):
expires_after: ExpiresAfter
- """Configuration for the ephemeral token expiration."""
+ """Configuration for the client secret expiration.
+
+ Expiration refers to the time after which a client secret will no longer be
+ valid for creating sessions. The session itself may continue after that time
+ once started. A secret can be used to create multiple sessions until it expires.
+ """
session: Session
"""Session configuration to use for the client secret.
@@ -24,15 +29,17 @@ class ClientSecretCreateParams(TypedDict, total=False):
class ExpiresAfter(TypedDict, total=False):
anchor: Literal["created_at"]
- """The anchor point for the ephemeral token expiration.
-
- Only `created_at` is currently supported.
+ """
+ The anchor point for the client secret expiration, meaning that `seconds` will
+ be added to the `created_at` time of the client secret to produce an expiration
+ timestamp. Only `created_at` is currently supported.
"""
seconds: int
"""The number of seconds from the anchor point to the expiration.
- Select a value between `10` and `7200`.
+ Select a value between `10` and `7200` (2 hours). This default to 600 seconds
+ (10 minutes) if not specified.
"""
src/openai/types/realtime/client_secret_create_response.py
@@ -1,102 +1,15 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-from typing import List, Union, Optional
-from typing_extensions import Literal, TypeAlias
+from typing import Union
+from typing_extensions import TypeAlias
from ..._models import BaseModel
from .realtime_session_create_response import RealtimeSessionCreateResponse
+from .realtime_transcription_session_create_response import RealtimeTranscriptionSessionCreateResponse
-__all__ = [
- "ClientSecretCreateResponse",
- "Session",
- "SessionRealtimeTranscriptionSessionCreateResponse",
- "SessionRealtimeTranscriptionSessionCreateResponseAudio",
- "SessionRealtimeTranscriptionSessionCreateResponseAudioInput",
- "SessionRealtimeTranscriptionSessionCreateResponseAudioInputNoiseReduction",
- "SessionRealtimeTranscriptionSessionCreateResponseAudioInputTranscription",
- "SessionRealtimeTranscriptionSessionCreateResponseAudioInputTurnDetection",
-]
+__all__ = ["ClientSecretCreateResponse", "Session"]
-
-class SessionRealtimeTranscriptionSessionCreateResponseAudioInputNoiseReduction(BaseModel):
- type: Optional[Literal["near_field", "far_field"]] = None
-
-
-class SessionRealtimeTranscriptionSessionCreateResponseAudioInputTranscription(BaseModel):
- language: Optional[str] = None
- """The language of the input audio.
-
- Supplying the input language in
- [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- format will improve accuracy and latency.
- """
-
- model: Optional[Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"]] = None
- """The model to use for transcription.
-
- Can be `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, or `whisper-1`.
- """
-
- prompt: Optional[str] = None
- """An optional text to guide the model's style or continue a previous audio
- segment.
-
- The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- should match the audio language.
- """
-
-
-class SessionRealtimeTranscriptionSessionCreateResponseAudioInputTurnDetection(BaseModel):
- prefix_padding_ms: Optional[int] = None
-
- silence_duration_ms: Optional[int] = None
-
- threshold: Optional[float] = None
-
- type: Optional[str] = None
- """Type of turn detection, only `server_vad` is currently supported."""
-
-
-class SessionRealtimeTranscriptionSessionCreateResponseAudioInput(BaseModel):
- format: Optional[str] = None
- """The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`."""
-
- noise_reduction: Optional[SessionRealtimeTranscriptionSessionCreateResponseAudioInputNoiseReduction] = None
- """Configuration for input audio noise reduction."""
-
- transcription: Optional[SessionRealtimeTranscriptionSessionCreateResponseAudioInputTranscription] = None
- """Configuration of the transcription model."""
-
- turn_detection: Optional[SessionRealtimeTranscriptionSessionCreateResponseAudioInputTurnDetection] = None
- """Configuration for turn detection."""
-
-
-class SessionRealtimeTranscriptionSessionCreateResponseAudio(BaseModel):
- input: Optional[SessionRealtimeTranscriptionSessionCreateResponseAudioInput] = None
-
-
-class SessionRealtimeTranscriptionSessionCreateResponse(BaseModel):
- id: Optional[str] = None
- """Unique identifier for the session that looks like `sess_1234567890abcdef`."""
-
- audio: Optional[SessionRealtimeTranscriptionSessionCreateResponseAudio] = None
- """Configuration for input audio for the session."""
-
- expires_at: Optional[int] = None
- """Expiration timestamp for the session, in seconds since epoch."""
-
- include: Optional[List[Literal["item.input_audio_transcription.logprobs"]]] = None
- """Additional fields to include in server outputs.
-
- - `item.input_audio_transcription.logprobs`: Include logprobs for input audio
- transcription.
- """
-
- object: Optional[str] = None
- """The object type. Always `realtime.transcription_session`."""
-
-
-Session: TypeAlias = Union[RealtimeSessionCreateResponse, SessionRealtimeTranscriptionSessionCreateResponse]
+Session: TypeAlias = Union[RealtimeSessionCreateResponse, RealtimeTranscriptionSessionCreateResponse]
class ClientSecretCreateResponse(BaseModel):
src/openai/types/realtime/conversation_item_input_audio_transcription_completed_event.py
@@ -59,7 +59,7 @@ class ConversationItemInputAudioTranscriptionCompletedEvent(BaseModel):
"""The unique ID of the server event."""
item_id: str
- """The ID of the user message item containing the audio."""
+ """The ID of the item containing the audio that is being transcribed."""
transcript: str
"""The transcribed text."""
@@ -70,7 +70,10 @@ class ConversationItemInputAudioTranscriptionCompletedEvent(BaseModel):
"""
usage: Usage
- """Usage statistics for the transcription."""
+ """
+ Usage statistics for the transcription, this is billed according to the ASR
+ model's pricing rather than the realtime model's pricing.
+ """
logprobs: Optional[List[LogProbProperties]] = None
"""The log probabilities of the transcription."""
src/openai/types/realtime/conversation_item_input_audio_transcription_delta_event.py
@@ -14,7 +14,7 @@ class ConversationItemInputAudioTranscriptionDeltaEvent(BaseModel):
"""The unique ID of the server event."""
item_id: str
- """The ID of the item."""
+ """The ID of the item containing the audio that is being transcribed."""
type: Literal["conversation.item.input_audio_transcription.delta"]
"""The event type, must be `conversation.item.input_audio_transcription.delta`."""
@@ -26,4 +26,11 @@ class ConversationItemInputAudioTranscriptionDeltaEvent(BaseModel):
"""The text delta."""
logprobs: Optional[List[LogProbProperties]] = None
- """The log probabilities of the transcription."""
+ """The log probabilities of the transcription.
+
+ These can be enabled by configurating the session with
+ `"include": ["item.input_audio_transcription.logprobs"]`. Each entry in the
+ array corresponds a log probability of which token would be selected for this
+ chunk of transcription. This can help to identify if it was possible there were
+ multiple valid options for a given chunk of transcription.
+ """
src/openai/types/realtime/conversation_item_truncate_event.py
@@ -17,7 +17,7 @@ class ConversationItemTruncateEvent(BaseModel):
"""
content_index: int
- """The index of the content part to truncate. Set this to 0."""
+ """The index of the content part to truncate. Set this to `0`."""
item_id: str
"""The ID of the assistant message item to truncate.
src/openai/types/realtime/conversation_item_truncate_event_param.py
@@ -16,7 +16,7 @@ class ConversationItemTruncateEventParam(TypedDict, total=False):
"""
content_index: Required[int]
- """The index of the content part to truncate. Set this to 0."""
+ """The index of the content part to truncate. Set this to `0`."""
item_id: Required[str]
"""The ID of the assistant message item to truncate.
src/openai/types/realtime/models.py
@@ -0,0 +1,25 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Optional
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+
+__all__ = ["Models"]
+
+
+class Models(BaseModel):
+ description: Optional[str] = None
+ """
+ The description of the function, including guidance on when and how to call it,
+ and guidance about what to tell the user when calling (if anything).
+ """
+
+ name: Optional[str] = None
+ """The name of the function."""
+
+ parameters: Optional[object] = None
+ """Parameters of the function in JSON Schema."""
+
+ type: Optional[Literal["function"]] = None
+ """The type of the tool, i.e. `function`."""
src/openai/types/realtime/models_param.py
@@ -0,0 +1,24 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing_extensions import Literal, TypedDict
+
+__all__ = ["ModelsParam"]
+
+
+class ModelsParam(TypedDict, total=False):
+ description: str
+ """
+ The description of the function, including guidance on when and how to call it,
+ and guidance about what to tell the user when calling (if anything).
+ """
+
+ name: str
+ """The name of the function."""
+
+ parameters: object
+ """Parameters of the function in JSON Schema."""
+
+ type: Literal["function"]
+ """The type of the tool, i.e. `function`."""
src/openai/types/realtime/noise_reduction_type.py
@@ -0,0 +1,7 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing_extensions import Literal, TypeAlias
+
+__all__ = ["NoiseReductionType"]
+
+NoiseReductionType: TypeAlias = Literal["near_field", "far_field"]
src/openai/types/realtime/realtime_audio_config.py
@@ -1,184 +1,15 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-from typing import Union, Optional
-from typing_extensions import Literal
+from typing import Optional
from ..._models import BaseModel
+from .realtime_audio_config_input import RealtimeAudioConfigInput
+from .realtime_audio_config_output import RealtimeAudioConfigOutput
-__all__ = ["RealtimeAudioConfig", "Input", "InputNoiseReduction", "InputTranscription", "InputTurnDetection", "Output"]
-
-
-class InputNoiseReduction(BaseModel):
- type: Optional[Literal["near_field", "far_field"]] = None
- """Type of noise reduction.
-
- `near_field` is for close-talking microphones such as headphones, `far_field` is
- for far-field microphones such as laptop or conference room microphones.
- """
-
-
-class InputTranscription(BaseModel):
- language: Optional[str] = None
- """The language of the input audio.
-
- Supplying the input language in
- [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- format will improve accuracy and latency.
- """
-
- model: Optional[
- Literal[
- "whisper-1",
- "gpt-4o-transcribe-latest",
- "gpt-4o-mini-transcribe",
- "gpt-4o-transcribe",
- "gpt-4o-transcribe-diarize",
- ]
- ] = None
- """The model to use for transcription.
-
- Current options are `whisper-1`, `gpt-4o-transcribe-latest`,
- `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`.
- """
-
- prompt: Optional[str] = None
- """
- An optional text to guide the model's style or continue a previous audio
- segment. For `whisper-1`, the
- [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
- For `gpt-4o-transcribe` models, the prompt is a free text string, for example
- "expect words related to technology".
- """
-
-
-class InputTurnDetection(BaseModel):
- create_response: Optional[bool] = None
- """
- Whether or not to automatically generate a response when a VAD stop event
- occurs.
- """
-
- eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None
- """Used only for `semantic_vad` mode.
-
- The eagerness of the model to respond. `low` will wait longer for the user to
- continue speaking, `high` will respond more quickly. `auto` is the default and
- is equivalent to `medium`.
- """
-
- idle_timeout_ms: Optional[int] = None
- """
- Optional idle timeout after which turn detection will auto-timeout when no
- additional audio is received.
- """
-
- interrupt_response: Optional[bool] = None
- """
- Whether or not to automatically interrupt any ongoing response with output to
- the default conversation (i.e. `conversation` of `auto`) when a VAD start event
- occurs.
- """
-
- prefix_padding_ms: Optional[int] = None
- """Used only for `server_vad` mode.
-
- Amount of audio to include before the VAD detected speech (in milliseconds).
- Defaults to 300ms.
- """
-
- silence_duration_ms: Optional[int] = None
- """Used only for `server_vad` mode.
-
- Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
- With shorter values the model will respond more quickly, but may jump in on
- short pauses from the user.
- """
-
- threshold: Optional[float] = None
- """Used only for `server_vad` mode.
-
- Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
- threshold will require louder audio to activate the model, and thus might
- perform better in noisy environments.
- """
-
- type: Optional[Literal["server_vad", "semantic_vad"]] = None
- """Type of turn detection."""
-
-
-class Input(BaseModel):
- format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None
- """The format of input audio.
-
- Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must
- be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian
- byte order.
- """
-
- noise_reduction: Optional[InputNoiseReduction] = None
- """Configuration for input audio noise reduction.
-
- This can be set to `null` to turn off. Noise reduction filters audio added to
- the input audio buffer before it is sent to VAD and the model. Filtering the
- audio can improve VAD and turn detection accuracy (reducing false positives) and
- model performance by improving perception of the input audio.
- """
-
- transcription: Optional[InputTranscription] = None
- """
- Configuration for input audio transcription, defaults to off and can be set to
- `null` to turn off once on. Input audio transcription is not native to the
- model, since the model consumes audio directly. Transcription runs
- asynchronously through
- [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
- and should be treated as guidance of input audio content rather than precisely
- what the model heard. The client can optionally set the language and prompt for
- transcription, these offer additional guidance to the transcription service.
- """
-
- turn_detection: Optional[InputTurnDetection] = None
- """Configuration for turn detection, ether Server VAD or Semantic VAD.
-
- This can be set to `null` to turn off, in which case the client must manually
- trigger model response. Server VAD means that the model will detect the start
- and end of speech based on audio volume and respond at the end of user speech.
- Semantic VAD is more advanced and uses a turn detection model (in conjunction
- with VAD) to semantically estimate whether the user has finished speaking, then
- dynamically sets a timeout based on this probability. For example, if user audio
- trails off with "uhhm", the model will score a low probability of turn end and
- wait longer for the user to continue speaking. This can be useful for more
- natural conversations, but may have a higher latency.
- """
-
-
-class Output(BaseModel):
- format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None
- """The format of output audio.
-
- Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, output audio is
- sampled at a rate of 24kHz.
- """
-
- speed: Optional[float] = None
- """The speed of the model's spoken response.
-
- 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
- This value can only be changed in between model turns, not while a response is
- in progress.
- """
-
- voice: Union[
- str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None
- ] = None
- """The voice the model uses to respond.
-
- Voice cannot be changed during the session once the model has responded with
- audio at least once. Current voice options are `alloy`, `ash`, `ballad`,
- `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`.
- """
+__all__ = ["RealtimeAudioConfig"]
class RealtimeAudioConfig(BaseModel):
- input: Optional[Input] = None
+ input: Optional[RealtimeAudioConfigInput] = None
- output: Optional[Output] = None
+ output: Optional[RealtimeAudioConfigOutput] = None
src/openai/types/realtime/realtime_audio_config_input.py
@@ -0,0 +1,60 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Optional
+
+from ..._models import BaseModel
+from .audio_transcription import AudioTranscription
+from .noise_reduction_type import NoiseReductionType
+from .realtime_audio_formats import RealtimeAudioFormats
+from .realtime_audio_input_turn_detection import RealtimeAudioInputTurnDetection
+
+__all__ = ["RealtimeAudioConfigInput", "NoiseReduction"]
+
+
+class NoiseReduction(BaseModel):
+ type: Optional[NoiseReductionType] = None
+ """Type of noise reduction.
+
+ `near_field` is for close-talking microphones such as headphones, `far_field` is
+ for far-field microphones such as laptop or conference room microphones.
+ """
+
+
+class RealtimeAudioConfigInput(BaseModel):
+ format: Optional[RealtimeAudioFormats] = None
+ """The format of the input audio."""
+
+ noise_reduction: Optional[NoiseReduction] = None
+ """Configuration for input audio noise reduction.
+
+ This can be set to `null` to turn off. Noise reduction filters audio added to
+ the input audio buffer before it is sent to VAD and the model. Filtering the
+ audio can improve VAD and turn detection accuracy (reducing false positives) and
+ model performance by improving perception of the input audio.
+ """
+
+ transcription: Optional[AudioTranscription] = None
+ """
+ Configuration for input audio transcription, defaults to off and can be set to
+ `null` to turn off once on. Input audio transcription is not native to the
+ model, since the model consumes audio directly. Transcription runs
+ asynchronously through
+ [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+ and should be treated as guidance of input audio content rather than precisely
+ what the model heard. The client can optionally set the language and prompt for
+ transcription, these offer additional guidance to the transcription service.
+ """
+
+ turn_detection: Optional[RealtimeAudioInputTurnDetection] = None
+ """Configuration for turn detection, ether Server VAD or Semantic VAD.
+
+ This can be set to `null` to turn off, in which case the client must manually
+ trigger model response. Server VAD means that the model will detect the start
+ and end of speech based on audio volume and respond at the end of user speech.
+ Semantic VAD is more advanced and uses a turn detection model (in conjunction
+ with VAD) to semantically estimate whether the user has finished speaking, then
+ dynamically sets a timeout based on this probability. For example, if user audio
+ trails off with "uhhm", the model will score a low probability of turn end and
+ wait longer for the user to continue speaking. This can be useful for more
+ natural conversations, but may have a higher latency.
+ """
src/openai/types/realtime/realtime_audio_config_input_param.py
@@ -0,0 +1,61 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing_extensions import TypedDict
+
+from .noise_reduction_type import NoiseReductionType
+from .audio_transcription_param import AudioTranscriptionParam
+from .realtime_audio_formats_param import RealtimeAudioFormatsParam
+from .realtime_audio_input_turn_detection_param import RealtimeAudioInputTurnDetectionParam
+
+__all__ = ["RealtimeAudioConfigInputParam", "NoiseReduction"]
+
+
+class NoiseReduction(TypedDict, total=False):
+ type: NoiseReductionType
+ """Type of noise reduction.
+
+ `near_field` is for close-talking microphones such as headphones, `far_field` is
+ for far-field microphones such as laptop or conference room microphones.
+ """
+
+
+class RealtimeAudioConfigInputParam(TypedDict, total=False):
+ format: RealtimeAudioFormatsParam
+ """The format of the input audio."""
+
+ noise_reduction: NoiseReduction
+ """Configuration for input audio noise reduction.
+
+ This can be set to `null` to turn off. Noise reduction filters audio added to
+ the input audio buffer before it is sent to VAD and the model. Filtering the
+ audio can improve VAD and turn detection accuracy (reducing false positives) and
+ model performance by improving perception of the input audio.
+ """
+
+ transcription: AudioTranscriptionParam
+ """
+ Configuration for input audio transcription, defaults to off and can be set to
+ `null` to turn off once on. Input audio transcription is not native to the
+ model, since the model consumes audio directly. Transcription runs
+ asynchronously through
+ [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+ and should be treated as guidance of input audio content rather than precisely
+ what the model heard. The client can optionally set the language and prompt for
+ transcription, these offer additional guidance to the transcription service.
+ """
+
+ turn_detection: RealtimeAudioInputTurnDetectionParam
+ """Configuration for turn detection, ether Server VAD or Semantic VAD.
+
+ This can be set to `null` to turn off, in which case the client must manually
+ trigger model response. Server VAD means that the model will detect the start
+ and end of speech based on audio volume and respond at the end of user speech.
+ Semantic VAD is more advanced and uses a turn detection model (in conjunction
+ with VAD) to semantically estimate whether the user has finished speaking, then
+ dynamically sets a timeout based on this probability. For example, if user audio
+ trails off with "uhhm", the model will score a low probability of turn end and
+ wait longer for the user to continue speaking. This can be useful for more
+ natural conversations, but may have a higher latency.
+ """
src/openai/types/realtime/realtime_audio_config_output.py
@@ -0,0 +1,36 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Union, Optional
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+from .realtime_audio_formats import RealtimeAudioFormats
+
+__all__ = ["RealtimeAudioConfigOutput"]
+
+
+class RealtimeAudioConfigOutput(BaseModel):
+ format: Optional[RealtimeAudioFormats] = None
+ """The format of the output audio."""
+
+ speed: Optional[float] = None
+ """
+ The speed of the model's spoken response as a multiple of the original speed.
+ 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
+ This value can only be changed in between model turns, not while a response is
+ in progress.
+
+ This parameter is a post-processing adjustment to the audio after it is
+ generated, it's also possible to prompt the model to speak faster or slower.
+ """
+
+ voice: Union[
+ str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None
+ ] = None
+ """The voice the model uses to respond.
+
+ Voice cannot be changed during the session once the model has responded with
+ audio at least once. Current voice options are `alloy`, `ash`, `ballad`,
+ `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`. We recommend
+ `marin` and `cedar` for best quality.
+ """
src/openai/types/realtime/realtime_audio_config_output_param.py
@@ -0,0 +1,35 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import Union
+from typing_extensions import Literal, TypedDict
+
+from .realtime_audio_formats_param import RealtimeAudioFormatsParam
+
+__all__ = ["RealtimeAudioConfigOutputParam"]
+
+
+class RealtimeAudioConfigOutputParam(TypedDict, total=False):
+ format: RealtimeAudioFormatsParam
+ """The format of the output audio."""
+
+ speed: float
+ """
+ The speed of the model's spoken response as a multiple of the original speed.
+ 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
+ This value can only be changed in between model turns, not while a response is
+ in progress.
+
+ This parameter is a post-processing adjustment to the audio after it is
+ generated, it's also possible to prompt the model to speak faster or slower.
+ """
+
+ voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]]
+ """The voice the model uses to respond.
+
+ Voice cannot be changed during the session once the model has responded with
+ audio at least once. Current voice options are `alloy`, `ash`, `ballad`,
+ `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`. We recommend
+ `marin` and `cedar` for best quality.
+ """
src/openai/types/realtime/realtime_audio_config_param.py
@@ -2,186 +2,15 @@
from __future__ import annotations
-from typing import Union, Optional
-from typing_extensions import Literal, TypedDict
+from typing_extensions import TypedDict
-__all__ = [
- "RealtimeAudioConfigParam",
- "Input",
- "InputNoiseReduction",
- "InputTranscription",
- "InputTurnDetection",
- "Output",
-]
+from .realtime_audio_config_input_param import RealtimeAudioConfigInputParam
+from .realtime_audio_config_output_param import RealtimeAudioConfigOutputParam
-
-class InputNoiseReduction(TypedDict, total=False):
- type: Literal["near_field", "far_field"]
- """Type of noise reduction.
-
- `near_field` is for close-talking microphones such as headphones, `far_field` is
- for far-field microphones such as laptop or conference room microphones.
- """
-
-
-class InputTranscription(TypedDict, total=False):
- language: str
- """The language of the input audio.
-
- Supplying the input language in
- [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- format will improve accuracy and latency.
- """
-
- model: Literal[
- "whisper-1",
- "gpt-4o-transcribe-latest",
- "gpt-4o-mini-transcribe",
- "gpt-4o-transcribe",
- "gpt-4o-transcribe-diarize",
- ]
- """The model to use for transcription.
-
- Current options are `whisper-1`, `gpt-4o-transcribe-latest`,
- `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`.
- """
-
- prompt: str
- """
- An optional text to guide the model's style or continue a previous audio
- segment. For `whisper-1`, the
- [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
- For `gpt-4o-transcribe` models, the prompt is a free text string, for example
- "expect words related to technology".
- """
-
-
-class InputTurnDetection(TypedDict, total=False):
- create_response: bool
- """
- Whether or not to automatically generate a response when a VAD stop event
- occurs.
- """
-
- eagerness: Literal["low", "medium", "high", "auto"]
- """Used only for `semantic_vad` mode.
-
- The eagerness of the model to respond. `low` will wait longer for the user to
- continue speaking, `high` will respond more quickly. `auto` is the default and
- is equivalent to `medium`.
- """
-
- idle_timeout_ms: Optional[int]
- """
- Optional idle timeout after which turn detection will auto-timeout when no
- additional audio is received.
- """
-
- interrupt_response: bool
- """
- Whether or not to automatically interrupt any ongoing response with output to
- the default conversation (i.e. `conversation` of `auto`) when a VAD start event
- occurs.
- """
-
- prefix_padding_ms: int
- """Used only for `server_vad` mode.
-
- Amount of audio to include before the VAD detected speech (in milliseconds).
- Defaults to 300ms.
- """
-
- silence_duration_ms: int
- """Used only for `server_vad` mode.
-
- Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
- With shorter values the model will respond more quickly, but may jump in on
- short pauses from the user.
- """
-
- threshold: float
- """Used only for `server_vad` mode.
-
- Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
- threshold will require louder audio to activate the model, and thus might
- perform better in noisy environments.
- """
-
- type: Literal["server_vad", "semantic_vad"]
- """Type of turn detection."""
-
-
-class Input(TypedDict, total=False):
- format: Literal["pcm16", "g711_ulaw", "g711_alaw"]
- """The format of input audio.
-
- Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must
- be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian
- byte order.
- """
-
- noise_reduction: InputNoiseReduction
- """Configuration for input audio noise reduction.
-
- This can be set to `null` to turn off. Noise reduction filters audio added to
- the input audio buffer before it is sent to VAD and the model. Filtering the
- audio can improve VAD and turn detection accuracy (reducing false positives) and
- model performance by improving perception of the input audio.
- """
-
- transcription: InputTranscription
- """
- Configuration for input audio transcription, defaults to off and can be set to
- `null` to turn off once on. Input audio transcription is not native to the
- model, since the model consumes audio directly. Transcription runs
- asynchronously through
- [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
- and should be treated as guidance of input audio content rather than precisely
- what the model heard. The client can optionally set the language and prompt for
- transcription, these offer additional guidance to the transcription service.
- """
-
- turn_detection: InputTurnDetection
- """Configuration for turn detection, ether Server VAD or Semantic VAD.
-
- This can be set to `null` to turn off, in which case the client must manually
- trigger model response. Server VAD means that the model will detect the start
- and end of speech based on audio volume and respond at the end of user speech.
- Semantic VAD is more advanced and uses a turn detection model (in conjunction
- with VAD) to semantically estimate whether the user has finished speaking, then
- dynamically sets a timeout based on this probability. For example, if user audio
- trails off with "uhhm", the model will score a low probability of turn end and
- wait longer for the user to continue speaking. This can be useful for more
- natural conversations, but may have a higher latency.
- """
-
-
-class Output(TypedDict, total=False):
- format: Literal["pcm16", "g711_ulaw", "g711_alaw"]
- """The format of output audio.
-
- Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, output audio is
- sampled at a rate of 24kHz.
- """
-
- speed: float
- """The speed of the model's spoken response.
-
- 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
- This value can only be changed in between model turns, not while a response is
- in progress.
- """
-
- voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]]
- """The voice the model uses to respond.
-
- Voice cannot be changed during the session once the model has responded with
- audio at least once. Current voice options are `alloy`, `ash`, `ballad`,
- `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`.
- """
+__all__ = ["RealtimeAudioConfigParam"]
class RealtimeAudioConfigParam(TypedDict, total=False):
- input: Input
+ input: RealtimeAudioConfigInputParam
- output: Output
+ output: RealtimeAudioConfigOutputParam
src/openai/types/realtime/realtime_audio_formats.py
@@ -0,0 +1,30 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Union, Optional
+from typing_extensions import Literal, Annotated, TypeAlias
+
+from ..._utils import PropertyInfo
+from ..._models import BaseModel
+
+__all__ = ["RealtimeAudioFormats", "AudioPCM", "AudioPCMU", "AudioPCMA"]
+
+
+class AudioPCM(BaseModel):
+ rate: Optional[Literal[24000]] = None
+ """The sample rate of the audio. Always `24000`."""
+
+ type: Optional[Literal["audio/pcm"]] = None
+ """The audio format. Always `audio/pcm`."""
+
+
+class AudioPCMU(BaseModel):
+ type: Optional[Literal["audio/pcmu"]] = None
+ """The audio format. Always `audio/pcmu`."""
+
+
+class AudioPCMA(BaseModel):
+ type: Optional[Literal["audio/pcma"]] = None
+ """The audio format. Always `audio/pcma`."""
+
+
+RealtimeAudioFormats: TypeAlias = Annotated[Union[AudioPCM, AudioPCMU, AudioPCMA], PropertyInfo(discriminator="type")]
src/openai/types/realtime/realtime_audio_formats_param.py
@@ -0,0 +1,29 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import Union
+from typing_extensions import Literal, TypeAlias, TypedDict
+
+__all__ = ["RealtimeAudioFormatsParam", "AudioPCM", "AudioPCMU", "AudioPCMA"]
+
+
+class AudioPCM(TypedDict, total=False):
+ rate: Literal[24000]
+ """The sample rate of the audio. Always `24000`."""
+
+ type: Literal["audio/pcm"]
+ """The audio format. Always `audio/pcm`."""
+
+
+class AudioPCMU(TypedDict, total=False):
+ type: Literal["audio/pcmu"]
+ """The audio format. Always `audio/pcmu`."""
+
+
+class AudioPCMA(TypedDict, total=False):
+ type: Literal["audio/pcma"]
+ """The audio format. Always `audio/pcma`."""
+
+
+RealtimeAudioFormatsParam: TypeAlias = Union[AudioPCM, AudioPCMU, AudioPCMA]
src/openai/types/realtime/realtime_audio_input_turn_detection.py
@@ -0,0 +1,64 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Optional
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+
+__all__ = ["RealtimeAudioInputTurnDetection"]
+
+
+class RealtimeAudioInputTurnDetection(BaseModel):
+ create_response: Optional[bool] = None
+ """
+ Whether or not to automatically generate a response when a VAD stop event
+ occurs.
+ """
+
+ eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None
+ """Used only for `semantic_vad` mode.
+
+ The eagerness of the model to respond. `low` will wait longer for the user to
+ continue speaking, `high` will respond more quickly. `auto` is the default and
+ is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s,
+ 4s, and 2s respectively.
+ """
+
+ idle_timeout_ms: Optional[int] = None
+ """
+ Optional idle timeout after which turn detection will auto-timeout when no
+ additional audio is received.
+ """
+
+ interrupt_response: Optional[bool] = None
+ """
+ Whether or not to automatically interrupt any ongoing response with output to
+ the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+ occurs.
+ """
+
+ prefix_padding_ms: Optional[int] = None
+ """Used only for `server_vad` mode.
+
+ Amount of audio to include before the VAD detected speech (in milliseconds).
+ Defaults to 300ms.
+ """
+
+ silence_duration_ms: Optional[int] = None
+ """Used only for `server_vad` mode.
+
+ Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ With shorter values the model will respond more quickly, but may jump in on
+ short pauses from the user.
+ """
+
+ threshold: Optional[float] = None
+ """Used only for `server_vad` mode.
+
+ Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ threshold will require louder audio to activate the model, and thus might
+ perform better in noisy environments.
+ """
+
+ type: Optional[Literal["server_vad", "semantic_vad"]] = None
+ """Type of turn detection."""
src/openai/types/realtime/realtime_audio_input_turn_detection_param.py
@@ -0,0 +1,64 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import Optional
+from typing_extensions import Literal, TypedDict
+
+__all__ = ["RealtimeAudioInputTurnDetectionParam"]
+
+
+class RealtimeAudioInputTurnDetectionParam(TypedDict, total=False):
+ create_response: bool
+ """
+ Whether or not to automatically generate a response when a VAD stop event
+ occurs.
+ """
+
+ eagerness: Literal["low", "medium", "high", "auto"]
+ """Used only for `semantic_vad` mode.
+
+ The eagerness of the model to respond. `low` will wait longer for the user to
+ continue speaking, `high` will respond more quickly. `auto` is the default and
+ is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s,
+ 4s, and 2s respectively.
+ """
+
+ idle_timeout_ms: Optional[int]
+ """
+ Optional idle timeout after which turn detection will auto-timeout when no
+ additional audio is received.
+ """
+
+ interrupt_response: bool
+ """
+ Whether or not to automatically interrupt any ongoing response with output to
+ the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+ occurs.
+ """
+
+ prefix_padding_ms: int
+ """Used only for `server_vad` mode.
+
+ Amount of audio to include before the VAD detected speech (in milliseconds).
+ Defaults to 300ms.
+ """
+
+ silence_duration_ms: int
+ """Used only for `server_vad` mode.
+
+ Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ With shorter values the model will respond more quickly, but may jump in on
+ short pauses from the user.
+ """
+
+ threshold: float
+ """Used only for `server_vad` mode.
+
+ Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ threshold will require louder audio to activate the model, and thus might
+ perform better in noisy environments.
+ """
+
+ type: Literal["server_vad", "semantic_vad"]
+ """Type of turn detection."""
src/openai/types/realtime/realtime_client_secret_config.py
@@ -1,27 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import Optional
-from typing_extensions import Literal
-
-from ..._models import BaseModel
-
-__all__ = ["RealtimeClientSecretConfig", "ExpiresAfter"]
-
-
-class ExpiresAfter(BaseModel):
- anchor: Literal["created_at"]
- """The anchor point for the ephemeral token expiration.
-
- Only `created_at` is currently supported.
- """
-
- seconds: Optional[int] = None
- """The number of seconds from the anchor point to the expiration.
-
- Select a value between `10` and `7200`.
- """
-
-
-class RealtimeClientSecretConfig(BaseModel):
- expires_after: Optional[ExpiresAfter] = None
- """Configuration for the ephemeral token expiration."""
src/openai/types/realtime/realtime_client_secret_config_param.py
@@ -1,26 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing_extensions import Literal, Required, TypedDict
-
-__all__ = ["RealtimeClientSecretConfigParam", "ExpiresAfter"]
-
-
-class ExpiresAfter(TypedDict, total=False):
- anchor: Required[Literal["created_at"]]
- """The anchor point for the ephemeral token expiration.
-
- Only `created_at` is currently supported.
- """
-
- seconds: int
- """The number of seconds from the anchor point to the expiration.
-
- Select a value between `10` and `7200`.
- """
-
-
-class RealtimeClientSecretConfigParam(TypedDict, total=False):
- expires_after: ExpiresAfter
- """Configuration for the ephemeral token expiration."""
src/openai/types/realtime/realtime_conversation_item_assistant_message.py
@@ -9,11 +9,27 @@ __all__ = ["RealtimeConversationItemAssistantMessage", "Content"]
class Content(BaseModel):
+ audio: Optional[str] = None
+ """
+ Base64-encoded audio bytes, these will be parsed as the format specified in the
+ session output audio type configuration. This defaults to PCM 16-bit 24kHz mono
+ if not specified.
+ """
+
text: Optional[str] = None
"""The text content."""
- type: Optional[Literal["text"]] = None
- """The content type. Always `text` for assistant messages."""
+ transcript: Optional[str] = None
+ """
+ The transcript of the audio content, this will always be present if the output
+ type is `audio`.
+ """
+
+ type: Optional[Literal["output_text", "output_audio"]] = None
+ """
+ The content type, `output_text` or `output_audio` depending on the session
+ `output_modalities` configuration.
+ """
class RealtimeConversationItemAssistantMessage(BaseModel):
@@ -27,10 +43,16 @@ class RealtimeConversationItemAssistantMessage(BaseModel):
"""The type of the item. Always `message`."""
id: Optional[str] = None
- """The unique ID of the item."""
+ """The unique ID of the item.
+
+ This may be provided by the client or generated by the server.
+ """
object: Optional[Literal["realtime.item"]] = None
- """Identifier for the API object being returned - always `realtime.item`."""
+ """Identifier for the API object being returned - always `realtime.item`.
+
+ Optional when creating a new item.
+ """
status: Optional[Literal["completed", "incomplete", "in_progress"]] = None
"""The status of the item. Has no effect on the conversation."""
src/openai/types/realtime/realtime_conversation_item_assistant_message_param.py
@@ -9,11 +9,27 @@ __all__ = ["RealtimeConversationItemAssistantMessageParam", "Content"]
class Content(TypedDict, total=False):
+ audio: str
+ """
+ Base64-encoded audio bytes, these will be parsed as the format specified in the
+ session output audio type configuration. This defaults to PCM 16-bit 24kHz mono
+ if not specified.
+ """
+
text: str
"""The text content."""
- type: Literal["text"]
- """The content type. Always `text` for assistant messages."""
+ transcript: str
+ """
+ The transcript of the audio content, this will always be present if the output
+ type is `audio`.
+ """
+
+ type: Literal["output_text", "output_audio"]
+ """
+ The content type, `output_text` or `output_audio` depending on the session
+ `output_modalities` configuration.
+ """
class RealtimeConversationItemAssistantMessageParam(TypedDict, total=False):
@@ -27,10 +43,16 @@ class RealtimeConversationItemAssistantMessageParam(TypedDict, total=False):
"""The type of the item. Always `message`."""
id: str
- """The unique ID of the item."""
+ """The unique ID of the item.
+
+ This may be provided by the client or generated by the server.
+ """
object: Literal["realtime.item"]
- """Identifier for the API object being returned - always `realtime.item`."""
+ """Identifier for the API object being returned - always `realtime.item`.
+
+ Optional when creating a new item.
+ """
status: Literal["completed", "incomplete", "in_progress"]
"""The status of the item. Has no effect on the conversation."""
src/openai/types/realtime/realtime_conversation_item_function_call.py
@@ -10,7 +10,11 @@ __all__ = ["RealtimeConversationItemFunctionCall"]
class RealtimeConversationItemFunctionCall(BaseModel):
arguments: str
- """The arguments of the function call."""
+ """The arguments of the function call.
+
+ This is a JSON-encoded string representing the arguments passed to the function,
+ for example `{"arg1": "value1", "arg2": 42}`.
+ """
name: str
"""The name of the function being called."""
@@ -19,13 +23,19 @@ class RealtimeConversationItemFunctionCall(BaseModel):
"""The type of the item. Always `function_call`."""
id: Optional[str] = None
- """The unique ID of the item."""
+ """The unique ID of the item.
+
+ This may be provided by the client or generated by the server.
+ """
call_id: Optional[str] = None
"""The ID of the function call."""
object: Optional[Literal["realtime.item"]] = None
- """Identifier for the API object being returned - always `realtime.item`."""
+ """Identifier for the API object being returned - always `realtime.item`.
+
+ Optional when creating a new item.
+ """
status: Optional[Literal["completed", "incomplete", "in_progress"]] = None
"""The status of the item. Has no effect on the conversation."""
src/openai/types/realtime/realtime_conversation_item_function_call_output.py
@@ -13,16 +13,25 @@ class RealtimeConversationItemFunctionCallOutput(BaseModel):
"""The ID of the function call this output is for."""
output: str
- """The output of the function call."""
+ """
+ The output of the function call, this is free text and can contain any
+ information or simply be empty.
+ """
type: Literal["function_call_output"]
"""The type of the item. Always `function_call_output`."""
id: Optional[str] = None
- """The unique ID of the item."""
+ """The unique ID of the item.
+
+ This may be provided by the client or generated by the server.
+ """
object: Optional[Literal["realtime.item"]] = None
- """Identifier for the API object being returned - always `realtime.item`."""
+ """Identifier for the API object being returned - always `realtime.item`.
+
+ Optional when creating a new item.
+ """
status: Optional[Literal["completed", "incomplete", "in_progress"]] = None
"""The status of the item. Has no effect on the conversation."""
src/openai/types/realtime/realtime_conversation_item_function_call_output_param.py
@@ -12,16 +12,25 @@ class RealtimeConversationItemFunctionCallOutputParam(TypedDict, total=False):
"""The ID of the function call this output is for."""
output: Required[str]
- """The output of the function call."""
+ """
+ The output of the function call, this is free text and can contain any
+ information or simply be empty.
+ """
type: Required[Literal["function_call_output"]]
"""The type of the item. Always `function_call_output`."""
id: str
- """The unique ID of the item."""
+ """The unique ID of the item.
+
+ This may be provided by the client or generated by the server.
+ """
object: Literal["realtime.item"]
- """Identifier for the API object being returned - always `realtime.item`."""
+ """Identifier for the API object being returned - always `realtime.item`.
+
+ Optional when creating a new item.
+ """
status: Literal["completed", "incomplete", "in_progress"]
"""The status of the item. Has no effect on the conversation."""
src/openai/types/realtime/realtime_conversation_item_function_call_param.py
@@ -9,7 +9,11 @@ __all__ = ["RealtimeConversationItemFunctionCallParam"]
class RealtimeConversationItemFunctionCallParam(TypedDict, total=False):
arguments: Required[str]
- """The arguments of the function call."""
+ """The arguments of the function call.
+
+ This is a JSON-encoded string representing the arguments passed to the function,
+ for example `{"arg1": "value1", "arg2": 42}`.
+ """
name: Required[str]
"""The name of the function being called."""
@@ -18,13 +22,19 @@ class RealtimeConversationItemFunctionCallParam(TypedDict, total=False):
"""The type of the item. Always `function_call`."""
id: str
- """The unique ID of the item."""
+ """The unique ID of the item.
+
+ This may be provided by the client or generated by the server.
+ """
call_id: str
"""The ID of the function call."""
object: Literal["realtime.item"]
- """Identifier for the API object being returned - always `realtime.item`."""
+ """Identifier for the API object being returned - always `realtime.item`.
+
+ Optional when creating a new item.
+ """
status: Literal["completed", "incomplete", "in_progress"]
"""The status of the item. Has no effect on the conversation."""
src/openai/types/realtime/realtime_conversation_item_system_message.py
@@ -27,10 +27,16 @@ class RealtimeConversationItemSystemMessage(BaseModel):
"""The type of the item. Always `message`."""
id: Optional[str] = None
- """The unique ID of the item."""
+ """The unique ID of the item.
+
+ This may be provided by the client or generated by the server.
+ """
object: Optional[Literal["realtime.item"]] = None
- """Identifier for the API object being returned - always `realtime.item`."""
+ """Identifier for the API object being returned - always `realtime.item`.
+
+ Optional when creating a new item.
+ """
status: Optional[Literal["completed", "incomplete", "in_progress"]] = None
"""The status of the item. Has no effect on the conversation."""
src/openai/types/realtime/realtime_conversation_item_system_message_param.py
@@ -27,10 +27,16 @@ class RealtimeConversationItemSystemMessageParam(TypedDict, total=False):
"""The type of the item. Always `message`."""
id: str
- """The unique ID of the item."""
+ """The unique ID of the item.
+
+ This may be provided by the client or generated by the server.
+ """
object: Literal["realtime.item"]
- """Identifier for the API object being returned - always `realtime.item`."""
+ """Identifier for the API object being returned - always `realtime.item`.
+
+ Optional when creating a new item.
+ """
status: Literal["completed", "incomplete", "in_progress"]
"""The status of the item. Has no effect on the conversation."""
src/openai/types/realtime/realtime_conversation_item_user_message.py
@@ -10,16 +10,37 @@ __all__ = ["RealtimeConversationItemUserMessage", "Content"]
class Content(BaseModel):
audio: Optional[str] = None
- """Base64-encoded audio bytes (for `input_audio`)."""
+ """
+ Base64-encoded audio bytes (for `input_audio`), these will be parsed as the
+ format specified in the session input audio type configuration. This defaults to
+ PCM 16-bit 24kHz mono if not specified.
+ """
+
+ detail: Optional[Literal["auto", "low", "high"]] = None
+ """The detail level of the image (for `input_image`).
+
+ `auto` will default to `high`.
+ """
+
+ image_url: Optional[str] = None
+ """Base64-encoded image bytes (for `input_image`) as a data URI.
+
+ For example `data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...`. Supported
+ formats are PNG and JPEG.
+ """
text: Optional[str] = None
"""The text content (for `input_text`)."""
transcript: Optional[str] = None
- """Transcript of the audio (for `input_audio`)."""
+ """Transcript of the audio (for `input_audio`).
- type: Optional[Literal["input_text", "input_audio"]] = None
- """The content type (`input_text` or `input_audio`)."""
+ This is not sent to the model, but will be attached to the message item for
+ reference.
+ """
+
+ type: Optional[Literal["input_text", "input_audio", "input_image"]] = None
+ """The content type (`input_text`, `input_audio`, or `input_image`)."""
class RealtimeConversationItemUserMessage(BaseModel):
@@ -33,10 +54,16 @@ class RealtimeConversationItemUserMessage(BaseModel):
"""The type of the item. Always `message`."""
id: Optional[str] = None
- """The unique ID of the item."""
+ """The unique ID of the item.
+
+ This may be provided by the client or generated by the server.
+ """
object: Optional[Literal["realtime.item"]] = None
- """Identifier for the API object being returned - always `realtime.item`."""
+ """Identifier for the API object being returned - always `realtime.item`.
+
+ Optional when creating a new item.
+ """
status: Optional[Literal["completed", "incomplete", "in_progress"]] = None
"""The status of the item. Has no effect on the conversation."""
src/openai/types/realtime/realtime_conversation_item_user_message_param.py
@@ -10,16 +10,37 @@ __all__ = ["RealtimeConversationItemUserMessageParam", "Content"]
class Content(TypedDict, total=False):
audio: str
- """Base64-encoded audio bytes (for `input_audio`)."""
+ """
+ Base64-encoded audio bytes (for `input_audio`), these will be parsed as the
+ format specified in the session input audio type configuration. This defaults to
+ PCM 16-bit 24kHz mono if not specified.
+ """
+
+ detail: Literal["auto", "low", "high"]
+ """The detail level of the image (for `input_image`).
+
+ `auto` will default to `high`.
+ """
+
+ image_url: str
+ """Base64-encoded image bytes (for `input_image`) as a data URI.
+
+ For example `data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...`. Supported
+ formats are PNG and JPEG.
+ """
text: str
"""The text content (for `input_text`)."""
transcript: str
- """Transcript of the audio (for `input_audio`)."""
+ """Transcript of the audio (for `input_audio`).
- type: Literal["input_text", "input_audio"]
- """The content type (`input_text` or `input_audio`)."""
+ This is not sent to the model, but will be attached to the message item for
+ reference.
+ """
+
+ type: Literal["input_text", "input_audio", "input_image"]
+ """The content type (`input_text`, `input_audio`, or `input_image`)."""
class RealtimeConversationItemUserMessageParam(TypedDict, total=False):
@@ -33,10 +54,16 @@ class RealtimeConversationItemUserMessageParam(TypedDict, total=False):
"""The type of the item. Always `message`."""
id: str
- """The unique ID of the item."""
+ """The unique ID of the item.
+
+ This may be provided by the client or generated by the server.
+ """
object: Literal["realtime.item"]
- """Identifier for the API object being returned - always `realtime.item`."""
+ """Identifier for the API object being returned - always `realtime.item`.
+
+ Optional when creating a new item.
+ """
status: Literal["completed", "incomplete", "in_progress"]
"""The status of the item. Has no effect on the conversation."""
src/openai/types/realtime/realtime_response.py
@@ -6,15 +6,39 @@ from typing_extensions import Literal
from ..._models import BaseModel
from ..shared.metadata import Metadata
from .conversation_item import ConversationItem
+from .realtime_audio_formats import RealtimeAudioFormats
from .realtime_response_usage import RealtimeResponseUsage
from .realtime_response_status import RealtimeResponseStatus
-__all__ = ["RealtimeResponse"]
+__all__ = ["RealtimeResponse", "Audio", "AudioOutput"]
+
+
+class AudioOutput(BaseModel):
+ format: Optional[RealtimeAudioFormats] = None
+ """The format of the output audio."""
+
+ voice: Union[
+ str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None
+ ] = None
+ """The voice the model uses to respond.
+
+ Voice cannot be changed during the session once the model has responded with
+ audio at least once. Current voice options are `alloy`, `ash`, `ballad`,
+ `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`. We recommend
+ `marin` and `cedar` for best quality.
+ """
+
+
+class Audio(BaseModel):
+ output: Optional[AudioOutput] = None
class RealtimeResponse(BaseModel):
id: Optional[str] = None
- """The unique ID of the response."""
+ """The unique ID of the response, will look like `resp_1234`."""
+
+ audio: Optional[Audio] = None
+ """Configuration for audio output."""
conversation_id: Optional[str] = None
"""
@@ -23,8 +47,7 @@ class RealtimeResponse(BaseModel):
the default conversation and the value of `conversation_id` will be an id like
`conv_1234`. If `none`, the response will not be added to any conversation and
the value of `conversation_id` will be `null`. If responses are being triggered
- by server VAD, the response will be added to the default conversation, thus the
- `conversation_id` will be an id like `conv_1234`.
+ automatically by VAD the response will be added to the default conversation
"""
max_output_tokens: Union[int, Literal["inf"], None] = None
@@ -43,22 +66,19 @@ class RealtimeResponse(BaseModel):
a maximum length of 512 characters.
"""
- modalities: Optional[List[Literal["text", "audio"]]] = None
- """The set of modalities the model used to respond.
-
- If there are multiple modalities, the model will pick one, for example if
- `modalities` is `["text", "audio"]`, the model could be responding in either
- text or audio.
- """
-
object: Optional[Literal["realtime.response"]] = None
"""The object type, must be `realtime.response`."""
output: Optional[List[ConversationItem]] = None
"""The list of output items generated by the response."""
- output_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None
- """The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`."""
+ output_modalities: Optional[List[Literal["text", "audio"]]] = None
+ """
+ The set of modalities the model used to respond, currently the only possible
+ values are `[\"audio\"]`, `[\"text\"]`. Audio output always include a text
+ transcript. Setting the output to mode `text` will disable audio output from the
+ model.
+ """
status: Optional[Literal["completed", "cancelled", "failed", "incomplete", "in_progress"]] = None
"""
@@ -69,9 +89,6 @@ class RealtimeResponse(BaseModel):
status_details: Optional[RealtimeResponseStatus] = None
"""Additional details about the status."""
- temperature: Optional[float] = None
- """Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8."""
-
usage: Optional[RealtimeResponseUsage] = None
"""Usage statistics for the Response, this will correspond to billing.
@@ -79,11 +96,3 @@ class RealtimeResponse(BaseModel):
to the Conversation, thus output from previous turns (text and audio tokens)
will become the input for later turns.
"""
-
- voice: Union[
- str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None
- ] = None
- """
- The voice the model used to respond. Current voice options are `alloy`, `ash`,
- `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`.
- """
src/openai/types/realtime/realtime_response_create_audio_output.py
@@ -0,0 +1,29 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Union, Optional
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+from .realtime_audio_formats import RealtimeAudioFormats
+
+__all__ = ["RealtimeResponseCreateAudioOutput", "Output"]
+
+
+class Output(BaseModel):
+ format: Optional[RealtimeAudioFormats] = None
+ """The format of the output audio."""
+
+ voice: Union[
+ str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None
+ ] = None
+ """The voice the model uses to respond.
+
+ Voice cannot be changed during the session once the model has responded with
+ audio at least once. Current voice options are `alloy`, `ash`, `ballad`,
+ `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`. We recommend
+ `marin` and `cedar` for best quality.
+ """
+
+
+class RealtimeResponseCreateAudioOutput(BaseModel):
+ output: Optional[Output] = None
src/openai/types/realtime/realtime_response_create_audio_output_param.py
@@ -0,0 +1,28 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import Union
+from typing_extensions import Literal, TypedDict
+
+from .realtime_audio_formats_param import RealtimeAudioFormatsParam
+
+__all__ = ["RealtimeResponseCreateAudioOutputParam", "Output"]
+
+
+class Output(TypedDict, total=False):
+ format: RealtimeAudioFormatsParam
+ """The format of the output audio."""
+
+ voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]]
+ """The voice the model uses to respond.
+
+ Voice cannot be changed during the session once the model has responded with
+ audio at least once. Current voice options are `alloy`, `ash`, `ballad`,
+ `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`. We recommend
+ `marin` and `cedar` for best quality.
+ """
+
+
+class RealtimeResponseCreateAudioOutputParam(TypedDict, total=False):
+ output: Output
src/openai/types/realtime/realtime_response_create_mcp_tool.py
@@ -0,0 +1,135 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Dict, List, Union, Optional
+from typing_extensions import Literal, TypeAlias
+
+from ..._models import BaseModel
+
+__all__ = [
+ "RealtimeResponseCreateMcpTool",
+ "AllowedTools",
+ "AllowedToolsMcpToolFilter",
+ "RequireApproval",
+ "RequireApprovalMcpToolApprovalFilter",
+ "RequireApprovalMcpToolApprovalFilterAlways",
+ "RequireApprovalMcpToolApprovalFilterNever",
+]
+
+
+class AllowedToolsMcpToolFilter(BaseModel):
+ read_only: Optional[bool] = None
+ """Indicates whether or not a tool modifies data or is read-only.
+
+ If an MCP server is
+ [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
+ it will match this filter.
+ """
+
+ tool_names: Optional[List[str]] = None
+ """List of allowed tool names."""
+
+
+AllowedTools: TypeAlias = Union[List[str], AllowedToolsMcpToolFilter, None]
+
+
+class RequireApprovalMcpToolApprovalFilterAlways(BaseModel):
+ read_only: Optional[bool] = None
+ """Indicates whether or not a tool modifies data or is read-only.
+
+ If an MCP server is
+ [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
+ it will match this filter.
+ """
+
+ tool_names: Optional[List[str]] = None
+ """List of allowed tool names."""
+
+
+class RequireApprovalMcpToolApprovalFilterNever(BaseModel):
+ read_only: Optional[bool] = None
+ """Indicates whether or not a tool modifies data or is read-only.
+
+ If an MCP server is
+ [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
+ it will match this filter.
+ """
+
+ tool_names: Optional[List[str]] = None
+ """List of allowed tool names."""
+
+
+class RequireApprovalMcpToolApprovalFilter(BaseModel):
+ always: Optional[RequireApprovalMcpToolApprovalFilterAlways] = None
+ """A filter object to specify which tools are allowed."""
+
+ never: Optional[RequireApprovalMcpToolApprovalFilterNever] = None
+ """A filter object to specify which tools are allowed."""
+
+
+RequireApproval: TypeAlias = Union[RequireApprovalMcpToolApprovalFilter, Literal["always", "never"], None]
+
+
+class RealtimeResponseCreateMcpTool(BaseModel):
+ server_label: str
+ """A label for this MCP server, used to identify it in tool calls."""
+
+ type: Literal["mcp"]
+ """The type of the MCP tool. Always `mcp`."""
+
+ allowed_tools: Optional[AllowedTools] = None
+ """List of allowed tool names or a filter object."""
+
+ authorization: Optional[str] = None
+ """
+ An OAuth access token that can be used with a remote MCP server, either with a
+ custom MCP server URL or a service connector. Your application must handle the
+ OAuth authorization flow and provide the token here.
+ """
+
+ connector_id: Optional[
+ Literal[
+ "connector_dropbox",
+ "connector_gmail",
+ "connector_googlecalendar",
+ "connector_googledrive",
+ "connector_microsoftteams",
+ "connector_outlookcalendar",
+ "connector_outlookemail",
+ "connector_sharepoint",
+ ]
+ ] = None
+ """Identifier for service connectors, like those available in ChatGPT.
+
+ One of `server_url` or `connector_id` must be provided. Learn more about service
+ connectors
+ [here](https://platform.openai.com/docs/guides/tools-remote-mcp#connectors).
+
+ Currently supported `connector_id` values are:
+
+ - Dropbox: `connector_dropbox`
+ - Gmail: `connector_gmail`
+ - Google Calendar: `connector_googlecalendar`
+ - Google Drive: `connector_googledrive`
+ - Microsoft Teams: `connector_microsoftteams`
+ - Outlook Calendar: `connector_outlookcalendar`
+ - Outlook Email: `connector_outlookemail`
+ - SharePoint: `connector_sharepoint`
+ """
+
+ headers: Optional[Dict[str, str]] = None
+ """Optional HTTP headers to send to the MCP server.
+
+ Use for authentication or other purposes.
+ """
+
+ require_approval: Optional[RequireApproval] = None
+ """Specify which of the MCP server's tools require approval."""
+
+ server_description: Optional[str] = None
+ """Optional description of the MCP server, used to provide more context."""
+
+ server_url: Optional[str] = None
+ """The URL for the MCP server.
+
+ One of `server_url` or `connector_id` must be provided.
+ """
src/openai/types/realtime/realtime_response_create_mcp_tool_param.py
@@ -0,0 +1,135 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import Dict, Union, Optional
+from typing_extensions import Literal, Required, TypeAlias, TypedDict
+
+from ..._types import SequenceNotStr
+
+__all__ = [
+ "RealtimeResponseCreateMcpToolParam",
+ "AllowedTools",
+ "AllowedToolsMcpToolFilter",
+ "RequireApproval",
+ "RequireApprovalMcpToolApprovalFilter",
+ "RequireApprovalMcpToolApprovalFilterAlways",
+ "RequireApprovalMcpToolApprovalFilterNever",
+]
+
+
+class AllowedToolsMcpToolFilter(TypedDict, total=False):
+ read_only: bool
+ """Indicates whether or not a tool modifies data or is read-only.
+
+ If an MCP server is
+ [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
+ it will match this filter.
+ """
+
+ tool_names: SequenceNotStr[str]
+ """List of allowed tool names."""
+
+
+AllowedTools: TypeAlias = Union[SequenceNotStr[str], AllowedToolsMcpToolFilter]
+
+
+class RequireApprovalMcpToolApprovalFilterAlways(TypedDict, total=False):
+ read_only: bool
+ """Indicates whether or not a tool modifies data or is read-only.
+
+ If an MCP server is
+ [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
+ it will match this filter.
+ """
+
+ tool_names: SequenceNotStr[str]
+ """List of allowed tool names."""
+
+
+class RequireApprovalMcpToolApprovalFilterNever(TypedDict, total=False):
+ read_only: bool
+ """Indicates whether or not a tool modifies data or is read-only.
+
+ If an MCP server is
+ [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
+ it will match this filter.
+ """
+
+ tool_names: SequenceNotStr[str]
+ """List of allowed tool names."""
+
+
+class RequireApprovalMcpToolApprovalFilter(TypedDict, total=False):
+ always: RequireApprovalMcpToolApprovalFilterAlways
+ """A filter object to specify which tools are allowed."""
+
+ never: RequireApprovalMcpToolApprovalFilterNever
+ """A filter object to specify which tools are allowed."""
+
+
+RequireApproval: TypeAlias = Union[RequireApprovalMcpToolApprovalFilter, Literal["always", "never"]]
+
+
+class RealtimeResponseCreateMcpToolParam(TypedDict, total=False):
+ server_label: Required[str]
+ """A label for this MCP server, used to identify it in tool calls."""
+
+ type: Required[Literal["mcp"]]
+ """The type of the MCP tool. Always `mcp`."""
+
+ allowed_tools: Optional[AllowedTools]
+ """List of allowed tool names or a filter object."""
+
+ authorization: str
+ """
+ An OAuth access token that can be used with a remote MCP server, either with a
+ custom MCP server URL or a service connector. Your application must handle the
+ OAuth authorization flow and provide the token here.
+ """
+
+ connector_id: Literal[
+ "connector_dropbox",
+ "connector_gmail",
+ "connector_googlecalendar",
+ "connector_googledrive",
+ "connector_microsoftteams",
+ "connector_outlookcalendar",
+ "connector_outlookemail",
+ "connector_sharepoint",
+ ]
+ """Identifier for service connectors, like those available in ChatGPT.
+
+ One of `server_url` or `connector_id` must be provided. Learn more about service
+ connectors
+ [here](https://platform.openai.com/docs/guides/tools-remote-mcp#connectors).
+
+ Currently supported `connector_id` values are:
+
+ - Dropbox: `connector_dropbox`
+ - Gmail: `connector_gmail`
+ - Google Calendar: `connector_googlecalendar`
+ - Google Drive: `connector_googledrive`
+ - Microsoft Teams: `connector_microsoftteams`
+ - Outlook Calendar: `connector_outlookcalendar`
+ - Outlook Email: `connector_outlookemail`
+ - SharePoint: `connector_sharepoint`
+ """
+
+ headers: Optional[Dict[str, str]]
+ """Optional HTTP headers to send to the MCP server.
+
+ Use for authentication or other purposes.
+ """
+
+ require_approval: Optional[RequireApproval]
+ """Specify which of the MCP server's tools require approval."""
+
+ server_description: str
+ """Optional description of the MCP server, used to provide more context."""
+
+ server_url: str
+ """The URL for the MCP server.
+
+ One of `server_url` or `connector_id` must be provided.
+ """
src/openai/types/realtime/realtime_response_create_params.py
@@ -0,0 +1,98 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Union, Optional
+from typing_extensions import Literal, TypeAlias
+
+from .models import Models
+from ..._models import BaseModel
+from ..shared.metadata import Metadata
+from .conversation_item import ConversationItem
+from ..responses.response_prompt import ResponsePrompt
+from ..responses.tool_choice_mcp import ToolChoiceMcp
+from ..responses.tool_choice_options import ToolChoiceOptions
+from ..responses.tool_choice_function import ToolChoiceFunction
+from .realtime_response_create_mcp_tool import RealtimeResponseCreateMcpTool
+from .realtime_response_create_audio_output import RealtimeResponseCreateAudioOutput
+
+__all__ = ["RealtimeResponseCreateParams", "ToolChoice", "Tool"]
+
+ToolChoice: TypeAlias = Union[ToolChoiceOptions, ToolChoiceFunction, ToolChoiceMcp]
+
+Tool: TypeAlias = Union[Models, RealtimeResponseCreateMcpTool]
+
+
+class RealtimeResponseCreateParams(BaseModel):
+ audio: Optional[RealtimeResponseCreateAudioOutput] = None
+ """Configuration for audio input and output."""
+
+ conversation: Union[str, Literal["auto", "none"], None] = None
+ """Controls which conversation the response is added to.
+
+ Currently supports `auto` and `none`, with `auto` as the default value. The
+ `auto` value means that the contents of the response will be added to the
+ default conversation. Set this to `none` to create an out-of-band response which
+ will not add items to default conversation.
+ """
+
+ input: Optional[List[ConversationItem]] = None
+ """Input items to include in the prompt for the model.
+
+ Using this field creates a new context for this Response instead of using the
+ default conversation. An empty array `[]` will clear the context for this
+ Response. Note that this can include references to items that previously
+ appeared in the session using their id.
+ """
+
+ instructions: Optional[str] = None
+ """The default system instructions (i.e.
+
+ system message) prepended to model calls. This field allows the client to guide
+ the model on desired responses. The model can be instructed on response content
+ and format, (e.g. "be extremely succinct", "act friendly", "here are examples of
+ good responses") and on audio behavior (e.g. "talk quickly", "inject emotion
+ into your voice", "laugh frequently"). The instructions are not guaranteed to be
+ followed by the model, but they provide guidance to the model on the desired
+ behavior. Note that the server sets default instructions which will be used if
+ this field is not set and are visible in the `session.created` event at the
+ start of the session.
+ """
+
+ max_output_tokens: Union[int, Literal["inf"], None] = None
+ """
+ Maximum number of output tokens for a single assistant response, inclusive of
+ tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
+ `inf` for the maximum available tokens for a given model. Defaults to `inf`.
+ """
+
+ metadata: Optional[Metadata] = None
+ """Set of 16 key-value pairs that can be attached to an object.
+
+ This can be useful for storing additional information about the object in a
+ structured format, and querying for objects via API or the dashboard.
+
+ Keys are strings with a maximum length of 64 characters. Values are strings with
+ a maximum length of 512 characters.
+ """
+
+ output_modalities: Optional[List[Literal["text", "audio"]]] = None
+ """
+ The set of modalities the model used to respond, currently the only possible
+ values are `[\"audio\"]`, `[\"text\"]`. Audio output always include a text
+ transcript. Setting the output to mode `text` will disable audio output from the
+ model.
+ """
+
+ prompt: Optional[ResponsePrompt] = None
+ """Reference to a prompt template and its variables.
+
+ [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
+ """
+
+ tool_choice: Optional[ToolChoice] = None
+ """How the model chooses tools.
+
+ Provide one of the string modes or force a specific function/MCP tool.
+ """
+
+ tools: Optional[List[Tool]] = None
+ """Tools available to the model."""
src/openai/types/realtime/realtime_response_create_params_param.py
@@ -0,0 +1,99 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import List, Union, Iterable, Optional
+from typing_extensions import Literal, TypeAlias, TypedDict
+
+from .models_param import ModelsParam
+from ..shared_params.metadata import Metadata
+from .conversation_item_param import ConversationItemParam
+from ..responses.tool_choice_options import ToolChoiceOptions
+from ..responses.response_prompt_param import ResponsePromptParam
+from ..responses.tool_choice_mcp_param import ToolChoiceMcpParam
+from ..responses.tool_choice_function_param import ToolChoiceFunctionParam
+from .realtime_response_create_mcp_tool_param import RealtimeResponseCreateMcpToolParam
+from .realtime_response_create_audio_output_param import RealtimeResponseCreateAudioOutputParam
+
+__all__ = ["RealtimeResponseCreateParamsParam", "ToolChoice", "Tool"]
+
+ToolChoice: TypeAlias = Union[ToolChoiceOptions, ToolChoiceFunctionParam, ToolChoiceMcpParam]
+
+Tool: TypeAlias = Union[ModelsParam, RealtimeResponseCreateMcpToolParam]
+
+
+class RealtimeResponseCreateParamsParam(TypedDict, total=False):
+ audio: RealtimeResponseCreateAudioOutputParam
+ """Configuration for audio input and output."""
+
+ conversation: Union[str, Literal["auto", "none"]]
+ """Controls which conversation the response is added to.
+
+ Currently supports `auto` and `none`, with `auto` as the default value. The
+ `auto` value means that the contents of the response will be added to the
+ default conversation. Set this to `none` to create an out-of-band response which
+ will not add items to default conversation.
+ """
+
+ input: Iterable[ConversationItemParam]
+ """Input items to include in the prompt for the model.
+
+ Using this field creates a new context for this Response instead of using the
+ default conversation. An empty array `[]` will clear the context for this
+ Response. Note that this can include references to items that previously
+ appeared in the session using their id.
+ """
+
+ instructions: str
+ """The default system instructions (i.e.
+
+ system message) prepended to model calls. This field allows the client to guide
+ the model on desired responses. The model can be instructed on response content
+ and format, (e.g. "be extremely succinct", "act friendly", "here are examples of
+ good responses") and on audio behavior (e.g. "talk quickly", "inject emotion
+ into your voice", "laugh frequently"). The instructions are not guaranteed to be
+ followed by the model, but they provide guidance to the model on the desired
+ behavior. Note that the server sets default instructions which will be used if
+ this field is not set and are visible in the `session.created` event at the
+ start of the session.
+ """
+
+ max_output_tokens: Union[int, Literal["inf"]]
+ """
+ Maximum number of output tokens for a single assistant response, inclusive of
+ tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
+ `inf` for the maximum available tokens for a given model. Defaults to `inf`.
+ """
+
+ metadata: Optional[Metadata]
+ """Set of 16 key-value pairs that can be attached to an object.
+
+ This can be useful for storing additional information about the object in a
+ structured format, and querying for objects via API or the dashboard.
+
+ Keys are strings with a maximum length of 64 characters. Values are strings with
+ a maximum length of 512 characters.
+ """
+
+ output_modalities: List[Literal["text", "audio"]]
+ """
+ The set of modalities the model used to respond, currently the only possible
+ values are `[\"audio\"]`, `[\"text\"]`. Audio output always include a text
+ transcript. Setting the output to mode `text` will disable audio output from the
+ model.
+ """
+
+ prompt: Optional[ResponsePromptParam]
+ """Reference to a prompt template and its variables.
+
+ [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
+ """
+
+ tool_choice: ToolChoice
+ """How the model chooses tools.
+
+ Provide one of the string modes or force a specific function/MCP tool.
+ """
+
+ tools: Iterable[Tool]
+ """Tools available to the model."""
src/openai/types/realtime/realtime_response_usage.py
@@ -11,7 +11,13 @@ __all__ = ["RealtimeResponseUsage"]
class RealtimeResponseUsage(BaseModel):
input_token_details: Optional[RealtimeResponseUsageInputTokenDetails] = None
- """Details about the input tokens used in the Response."""
+ """Details about the input tokens used in the Response.
+
+ Cached tokens are tokens from previous turns in the conversation that are
+ included as context for the current response. Cached tokens here are counted as
+ a subset of input tokens, meaning input tokens will include cached and uncached
+ tokens.
+ """
input_tokens: Optional[int] = None
"""
src/openai/types/realtime/realtime_response_usage_input_token_details.py
@@ -4,15 +4,32 @@ from typing import Optional
from ..._models import BaseModel
-__all__ = ["RealtimeResponseUsageInputTokenDetails"]
+__all__ = ["RealtimeResponseUsageInputTokenDetails", "CachedTokensDetails"]
+
+
+class CachedTokensDetails(BaseModel):
+ audio_tokens: Optional[int] = None
+ """The number of cached audio tokens used as input for the Response."""
+
+ image_tokens: Optional[int] = None
+ """The number of cached image tokens used as input for the Response."""
+
+ text_tokens: Optional[int] = None
+ """The number of cached text tokens used as input for the Response."""
class RealtimeResponseUsageInputTokenDetails(BaseModel):
audio_tokens: Optional[int] = None
- """The number of audio tokens used in the Response."""
+ """The number of audio tokens used as input for the Response."""
cached_tokens: Optional[int] = None
- """The number of cached tokens used in the Response."""
+ """The number of cached tokens used as input for the Response."""
+
+ cached_tokens_details: Optional[CachedTokensDetails] = None
+ """Details about the cached tokens used as input for the Response."""
+
+ image_tokens: Optional[int] = None
+ """The number of image tokens used as input for the Response."""
text_tokens: Optional[int] = None
- """The number of text tokens used in the Response."""
+ """The number of text tokens used as input for the Response."""
src/openai/types/realtime/realtime_session.py
@@ -1,307 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import List, Union, Optional
-from typing_extensions import Literal, TypeAlias
-
-from ..._models import BaseModel
-from ..responses.response_prompt import ResponsePrompt
-
-__all__ = [
- "RealtimeSession",
- "InputAudioNoiseReduction",
- "InputAudioTranscription",
- "Tool",
- "Tracing",
- "TracingTracingConfiguration",
- "TurnDetection",
-]
-
-
-class InputAudioNoiseReduction(BaseModel):
- type: Optional[Literal["near_field", "far_field"]] = None
- """Type of noise reduction.
-
- `near_field` is for close-talking microphones such as headphones, `far_field` is
- for far-field microphones such as laptop or conference room microphones.
- """
-
-
-class InputAudioTranscription(BaseModel):
- language: Optional[str] = None
- """The language of the input audio.
-
- Supplying the input language in
- [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- format will improve accuracy and latency.
- """
-
- model: Optional[str] = None
- """
- The model to use for transcription, current options are `gpt-4o-transcribe`,
- `gpt-4o-mini-transcribe`, and `whisper-1`.
- """
-
- prompt: Optional[str] = None
- """
- An optional text to guide the model's style or continue a previous audio
- segment. For `whisper-1`, the
- [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
- For `gpt-4o-transcribe` models, the prompt is a free text string, for example
- "expect words related to technology".
- """
-
-
-class Tool(BaseModel):
- description: Optional[str] = None
- """
- The description of the function, including guidance on when and how to call it,
- and guidance about what to tell the user when calling (if anything).
- """
-
- name: Optional[str] = None
- """The name of the function."""
-
- parameters: Optional[object] = None
- """Parameters of the function in JSON Schema."""
-
- type: Optional[Literal["function"]] = None
- """The type of the tool, i.e. `function`."""
-
-
-class TracingTracingConfiguration(BaseModel):
- group_id: Optional[str] = None
- """
- The group id to attach to this trace to enable filtering and grouping in the
- traces dashboard.
- """
-
- metadata: Optional[object] = None
- """
- The arbitrary metadata to attach to this trace to enable filtering in the traces
- dashboard.
- """
-
- workflow_name: Optional[str] = None
- """The name of the workflow to attach to this trace.
-
- This is used to name the trace in the traces dashboard.
- """
-
-
-Tracing: TypeAlias = Union[Literal["auto"], TracingTracingConfiguration, None]
-
-
-class TurnDetection(BaseModel):
- create_response: Optional[bool] = None
- """
- Whether or not to automatically generate a response when a VAD stop event
- occurs.
- """
-
- eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None
- """Used only for `semantic_vad` mode.
-
- The eagerness of the model to respond. `low` will wait longer for the user to
- continue speaking, `high` will respond more quickly. `auto` is the default and
- is equivalent to `medium`.
- """
-
- idle_timeout_ms: Optional[int] = None
- """
- Optional idle timeout after which turn detection will auto-timeout when no
- additional audio is received.
- """
-
- interrupt_response: Optional[bool] = None
- """
- Whether or not to automatically interrupt any ongoing response with output to
- the default conversation (i.e. `conversation` of `auto`) when a VAD start event
- occurs.
- """
-
- prefix_padding_ms: Optional[int] = None
- """Used only for `server_vad` mode.
-
- Amount of audio to include before the VAD detected speech (in milliseconds).
- Defaults to 300ms.
- """
-
- silence_duration_ms: Optional[int] = None
- """Used only for `server_vad` mode.
-
- Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
- With shorter values the model will respond more quickly, but may jump in on
- short pauses from the user.
- """
-
- threshold: Optional[float] = None
- """Used only for `server_vad` mode.
-
- Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
- threshold will require louder audio to activate the model, and thus might
- perform better in noisy environments.
- """
-
- type: Optional[Literal["server_vad", "semantic_vad"]] = None
- """Type of turn detection."""
-
-
-class RealtimeSession(BaseModel):
- id: Optional[str] = None
- """Unique identifier for the session that looks like `sess_1234567890abcdef`."""
-
- expires_at: Optional[int] = None
- """Expiration timestamp for the session, in seconds since epoch."""
-
- include: Optional[List[Literal["item.input_audio_transcription.logprobs"]]] = None
- """Additional fields to include in server outputs.
-
- - `item.input_audio_transcription.logprobs`: Include logprobs for input audio
- transcription.
- """
-
- input_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None
- """The format of input audio.
-
- Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must
- be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian
- byte order.
- """
-
- input_audio_noise_reduction: Optional[InputAudioNoiseReduction] = None
- """Configuration for input audio noise reduction.
-
- This can be set to `null` to turn off. Noise reduction filters audio added to
- the input audio buffer before it is sent to VAD and the model. Filtering the
- audio can improve VAD and turn detection accuracy (reducing false positives) and
- model performance by improving perception of the input audio.
- """
-
- input_audio_transcription: Optional[InputAudioTranscription] = None
- """
- Configuration for input audio transcription, defaults to off and can be set to
- `null` to turn off once on. Input audio transcription is not native to the
- model, since the model consumes audio directly. Transcription runs
- asynchronously through
- [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
- and should be treated as guidance of input audio content rather than precisely
- what the model heard. The client can optionally set the language and prompt for
- transcription, these offer additional guidance to the transcription service.
- """
-
- instructions: Optional[str] = None
- """The default system instructions (i.e.
-
- system message) prepended to model calls. This field allows the client to guide
- the model on desired responses. The model can be instructed on response content
- and format, (e.g. "be extremely succinct", "act friendly", "here are examples of
- good responses") and on audio behavior (e.g. "talk quickly", "inject emotion
- into your voice", "laugh frequently"). The instructions are not guaranteed to be
- followed by the model, but they provide guidance to the model on the desired
- behavior.
-
- Note that the server sets default instructions which will be used if this field
- is not set and are visible in the `session.created` event at the start of the
- session.
- """
-
- max_response_output_tokens: Union[int, Literal["inf"], None] = None
- """
- Maximum number of output tokens for a single assistant response, inclusive of
- tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
- `inf` for the maximum available tokens for a given model. Defaults to `inf`.
- """
-
- modalities: Optional[List[Literal["text", "audio"]]] = None
- """The set of modalities the model can respond with.
-
- To disable audio, set this to ["text"].
- """
-
- model: Optional[
- Literal[
- "gpt-realtime",
- "gpt-realtime-2025-08-28",
- "gpt-4o-realtime-preview",
- "gpt-4o-realtime-preview-2024-10-01",
- "gpt-4o-realtime-preview-2024-12-17",
- "gpt-4o-realtime-preview-2025-06-03",
- "gpt-4o-mini-realtime-preview",
- "gpt-4o-mini-realtime-preview-2024-12-17",
- ]
- ] = None
- """The Realtime model used for this session."""
-
- object: Optional[Literal["realtime.session"]] = None
- """The object type. Always `realtime.session`."""
-
- output_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None
- """The format of output audio.
-
- Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, output audio is
- sampled at a rate of 24kHz.
- """
-
- prompt: Optional[ResponsePrompt] = None
- """Reference to a prompt template and its variables.
-
- [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
- """
-
- speed: Optional[float] = None
- """The speed of the model's spoken response.
-
- 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
- This value can only be changed in between model turns, not while a response is
- in progress.
- """
-
- temperature: Optional[float] = None
- """Sampling temperature for the model, limited to [0.6, 1.2].
-
- For audio models a temperature of 0.8 is highly recommended for best
- performance.
- """
-
- tool_choice: Optional[str] = None
- """How the model chooses tools.
-
- Options are `auto`, `none`, `required`, or specify a function.
- """
-
- tools: Optional[List[Tool]] = None
- """Tools (functions) available to the model."""
-
- tracing: Optional[Tracing] = None
- """Configuration options for tracing.
-
- Set to null to disable tracing. Once tracing is enabled for a session, the
- configuration cannot be modified.
-
- `auto` will create a trace for the session with default values for the workflow
- name, group id, and metadata.
- """
-
- turn_detection: Optional[TurnDetection] = None
- """Configuration for turn detection, ether Server VAD or Semantic VAD.
-
- This can be set to `null` to turn off, in which case the client must manually
- trigger model response. Server VAD means that the model will detect the start
- and end of speech based on audio volume and respond at the end of user speech.
- Semantic VAD is more advanced and uses a turn detection model (in conjunction
- with VAD) to semantically estimate whether the user has finished speaking, then
- dynamically sets a timeout based on this probability. For example, if user audio
- trails off with "uhhm", the model will score a low probability of turn end and
- wait longer for the user to continue speaking. This can be useful for more
- natural conversations, but may have a higher latency.
- """
-
- voice: Union[
- str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None
- ] = None
- """The voice the model uses to respond.
-
- Voice cannot be changed during the session once the model has responded with
- audio at least once. Current voice options are `alloy`, `ash`, `ballad`,
- `coral`, `echo`, `sage`, `shimmer`, and `verse`.
- """
src/openai/types/realtime/realtime_session_client_secret.py
@@ -0,0 +1,20 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from ..._models import BaseModel
+
+__all__ = ["RealtimeSessionClientSecret"]
+
+
+class RealtimeSessionClientSecret(BaseModel):
+ expires_at: int
+ """Timestamp for when the token expires.
+
+ Currently, all tokens expire after one minute.
+ """
+
+ value: str
+ """
+ Ephemeral key usable in client environments to authenticate connections to the
+ Realtime API. Use this in client-side environments rather than a standard API
+ token, which should only be used server-side.
+ """
src/openai/types/realtime/realtime_session_create_request.py
@@ -10,43 +10,22 @@ from .realtime_tools_config import RealtimeToolsConfig
from .realtime_tracing_config import RealtimeTracingConfig
from ..responses.response_prompt import ResponsePrompt
from .realtime_tool_choice_config import RealtimeToolChoiceConfig
-from .realtime_client_secret_config import RealtimeClientSecretConfig
__all__ = ["RealtimeSessionCreateRequest"]
class RealtimeSessionCreateRequest(BaseModel):
- model: Union[
- str,
- Literal[
- "gpt-realtime",
- "gpt-realtime-2025-08-28",
- "gpt-4o-realtime",
- "gpt-4o-mini-realtime",
- "gpt-4o-realtime-preview",
- "gpt-4o-realtime-preview-2024-10-01",
- "gpt-4o-realtime-preview-2024-12-17",
- "gpt-4o-realtime-preview-2025-06-03",
- "gpt-4o-mini-realtime-preview",
- "gpt-4o-mini-realtime-preview-2024-12-17",
- ],
- ]
- """The Realtime model used for this session."""
-
type: Literal["realtime"]
"""The type of session to create. Always `realtime` for the Realtime API."""
audio: Optional[RealtimeAudioConfig] = None
"""Configuration for input and output audio."""
- client_secret: Optional[RealtimeClientSecretConfig] = None
- """Configuration options for the generated client secret."""
-
include: Optional[List[Literal["item.input_audio_transcription.logprobs"]]] = None
"""Additional fields to include in server outputs.
- - `item.input_audio_transcription.logprobs`: Include logprobs for input audio
- transcription.
+ `item.input_audio_transcription.logprobs`: Include logprobs for input audio
+ transcription.
"""
instructions: Optional[str] = None
@@ -72,10 +51,28 @@ class RealtimeSessionCreateRequest(BaseModel):
`inf` for the maximum available tokens for a given model. Defaults to `inf`.
"""
+ model: Union[
+ str,
+ Literal[
+ "gpt-realtime",
+ "gpt-realtime-2025-08-28",
+ "gpt-4o-realtime-preview",
+ "gpt-4o-realtime-preview-2024-10-01",
+ "gpt-4o-realtime-preview-2024-12-17",
+ "gpt-4o-realtime-preview-2025-06-03",
+ "gpt-4o-mini-realtime-preview",
+ "gpt-4o-mini-realtime-preview-2024-12-17",
+ ],
+ None,
+ ] = None
+ """The Realtime model used for this session."""
+
output_modalities: Optional[List[Literal["text", "audio"]]] = None
"""The set of modalities the model can respond with.
- To disable audio, set this to ["text"].
+ It defaults to `["audio"]`, indicating that the model will respond with audio
+ plus a transcript. `["text"]` can be used to make the model respond with text
+ only. It is not possible to request both `text` and `audio` at the same time.
"""
prompt: Optional[ResponsePrompt] = None
@@ -84,13 +81,6 @@ class RealtimeSessionCreateRequest(BaseModel):
[Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
"""
- temperature: Optional[float] = None
- """Sampling temperature for the model, limited to [0.6, 1.2].
-
- For audio models a temperature of 0.8 is highly recommended for best
- performance.
- """
-
tool_choice: Optional[RealtimeToolChoiceConfig] = None
"""How the model chooses tools.
@@ -101,10 +91,10 @@ class RealtimeSessionCreateRequest(BaseModel):
"""Tools available to the model."""
tracing: Optional[RealtimeTracingConfig] = None
- """Configuration options for tracing.
-
- Set to null to disable tracing. Once tracing is enabled for a session, the
- configuration cannot be modified.
+ """
+ Realtime API can write session traces to the
+ [Traces Dashboard](/logs?api=traces). Set to null to disable tracing. Once
+ tracing is enabled for a session, the configuration cannot be modified.
`auto` will create a trace for the session with default values for the workflow
name, group id, and metadata.
@@ -113,6 +103,5 @@ class RealtimeSessionCreateRequest(BaseModel):
truncation: Optional[RealtimeTruncation] = None
"""
Controls how the realtime conversation is truncated prior to model inference.
- The default is `auto`. When set to `retention_ratio`, the server retains a
- fraction of the conversation tokens prior to the instructions.
+ The default is `auto`.
"""
src/openai/types/realtime/realtime_session_create_request_param.py
@@ -11,45 +11,22 @@ from .realtime_tools_config_param import RealtimeToolsConfigParam
from .realtime_tracing_config_param import RealtimeTracingConfigParam
from ..responses.response_prompt_param import ResponsePromptParam
from .realtime_tool_choice_config_param import RealtimeToolChoiceConfigParam
-from .realtime_client_secret_config_param import RealtimeClientSecretConfigParam
__all__ = ["RealtimeSessionCreateRequestParam"]
class RealtimeSessionCreateRequestParam(TypedDict, total=False):
- model: Required[
- Union[
- str,
- Literal[
- "gpt-realtime",
- "gpt-realtime-2025-08-28",
- "gpt-4o-realtime",
- "gpt-4o-mini-realtime",
- "gpt-4o-realtime-preview",
- "gpt-4o-realtime-preview-2024-10-01",
- "gpt-4o-realtime-preview-2024-12-17",
- "gpt-4o-realtime-preview-2025-06-03",
- "gpt-4o-mini-realtime-preview",
- "gpt-4o-mini-realtime-preview-2024-12-17",
- ],
- ]
- ]
- """The Realtime model used for this session."""
-
type: Required[Literal["realtime"]]
"""The type of session to create. Always `realtime` for the Realtime API."""
audio: RealtimeAudioConfigParam
"""Configuration for input and output audio."""
- client_secret: RealtimeClientSecretConfigParam
- """Configuration options for the generated client secret."""
-
include: List[Literal["item.input_audio_transcription.logprobs"]]
"""Additional fields to include in server outputs.
- - `item.input_audio_transcription.logprobs`: Include logprobs for input audio
- transcription.
+ `item.input_audio_transcription.logprobs`: Include logprobs for input audio
+ transcription.
"""
instructions: str
@@ -75,10 +52,27 @@ class RealtimeSessionCreateRequestParam(TypedDict, total=False):
`inf` for the maximum available tokens for a given model. Defaults to `inf`.
"""
+ model: Union[
+ str,
+ Literal[
+ "gpt-realtime",
+ "gpt-realtime-2025-08-28",
+ "gpt-4o-realtime-preview",
+ "gpt-4o-realtime-preview-2024-10-01",
+ "gpt-4o-realtime-preview-2024-12-17",
+ "gpt-4o-realtime-preview-2025-06-03",
+ "gpt-4o-mini-realtime-preview",
+ "gpt-4o-mini-realtime-preview-2024-12-17",
+ ],
+ ]
+ """The Realtime model used for this session."""
+
output_modalities: List[Literal["text", "audio"]]
"""The set of modalities the model can respond with.
- To disable audio, set this to ["text"].
+ It defaults to `["audio"]`, indicating that the model will respond with audio
+ plus a transcript. `["text"]` can be used to make the model respond with text
+ only. It is not possible to request both `text` and `audio` at the same time.
"""
prompt: Optional[ResponsePromptParam]
@@ -87,13 +81,6 @@ class RealtimeSessionCreateRequestParam(TypedDict, total=False):
[Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
"""
- temperature: float
- """Sampling temperature for the model, limited to [0.6, 1.2].
-
- For audio models a temperature of 0.8 is highly recommended for best
- performance.
- """
-
tool_choice: RealtimeToolChoiceConfigParam
"""How the model chooses tools.
@@ -104,10 +91,10 @@ class RealtimeSessionCreateRequestParam(TypedDict, total=False):
"""Tools available to the model."""
tracing: Optional[RealtimeTracingConfigParam]
- """Configuration options for tracing.
-
- Set to null to disable tracing. Once tracing is enabled for a session, the
- configuration cannot be modified.
+ """
+ Realtime API can write session traces to the
+ [Traces Dashboard](/logs?api=traces). Set to null to disable tracing. Once
+ tracing is enabled for a session, the configuration cannot be modified.
`auto` will create a trace for the session with default values for the workflow
name, group id, and metadata.
@@ -116,6 +103,5 @@ class RealtimeSessionCreateRequestParam(TypedDict, total=False):
truncation: RealtimeTruncationParam
"""
Controls how the realtime conversation is truncated prior to model inference.
- The default is `auto`. When set to `retention_ratio`, the server retains a
- fraction of the conversation tokens prior to the instructions.
+ The default is `auto`.
"""
src/openai/types/realtime/realtime_session_create_response.py
@@ -1,74 +1,171 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-from typing import List, Union, Optional
+from typing import Dict, List, Union, Optional
from typing_extensions import Literal, TypeAlias
+from .models import Models
from ..._models import BaseModel
+from .audio_transcription import AudioTranscription
+from .realtime_truncation import RealtimeTruncation
+from .noise_reduction_type import NoiseReductionType
+from .realtime_audio_formats import RealtimeAudioFormats
+from ..responses.response_prompt import ResponsePrompt
+from ..responses.tool_choice_mcp import ToolChoiceMcp
+from ..responses.tool_choice_options import ToolChoiceOptions
+from .realtime_session_client_secret import RealtimeSessionClientSecret
+from ..responses.tool_choice_function import ToolChoiceFunction
__all__ = [
"RealtimeSessionCreateResponse",
"Audio",
"AudioInput",
"AudioInputNoiseReduction",
- "AudioInputTranscription",
"AudioInputTurnDetection",
"AudioOutput",
+ "ToolChoice",
"Tool",
+ "ToolMcpTool",
+ "ToolMcpToolAllowedTools",
+ "ToolMcpToolAllowedToolsMcpToolFilter",
+ "ToolMcpToolRequireApproval",
+ "ToolMcpToolRequireApprovalMcpToolApprovalFilter",
+ "ToolMcpToolRequireApprovalMcpToolApprovalFilterAlways",
+ "ToolMcpToolRequireApprovalMcpToolApprovalFilterNever",
"Tracing",
"TracingTracingConfiguration",
- "TurnDetection",
]
class AudioInputNoiseReduction(BaseModel):
- type: Optional[Literal["near_field", "far_field"]] = None
+ type: Optional[NoiseReductionType] = None
+ """Type of noise reduction.
+ `near_field` is for close-talking microphones such as headphones, `far_field` is
+ for far-field microphones such as laptop or conference room microphones.
+ """
-class AudioInputTranscription(BaseModel):
- language: Optional[str] = None
- """The language of the input audio."""
- model: Optional[str] = None
- """The model to use for transcription."""
+class AudioInputTurnDetection(BaseModel):
+ create_response: Optional[bool] = None
+ """
+ Whether or not to automatically generate a response when a VAD stop event
+ occurs.
+ """
- prompt: Optional[str] = None
- """Optional text to guide the model's style or continue a previous audio segment."""
+ eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None
+ """Used only for `semantic_vad` mode.
+ The eagerness of the model to respond. `low` will wait longer for the user to
+ continue speaking, `high` will respond more quickly. `auto` is the default and
+ is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s,
+ 4s, and 2s respectively.
+ """
+
+ idle_timeout_ms: Optional[int] = None
+ """
+ Optional idle timeout after which turn detection will auto-timeout when no
+ additional audio is received.
+ """
+
+ interrupt_response: Optional[bool] = None
+ """
+ Whether or not to automatically interrupt any ongoing response with output to
+ the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+ occurs.
+ """
-class AudioInputTurnDetection(BaseModel):
prefix_padding_ms: Optional[int] = None
+ """Used only for `server_vad` mode.
+
+ Amount of audio to include before the VAD detected speech (in milliseconds).
+ Defaults to 300ms.
+ """
silence_duration_ms: Optional[int] = None
+ """Used only for `server_vad` mode.
+
+ Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ With shorter values the model will respond more quickly, but may jump in on
+ short pauses from the user.
+ """
threshold: Optional[float] = None
+ """Used only for `server_vad` mode.
- type: Optional[str] = None
- """Type of turn detection, only `server_vad` is currently supported."""
+ Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ threshold will require louder audio to activate the model, and thus might
+ perform better in noisy environments.
+ """
+
+ type: Optional[Literal["server_vad", "semantic_vad"]] = None
+ """Type of turn detection."""
class AudioInput(BaseModel):
- format: Optional[str] = None
- """The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`."""
+ format: Optional[RealtimeAudioFormats] = None
+ """The format of the input audio."""
noise_reduction: Optional[AudioInputNoiseReduction] = None
- """Configuration for input audio noise reduction."""
+ """Configuration for input audio noise reduction.
- transcription: Optional[AudioInputTranscription] = None
- """Configuration for input audio transcription."""
+ This can be set to `null` to turn off. Noise reduction filters audio added to
+ the input audio buffer before it is sent to VAD and the model. Filtering the
+ audio can improve VAD and turn detection accuracy (reducing false positives) and
+ model performance by improving perception of the input audio.
+ """
+
+ transcription: Optional[AudioTranscription] = None
+ """
+ Configuration for input audio transcription, defaults to off and can be set to
+ `null` to turn off once on. Input audio transcription is not native to the
+ model, since the model consumes audio directly. Transcription runs
+ asynchronously through
+ [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+ and should be treated as guidance of input audio content rather than precisely
+ what the model heard. The client can optionally set the language and prompt for
+ transcription, these offer additional guidance to the transcription service.
+ """
turn_detection: Optional[AudioInputTurnDetection] = None
- """Configuration for turn detection."""
+ """Configuration for turn detection, ether Server VAD or Semantic VAD.
+
+ This can be set to `null` to turn off, in which case the client must manually
+ trigger model response. Server VAD means that the model will detect the start
+ and end of speech based on audio volume and respond at the end of user speech.
+ Semantic VAD is more advanced and uses a turn detection model (in conjunction
+ with VAD) to semantically estimate whether the user has finished speaking, then
+ dynamically sets a timeout based on this probability. For example, if user audio
+ trails off with "uhhm", the model will score a low probability of turn end and
+ wait longer for the user to continue speaking. This can be useful for more
+ natural conversations, but may have a higher latency.
+ """
class AudioOutput(BaseModel):
- format: Optional[str] = None
- """The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`."""
+ format: Optional[RealtimeAudioFormats] = None
+ """The format of the output audio."""
speed: Optional[float] = None
+ """
+ The speed of the model's spoken response as a multiple of the original speed.
+ 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
+ This value can only be changed in between model turns, not while a response is
+ in progress.
+
+ This parameter is a post-processing adjustment to the audio after it is
+ generated, it's also possible to prompt the model to speak faster or slower.
+ """
voice: Union[
str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None
] = None
+ """The voice the model uses to respond.
+
+ Voice cannot be changed during the session once the model has responded with
+ audio at least once. Current voice options are `alloy`, `ash`, `ballad`,
+ `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`. We recommend
+ `marin` and `cedar` for best quality.
+ """
class Audio(BaseModel):
@@ -77,86 +174,168 @@ class Audio(BaseModel):
output: Optional[AudioOutput] = None
-class Tool(BaseModel):
- description: Optional[str] = None
- """
- The description of the function, including guidance on when and how to call it,
- and guidance about what to tell the user when calling (if anything).
+ToolChoice: TypeAlias = Union[ToolChoiceOptions, ToolChoiceFunction, ToolChoiceMcp]
+
+
+class ToolMcpToolAllowedToolsMcpToolFilter(BaseModel):
+ read_only: Optional[bool] = None
+ """Indicates whether or not a tool modifies data or is read-only.
+
+ If an MCP server is
+ [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
+ it will match this filter.
"""
- name: Optional[str] = None
- """The name of the function."""
+ tool_names: Optional[List[str]] = None
+ """List of allowed tool names."""
- parameters: Optional[object] = None
- """Parameters of the function in JSON Schema."""
- type: Optional[Literal["function"]] = None
- """The type of the tool, i.e. `function`."""
+ToolMcpToolAllowedTools: TypeAlias = Union[List[str], ToolMcpToolAllowedToolsMcpToolFilter, None]
-class TracingTracingConfiguration(BaseModel):
- group_id: Optional[str] = None
+class ToolMcpToolRequireApprovalMcpToolApprovalFilterAlways(BaseModel):
+ read_only: Optional[bool] = None
+ """Indicates whether or not a tool modifies data or is read-only.
+
+ If an MCP server is
+ [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
+ it will match this filter.
"""
- The group id to attach to this trace to enable filtering and grouping in the
- traces dashboard.
+
+ tool_names: Optional[List[str]] = None
+ """List of allowed tool names."""
+
+
+class ToolMcpToolRequireApprovalMcpToolApprovalFilterNever(BaseModel):
+ read_only: Optional[bool] = None
+ """Indicates whether or not a tool modifies data or is read-only.
+
+ If an MCP server is
+ [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
+ it will match this filter.
"""
- metadata: Optional[object] = None
+ tool_names: Optional[List[str]] = None
+ """List of allowed tool names."""
+
+
+class ToolMcpToolRequireApprovalMcpToolApprovalFilter(BaseModel):
+ always: Optional[ToolMcpToolRequireApprovalMcpToolApprovalFilterAlways] = None
+ """A filter object to specify which tools are allowed."""
+
+ never: Optional[ToolMcpToolRequireApprovalMcpToolApprovalFilterNever] = None
+ """A filter object to specify which tools are allowed."""
+
+
+ToolMcpToolRequireApproval: TypeAlias = Union[
+ ToolMcpToolRequireApprovalMcpToolApprovalFilter, Literal["always", "never"], None
+]
+
+
+class ToolMcpTool(BaseModel):
+ server_label: str
+ """A label for this MCP server, used to identify it in tool calls."""
+
+ type: Literal["mcp"]
+ """The type of the MCP tool. Always `mcp`."""
+
+ allowed_tools: Optional[ToolMcpToolAllowedTools] = None
+ """List of allowed tool names or a filter object."""
+
+ authorization: Optional[str] = None
"""
- The arbitrary metadata to attach to this trace to enable filtering in the traces
- dashboard.
+ An OAuth access token that can be used with a remote MCP server, either with a
+ custom MCP server URL or a service connector. Your application must handle the
+ OAuth authorization flow and provide the token here.
"""
- workflow_name: Optional[str] = None
- """The name of the workflow to attach to this trace.
-
- This is used to name the trace in the traces dashboard.
+ connector_id: Optional[
+ Literal[
+ "connector_dropbox",
+ "connector_gmail",
+ "connector_googlecalendar",
+ "connector_googledrive",
+ "connector_microsoftteams",
+ "connector_outlookcalendar",
+ "connector_outlookemail",
+ "connector_sharepoint",
+ ]
+ ] = None
+ """Identifier for service connectors, like those available in ChatGPT.
+
+ One of `server_url` or `connector_id` must be provided. Learn more about service
+ connectors
+ [here](https://platform.openai.com/docs/guides/tools-remote-mcp#connectors).
+
+ Currently supported `connector_id` values are:
+
+ - Dropbox: `connector_dropbox`
+ - Gmail: `connector_gmail`
+ - Google Calendar: `connector_googlecalendar`
+ - Google Drive: `connector_googledrive`
+ - Microsoft Teams: `connector_microsoftteams`
+ - Outlook Calendar: `connector_outlookcalendar`
+ - Outlook Email: `connector_outlookemail`
+ - SharePoint: `connector_sharepoint`
"""
+ headers: Optional[Dict[str, str]] = None
+ """Optional HTTP headers to send to the MCP server.
+
+ Use for authentication or other purposes.
+ """
-Tracing: TypeAlias = Union[Literal["auto"], TracingTracingConfiguration]
+ require_approval: Optional[ToolMcpToolRequireApproval] = None
+ """Specify which of the MCP server's tools require approval."""
+ server_description: Optional[str] = None
+ """Optional description of the MCP server, used to provide more context."""
-class TurnDetection(BaseModel):
- prefix_padding_ms: Optional[int] = None
- """Amount of audio to include before the VAD detected speech (in milliseconds).
+ server_url: Optional[str] = None
+ """The URL for the MCP server.
- Defaults to 300ms.
+ One of `server_url` or `connector_id` must be provided.
"""
- silence_duration_ms: Optional[int] = None
- """Duration of silence to detect speech stop (in milliseconds).
- Defaults to 500ms. With shorter values the model will respond more quickly, but
- may jump in on short pauses from the user.
+Tool: TypeAlias = Union[Models, ToolMcpTool]
+
+
+class TracingTracingConfiguration(BaseModel):
+ group_id: Optional[str] = None
+ """
+ The group id to attach to this trace to enable filtering and grouping in the
+ Traces Dashboard.
"""
- threshold: Optional[float] = None
- """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5.
+ metadata: Optional[object] = None
+ """
+ The arbitrary metadata to attach to this trace to enable filtering in the Traces
+ Dashboard.
+ """
- A higher threshold will require louder audio to activate the model, and thus
- might perform better in noisy environments.
+ workflow_name: Optional[str] = None
+ """The name of the workflow to attach to this trace.
+
+ This is used to name the trace in the Traces Dashboard.
"""
- type: Optional[str] = None
- """Type of turn detection, only `server_vad` is currently supported."""
+Tracing: TypeAlias = Union[Literal["auto"], TracingTracingConfiguration, None]
-class RealtimeSessionCreateResponse(BaseModel):
- id: Optional[str] = None
- """Unique identifier for the session that looks like `sess_1234567890abcdef`."""
+class RealtimeSessionCreateResponse(BaseModel):
audio: Optional[Audio] = None
- """Configuration for input and output audio for the session."""
+ """Configuration for input and output audio."""
- expires_at: Optional[int] = None
- """Expiration timestamp for the session, in seconds since epoch."""
+ client_secret: Optional[RealtimeSessionClientSecret] = None
+ """Ephemeral key returned by the API."""
include: Optional[List[Literal["item.input_audio_transcription.logprobs"]]] = None
"""Additional fields to include in server outputs.
- - `item.input_audio_transcription.logprobs`: Include logprobs for input audio
- transcription.
+ `item.input_audio_transcription.logprobs`: Include logprobs for input audio
+ transcription.
"""
instructions: Optional[str] = None
@@ -182,41 +361,60 @@ class RealtimeSessionCreateResponse(BaseModel):
`inf` for the maximum available tokens for a given model. Defaults to `inf`.
"""
- model: Optional[str] = None
+ model: Union[
+ str,
+ Literal[
+ "gpt-realtime",
+ "gpt-realtime-2025-08-28",
+ "gpt-4o-realtime-preview",
+ "gpt-4o-realtime-preview-2024-10-01",
+ "gpt-4o-realtime-preview-2024-12-17",
+ "gpt-4o-realtime-preview-2025-06-03",
+ "gpt-4o-mini-realtime-preview",
+ "gpt-4o-mini-realtime-preview-2024-12-17",
+ ],
+ None,
+ ] = None
"""The Realtime model used for this session."""
- object: Optional[str] = None
- """The object type. Always `realtime.session`."""
-
output_modalities: Optional[List[Literal["text", "audio"]]] = None
"""The set of modalities the model can respond with.
- To disable audio, set this to ["text"].
+ It defaults to `["audio"]`, indicating that the model will respond with audio
+ plus a transcript. `["text"]` can be used to make the model respond with text
+ only. It is not possible to request both `text` and `audio` at the same time.
+ """
+
+ prompt: Optional[ResponsePrompt] = None
+ """Reference to a prompt template and its variables.
+
+ [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
"""
- tool_choice: Optional[str] = None
+ tool_choice: Optional[ToolChoice] = None
"""How the model chooses tools.
- Options are `auto`, `none`, `required`, or specify a function.
+ Provide one of the string modes or force a specific function/MCP tool.
"""
tools: Optional[List[Tool]] = None
- """Tools (functions) available to the model."""
+ """Tools available to the model."""
tracing: Optional[Tracing] = None
- """Configuration options for tracing.
-
- Set to null to disable tracing. Once tracing is enabled for a session, the
- configuration cannot be modified.
+ """
+ Realtime API can write session traces to the
+ [Traces Dashboard](/logs?api=traces). Set to null to disable tracing. Once
+ tracing is enabled for a session, the configuration cannot be modified.
`auto` will create a trace for the session with default values for the workflow
name, group id, and metadata.
"""
- turn_detection: Optional[TurnDetection] = None
- """Configuration for turn detection.
-
- Can be set to `null` to turn off. Server VAD means that the model will detect
- the start and end of speech based on audio volume and respond at the end of user
- speech.
+ truncation: Optional[RealtimeTruncation] = None
"""
+ Controls how the realtime conversation is truncated prior to model inference.
+ The default is `auto`.
+ """
+
+ type: Optional[Literal["realtime"]] = None
+ """The type of session to create. Always `realtime` for the Realtime API."""
src/openai/types/realtime/realtime_tools_config_param.py
@@ -6,11 +6,11 @@ from typing import Dict, List, Union, Optional
from typing_extensions import Literal, Required, TypeAlias, TypedDict
from ..._types import SequenceNotStr
+from .models_param import ModelsParam
__all__ = [
"RealtimeToolsConfigParam",
"RealtimeToolsConfigUnionParam",
- "Function",
"Mcp",
"McpAllowedTools",
"McpAllowedToolsMcpToolFilter",
@@ -21,23 +21,6 @@ __all__ = [
]
-class Function(TypedDict, total=False):
- description: str
- """
- The description of the function, including guidance on when and how to call it,
- and guidance about what to tell the user when calling (if anything).
- """
-
- name: str
- """The name of the function."""
-
- parameters: object
- """Parameters of the function in JSON Schema."""
-
- type: Literal["function"]
- """The type of the tool, i.e. `function`."""
-
-
class McpAllowedToolsMcpToolFilter(TypedDict, total=False):
read_only: bool
"""Indicates whether or not a tool modifies data or is read-only.
@@ -155,6 +138,6 @@ class Mcp(TypedDict, total=False):
"""
-RealtimeToolsConfigUnionParam: TypeAlias = Union[Function, Mcp]
+RealtimeToolsConfigUnionParam: TypeAlias = Union[ModelsParam, Mcp]
RealtimeToolsConfigParam: TypeAlias = List[RealtimeToolsConfigUnionParam]
src/openai/types/realtime/realtime_tools_config_union.py
@@ -3,12 +3,12 @@
from typing import Dict, List, Union, Optional
from typing_extensions import Literal, Annotated, TypeAlias
+from .models import Models
from ..._utils import PropertyInfo
from ..._models import BaseModel
__all__ = [
"RealtimeToolsConfigUnion",
- "Function",
"Mcp",
"McpAllowedTools",
"McpAllowedToolsMcpToolFilter",
@@ -19,23 +19,6 @@ __all__ = [
]
-class Function(BaseModel):
- description: Optional[str] = None
- """
- The description of the function, including guidance on when and how to call it,
- and guidance about what to tell the user when calling (if anything).
- """
-
- name: Optional[str] = None
- """The name of the function."""
-
- parameters: Optional[object] = None
- """Parameters of the function in JSON Schema."""
-
- type: Optional[Literal["function"]] = None
- """The type of the tool, i.e. `function`."""
-
-
class McpAllowedToolsMcpToolFilter(BaseModel):
read_only: Optional[bool] = None
"""Indicates whether or not a tool modifies data or is read-only.
@@ -155,4 +138,4 @@ class Mcp(BaseModel):
"""
-RealtimeToolsConfigUnion: TypeAlias = Annotated[Union[Function, Mcp], PropertyInfo(discriminator="type")]
+RealtimeToolsConfigUnion: TypeAlias = Annotated[Union[Models, Mcp], PropertyInfo(discriminator="type")]
src/openai/types/realtime/realtime_tools_config_union_param.py
@@ -6,10 +6,10 @@ from typing import Dict, Union, Optional
from typing_extensions import Literal, Required, TypeAlias, TypedDict
from ..._types import SequenceNotStr
+from .models_param import ModelsParam
__all__ = [
"RealtimeToolsConfigUnionParam",
- "Function",
"Mcp",
"McpAllowedTools",
"McpAllowedToolsMcpToolFilter",
@@ -20,23 +20,6 @@ __all__ = [
]
-class Function(TypedDict, total=False):
- description: str
- """
- The description of the function, including guidance on when and how to call it,
- and guidance about what to tell the user when calling (if anything).
- """
-
- name: str
- """The name of the function."""
-
- parameters: object
- """Parameters of the function in JSON Schema."""
-
- type: Literal["function"]
- """The type of the tool, i.e. `function`."""
-
-
class McpAllowedToolsMcpToolFilter(TypedDict, total=False):
read_only: bool
"""Indicates whether or not a tool modifies data or is read-only.
@@ -154,4 +137,4 @@ class Mcp(TypedDict, total=False):
"""
-RealtimeToolsConfigUnionParam: TypeAlias = Union[Function, Mcp]
+RealtimeToolsConfigUnionParam: TypeAlias = Union[ModelsParam, Mcp]
src/openai/types/realtime/realtime_tracing_config.py
@@ -12,19 +12,19 @@ class TracingConfiguration(BaseModel):
group_id: Optional[str] = None
"""
The group id to attach to this trace to enable filtering and grouping in the
- traces dashboard.
+ Traces Dashboard.
"""
metadata: Optional[object] = None
"""
- The arbitrary metadata to attach to this trace to enable filtering in the traces
- dashboard.
+ The arbitrary metadata to attach to this trace to enable filtering in the Traces
+ Dashboard.
"""
workflow_name: Optional[str] = None
"""The name of the workflow to attach to this trace.
- This is used to name the trace in the traces dashboard.
+ This is used to name the trace in the Traces Dashboard.
"""
src/openai/types/realtime/realtime_tracing_config_param.py
@@ -12,19 +12,19 @@ class TracingConfiguration(TypedDict, total=False):
group_id: str
"""
The group id to attach to this trace to enable filtering and grouping in the
- traces dashboard.
+ Traces Dashboard.
"""
metadata: object
"""
- The arbitrary metadata to attach to this trace to enable filtering in the traces
- dashboard.
+ The arbitrary metadata to attach to this trace to enable filtering in the Traces
+ Dashboard.
"""
workflow_name: str
"""The name of the workflow to attach to this trace.
- This is used to name the trace in the traces dashboard.
+ This is used to name the trace in the Traces Dashboard.
"""
src/openai/types/realtime/realtime_transcription_session_audio.py
@@ -0,0 +1,12 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Optional
+
+from ..._models import BaseModel
+from .realtime_transcription_session_audio_input import RealtimeTranscriptionSessionAudioInput
+
+__all__ = ["RealtimeTranscriptionSessionAudio"]
+
+
+class RealtimeTranscriptionSessionAudio(BaseModel):
+ input: Optional[RealtimeTranscriptionSessionAudioInput] = None
src/openai/types/realtime/realtime_transcription_session_audio_input.py
@@ -0,0 +1,62 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Optional
+
+from ..._models import BaseModel
+from .audio_transcription import AudioTranscription
+from .noise_reduction_type import NoiseReductionType
+from .realtime_audio_formats import RealtimeAudioFormats
+from .realtime_transcription_session_audio_input_turn_detection import (
+ RealtimeTranscriptionSessionAudioInputTurnDetection,
+)
+
+__all__ = ["RealtimeTranscriptionSessionAudioInput", "NoiseReduction"]
+
+
+class NoiseReduction(BaseModel):
+ type: Optional[NoiseReductionType] = None
+ """Type of noise reduction.
+
+ `near_field` is for close-talking microphones such as headphones, `far_field` is
+ for far-field microphones such as laptop or conference room microphones.
+ """
+
+
+class RealtimeTranscriptionSessionAudioInput(BaseModel):
+ format: Optional[RealtimeAudioFormats] = None
+ """The PCM audio format. Only a 24kHz sample rate is supported."""
+
+ noise_reduction: Optional[NoiseReduction] = None
+ """Configuration for input audio noise reduction.
+
+ This can be set to `null` to turn off. Noise reduction filters audio added to
+ the input audio buffer before it is sent to VAD and the model. Filtering the
+ audio can improve VAD and turn detection accuracy (reducing false positives) and
+ model performance by improving perception of the input audio.
+ """
+
+ transcription: Optional[AudioTranscription] = None
+ """
+ Configuration for input audio transcription, defaults to off and can be set to
+ `null` to turn off once on. Input audio transcription is not native to the
+ model, since the model consumes audio directly. Transcription runs
+ asynchronously through
+ [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+ and should be treated as guidance of input audio content rather than precisely
+ what the model heard. The client can optionally set the language and prompt for
+ transcription, these offer additional guidance to the transcription service.
+ """
+
+ turn_detection: Optional[RealtimeTranscriptionSessionAudioInputTurnDetection] = None
+ """Configuration for turn detection, ether Server VAD or Semantic VAD.
+
+ This can be set to `null` to turn off, in which case the client must manually
+ trigger model response. Server VAD means that the model will detect the start
+ and end of speech based on audio volume and respond at the end of user speech.
+ Semantic VAD is more advanced and uses a turn detection model (in conjunction
+ with VAD) to semantically estimate whether the user has finished speaking, then
+ dynamically sets a timeout based on this probability. For example, if user audio
+ trails off with "uhhm", the model will score a low probability of turn end and
+ wait longer for the user to continue speaking. This can be useful for more
+ natural conversations, but may have a higher latency.
+ """
src/openai/types/realtime/realtime_transcription_session_audio_input_param.py
@@ -0,0 +1,63 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing_extensions import TypedDict
+
+from .noise_reduction_type import NoiseReductionType
+from .audio_transcription_param import AudioTranscriptionParam
+from .realtime_audio_formats_param import RealtimeAudioFormatsParam
+from .realtime_transcription_session_audio_input_turn_detection_param import (
+ RealtimeTranscriptionSessionAudioInputTurnDetectionParam,
+)
+
+__all__ = ["RealtimeTranscriptionSessionAudioInputParam", "NoiseReduction"]
+
+
+class NoiseReduction(TypedDict, total=False):
+ type: NoiseReductionType
+ """Type of noise reduction.
+
+ `near_field` is for close-talking microphones such as headphones, `far_field` is
+ for far-field microphones such as laptop or conference room microphones.
+ """
+
+
+class RealtimeTranscriptionSessionAudioInputParam(TypedDict, total=False):
+ format: RealtimeAudioFormatsParam
+ """The PCM audio format. Only a 24kHz sample rate is supported."""
+
+ noise_reduction: NoiseReduction
+ """Configuration for input audio noise reduction.
+
+ This can be set to `null` to turn off. Noise reduction filters audio added to
+ the input audio buffer before it is sent to VAD and the model. Filtering the
+ audio can improve VAD and turn detection accuracy (reducing false positives) and
+ model performance by improving perception of the input audio.
+ """
+
+ transcription: AudioTranscriptionParam
+ """
+ Configuration for input audio transcription, defaults to off and can be set to
+ `null` to turn off once on. Input audio transcription is not native to the
+ model, since the model consumes audio directly. Transcription runs
+ asynchronously through
+ [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+ and should be treated as guidance of input audio content rather than precisely
+ what the model heard. The client can optionally set the language and prompt for
+ transcription, these offer additional guidance to the transcription service.
+ """
+
+ turn_detection: RealtimeTranscriptionSessionAudioInputTurnDetectionParam
+ """Configuration for turn detection, ether Server VAD or Semantic VAD.
+
+ This can be set to `null` to turn off, in which case the client must manually
+ trigger model response. Server VAD means that the model will detect the start
+ and end of speech based on audio volume and respond at the end of user speech.
+ Semantic VAD is more advanced and uses a turn detection model (in conjunction
+ with VAD) to semantically estimate whether the user has finished speaking, then
+ dynamically sets a timeout based on this probability. For example, if user audio
+ trails off with "uhhm", the model will score a low probability of turn end and
+ wait longer for the user to continue speaking. This can be useful for more
+ natural conversations, but may have a higher latency.
+ """
src/openai/types/realtime/realtime_transcription_session_audio_input_turn_detection.py
@@ -0,0 +1,63 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Optional
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+
+__all__ = ["RealtimeTranscriptionSessionAudioInputTurnDetection"]
+
+
+class RealtimeTranscriptionSessionAudioInputTurnDetection(BaseModel):
+ create_response: Optional[bool] = None
+ """
+ Whether or not to automatically generate a response when a VAD stop event
+ occurs.
+ """
+
+ eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None
+ """Used only for `semantic_vad` mode.
+
+ The eagerness of the model to respond. `low` will wait longer for the user to
+ continue speaking, `high` will respond more quickly. `auto` is the default and
+ is equivalent to `medium`.
+ """
+
+ idle_timeout_ms: Optional[int] = None
+ """
+ Optional idle timeout after which turn detection will auto-timeout when no
+ additional audio is received.
+ """
+
+ interrupt_response: Optional[bool] = None
+ """
+ Whether or not to automatically interrupt any ongoing response with output to
+ the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+ occurs.
+ """
+
+ prefix_padding_ms: Optional[int] = None
+ """Used only for `server_vad` mode.
+
+ Amount of audio to include before the VAD detected speech (in milliseconds).
+ Defaults to 300ms.
+ """
+
+ silence_duration_ms: Optional[int] = None
+ """Used only for `server_vad` mode.
+
+ Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ With shorter values the model will respond more quickly, but may jump in on
+ short pauses from the user.
+ """
+
+ threshold: Optional[float] = None
+ """Used only for `server_vad` mode.
+
+ Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ threshold will require louder audio to activate the model, and thus might
+ perform better in noisy environments.
+ """
+
+ type: Optional[Literal["server_vad", "semantic_vad"]] = None
+ """Type of turn detection."""
src/openai/types/realtime/realtime_transcription_session_audio_input_turn_detection_param.py
@@ -0,0 +1,63 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import Optional
+from typing_extensions import Literal, TypedDict
+
+__all__ = ["RealtimeTranscriptionSessionAudioInputTurnDetectionParam"]
+
+
+class RealtimeTranscriptionSessionAudioInputTurnDetectionParam(TypedDict, total=False):
+ create_response: bool
+ """
+ Whether or not to automatically generate a response when a VAD stop event
+ occurs.
+ """
+
+ eagerness: Literal["low", "medium", "high", "auto"]
+ """Used only for `semantic_vad` mode.
+
+ The eagerness of the model to respond. `low` will wait longer for the user to
+ continue speaking, `high` will respond more quickly. `auto` is the default and
+ is equivalent to `medium`.
+ """
+
+ idle_timeout_ms: Optional[int]
+ """
+ Optional idle timeout after which turn detection will auto-timeout when no
+ additional audio is received.
+ """
+
+ interrupt_response: bool
+ """
+ Whether or not to automatically interrupt any ongoing response with output to
+ the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+ occurs.
+ """
+
+ prefix_padding_ms: int
+ """Used only for `server_vad` mode.
+
+ Amount of audio to include before the VAD detected speech (in milliseconds).
+ Defaults to 300ms.
+ """
+
+ silence_duration_ms: int
+ """Used only for `server_vad` mode.
+
+ Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ With shorter values the model will respond more quickly, but may jump in on
+ short pauses from the user.
+ """
+
+ threshold: float
+ """Used only for `server_vad` mode.
+
+ Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ threshold will require louder audio to activate the model, and thus might
+ perform better in noisy environments.
+ """
+
+ type: Literal["server_vad", "semantic_vad"]
+ """Type of turn detection."""
src/openai/types/realtime/realtime_transcription_session_audio_param.py
@@ -0,0 +1,13 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing_extensions import TypedDict
+
+from .realtime_transcription_session_audio_input_param import RealtimeTranscriptionSessionAudioInputParam
+
+__all__ = ["RealtimeTranscriptionSessionAudioParam"]
+
+
+class RealtimeTranscriptionSessionAudioParam(TypedDict, total=False):
+ input: RealtimeTranscriptionSessionAudioInputParam
src/openai/types/realtime/realtime_transcription_session_client_secret.py
@@ -0,0 +1,20 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from ..._models import BaseModel
+
+__all__ = ["RealtimeTranscriptionSessionClientSecret"]
+
+
+class RealtimeTranscriptionSessionClientSecret(BaseModel):
+ expires_at: int
+ """Timestamp for when the token expires.
+
+ Currently, all tokens expire after one minute.
+ """
+
+ value: str
+ """
+ Ephemeral key usable in client environments to authenticate connections to the
+ Realtime API. Use this in client-side environments rather than a standard API
+ token, which should only be used server-side.
+ """
src/openai/types/realtime/realtime_transcription_session_create_request.py
@@ -1,128 +1,27 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-from typing import List, Union, Optional
+from typing import List, Optional
from typing_extensions import Literal
from ..._models import BaseModel
+from .realtime_transcription_session_audio import RealtimeTranscriptionSessionAudio
-__all__ = [
- "RealtimeTranscriptionSessionCreateRequest",
- "InputAudioNoiseReduction",
- "InputAudioTranscription",
- "TurnDetection",
-]
-
-
-class InputAudioNoiseReduction(BaseModel):
- type: Optional[Literal["near_field", "far_field"]] = None
- """Type of noise reduction.
-
- `near_field` is for close-talking microphones such as headphones, `far_field` is
- for far-field microphones such as laptop or conference room microphones.
- """
-
-
-class InputAudioTranscription(BaseModel):
- language: Optional[str] = None
- """The language of the input audio.
-
- Supplying the input language in
- [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- format will improve accuracy and latency.
- """
-
- model: Optional[Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"]] = None
- """
- The model to use for transcription, current options are `gpt-4o-transcribe`,
- `gpt-4o-mini-transcribe`, and `whisper-1`.
- """
-
- prompt: Optional[str] = None
- """
- An optional text to guide the model's style or continue a previous audio
- segment. For `whisper-1`, the
- [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
- For `gpt-4o-transcribe` models, the prompt is a free text string, for example
- "expect words related to technology".
- """
-
-
-class TurnDetection(BaseModel):
- prefix_padding_ms: Optional[int] = None
- """Amount of audio to include before the VAD detected speech (in milliseconds).
-
- Defaults to 300ms.
- """
-
- silence_duration_ms: Optional[int] = None
- """Duration of silence to detect speech stop (in milliseconds).
-
- Defaults to 500ms. With shorter values the model will respond more quickly, but
- may jump in on short pauses from the user.
- """
-
- threshold: Optional[float] = None
- """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5.
-
- A higher threshold will require louder audio to activate the model, and thus
- might perform better in noisy environments.
- """
-
- type: Optional[Literal["server_vad"]] = None
- """Type of turn detection.
-
- Only `server_vad` is currently supported for transcription sessions.
- """
+__all__ = ["RealtimeTranscriptionSessionCreateRequest"]
class RealtimeTranscriptionSessionCreateRequest(BaseModel):
- model: Union[str, Literal["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]]
- """ID of the model to use.
-
- The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1`
- (which is powered by our open source Whisper V2 model).
- """
-
type: Literal["transcription"]
"""The type of session to create.
Always `transcription` for transcription sessions.
"""
- include: Optional[List[Literal["item.input_audio_transcription.logprobs"]]] = None
- """The set of items to include in the transcription. Current available items are:
-
- - `item.input_audio_transcription.logprobs`
- """
+ audio: Optional[RealtimeTranscriptionSessionAudio] = None
+ """Configuration for input and output audio."""
- input_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None
- """The format of input audio.
-
- Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must
- be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian
- byte order.
- """
-
- input_audio_noise_reduction: Optional[InputAudioNoiseReduction] = None
- """Configuration for input audio noise reduction.
-
- This can be set to `null` to turn off. Noise reduction filters audio added to
- the input audio buffer before it is sent to VAD and the model. Filtering the
- audio can improve VAD and turn detection accuracy (reducing false positives) and
- model performance by improving perception of the input audio.
- """
-
- input_audio_transcription: Optional[InputAudioTranscription] = None
- """Configuration for input audio transcription.
-
- The client can optionally set the language and prompt for transcription, these
- offer additional guidance to the transcription service.
- """
-
- turn_detection: Optional[TurnDetection] = None
- """Configuration for turn detection.
+ include: Optional[List[Literal["item.input_audio_transcription.logprobs"]]] = None
+ """Additional fields to include in server outputs.
- Can be set to `null` to turn off. Server VAD means that the model will detect
- the start and end of speech based on audio volume and respond at the end of user
- speech.
+ `item.input_audio_transcription.logprobs`: Include logprobs for input audio
+ transcription.
"""
src/openai/types/realtime/realtime_transcription_session_create_request_param.py
@@ -2,127 +2,27 @@
from __future__ import annotations
-from typing import List, Union
+from typing import List
from typing_extensions import Literal, Required, TypedDict
-__all__ = [
- "RealtimeTranscriptionSessionCreateRequestParam",
- "InputAudioNoiseReduction",
- "InputAudioTranscription",
- "TurnDetection",
-]
+from .realtime_transcription_session_audio_param import RealtimeTranscriptionSessionAudioParam
-
-class InputAudioNoiseReduction(TypedDict, total=False):
- type: Literal["near_field", "far_field"]
- """Type of noise reduction.
-
- `near_field` is for close-talking microphones such as headphones, `far_field` is
- for far-field microphones such as laptop or conference room microphones.
- """
-
-
-class InputAudioTranscription(TypedDict, total=False):
- language: str
- """The language of the input audio.
-
- Supplying the input language in
- [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- format will improve accuracy and latency.
- """
-
- model: Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"]
- """
- The model to use for transcription, current options are `gpt-4o-transcribe`,
- `gpt-4o-mini-transcribe`, and `whisper-1`.
- """
-
- prompt: str
- """
- An optional text to guide the model's style or continue a previous audio
- segment. For `whisper-1`, the
- [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
- For `gpt-4o-transcribe` models, the prompt is a free text string, for example
- "expect words related to technology".
- """
-
-
-class TurnDetection(TypedDict, total=False):
- prefix_padding_ms: int
- """Amount of audio to include before the VAD detected speech (in milliseconds).
-
- Defaults to 300ms.
- """
-
- silence_duration_ms: int
- """Duration of silence to detect speech stop (in milliseconds).
-
- Defaults to 500ms. With shorter values the model will respond more quickly, but
- may jump in on short pauses from the user.
- """
-
- threshold: float
- """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5.
-
- A higher threshold will require louder audio to activate the model, and thus
- might perform better in noisy environments.
- """
-
- type: Literal["server_vad"]
- """Type of turn detection.
-
- Only `server_vad` is currently supported for transcription sessions.
- """
+__all__ = ["RealtimeTranscriptionSessionCreateRequestParam"]
class RealtimeTranscriptionSessionCreateRequestParam(TypedDict, total=False):
- model: Required[Union[str, Literal["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]]]
- """ID of the model to use.
-
- The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1`
- (which is powered by our open source Whisper V2 model).
- """
-
type: Required[Literal["transcription"]]
"""The type of session to create.
Always `transcription` for transcription sessions.
"""
- include: List[Literal["item.input_audio_transcription.logprobs"]]
- """The set of items to include in the transcription. Current available items are:
-
- - `item.input_audio_transcription.logprobs`
- """
+ audio: RealtimeTranscriptionSessionAudioParam
+ """Configuration for input and output audio."""
- input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"]
- """The format of input audio.
-
- Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must
- be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian
- byte order.
- """
-
- input_audio_noise_reduction: InputAudioNoiseReduction
- """Configuration for input audio noise reduction.
-
- This can be set to `null` to turn off. Noise reduction filters audio added to
- the input audio buffer before it is sent to VAD and the model. Filtering the
- audio can improve VAD and turn detection accuracy (reducing false positives) and
- model performance by improving perception of the input audio.
- """
-
- input_audio_transcription: InputAudioTranscription
- """Configuration for input audio transcription.
-
- The client can optionally set the language and prompt for transcription, these
- offer additional guidance to the transcription service.
- """
-
- turn_detection: TurnDetection
- """Configuration for turn detection.
+ include: List[Literal["item.input_audio_transcription.logprobs"]]
+ """Additional fields to include in server outputs.
- Can be set to `null` to turn off. Server VAD means that the model will detect
- the start and end of speech based on audio volume and respond at the end of user
- speech.
+ `item.input_audio_transcription.logprobs`: Include logprobs for input audio
+ transcription.
"""
src/openai/types/realtime/realtime_transcription_session_create_response.py
@@ -0,0 +1,41 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Optional
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+from .realtime_transcription_session_client_secret import RealtimeTranscriptionSessionClientSecret
+from .realtime_transcription_session_turn_detection import RealtimeTranscriptionSessionTurnDetection
+from .realtime_transcription_session_input_audio_transcription import (
+ RealtimeTranscriptionSessionInputAudioTranscription,
+)
+
+__all__ = ["RealtimeTranscriptionSessionCreateResponse"]
+
+
+class RealtimeTranscriptionSessionCreateResponse(BaseModel):
+ client_secret: RealtimeTranscriptionSessionClientSecret
+ """Ephemeral key returned by the API.
+
+ Only present when the session is created on the server via REST API.
+ """
+
+ input_audio_format: Optional[str] = None
+ """The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`."""
+
+ input_audio_transcription: Optional[RealtimeTranscriptionSessionInputAudioTranscription] = None
+ """Configuration of the transcription model."""
+
+ modalities: Optional[List[Literal["text", "audio"]]] = None
+ """The set of modalities the model can respond with.
+
+ To disable audio, set this to ["text"].
+ """
+
+ turn_detection: Optional[RealtimeTranscriptionSessionTurnDetection] = None
+ """Configuration for turn detection.
+
+ Can be set to `null` to turn off. Server VAD means that the model will detect
+ the start and end of speech based on audio volume and respond at the end of user
+ speech.
+ """
src/openai/types/realtime/realtime_transcription_session_input_audio_transcription.py
@@ -0,0 +1,36 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Optional
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+
+__all__ = ["RealtimeTranscriptionSessionInputAudioTranscription"]
+
+
+class RealtimeTranscriptionSessionInputAudioTranscription(BaseModel):
+ language: Optional[str] = None
+ """The language of the input audio.
+
+ Supplying the input language in
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ format will improve accuracy and latency.
+ """
+
+ model: Optional[Literal["whisper-1", "gpt-4o-transcribe-latest", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"]] = (
+ None
+ )
+ """The model to use for transcription.
+
+ Current options are `whisper-1`, `gpt-4o-transcribe-latest`,
+ `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
+ """
+
+ prompt: Optional[str] = None
+ """
+ An optional text to guide the model's style or continue a previous audio
+ segment. For `whisper-1`, the
+ [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+ For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+ "expect words related to technology".
+ """
src/openai/types/realtime/realtime_transcription_session_turn_detection.py
@@ -0,0 +1,32 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Optional
+
+from ..._models import BaseModel
+
+__all__ = ["RealtimeTranscriptionSessionTurnDetection"]
+
+
+class RealtimeTranscriptionSessionTurnDetection(BaseModel):
+ prefix_padding_ms: Optional[int] = None
+ """Amount of audio to include before the VAD detected speech (in milliseconds).
+
+ Defaults to 300ms.
+ """
+
+ silence_duration_ms: Optional[int] = None
+ """Duration of silence to detect speech stop (in milliseconds).
+
+ Defaults to 500ms. With shorter values the model will respond more quickly, but
+ may jump in on short pauses from the user.
+ """
+
+ threshold: Optional[float] = None
+ """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5.
+
+ A higher threshold will require louder audio to activate the model, and thus
+ might perform better in noisy environments.
+ """
+
+ type: Optional[str] = None
+ """Type of turn detection, only `server_vad` is currently supported."""
src/openai/types/realtime/realtime_truncation.py
@@ -1,22 +1,10 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-from typing import Union, Optional
+from typing import Union
from typing_extensions import Literal, TypeAlias
-from ..._models import BaseModel
+from .realtime_truncation_retention_ratio import RealtimeTruncationRetentionRatio
-__all__ = ["RealtimeTruncation", "RetentionRatioTruncation"]
+__all__ = ["RealtimeTruncation"]
-
-class RetentionRatioTruncation(BaseModel):
- retention_ratio: float
- """Fraction of pre-instruction conversation tokens to retain (0.0 - 1.0)."""
-
- type: Literal["retention_ratio"]
- """Use retention ratio truncation."""
-
- post_instructions_token_limit: Optional[int] = None
- """Optional cap on tokens allowed after the instructions."""
-
-
-RealtimeTruncation: TypeAlias = Union[Literal["auto", "disabled"], RetentionRatioTruncation]
+RealtimeTruncation: TypeAlias = Union[Literal["auto", "disabled"], RealtimeTruncationRetentionRatio]
src/openai/types/realtime/realtime_truncation_param.py
@@ -2,21 +2,11 @@
from __future__ import annotations
-from typing import Union, Optional
-from typing_extensions import Literal, Required, TypeAlias, TypedDict
+from typing import Union
+from typing_extensions import Literal, TypeAlias
-__all__ = ["RealtimeTruncationParam", "RetentionRatioTruncation"]
+from .realtime_truncation_retention_ratio_param import RealtimeTruncationRetentionRatioParam
+__all__ = ["RealtimeTruncationParam"]
-class RetentionRatioTruncation(TypedDict, total=False):
- retention_ratio: Required[float]
- """Fraction of pre-instruction conversation tokens to retain (0.0 - 1.0)."""
-
- type: Required[Literal["retention_ratio"]]
- """Use retention ratio truncation."""
-
- post_instructions_token_limit: Optional[int]
- """Optional cap on tokens allowed after the instructions."""
-
-
-RealtimeTruncationParam: TypeAlias = Union[Literal["auto", "disabled"], RetentionRatioTruncation]
+RealtimeTruncationParam: TypeAlias = Union[Literal["auto", "disabled"], RealtimeTruncationRetentionRatioParam]
src/openai/types/realtime/realtime_truncation_retention_ratio.py
@@ -0,0 +1,18 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+
+__all__ = ["RealtimeTruncationRetentionRatio"]
+
+
+class RealtimeTruncationRetentionRatio(BaseModel):
+ retention_ratio: float
+ """
+ Fraction of post-instruction conversation tokens to retain (0.0 - 1.0) when the
+ conversation exceeds the input token limit.
+ """
+
+ type: Literal["retention_ratio"]
+ """Use retention ratio truncation."""
src/openai/types/realtime/realtime_truncation_retention_ratio_param.py
@@ -0,0 +1,18 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing_extensions import Literal, Required, TypedDict
+
+__all__ = ["RealtimeTruncationRetentionRatioParam"]
+
+
+class RealtimeTruncationRetentionRatioParam(TypedDict, total=False):
+ retention_ratio: Required[float]
+ """
+ Fraction of post-instruction conversation tokens to retain (0.0 - 1.0) when the
+ conversation exceeds the input token limit.
+ """
+
+ type: Required[Literal["retention_ratio"]]
+ """Use retention ratio truncation."""
src/openai/types/realtime/response_create_event.py
@@ -1,126 +1,12 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-from typing import List, Union, Optional
-from typing_extensions import Literal, TypeAlias
+from typing import Optional
+from typing_extensions import Literal
from ..._models import BaseModel
-from ..shared.metadata import Metadata
-from .conversation_item import ConversationItem
-from ..responses.response_prompt import ResponsePrompt
-from ..responses.tool_choice_mcp import ToolChoiceMcp
-from ..responses.tool_choice_options import ToolChoiceOptions
-from ..responses.tool_choice_function import ToolChoiceFunction
+from .realtime_response_create_params import RealtimeResponseCreateParams
-__all__ = ["ResponseCreateEvent", "Response", "ResponseToolChoice", "ResponseTool"]
-
-ResponseToolChoice: TypeAlias = Union[ToolChoiceOptions, ToolChoiceFunction, ToolChoiceMcp]
-
-
-class ResponseTool(BaseModel):
- description: Optional[str] = None
- """
- The description of the function, including guidance on when and how to call it,
- and guidance about what to tell the user when calling (if anything).
- """
-
- name: Optional[str] = None
- """The name of the function."""
-
- parameters: Optional[object] = None
- """Parameters of the function in JSON Schema."""
-
- type: Optional[Literal["function"]] = None
- """The type of the tool, i.e. `function`."""
-
-
-class Response(BaseModel):
- conversation: Union[str, Literal["auto", "none"], None] = None
- """Controls which conversation the response is added to.
-
- Currently supports `auto` and `none`, with `auto` as the default value. The
- `auto` value means that the contents of the response will be added to the
- default conversation. Set this to `none` to create an out-of-band response which
- will not add items to default conversation.
- """
-
- input: Optional[List[ConversationItem]] = None
- """Input items to include in the prompt for the model.
-
- Using this field creates a new context for this Response instead of using the
- default conversation. An empty array `[]` will clear the context for this
- Response. Note that this can include references to items from the default
- conversation.
- """
-
- instructions: Optional[str] = None
- """The default system instructions (i.e.
-
- system message) prepended to model calls. This field allows the client to guide
- the model on desired responses. The model can be instructed on response content
- and format, (e.g. "be extremely succinct", "act friendly", "here are examples of
- good responses") and on audio behavior (e.g. "talk quickly", "inject emotion
- into your voice", "laugh frequently"). The instructions are not guaranteed to be
- followed by the model, but they provide guidance to the model on the desired
- behavior.
-
- Note that the server sets default instructions which will be used if this field
- is not set and are visible in the `session.created` event at the start of the
- session.
- """
-
- max_output_tokens: Union[int, Literal["inf"], None] = None
- """
- Maximum number of output tokens for a single assistant response, inclusive of
- tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
- `inf` for the maximum available tokens for a given model. Defaults to `inf`.
- """
-
- metadata: Optional[Metadata] = None
- """Set of 16 key-value pairs that can be attached to an object.
-
- This can be useful for storing additional information about the object in a
- structured format, and querying for objects via API or the dashboard.
-
- Keys are strings with a maximum length of 64 characters. Values are strings with
- a maximum length of 512 characters.
- """
-
- modalities: Optional[List[Literal["text", "audio"]]] = None
- """The set of modalities the model can respond with.
-
- To disable audio, set this to ["text"].
- """
-
- output_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None
- """The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`."""
-
- prompt: Optional[ResponsePrompt] = None
- """Reference to a prompt template and its variables.
-
- [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
- """
-
- temperature: Optional[float] = None
- """Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8."""
-
- tool_choice: Optional[ResponseToolChoice] = None
- """How the model chooses tools.
-
- Provide one of the string modes or force a specific function/MCP tool.
- """
-
- tools: Optional[List[ResponseTool]] = None
- """Tools (functions) available to the model."""
-
- voice: Union[
- str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None
- ] = None
- """The voice the model uses to respond.
-
- Voice cannot be changed during the session once the model has responded with
- audio at least once. Current voice options are `alloy`, `ash`, `ballad`,
- `coral`, `echo`, `sage`, `shimmer`, and `verse`.
- """
+__all__ = ["ResponseCreateEvent"]
class ResponseCreateEvent(BaseModel):
@@ -130,5 +16,5 @@ class ResponseCreateEvent(BaseModel):
event_id: Optional[str] = None
"""Optional client-generated ID used to identify this event."""
- response: Optional[Response] = None
+ response: Optional[RealtimeResponseCreateParams] = None
"""Create a new Realtime response with these parameters"""
src/openai/types/realtime/response_create_event_param.py
@@ -2,124 +2,11 @@
from __future__ import annotations
-from typing import List, Union, Iterable, Optional
-from typing_extensions import Literal, Required, TypeAlias, TypedDict
+from typing_extensions import Literal, Required, TypedDict
-from ..shared_params.metadata import Metadata
-from .conversation_item_param import ConversationItemParam
-from ..responses.tool_choice_options import ToolChoiceOptions
-from ..responses.response_prompt_param import ResponsePromptParam
-from ..responses.tool_choice_mcp_param import ToolChoiceMcpParam
-from ..responses.tool_choice_function_param import ToolChoiceFunctionParam
+from .realtime_response_create_params_param import RealtimeResponseCreateParamsParam
-__all__ = ["ResponseCreateEventParam", "Response", "ResponseToolChoice", "ResponseTool"]
-
-ResponseToolChoice: TypeAlias = Union[ToolChoiceOptions, ToolChoiceFunctionParam, ToolChoiceMcpParam]
-
-
-class ResponseTool(TypedDict, total=False):
- description: str
- """
- The description of the function, including guidance on when and how to call it,
- and guidance about what to tell the user when calling (if anything).
- """
-
- name: str
- """The name of the function."""
-
- parameters: object
- """Parameters of the function in JSON Schema."""
-
- type: Literal["function"]
- """The type of the tool, i.e. `function`."""
-
-
-class Response(TypedDict, total=False):
- conversation: Union[str, Literal["auto", "none"]]
- """Controls which conversation the response is added to.
-
- Currently supports `auto` and `none`, with `auto` as the default value. The
- `auto` value means that the contents of the response will be added to the
- default conversation. Set this to `none` to create an out-of-band response which
- will not add items to default conversation.
- """
-
- input: Iterable[ConversationItemParam]
- """Input items to include in the prompt for the model.
-
- Using this field creates a new context for this Response instead of using the
- default conversation. An empty array `[]` will clear the context for this
- Response. Note that this can include references to items from the default
- conversation.
- """
-
- instructions: str
- """The default system instructions (i.e.
-
- system message) prepended to model calls. This field allows the client to guide
- the model on desired responses. The model can be instructed on response content
- and format, (e.g. "be extremely succinct", "act friendly", "here are examples of
- good responses") and on audio behavior (e.g. "talk quickly", "inject emotion
- into your voice", "laugh frequently"). The instructions are not guaranteed to be
- followed by the model, but they provide guidance to the model on the desired
- behavior.
-
- Note that the server sets default instructions which will be used if this field
- is not set and are visible in the `session.created` event at the start of the
- session.
- """
-
- max_output_tokens: Union[int, Literal["inf"]]
- """
- Maximum number of output tokens for a single assistant response, inclusive of
- tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
- `inf` for the maximum available tokens for a given model. Defaults to `inf`.
- """
-
- metadata: Optional[Metadata]
- """Set of 16 key-value pairs that can be attached to an object.
-
- This can be useful for storing additional information about the object in a
- structured format, and querying for objects via API or the dashboard.
-
- Keys are strings with a maximum length of 64 characters. Values are strings with
- a maximum length of 512 characters.
- """
-
- modalities: List[Literal["text", "audio"]]
- """The set of modalities the model can respond with.
-
- To disable audio, set this to ["text"].
- """
-
- output_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"]
- """The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`."""
-
- prompt: Optional[ResponsePromptParam]
- """Reference to a prompt template and its variables.
-
- [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
- """
-
- temperature: float
- """Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8."""
-
- tool_choice: ResponseToolChoice
- """How the model chooses tools.
-
- Provide one of the string modes or force a specific function/MCP tool.
- """
-
- tools: Iterable[ResponseTool]
- """Tools (functions) available to the model."""
-
- voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]]
- """The voice the model uses to respond.
-
- Voice cannot be changed during the session once the model has responded with
- audio at least once. Current voice options are `alloy`, `ash`, `ballad`,
- `coral`, `echo`, `sage`, `shimmer`, and `verse`.
- """
+__all__ = ["ResponseCreateEventParam"]
class ResponseCreateEventParam(TypedDict, total=False):
@@ -129,5 +16,5 @@ class ResponseCreateEventParam(TypedDict, total=False):
event_id: str
"""Optional client-generated ID used to identify this event."""
- response: Response
+ response: RealtimeResponseCreateParamsParam
"""Create a new Realtime response with these parameters"""
src/openai/types/realtime/session_created_event.py
@@ -1,19 +1,23 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-from typing_extensions import Literal
+from typing import Union
+from typing_extensions import Literal, TypeAlias
from ..._models import BaseModel
-from .realtime_session import RealtimeSession
+from .realtime_session_create_request import RealtimeSessionCreateRequest
+from .realtime_transcription_session_create_request import RealtimeTranscriptionSessionCreateRequest
-__all__ = ["SessionCreatedEvent"]
+__all__ = ["SessionCreatedEvent", "Session"]
+
+Session: TypeAlias = Union[RealtimeSessionCreateRequest, RealtimeTranscriptionSessionCreateRequest]
class SessionCreatedEvent(BaseModel):
event_id: str
"""The unique ID of the server event."""
- session: RealtimeSession
- """Realtime session object."""
+ session: Session
+ """The session configuration."""
type: Literal["session.created"]
"""The event type, must be `session.created`."""
src/openai/types/realtime/session_update_event.py
@@ -1,20 +1,31 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-from typing import Optional
-from typing_extensions import Literal
+from typing import Union, Optional
+from typing_extensions import Literal, TypeAlias
from ..._models import BaseModel
from .realtime_session_create_request import RealtimeSessionCreateRequest
+from .realtime_transcription_session_create_request import RealtimeTranscriptionSessionCreateRequest
-__all__ = ["SessionUpdateEvent"]
+__all__ = ["SessionUpdateEvent", "Session"]
+
+Session: TypeAlias = Union[RealtimeSessionCreateRequest, RealtimeTranscriptionSessionCreateRequest]
class SessionUpdateEvent(BaseModel):
- session: RealtimeSessionCreateRequest
- """Realtime session object configuration."""
+ session: Session
+ """Update the Realtime session.
+
+ Choose either a realtime session or a transcription session.
+ """
type: Literal["session.update"]
"""The event type, must be `session.update`."""
event_id: Optional[str] = None
- """Optional client-generated ID used to identify this event."""
+ """Optional client-generated ID used to identify this event.
+
+ This is an arbitrary string that a client may assign. It will be passed back if
+ there is an error with the event, but the corresponding `session.updated` event
+ will not include it.
+ """
src/openai/types/realtime/session_update_event_param.py
@@ -2,19 +2,31 @@
from __future__ import annotations
-from typing_extensions import Literal, Required, TypedDict
+from typing import Union
+from typing_extensions import Literal, Required, TypeAlias, TypedDict
from .realtime_session_create_request_param import RealtimeSessionCreateRequestParam
+from .realtime_transcription_session_create_request_param import RealtimeTranscriptionSessionCreateRequestParam
-__all__ = ["SessionUpdateEventParam"]
+__all__ = ["SessionUpdateEventParam", "Session"]
+
+Session: TypeAlias = Union[RealtimeSessionCreateRequestParam, RealtimeTranscriptionSessionCreateRequestParam]
class SessionUpdateEventParam(TypedDict, total=False):
- session: Required[RealtimeSessionCreateRequestParam]
- """Realtime session object configuration."""
+ session: Required[Session]
+ """Update the Realtime session.
+
+ Choose either a realtime session or a transcription session.
+ """
type: Required[Literal["session.update"]]
"""The event type, must be `session.update`."""
event_id: str
- """Optional client-generated ID used to identify this event."""
+ """Optional client-generated ID used to identify this event.
+
+ This is an arbitrary string that a client may assign. It will be passed back if
+ there is an error with the event, but the corresponding `session.updated` event
+ will not include it.
+ """
src/openai/types/realtime/session_updated_event.py
@@ -1,19 +1,23 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-from typing_extensions import Literal
+from typing import Union
+from typing_extensions import Literal, TypeAlias
from ..._models import BaseModel
-from .realtime_session import RealtimeSession
+from .realtime_session_create_request import RealtimeSessionCreateRequest
+from .realtime_transcription_session_create_request import RealtimeTranscriptionSessionCreateRequest
-__all__ = ["SessionUpdatedEvent"]
+__all__ = ["SessionUpdatedEvent", "Session"]
+
+Session: TypeAlias = Union[RealtimeSessionCreateRequest, RealtimeTranscriptionSessionCreateRequest]
class SessionUpdatedEvent(BaseModel):
event_id: str
"""The unique ID of the server event."""
- session: RealtimeSession
- """Realtime session object."""
+ session: Session
+ """The session configuration."""
type: Literal["session.updated"]
"""The event type, must be `session.updated`."""
src/openai/types/realtime/transcription_session_created.py
@@ -1,105 +1,24 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-from typing import List, Optional
from typing_extensions import Literal
from ..._models import BaseModel
+from .realtime_transcription_session_create_response import RealtimeTranscriptionSessionCreateResponse
-__all__ = [
- "TranscriptionSessionCreated",
- "Session",
- "SessionAudio",
- "SessionAudioInput",
- "SessionAudioInputNoiseReduction",
- "SessionAudioInputTranscription",
- "SessionAudioInputTurnDetection",
-]
-
-
-class SessionAudioInputNoiseReduction(BaseModel):
- type: Optional[Literal["near_field", "far_field"]] = None
-
-
-class SessionAudioInputTranscription(BaseModel):
- language: Optional[str] = None
- """The language of the input audio.
-
- Supplying the input language in
- [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- format will improve accuracy and latency.
- """
-
- model: Optional[Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"]] = None
- """The model to use for transcription.
-
- Can be `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, or `whisper-1`.
- """
-
- prompt: Optional[str] = None
- """An optional text to guide the model's style or continue a previous audio
- segment.
-
- The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- should match the audio language.
- """
-
-
-class SessionAudioInputTurnDetection(BaseModel):
- prefix_padding_ms: Optional[int] = None
-
- silence_duration_ms: Optional[int] = None
-
- threshold: Optional[float] = None
-
- type: Optional[str] = None
- """Type of turn detection, only `server_vad` is currently supported."""
-
-
-class SessionAudioInput(BaseModel):
- format: Optional[str] = None
- """The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`."""
-
- noise_reduction: Optional[SessionAudioInputNoiseReduction] = None
- """Configuration for input audio noise reduction."""
-
- transcription: Optional[SessionAudioInputTranscription] = None
- """Configuration of the transcription model."""
-
- turn_detection: Optional[SessionAudioInputTurnDetection] = None
- """Configuration for turn detection."""
-
-
-class SessionAudio(BaseModel):
- input: Optional[SessionAudioInput] = None
-
-
-class Session(BaseModel):
- id: Optional[str] = None
- """Unique identifier for the session that looks like `sess_1234567890abcdef`."""
-
- audio: Optional[SessionAudio] = None
- """Configuration for input audio for the session."""
-
- expires_at: Optional[int] = None
- """Expiration timestamp for the session, in seconds since epoch."""
-
- include: Optional[List[Literal["item.input_audio_transcription.logprobs"]]] = None
- """Additional fields to include in server outputs.
-
- - `item.input_audio_transcription.logprobs`: Include logprobs for input audio
- transcription.
- """
-
- object: Optional[str] = None
- """The object type. Always `realtime.transcription_session`."""
+__all__ = ["TranscriptionSessionCreated"]
class TranscriptionSessionCreated(BaseModel):
event_id: str
"""The unique ID of the server event."""
- session: Session
- """A Realtime transcription session configuration object."""
+ session: RealtimeTranscriptionSessionCreateResponse
+ """A new Realtime transcription session configuration.
+
+ When a session is created on the server via REST API, the session object also
+ contains an ephemeral key. Default TTL for keys is 10 minutes. This property is
+ not present when a session is updated via the WebSocket API.
+ """
type: Literal["transcription_session.created"]
"""The event type, must be `transcription_session.created`."""
src/openai/types/realtime/transcription_session_update.py
@@ -1,16 +1,94 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-from typing import Optional
+from typing import List, Optional
from typing_extensions import Literal
from ..._models import BaseModel
-from .realtime_transcription_session_create_request import RealtimeTranscriptionSessionCreateRequest
+from .audio_transcription import AudioTranscription
+from .noise_reduction_type import NoiseReductionType
-__all__ = ["TranscriptionSessionUpdate"]
+__all__ = ["TranscriptionSessionUpdate", "Session", "SessionInputAudioNoiseReduction", "SessionTurnDetection"]
+
+
+class SessionInputAudioNoiseReduction(BaseModel):
+ type: Optional[NoiseReductionType] = None
+ """Type of noise reduction.
+
+ `near_field` is for close-talking microphones such as headphones, `far_field` is
+ for far-field microphones such as laptop or conference room microphones.
+ """
+
+
+class SessionTurnDetection(BaseModel):
+ prefix_padding_ms: Optional[int] = None
+ """Amount of audio to include before the VAD detected speech (in milliseconds).
+
+ Defaults to 300ms.
+ """
+
+ silence_duration_ms: Optional[int] = None
+ """Duration of silence to detect speech stop (in milliseconds).
+
+ Defaults to 500ms. With shorter values the model will respond more quickly, but
+ may jump in on short pauses from the user.
+ """
+
+ threshold: Optional[float] = None
+ """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5.
+
+ A higher threshold will require louder audio to activate the model, and thus
+ might perform better in noisy environments.
+ """
+
+ type: Optional[Literal["server_vad"]] = None
+ """Type of turn detection.
+
+ Only `server_vad` is currently supported for transcription sessions.
+ """
+
+
+class Session(BaseModel):
+ include: Optional[List[Literal["item.input_audio_transcription.logprobs"]]] = None
+ """The set of items to include in the transcription.
+
+ Current available items are: `item.input_audio_transcription.logprobs`
+ """
+
+ input_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None
+ """The format of input audio.
+
+ Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must
+ be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian
+ byte order.
+ """
+
+ input_audio_noise_reduction: Optional[SessionInputAudioNoiseReduction] = None
+ """Configuration for input audio noise reduction.
+
+ This can be set to `null` to turn off. Noise reduction filters audio added to
+ the input audio buffer before it is sent to VAD and the model. Filtering the
+ audio can improve VAD and turn detection accuracy (reducing false positives) and
+ model performance by improving perception of the input audio.
+ """
+
+ input_audio_transcription: Optional[AudioTranscription] = None
+ """Configuration for input audio transcription.
+
+ The client can optionally set the language and prompt for transcription, these
+ offer additional guidance to the transcription service.
+ """
+
+ turn_detection: Optional[SessionTurnDetection] = None
+ """Configuration for turn detection.
+
+ Can be set to `null` to turn off. Server VAD means that the model will detect
+ the start and end of speech based on audio volume and respond at the end of user
+ speech.
+ """
class TranscriptionSessionUpdate(BaseModel):
- session: RealtimeTranscriptionSessionCreateRequest
+ session: Session
"""Realtime transcription session object configuration."""
type: Literal["transcription_session.update"]
src/openai/types/realtime/transcription_session_update_param.py
@@ -2,15 +2,94 @@
from __future__ import annotations
+from typing import List
from typing_extensions import Literal, Required, TypedDict
-from .realtime_transcription_session_create_request_param import RealtimeTranscriptionSessionCreateRequestParam
+from .noise_reduction_type import NoiseReductionType
+from .audio_transcription_param import AudioTranscriptionParam
-__all__ = ["TranscriptionSessionUpdateParam"]
+__all__ = ["TranscriptionSessionUpdateParam", "Session", "SessionInputAudioNoiseReduction", "SessionTurnDetection"]
+
+
+class SessionInputAudioNoiseReduction(TypedDict, total=False):
+ type: NoiseReductionType
+ """Type of noise reduction.
+
+ `near_field` is for close-talking microphones such as headphones, `far_field` is
+ for far-field microphones such as laptop or conference room microphones.
+ """
+
+
+class SessionTurnDetection(TypedDict, total=False):
+ prefix_padding_ms: int
+ """Amount of audio to include before the VAD detected speech (in milliseconds).
+
+ Defaults to 300ms.
+ """
+
+ silence_duration_ms: int
+ """Duration of silence to detect speech stop (in milliseconds).
+
+ Defaults to 500ms. With shorter values the model will respond more quickly, but
+ may jump in on short pauses from the user.
+ """
+
+ threshold: float
+ """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5.
+
+ A higher threshold will require louder audio to activate the model, and thus
+ might perform better in noisy environments.
+ """
+
+ type: Literal["server_vad"]
+ """Type of turn detection.
+
+ Only `server_vad` is currently supported for transcription sessions.
+ """
+
+
+class Session(TypedDict, total=False):
+ include: List[Literal["item.input_audio_transcription.logprobs"]]
+ """The set of items to include in the transcription.
+
+ Current available items are: `item.input_audio_transcription.logprobs`
+ """
+
+ input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"]
+ """The format of input audio.
+
+ Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must
+ be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian
+ byte order.
+ """
+
+ input_audio_noise_reduction: SessionInputAudioNoiseReduction
+ """Configuration for input audio noise reduction.
+
+ This can be set to `null` to turn off. Noise reduction filters audio added to
+ the input audio buffer before it is sent to VAD and the model. Filtering the
+ audio can improve VAD and turn detection accuracy (reducing false positives) and
+ model performance by improving perception of the input audio.
+ """
+
+ input_audio_transcription: AudioTranscriptionParam
+ """Configuration for input audio transcription.
+
+ The client can optionally set the language and prompt for transcription, these
+ offer additional guidance to the transcription service.
+ """
+
+ turn_detection: SessionTurnDetection
+ """Configuration for turn detection.
+
+ Can be set to `null` to turn off. Server VAD means that the model will detect
+ the start and end of speech based on audio volume and respond at the end of user
+ speech.
+ """
class TranscriptionSessionUpdateParam(TypedDict, total=False):
- session: Required[RealtimeTranscriptionSessionCreateRequestParam]
+ session: Required[Session]
"""Realtime transcription session object configuration."""
type: Required[Literal["transcription_session.update"]]
src/openai/types/realtime/transcription_session_updated_event.py
@@ -1,105 +1,24 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-from typing import List, Optional
from typing_extensions import Literal
from ..._models import BaseModel
+from .realtime_transcription_session_create_response import RealtimeTranscriptionSessionCreateResponse
-__all__ = [
- "TranscriptionSessionUpdatedEvent",
- "Session",
- "SessionAudio",
- "SessionAudioInput",
- "SessionAudioInputNoiseReduction",
- "SessionAudioInputTranscription",
- "SessionAudioInputTurnDetection",
-]
-
-
-class SessionAudioInputNoiseReduction(BaseModel):
- type: Optional[Literal["near_field", "far_field"]] = None
-
-
-class SessionAudioInputTranscription(BaseModel):
- language: Optional[str] = None
- """The language of the input audio.
-
- Supplying the input language in
- [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- format will improve accuracy and latency.
- """
-
- model: Optional[Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"]] = None
- """The model to use for transcription.
-
- Can be `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, or `whisper-1`.
- """
-
- prompt: Optional[str] = None
- """An optional text to guide the model's style or continue a previous audio
- segment.
-
- The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- should match the audio language.
- """
-
-
-class SessionAudioInputTurnDetection(BaseModel):
- prefix_padding_ms: Optional[int] = None
-
- silence_duration_ms: Optional[int] = None
-
- threshold: Optional[float] = None
-
- type: Optional[str] = None
- """Type of turn detection, only `server_vad` is currently supported."""
-
-
-class SessionAudioInput(BaseModel):
- format: Optional[str] = None
- """The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`."""
-
- noise_reduction: Optional[SessionAudioInputNoiseReduction] = None
- """Configuration for input audio noise reduction."""
-
- transcription: Optional[SessionAudioInputTranscription] = None
- """Configuration of the transcription model."""
-
- turn_detection: Optional[SessionAudioInputTurnDetection] = None
- """Configuration for turn detection."""
-
-
-class SessionAudio(BaseModel):
- input: Optional[SessionAudioInput] = None
-
-
-class Session(BaseModel):
- id: Optional[str] = None
- """Unique identifier for the session that looks like `sess_1234567890abcdef`."""
-
- audio: Optional[SessionAudio] = None
- """Configuration for input audio for the session."""
-
- expires_at: Optional[int] = None
- """Expiration timestamp for the session, in seconds since epoch."""
-
- include: Optional[List[Literal["item.input_audio_transcription.logprobs"]]] = None
- """Additional fields to include in server outputs.
-
- - `item.input_audio_transcription.logprobs`: Include logprobs for input audio
- transcription.
- """
-
- object: Optional[str] = None
- """The object type. Always `realtime.transcription_session`."""
+__all__ = ["TranscriptionSessionUpdatedEvent"]
class TranscriptionSessionUpdatedEvent(BaseModel):
event_id: str
"""The unique ID of the server event."""
- session: Session
- """A Realtime transcription session configuration object."""
+ session: RealtimeTranscriptionSessionCreateResponse
+ """A new Realtime transcription session configuration.
+
+ When a session is created on the server via REST API, the session object also
+ contains an ephemeral key. Default TTL for keys is 10 minutes. This property is
+ not present when a session is updated via the WebSocket API.
+ """
type: Literal["transcription_session.updated"]
"""The event type, must be `transcription_session.updated`."""
src/openai/_version.py
@@ -1,4 +1,4 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
__title__ = "openai"
-__version__ = "1.106.1" # x-release-please-version
+__version__ = "1.107.0" # x-release-please-version
tests/api_resources/realtime/test_client_secrets.py
@@ -30,11 +30,13 @@ class TestClientSecrets:
"seconds": 10,
},
session={
- "model": "string",
"type": "realtime",
"audio": {
"input": {
- "format": "pcm16",
+ "format": {
+ "rate": 24000,
+ "type": "audio/pcm",
+ },
"noise_reduction": {"type": "near_field"},
"transcription": {
"language": "language",
@@ -53,27 +55,24 @@ class TestClientSecrets:
},
},
"output": {
- "format": "pcm16",
+ "format": {
+ "rate": 24000,
+ "type": "audio/pcm",
+ },
"speed": 0.25,
"voice": "ash",
},
},
- "client_secret": {
- "expires_after": {
- "anchor": "created_at",
- "seconds": 0,
- }
- },
"include": ["item.input_audio_transcription.logprobs"],
"instructions": "instructions",
"max_output_tokens": 0,
+ "model": "string",
"output_modalities": ["text"],
"prompt": {
"id": "id",
"variables": {"foo": "string"},
"version": "version",
},
- "temperature": 0,
"tool_choice": "none",
"tools": [
{
@@ -128,11 +127,13 @@ class TestAsyncClientSecrets:
"seconds": 10,
},
session={
- "model": "string",
"type": "realtime",
"audio": {
"input": {
- "format": "pcm16",
+ "format": {
+ "rate": 24000,
+ "type": "audio/pcm",
+ },
"noise_reduction": {"type": "near_field"},
"transcription": {
"language": "language",
@@ -151,27 +152,24 @@ class TestAsyncClientSecrets:
},
},
"output": {
- "format": "pcm16",
+ "format": {
+ "rate": 24000,
+ "type": "audio/pcm",
+ },
"speed": 0.25,
"voice": "ash",
},
},
- "client_secret": {
- "expires_after": {
- "anchor": "created_at",
- "seconds": 0,
- }
- },
"include": ["item.input_audio_transcription.logprobs"],
"instructions": "instructions",
"max_output_tokens": 0,
+ "model": "string",
"output_modalities": ["text"],
"prompt": {
"id": "id",
"variables": {"foo": "string"},
"version": "version",
},
- "temperature": 0,
"tool_choice": "none",
"tools": [
{
.release-please-manifest.json
@@ -1,3 +1,3 @@
{
- ".": "1.106.1"
+ ".": "1.107.0"
}
\ No newline at end of file
.stats.yml
@@ -1,4 +1,4 @@
configured_endpoints: 118
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-51afd6abbcb18c3086f62993f9379c18443b9e516cbc0548ddfb932e835657f8.yml
-openapi_spec_hash: dae6afeaefa15cb8700c7a870531e06f
-config_hash: b854932c0ea24b400bdd64e4376936bd
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-7807ec6037efcee1af7decbfd3974a42b761fb6c6a71b4050fe43484d7fcbac4.yml
+openapi_spec_hash: da6851e3891ad2659a50ed6a736fd32a
+config_hash: 74d955cdc2377213f5268ea309090f6c
api.md
@@ -863,6 +863,7 @@ Types:
```python
from openai.types.realtime import (
+ AudioTranscription,
ConversationCreatedEvent,
ConversationItem,
ConversationItemAdded,
@@ -891,11 +892,16 @@ from openai.types.realtime import (
McpListToolsCompleted,
McpListToolsFailed,
McpListToolsInProgress,
+ Models,
+ NoiseReductionType,
OutputAudioBufferClearEvent,
RateLimitsUpdatedEvent,
RealtimeAudioConfig,
+ RealtimeAudioConfigInput,
+ RealtimeAudioConfigOutput,
+ RealtimeAudioFormats,
+ RealtimeAudioInputTurnDetection,
RealtimeClientEvent,
- RealtimeClientSecretConfig,
RealtimeConversationItemAssistantMessage,
RealtimeConversationItemFunctionCall,
RealtimeConversationItemFunctionCallOutput,
@@ -911,6 +917,9 @@ from openai.types.realtime import (
RealtimeMcpToolExecutionError,
RealtimeMcphttpError,
RealtimeResponse,
+ RealtimeResponseCreateAudioOutput,
+ RealtimeResponseCreateMcpTool,
+ RealtimeResponseCreateParams,
RealtimeResponseStatus,
RealtimeResponseUsage,
RealtimeResponseUsageInputTokenDetails,
@@ -922,8 +931,12 @@ from openai.types.realtime import (
RealtimeToolsConfig,
RealtimeToolsConfigUnion,
RealtimeTracingConfig,
+ RealtimeTranscriptionSessionAudio,
+ RealtimeTranscriptionSessionAudioInput,
+ RealtimeTranscriptionSessionAudioInputTurnDetection,
RealtimeTranscriptionSessionCreateRequest,
RealtimeTruncation,
+ RealtimeTruncationRetentionRatio,
ResponseAudioDeltaEvent,
ResponseAudioDoneEvent,
ResponseAudioTranscriptDeltaEvent,
@@ -959,7 +972,15 @@ from openai.types.realtime import (
Types:
```python
-from openai.types.realtime import RealtimeSessionCreateResponse, ClientSecretCreateResponse
+from openai.types.realtime import (
+ RealtimeSessionClientSecret,
+ RealtimeSessionCreateResponse,
+ RealtimeTranscriptionSessionClientSecret,
+ RealtimeTranscriptionSessionCreateResponse,
+ RealtimeTranscriptionSessionInputAudioTranscription,
+ RealtimeTranscriptionSessionTurnDetection,
+ ClientSecretCreateResponse,
+)
```
Methods:
CHANGELOG.md
@@ -1,5 +1,18 @@
# Changelog
+## 1.107.0 (2025-09-08)
+
+Full Changelog: [v1.106.1...v1.107.0](https://github.com/openai/openai-python/compare/v1.106.1...v1.107.0)
+
+### Features
+
+* **api:** ship the RealtimeGA API shape ([dc319d8](https://github.com/openai/openai-python/commit/dc319d8bbb3a20108399c1d15f98e63bdd84eb5c))
+
+
+### Chores
+
+* **internal:** codegen related update ([b79b7ca](https://github.com/openai/openai-python/commit/b79b7ca3a72009a036db0a344b500f616ca0443f))
+
## 1.106.1 (2025-09-04)
Full Changelog: [v1.106.0...v1.106.1](https://github.com/openai/openai-python/compare/v1.106.0...v1.106.1)
pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "openai"
-version = "1.106.1"
+version = "1.107.0"
description = "The official Python library for the openai API"
dynamic = ["readme"]
license = "Apache-2.0"
requirements-dev.lock
@@ -70,7 +70,7 @@ filelock==3.12.4
frozenlist==1.7.0
# via aiohttp
# via aiosignal
-griffe==1.13.0
+griffe==1.14.0
h11==0.16.0
# via httpcore
httpcore==1.0.9