1# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
  2
  3from typing import List, Union, Optional
  4from typing_extensions import Literal, TypeAlias
  5
  6from ...._models import BaseModel
  7
  8__all__ = [
  9    "Session",
 10    "InputAudioNoiseReduction",
 11    "InputAudioTranscription",
 12    "Tool",
 13    "Tracing",
 14    "TracingTracingConfiguration",
 15    "TurnDetection",
 16]
 17
 18
 19class InputAudioNoiseReduction(BaseModel):
 20    type: Optional[Literal["near_field", "far_field"]] = None
 21    """Type of noise reduction.
 22
 23    `near_field` is for close-talking microphones such as headphones, `far_field` is
 24    for far-field microphones such as laptop or conference room microphones.
 25    """
 26
 27
 28class InputAudioTranscription(BaseModel):
 29    language: Optional[str] = None
 30    """The language of the input audio.
 31
 32    Supplying the input language in
 33    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
 34    format will improve accuracy and latency.
 35    """
 36
 37    model: Optional[str] = None
 38    """
 39    The model to use for transcription, current options are `gpt-4o-transcribe`,
 40    `gpt-4o-mini-transcribe`, and `whisper-1`.
 41    """
 42
 43    prompt: Optional[str] = None
 44    """
 45    An optional text to guide the model's style or continue a previous audio
 46    segment. For `whisper-1`, the
 47    [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
 48    For `gpt-4o-transcribe` models, the prompt is a free text string, for example
 49    "expect words related to technology".
 50    """
 51
 52
 53class Tool(BaseModel):
 54    description: Optional[str] = None
 55    """
 56    The description of the function, including guidance on when and how to call it,
 57    and guidance about what to tell the user when calling (if anything).
 58    """
 59
 60    name: Optional[str] = None
 61    """The name of the function."""
 62
 63    parameters: Optional[object] = None
 64    """Parameters of the function in JSON Schema."""
 65
 66    type: Optional[Literal["function"]] = None
 67    """The type of the tool, i.e. `function`."""
 68
 69
 70class TracingTracingConfiguration(BaseModel):
 71    group_id: Optional[str] = None
 72    """
 73    The group id to attach to this trace to enable filtering and grouping in the
 74    traces dashboard.
 75    """
 76
 77    metadata: Optional[object] = None
 78    """
 79    The arbitrary metadata to attach to this trace to enable filtering in the traces
 80    dashboard.
 81    """
 82
 83    workflow_name: Optional[str] = None
 84    """The name of the workflow to attach to this trace.
 85
 86    This is used to name the trace in the traces dashboard.
 87    """
 88
 89
 90Tracing: TypeAlias = Union[Literal["auto"], TracingTracingConfiguration]
 91
 92
 93class TurnDetection(BaseModel):
 94    create_response: Optional[bool] = None
 95    """
 96    Whether or not to automatically generate a response when a VAD stop event
 97    occurs.
 98    """
 99
100    eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None
101    """Used only for `semantic_vad` mode.
102
103    The eagerness of the model to respond. `low` will wait longer for the user to
104    continue speaking, `high` will respond more quickly. `auto` is the default and
105    is equivalent to `medium`.
106    """
107
108    interrupt_response: Optional[bool] = None
109    """
110    Whether or not to automatically interrupt any ongoing response with output to
111    the default conversation (i.e. `conversation` of `auto`) when a VAD start event
112    occurs.
113    """
114
115    prefix_padding_ms: Optional[int] = None
116    """Used only for `server_vad` mode.
117
118    Amount of audio to include before the VAD detected speech (in milliseconds).
119    Defaults to 300ms.
120    """
121
122    silence_duration_ms: Optional[int] = None
123    """Used only for `server_vad` mode.
124
125    Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
126    With shorter values the model will respond more quickly, but may jump in on
127    short pauses from the user.
128    """
129
130    threshold: Optional[float] = None
131    """Used only for `server_vad` mode.
132
133    Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
134    threshold will require louder audio to activate the model, and thus might
135    perform better in noisy environments.
136    """
137
138    type: Optional[Literal["server_vad", "semantic_vad"]] = None
139    """Type of turn detection."""
140
141
142class Session(BaseModel):
143    id: Optional[str] = None
144    """Unique identifier for the session that looks like `sess_1234567890abcdef`."""
145
146    input_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None
147    """The format of input audio.
148
149    Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must
150    be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian
151    byte order.
152    """
153
154    input_audio_noise_reduction: Optional[InputAudioNoiseReduction] = None
155    """Configuration for input audio noise reduction.
156
157    This can be set to `null` to turn off. Noise reduction filters audio added to
158    the input audio buffer before it is sent to VAD and the model. Filtering the
159    audio can improve VAD and turn detection accuracy (reducing false positives) and
160    model performance by improving perception of the input audio.
161    """
162
163    input_audio_transcription: Optional[InputAudioTranscription] = None
164    """
165    Configuration for input audio transcription, defaults to off and can be set to
166    `null` to turn off once on. Input audio transcription is not native to the
167    model, since the model consumes audio directly. Transcription runs
168    asynchronously through
169    [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
170    and should be treated as guidance of input audio content rather than precisely
171    what the model heard. The client can optionally set the language and prompt for
172    transcription, these offer additional guidance to the transcription service.
173    """
174
175    instructions: Optional[str] = None
176    """The default system instructions (i.e.
177
178    system message) prepended to model calls. This field allows the client to guide
179    the model on desired responses. The model can be instructed on response content
180    and format, (e.g. "be extremely succinct", "act friendly", "here are examples of
181    good responses") and on audio behavior (e.g. "talk quickly", "inject emotion
182    into your voice", "laugh frequently"). The instructions are not guaranteed to be
183    followed by the model, but they provide guidance to the model on the desired
184    behavior.
185
186    Note that the server sets default instructions which will be used if this field
187    is not set and are visible in the `session.created` event at the start of the
188    session.
189    """
190
191    max_response_output_tokens: Union[int, Literal["inf"], None] = None
192    """
193    Maximum number of output tokens for a single assistant response, inclusive of
194    tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
195    `inf` for the maximum available tokens for a given model. Defaults to `inf`.
196    """
197
198    modalities: Optional[List[Literal["text", "audio"]]] = None
199    """The set of modalities the model can respond with.
200
201    To disable audio, set this to ["text"].
202    """
203
204    model: Optional[
205        Literal[
206            "gpt-realtime",
207            "gpt-realtime-2025-08-28",
208            "gpt-4o-realtime-preview",
209            "gpt-4o-realtime-preview-2024-10-01",
210            "gpt-4o-realtime-preview-2024-12-17",
211            "gpt-4o-realtime-preview-2025-06-03",
212            "gpt-4o-mini-realtime-preview",
213            "gpt-4o-mini-realtime-preview-2024-12-17",
214        ]
215    ] = None
216    """The Realtime model used for this session."""
217
218    output_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None
219    """The format of output audio.
220
221    Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, output audio is
222    sampled at a rate of 24kHz.
223    """
224
225    speed: Optional[float] = None
226    """The speed of the model's spoken response.
227
228    1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
229    This value can only be changed in between model turns, not while a response is
230    in progress.
231    """
232
233    temperature: Optional[float] = None
234    """Sampling temperature for the model, limited to [0.6, 1.2].
235
236    For audio models a temperature of 0.8 is highly recommended for best
237    performance.
238    """
239
240    tool_choice: Optional[str] = None
241    """How the model chooses tools.
242
243    Options are `auto`, `none`, `required`, or specify a function.
244    """
245
246    tools: Optional[List[Tool]] = None
247    """Tools (functions) available to the model."""
248
249    tracing: Optional[Tracing] = None
250    """Configuration options for tracing.
251
252    Set to null to disable tracing. Once tracing is enabled for a session, the
253    configuration cannot be modified.
254
255    `auto` will create a trace for the session with default values for the workflow
256    name, group id, and metadata.
257    """
258
259    turn_detection: Optional[TurnDetection] = None
260    """Configuration for turn detection, ether Server VAD or Semantic VAD.
261
262    This can be set to `null` to turn off, in which case the client must manually
263    trigger model response. Server VAD means that the model will detect the start
264    and end of speech based on audio volume and respond at the end of user speech.
265    Semantic VAD is more advanced and uses a turn detection model (in conjunction
266    with VAD) to semantically estimate whether the user has finished speaking, then
267    dynamically sets a timeout based on this probability. For example, if user audio
268    trails off with "uhhm", the model will score a low probability of turn end and
269    wait longer for the user to continue speaking. This can be useful for more
270    natural conversations, but may have a higher latency.
271    """
272
273    voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"], None] = None
274    """The voice the model uses to respond.
275
276    Voice cannot be changed during the session once the model has responded with
277    audio at least once. Current voice options are `alloy`, `ash`, `ballad`,
278    `coral`, `echo`, `sage`, `shimmer`, and `verse`.
279    """