main
1# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
3from typing import List, Union, Optional
4from typing_extensions import Literal, TypeAlias
5
6from ...._models import BaseModel
7
8__all__ = [
9 "Session",
10 "InputAudioNoiseReduction",
11 "InputAudioTranscription",
12 "Tool",
13 "Tracing",
14 "TracingTracingConfiguration",
15 "TurnDetection",
16]
17
18
19class InputAudioNoiseReduction(BaseModel):
20 type: Optional[Literal["near_field", "far_field"]] = None
21 """Type of noise reduction.
22
23 `near_field` is for close-talking microphones such as headphones, `far_field` is
24 for far-field microphones such as laptop or conference room microphones.
25 """
26
27
28class InputAudioTranscription(BaseModel):
29 language: Optional[str] = None
30 """The language of the input audio.
31
32 Supplying the input language in
33 [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
34 format will improve accuracy and latency.
35 """
36
37 model: Optional[str] = None
38 """
39 The model to use for transcription, current options are `gpt-4o-transcribe`,
40 `gpt-4o-mini-transcribe`, and `whisper-1`.
41 """
42
43 prompt: Optional[str] = None
44 """
45 An optional text to guide the model's style or continue a previous audio
46 segment. For `whisper-1`, the
47 [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
48 For `gpt-4o-transcribe` models, the prompt is a free text string, for example
49 "expect words related to technology".
50 """
51
52
53class Tool(BaseModel):
54 description: Optional[str] = None
55 """
56 The description of the function, including guidance on when and how to call it,
57 and guidance about what to tell the user when calling (if anything).
58 """
59
60 name: Optional[str] = None
61 """The name of the function."""
62
63 parameters: Optional[object] = None
64 """Parameters of the function in JSON Schema."""
65
66 type: Optional[Literal["function"]] = None
67 """The type of the tool, i.e. `function`."""
68
69
70class TracingTracingConfiguration(BaseModel):
71 group_id: Optional[str] = None
72 """
73 The group id to attach to this trace to enable filtering and grouping in the
74 traces dashboard.
75 """
76
77 metadata: Optional[object] = None
78 """
79 The arbitrary metadata to attach to this trace to enable filtering in the traces
80 dashboard.
81 """
82
83 workflow_name: Optional[str] = None
84 """The name of the workflow to attach to this trace.
85
86 This is used to name the trace in the traces dashboard.
87 """
88
89
90Tracing: TypeAlias = Union[Literal["auto"], TracingTracingConfiguration]
91
92
93class TurnDetection(BaseModel):
94 create_response: Optional[bool] = None
95 """
96 Whether or not to automatically generate a response when a VAD stop event
97 occurs.
98 """
99
100 eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None
101 """Used only for `semantic_vad` mode.
102
103 The eagerness of the model to respond. `low` will wait longer for the user to
104 continue speaking, `high` will respond more quickly. `auto` is the default and
105 is equivalent to `medium`.
106 """
107
108 interrupt_response: Optional[bool] = None
109 """
110 Whether or not to automatically interrupt any ongoing response with output to
111 the default conversation (i.e. `conversation` of `auto`) when a VAD start event
112 occurs.
113 """
114
115 prefix_padding_ms: Optional[int] = None
116 """Used only for `server_vad` mode.
117
118 Amount of audio to include before the VAD detected speech (in milliseconds).
119 Defaults to 300ms.
120 """
121
122 silence_duration_ms: Optional[int] = None
123 """Used only for `server_vad` mode.
124
125 Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
126 With shorter values the model will respond more quickly, but may jump in on
127 short pauses from the user.
128 """
129
130 threshold: Optional[float] = None
131 """Used only for `server_vad` mode.
132
133 Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
134 threshold will require louder audio to activate the model, and thus might
135 perform better in noisy environments.
136 """
137
138 type: Optional[Literal["server_vad", "semantic_vad"]] = None
139 """Type of turn detection."""
140
141
142class Session(BaseModel):
143 id: Optional[str] = None
144 """Unique identifier for the session that looks like `sess_1234567890abcdef`."""
145
146 input_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None
147 """The format of input audio.
148
149 Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must
150 be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian
151 byte order.
152 """
153
154 input_audio_noise_reduction: Optional[InputAudioNoiseReduction] = None
155 """Configuration for input audio noise reduction.
156
157 This can be set to `null` to turn off. Noise reduction filters audio added to
158 the input audio buffer before it is sent to VAD and the model. Filtering the
159 audio can improve VAD and turn detection accuracy (reducing false positives) and
160 model performance by improving perception of the input audio.
161 """
162
163 input_audio_transcription: Optional[InputAudioTranscription] = None
164 """
165 Configuration for input audio transcription, defaults to off and can be set to
166 `null` to turn off once on. Input audio transcription is not native to the
167 model, since the model consumes audio directly. Transcription runs
168 asynchronously through
169 [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
170 and should be treated as guidance of input audio content rather than precisely
171 what the model heard. The client can optionally set the language and prompt for
172 transcription, these offer additional guidance to the transcription service.
173 """
174
175 instructions: Optional[str] = None
176 """The default system instructions (i.e.
177
178 system message) prepended to model calls. This field allows the client to guide
179 the model on desired responses. The model can be instructed on response content
180 and format, (e.g. "be extremely succinct", "act friendly", "here are examples of
181 good responses") and on audio behavior (e.g. "talk quickly", "inject emotion
182 into your voice", "laugh frequently"). The instructions are not guaranteed to be
183 followed by the model, but they provide guidance to the model on the desired
184 behavior.
185
186 Note that the server sets default instructions which will be used if this field
187 is not set and are visible in the `session.created` event at the start of the
188 session.
189 """
190
191 max_response_output_tokens: Union[int, Literal["inf"], None] = None
192 """
193 Maximum number of output tokens for a single assistant response, inclusive of
194 tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
195 `inf` for the maximum available tokens for a given model. Defaults to `inf`.
196 """
197
198 modalities: Optional[List[Literal["text", "audio"]]] = None
199 """The set of modalities the model can respond with.
200
201 To disable audio, set this to ["text"].
202 """
203
204 model: Optional[
205 Literal[
206 "gpt-realtime",
207 "gpt-realtime-2025-08-28",
208 "gpt-4o-realtime-preview",
209 "gpt-4o-realtime-preview-2024-10-01",
210 "gpt-4o-realtime-preview-2024-12-17",
211 "gpt-4o-realtime-preview-2025-06-03",
212 "gpt-4o-mini-realtime-preview",
213 "gpt-4o-mini-realtime-preview-2024-12-17",
214 ]
215 ] = None
216 """The Realtime model used for this session."""
217
218 output_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None
219 """The format of output audio.
220
221 Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, output audio is
222 sampled at a rate of 24kHz.
223 """
224
225 speed: Optional[float] = None
226 """The speed of the model's spoken response.
227
228 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
229 This value can only be changed in between model turns, not while a response is
230 in progress.
231 """
232
233 temperature: Optional[float] = None
234 """Sampling temperature for the model, limited to [0.6, 1.2].
235
236 For audio models a temperature of 0.8 is highly recommended for best
237 performance.
238 """
239
240 tool_choice: Optional[str] = None
241 """How the model chooses tools.
242
243 Options are `auto`, `none`, `required`, or specify a function.
244 """
245
246 tools: Optional[List[Tool]] = None
247 """Tools (functions) available to the model."""
248
249 tracing: Optional[Tracing] = None
250 """Configuration options for tracing.
251
252 Set to null to disable tracing. Once tracing is enabled for a session, the
253 configuration cannot be modified.
254
255 `auto` will create a trace for the session with default values for the workflow
256 name, group id, and metadata.
257 """
258
259 turn_detection: Optional[TurnDetection] = None
260 """Configuration for turn detection, ether Server VAD or Semantic VAD.
261
262 This can be set to `null` to turn off, in which case the client must manually
263 trigger model response. Server VAD means that the model will detect the start
264 and end of speech based on audio volume and respond at the end of user speech.
265 Semantic VAD is more advanced and uses a turn detection model (in conjunction
266 with VAD) to semantically estimate whether the user has finished speaking, then
267 dynamically sets a timeout based on this probability. For example, if user audio
268 trails off with "uhhm", the model will score a low probability of turn end and
269 wait longer for the user to continue speaking. This can be useful for more
270 natural conversations, but may have a higher latency.
271 """
272
273 voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"], None] = None
274 """The voice the model uses to respond.
275
276 Voice cannot be changed during the session once the model has responded with
277 audio at least once. Current voice options are `alloy`, `ash`, `ballad`,
278 `coral`, `echo`, `sage`, `shimmer`, and `verse`.
279 """