Commit 8dcfe8b2
Changed files (1)
src
openai
resources
audio
src/openai/resources/audio/transcriptions.py
@@ -69,9 +69,10 @@ class Transcriptions(SyncAPIResource):
model: Union[str, AudioModel],
chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
include: List[TranscriptionInclude] | Omit = omit,
- response_format: Union[Literal["json"], Omit] = omit,
language: str | Omit = omit,
prompt: str | Omit = omit,
+ response_format: Union[Literal["json"], Omit] = omit,
+ stream: Optional[Literal[False]] | Omit = omit,
temperature: float | Omit = omit,
timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
@@ -80,7 +81,68 @@ class Transcriptions(SyncAPIResource):
extra_query: Query | None = None,
extra_body: Body | None = None,
timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> Transcription: ...
+ ) -> TranscriptionCreateResponse:
+ """
+ Transcribes audio into the input language.
+
+ Args:
+ file:
+ The audio file object (not file name) to transcribe, in one of these formats:
+ flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
+
+ model: ID of the model to use. The options are `gpt-4o-transcribe`,
+ `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
+ Whisper V2 model).
+
+ chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
+ first normalizes loudness and then uses voice activity detection (VAD) to choose
+ boundaries. `server_vad` object can be provided to tweak VAD detection
+ parameters manually. If unset, the audio is transcribed as a single block.
+
+ include: Additional information to include in the transcription response. `logprobs` will
+ return the log probabilities of the tokens in the response to understand the
+ model's confidence in the transcription. `logprobs` only works with
+ response_format set to `json` and only with the models `gpt-4o-transcribe` and
+ `gpt-4o-mini-transcribe`.
+
+ language: The language of the input audio. Supplying the input language in
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ format will improve accuracy and latency.
+
+ prompt: An optional text to guide the model's style or continue a previous audio
+ segment. The
+ [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
+ should match the audio language.
+
+ response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
+ `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
+ the only supported format is `json`.
+
+ stream: If set to true, the model response data will be streamed to the client as it is
+ generated using
+ [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
+ See the
+ [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
+ for more information.
+
+ Note: Streaming is not supported for the `whisper-1` model and will be ignored.
+
+ temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
+ output more random, while lower values like 0.2 will make it more focused and
+ deterministic. If set to 0, the model will use
+ [log probability](https://en.wikipedia.org/wiki/Log_probability) to
+ automatically increase the temperature until certain thresholds are hit.
+
+ timestamp_granularities: The timestamp granularities to populate for this transcription.
+ `response_format` must be set `verbose_json` to use timestamp granularities.
+ Either or both of these options are supported: `word`, or `segment`. Note: There
+ is no additional latency for segment timestamps, but generating word timestamps
+ incurs additional latency.
+
+ extra_headers: Send extra headers
+
+ extra_query: Add additional query parameters to the request
+ """
@overload
def create(