Commit 8136a216
Changed files (11)
examples/audio.py
@@ -1,6 +1,5 @@
#!/usr/bin/env rye run python
-import time
from pathlib import Path
from openai import OpenAI
@@ -12,8 +11,6 @@ speech_file_path = Path(__file__).parent / "speech.mp3"
def main() -> None:
- stream_to_speakers()
-
# Create text-to-speech audio file
with openai.audio.speech.with_streaming_response.create(
model="tts-1",
@@ -37,28 +34,5 @@ def main() -> None:
print(translation.text)
-def stream_to_speakers() -> None:
- import pyaudio
-
- player_stream = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
-
- start_time = time.time()
-
- with openai.audio.speech.with_streaming_response.create(
- model="tts-1",
- voice="alloy",
- response_format="pcm", # similar to WAV, but without a header chunk at the start.
- input="""I see skies of blue and clouds of white
- The bright blessed days, the dark sacred nights
- And I think to myself
- What a wonderful world""",
- ) as response:
- print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
- for chunk in response.iter_bytes(chunk_size=1024):
- player_stream.write(chunk)
-
- print(f"Done in {int((time.time() - start_time) * 1000)}ms.")
-
-
if __name__ == "__main__":
main()
examples/speech_to_text.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env rye run python
+
+import asyncio
+
+from openai import AsyncOpenAI
+from openai.helpers import Microphone
+
+# gets OPENAI_API_KEY from your environment variables
+openai = AsyncOpenAI()
+
+
+async def main() -> None:
+ print("Recording for the next 10 seconds...")
+ recording = await Microphone(timeout=10).record()
+ print("Recording complete")
+ transcription = await openai.audio.transcriptions.create(
+ model="whisper-1",
+ file=recording,
+ )
+
+ print(transcription.text)
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
examples/text_to_speech.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env rye run python
+
+import time
+import asyncio
+
+from openai import AsyncOpenAI
+from openai.helpers import LocalAudioPlayer
+
+# gets OPENAI_API_KEY from your environment variables
+openai = AsyncOpenAI()
+
+
+async def main() -> None:
+ start_time = time.time()
+
+ async with openai.audio.speech.with_streaming_response.create(
+ model="tts-1",
+ voice="alloy",
+ response_format="pcm", # similar to WAV, but without a header chunk at the start.
+ input="""I see skies of blue and clouds of white
+ The bright blessed days, the dark sacred nights
+ And I think to myself
+ What a wonderful world""",
+ ) as response:
+ print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
+ await LocalAudioPlayer().play(response)
+ print(f"Time to play: {int((time.time() - start_time) * 1000)}ms")
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
src/openai/helpers/__init__.py
@@ -0,0 +1,4 @@
+from .microphone import Microphone
+from .local_audio_player import LocalAudioPlayer
+
+__all__ = ["Microphone", "LocalAudioPlayer"]
src/openai/helpers/local_audio_player.py
@@ -0,0 +1,162 @@
+# mypy: ignore-errors
+import queue
+import asyncio
+from typing import Any, Union, Callable, AsyncGenerator, cast
+
+import numpy as np
+import sounddevice as sd # type: ignore
+import numpy.typing as npt
+
+from .. import _legacy_response
+from .._response import StreamedBinaryAPIResponse, AsyncStreamedBinaryAPIResponse
+
+SAMPLE_RATE = 24000
+
+
+class LocalAudioPlayer:
+ def __init__(
+ self,
+ should_stop: Union[Callable[[], bool], None] = None,
+ ):
+ self.channels = 1
+ self.dtype = np.float32
+ self.should_stop = should_stop
+
+ async def _tts_response_to_buffer(
+ self,
+ response: Union[
+ _legacy_response.HttpxBinaryResponseContent,
+ AsyncStreamedBinaryAPIResponse,
+ StreamedBinaryAPIResponse,
+ ],
+ ) -> npt.NDArray[np.float32]:
+ chunks: list[bytes] = []
+ if isinstance(response, _legacy_response.HttpxBinaryResponseContent) or isinstance(
+ response, StreamedBinaryAPIResponse
+ ):
+ for chunk in response.iter_bytes(chunk_size=1024):
+ if chunk:
+ chunks.append(chunk)
+ else:
+ async for chunk in response.iter_bytes(chunk_size=1024):
+ if chunk:
+ chunks.append(chunk)
+
+ audio_bytes = b"".join(chunks)
+ audio_np = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32767.0
+ audio_np = audio_np.reshape(-1, 1)
+ return audio_np
+
+ async def play(
+ self,
+ input: Union[
+ npt.NDArray[np.int16],
+ npt.NDArray[np.float32],
+ _legacy_response.HttpxBinaryResponseContent,
+ AsyncStreamedBinaryAPIResponse,
+ StreamedBinaryAPIResponse,
+ ],
+ ) -> None:
+ audio_content: npt.NDArray[np.float32]
+ if isinstance(input, np.ndarray):
+ if input.dtype == np.int16 and self.dtype == np.float32:
+ audio_content = (input.astype(np.float32) / 32767.0).reshape(-1, self.channels)
+ elif input.dtype == np.float32:
+ audio_content = cast(npt.NDArray[np.float32], input)
+ else:
+ raise ValueError(f"Unsupported dtype: {input.dtype}")
+ else:
+ audio_content = await self._tts_response_to_buffer(input)
+
+ loop = asyncio.get_event_loop()
+ event = asyncio.Event()
+ idx = 0
+
+ def callback(
+ outdata: npt.NDArray[np.float32],
+ frame_count: int,
+ _time_info: Any,
+ _status: Any,
+ ):
+ nonlocal idx
+
+ remainder = len(audio_content) - idx
+ if remainder == 0 or (callable(self.should_stop) and self.should_stop()):
+ loop.call_soon_threadsafe(event.set)
+ raise sd.CallbackStop
+ valid_frames = frame_count if remainder >= frame_count else remainder
+ outdata[:valid_frames] = audio_content[idx : idx + valid_frames]
+ outdata[valid_frames:] = 0
+ idx += valid_frames
+
+ stream = sd.OutputStream(
+ samplerate=SAMPLE_RATE,
+ callback=callback,
+ dtype=audio_content.dtype,
+ channels=audio_content.shape[1],
+ )
+ with stream:
+ await event.wait()
+
+ async def play_stream(
+ self,
+ buffer_stream: AsyncGenerator[Union[npt.NDArray[np.float32], npt.NDArray[np.int16], None], None],
+ ) -> None:
+ loop = asyncio.get_event_loop()
+ event = asyncio.Event()
+ buffer_queue: queue.Queue[Union[npt.NDArray[np.float32], npt.NDArray[np.int16], None]] = queue.Queue(maxsize=50)
+
+ async def buffer_producer():
+ async for buffer in buffer_stream:
+ if buffer is None:
+ break
+ await loop.run_in_executor(None, buffer_queue.put, buffer)
+ await loop.run_in_executor(None, buffer_queue.put, None) # Signal completion
+
+ def callback(
+ outdata: npt.NDArray[np.float32],
+ frame_count: int,
+ _time_info: Any,
+ _status: Any,
+ ):
+ nonlocal current_buffer, buffer_pos
+
+ frames_written = 0
+ while frames_written < frame_count:
+ if current_buffer is None or buffer_pos >= len(current_buffer):
+ try:
+ current_buffer = buffer_queue.get(timeout=0.1)
+ if current_buffer is None:
+ loop.call_soon_threadsafe(event.set)
+ raise sd.CallbackStop
+ buffer_pos = 0
+
+ if current_buffer.dtype == np.int16 and self.dtype == np.float32:
+ current_buffer = (current_buffer.astype(np.float32) / 32767.0).reshape(-1, self.channels)
+
+ except queue.Empty:
+ outdata[frames_written:] = 0
+ return
+
+ remaining_frames = len(current_buffer) - buffer_pos
+ frames_to_write = min(frame_count - frames_written, remaining_frames)
+ outdata[frames_written : frames_written + frames_to_write] = current_buffer[
+ buffer_pos : buffer_pos + frames_to_write
+ ]
+ buffer_pos += frames_to_write
+ frames_written += frames_to_write
+
+ current_buffer = None
+ buffer_pos = 0
+
+ producer_task = asyncio.create_task(buffer_producer())
+
+ with sd.OutputStream(
+ samplerate=SAMPLE_RATE,
+ channels=self.channels,
+ dtype=self.dtype,
+ callback=callback,
+ ):
+ await event.wait()
+
+ await producer_task
src/openai/helpers/microphone.py
@@ -0,0 +1,98 @@
+# mypy: ignore-errors
+import io
+import time
+import wave
+import asyncio
+from typing import Any, Type, Union, Generic, TypeVar, Callable, overload
+from typing_extensions import Literal
+
+import numpy as np
+import sounddevice as sd # type: ignore
+import numpy.typing as npt
+
+from openai._types import FileTypes, FileContent
+
+SAMPLE_RATE = 24000
+
+DType = TypeVar("DType", bound=np.generic)
+
+
+class Microphone(Generic[DType]):
+ def __init__(
+ self,
+ channels: int = 1,
+ dtype: Type[DType] = np.int16,
+ should_record: Union[Callable[[], bool], None] = None,
+ timeout: Union[float, None] = None,
+ ):
+ self.channels = channels
+ self.dtype = dtype
+ self.should_record = should_record
+ self.buffer_chunks = []
+ self.timeout = timeout
+ self.has_record_function = callable(should_record)
+
+ def _ndarray_to_wav(self, audio_data: npt.NDArray[DType]) -> FileTypes:
+ buffer: FileContent = io.BytesIO()
+ with wave.open(buffer, "w") as wav_file:
+ wav_file.setnchannels(self.channels)
+ wav_file.setsampwidth(np.dtype(self.dtype).itemsize)
+ wav_file.setframerate(SAMPLE_RATE)
+ wav_file.writeframes(audio_data.tobytes())
+ buffer.seek(0)
+ return ("audio.wav", buffer, "audio/wav")
+
+ @overload
+ async def record(self, return_ndarray: Literal[True]) -> npt.NDArray[DType]: ...
+
+ @overload
+ async def record(self, return_ndarray: Literal[False]) -> FileTypes: ...
+
+ @overload
+ async def record(self, return_ndarray: None = ...) -> FileTypes: ...
+
+ async def record(self, return_ndarray: Union[bool, None] = False) -> Union[npt.NDArray[DType], FileTypes]:
+ loop = asyncio.get_event_loop()
+ event = asyncio.Event()
+ self.buffer_chunks: list[npt.NDArray[DType]] = []
+ start_time = time.perf_counter()
+
+ def callback(
+ indata: npt.NDArray[DType],
+ _frame_count: int,
+ _time_info: Any,
+ _status: Any,
+ ):
+ execution_time = time.perf_counter() - start_time
+ reached_recording_timeout = execution_time > self.timeout if self.timeout is not None else False
+ if reached_recording_timeout:
+ loop.call_soon_threadsafe(event.set)
+ raise sd.CallbackStop
+
+ should_be_recording = self.should_record() if callable(self.should_record) else True
+ if not should_be_recording:
+ loop.call_soon_threadsafe(event.set)
+ raise sd.CallbackStop
+
+ self.buffer_chunks.append(indata.copy())
+
+ stream = sd.InputStream(
+ callback=callback,
+ dtype=self.dtype,
+ samplerate=SAMPLE_RATE,
+ channels=self.channels,
+ )
+ with stream:
+ await event.wait()
+
+ # Concatenate all chunks into a single buffer, handle empty case
+ concatenated_chunks: npt.NDArray[DType] = (
+ np.concatenate(self.buffer_chunks, axis=0)
+ if len(self.buffer_chunks) > 0
+ else np.array([], dtype=self.dtype)
+ )
+
+ if return_ndarray:
+ return concatenated_chunks
+ else:
+ return self._ndarray_to_wav(concatenated_chunks)
src/openai/helpers.py
@@ -0,0 +1,4 @@
+from .helpers.microphone import Microphone
+from .helpers.local_audio_player import LocalAudioPlayer
+
+__all__ = ["LocalAudioPlayer", "Microphone"]
.gitignore
@@ -14,3 +14,7 @@ dist
.envrc
codegen.log
Brewfile.lock.json
+
+.DS_Store
+
+examples/*.mp3
pyproject.toml
@@ -16,6 +16,8 @@ dependencies = [
"sniffio",
"tqdm > 4",
"jiter>=0.4.0, <1",
+ "sounddevice>=0.5.1",
+ "numpy>=2.0.2",
]
requires-python = ">= 3.8"
classifiers = [
requirements-dev.lock
@@ -33,6 +33,7 @@ certifi==2023.7.22
# via requests
cffi==1.16.0
# via cryptography
+ # via sounddevice
charset-normalizer==3.3.2
# via requests
click==8.1.7
@@ -92,7 +93,7 @@ nest-asyncio==1.6.0
nodeenv==1.8.0
# via pyright
nox==2023.4.22
-numpy==1.26.3
+numpy==2.0.2
# via openai
# via pandas
# via pandas-stubs
@@ -102,7 +103,7 @@ packaging==23.2
# via black
# via nox
# via pytest
-pandas==2.1.4
+pandas==2.2.3
# via openai
pandas-stubs==2.1.4.231227
# via openai
@@ -154,6 +155,8 @@ sniffio==1.3.0
# via trio
sortedcontainers==2.4.0
# via trio
+sounddevice==0.5.1
+ # via openai
time-machine==2.9.0
toml==0.10.2
# via inline-snapshot
requirements.lock
@@ -18,6 +18,8 @@ anyio==4.1.0
certifi==2023.7.22
# via httpcore
# via httpx
+cffi==1.17.1
+ # via sounddevice
distro==1.8.0
# via openai
exceptiongroup==1.2.2
@@ -41,6 +43,8 @@ pandas==2.2.3
# via openai
pandas-stubs==2.2.2.240807
# via openai
+pycparser==2.22
+ # via cffi
pydantic==2.10.3
# via openai
pydantic-core==2.27.1
@@ -54,6 +58,8 @@ six==1.16.0
sniffio==1.3.0
# via anyio
# via openai
+sounddevice==0.5.1
+ # via openai
tqdm==4.66.5
# via openai
types-pytz==2024.2.0.20241003