Merge branch 'develop' into 114-improve-transcript-accuracy
Browse files
src/reachy_mini_conversation_app/console.py
CHANGED
|
@@ -8,8 +8,8 @@ import asyncio
|
|
| 8 |
import logging
|
| 9 |
from typing import List
|
| 10 |
|
| 11 |
-
from fastrtc import AdditionalOutputs,
|
| 12 |
-
from
|
| 13 |
|
| 14 |
from reachy_mini import ReachyMini
|
| 15 |
from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler
|
|
@@ -30,11 +30,6 @@ class LocalStream:
|
|
| 30 |
# Allow the handler to flush the player queue when appropriate.
|
| 31 |
self.handler._clear_queue = self.clear_audio_queue
|
| 32 |
|
| 33 |
-
# Hack to avoid the first lenghty call to resample at runtime.
|
| 34 |
-
# This is likely caused by cache initialization overhead.
|
| 35 |
-
import numpy as np
|
| 36 |
-
resample(np.array([0.0]), orig_sr=1, target_sr=1)
|
| 37 |
-
|
| 38 |
def launch(self) -> None:
|
| 39 |
"""Start the recorder/player and run the async processing loops."""
|
| 40 |
self._stop_event.clear()
|
|
@@ -88,9 +83,7 @@ class LocalStream:
|
|
| 88 |
while not self._stop_event.is_set():
|
| 89 |
audio_frame = self._robot.media.get_audio_sample()
|
| 90 |
if audio_frame is not None:
|
| 91 |
-
|
| 92 |
-
frame = audio_to_int16(frame_mono)
|
| 93 |
-
await self.handler.receive((16000, frame))
|
| 94 |
|
| 95 |
await asyncio.sleep(0.01) # avoid busy loop
|
| 96 |
|
|
@@ -110,18 +103,24 @@ class LocalStream:
|
|
| 110 |
)
|
| 111 |
|
| 112 |
elif isinstance(handler_output, tuple):
|
| 113 |
-
input_sample_rate,
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
if
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
)
|
| 123 |
|
| 124 |
-
self._robot.media.push_audio_sample(
|
| 125 |
|
| 126 |
else:
|
| 127 |
logger.debug("Ignoring output type=%s", type(handler_output).__name__)
|
|
|
|
| 8 |
import logging
|
| 9 |
from typing import List
|
| 10 |
|
| 11 |
+
from fastrtc import AdditionalOutputs, audio_to_float32
|
| 12 |
+
from scipy.signal import resample
|
| 13 |
|
| 14 |
from reachy_mini import ReachyMini
|
| 15 |
from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler
|
|
|
|
| 30 |
# Allow the handler to flush the player queue when appropriate.
|
| 31 |
self.handler._clear_queue = self.clear_audio_queue
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
def launch(self) -> None:
|
| 34 |
"""Start the recorder/player and run the async processing loops."""
|
| 35 |
self._stop_event.clear()
|
|
|
|
| 83 |
while not self._stop_event.is_set():
|
| 84 |
audio_frame = self._robot.media.get_audio_sample()
|
| 85 |
if audio_frame is not None:
|
| 86 |
+
await self.handler.receive((self._robot.media.get_input_audio_samplerate(), audio_frame))
|
|
|
|
|
|
|
| 87 |
|
| 88 |
await asyncio.sleep(0.01) # avoid busy loop
|
| 89 |
|
|
|
|
| 103 |
)
|
| 104 |
|
| 105 |
elif isinstance(handler_output, tuple):
|
| 106 |
+
input_sample_rate, audio_data = handler_output
|
| 107 |
+
output_sample_rate = self._robot.media.get_output_audio_samplerate()
|
| 108 |
+
|
| 109 |
+
# Reshape if needed
|
| 110 |
+
if audio_data.ndim == 2:
|
| 111 |
+
audio_data = audio_data.squeeze()
|
| 112 |
+
|
| 113 |
+
# Cast if needed
|
| 114 |
+
audio_frame = audio_to_float32(audio_data)
|
| 115 |
+
|
| 116 |
+
# Resample if needed
|
| 117 |
+
if input_sample_rate != output_sample_rate:
|
| 118 |
+
audio_frame = resample(
|
| 119 |
+
audio_frame,
|
| 120 |
+
int(len(audio_frame) * output_sample_rate / input_sample_rate),
|
| 121 |
)
|
| 122 |
|
| 123 |
+
self._robot.media.push_audio_sample(audio_frame)
|
| 124 |
|
| 125 |
else:
|
| 126 |
logger.debug("Ignoring output type=%s", type(handler_output).__name__)
|
src/reachy_mini_conversation_app/openai_realtime.py
CHANGED
|
@@ -3,15 +3,16 @@ import base64
|
|
| 3 |
import random
|
| 4 |
import asyncio
|
| 5 |
import logging
|
| 6 |
-
from typing import Any, Tuple, Literal
|
| 7 |
from datetime import datetime
|
| 8 |
|
| 9 |
import cv2
|
| 10 |
import numpy as np
|
| 11 |
import gradio as gr
|
| 12 |
from openai import AsyncOpenAI
|
| 13 |
-
from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
|
| 14 |
from numpy.typing import NDArray
|
|
|
|
| 15 |
from websockets.exceptions import ConnectionClosedError
|
| 16 |
|
| 17 |
from reachy_mini_conversation_app.config import config
|
|
@@ -25,6 +26,9 @@ from reachy_mini_conversation_app.tools.core_tools import (
|
|
| 25 |
|
| 26 |
logger = logging.getLogger(__name__)
|
| 27 |
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
class OpenaiRealtimeHandler(AsyncStreamHandler):
|
| 30 |
"""An OpenAI realtime handler for fastrtc Stream."""
|
|
@@ -33,16 +37,19 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 33 |
"""Initialize the handler."""
|
| 34 |
super().__init__(
|
| 35 |
expected_layout="mono",
|
| 36 |
-
output_sample_rate=
|
| 37 |
-
input_sample_rate=
|
| 38 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
self.deps = deps
|
| 40 |
|
| 41 |
# Override type annotations for OpenAI strict typing (only for values used in API)
|
| 42 |
-
self.output_sample_rate
|
| 43 |
-
self.
|
| 44 |
-
# input_sample_rate rest as int for comparison logic
|
| 45 |
-
self.resample_ratio = self.target_input_rate / self.input_sample_rate
|
| 46 |
|
| 47 |
self.connection: Any = None
|
| 48 |
self.output_queue: "asyncio.Queue[Tuple[int, NDArray[np.int16]] | AdditionalOutputs]" = asyncio.Queue()
|
|
@@ -60,21 +67,6 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 60 |
"""Create a copy of the handler."""
|
| 61 |
return OpenaiRealtimeHandler(self.deps)
|
| 62 |
|
| 63 |
-
def resample_audio(self, audio: NDArray[np.int16]) -> NDArray[np.int16]:
|
| 64 |
-
"""Resample audio using linear interpolation."""
|
| 65 |
-
if self.input_sample_rate == self.target_input_rate:
|
| 66 |
-
return audio
|
| 67 |
-
|
| 68 |
-
# Use numpy's interp for simple linear resampling
|
| 69 |
-
input_length = len(audio)
|
| 70 |
-
output_length = int(input_length * self.resample_ratio)
|
| 71 |
-
|
| 72 |
-
input_time = np.arange(input_length)
|
| 73 |
-
output_time = np.linspace(0, input_length - 1, output_length)
|
| 74 |
-
|
| 75 |
-
resampled = np.interp(output_time, input_time, audio.astype(np.float32))
|
| 76 |
-
return cast(NDArray[np.int16], resampled.astype(np.int16))
|
| 77 |
-
|
| 78 |
async def _emit_debounced_partial(self, transcript: str, sequence: int) -> None:
|
| 79 |
"""Emit partial transcript after debounce delay."""
|
| 80 |
try:
|
|
@@ -130,7 +122,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 130 |
"input": {
|
| 131 |
"format": {
|
| 132 |
"type": "audio/pcm",
|
| 133 |
-
"rate": self.
|
| 134 |
},
|
| 135 |
"transcription": {
|
| 136 |
"model": "gpt-4o-transcribe",
|
|
@@ -348,17 +340,28 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 348 |
|
| 349 |
# Microphone receive
|
| 350 |
async def receive(self, frame: Tuple[int, NDArray[np.int16]]) -> None:
|
| 351 |
-
"""Receive audio frame from the microphone and send it to the openai server.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
if not self.connection:
|
| 353 |
return
|
| 354 |
-
|
| 355 |
-
|
|
|
|
|
|
|
|
|
|
| 356 |
|
| 357 |
# Resample if needed
|
| 358 |
-
if self.input_sample_rate !=
|
| 359 |
-
|
|
|
|
|
|
|
|
|
|
| 360 |
|
| 361 |
-
audio_message = base64.b64encode(
|
| 362 |
await self.connection.input_audio_buffer.append(audio=audio_message)
|
| 363 |
|
| 364 |
async def emit(self) -> Tuple[int, NDArray[np.int16]] | AdditionalOutputs | None:
|
|
|
|
| 3 |
import random
|
| 4 |
import asyncio
|
| 5 |
import logging
|
| 6 |
+
from typing import Any, Final, Tuple, Literal
|
| 7 |
from datetime import datetime
|
| 8 |
|
| 9 |
import cv2
|
| 10 |
import numpy as np
|
| 11 |
import gradio as gr
|
| 12 |
from openai import AsyncOpenAI
|
| 13 |
+
from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item, audio_to_int16
|
| 14 |
from numpy.typing import NDArray
|
| 15 |
+
from scipy.signal import resample
|
| 16 |
from websockets.exceptions import ConnectionClosedError
|
| 17 |
|
| 18 |
from reachy_mini_conversation_app.config import config
|
|
|
|
| 26 |
|
| 27 |
logger = logging.getLogger(__name__)
|
| 28 |
|
| 29 |
+
OPEN_AI_INPUT_SAMPLE_RATE: Final[Literal[24000]] = 24000
|
| 30 |
+
OPEN_AI_OUTPUT_SAMPLE_RATE: Final[Literal[24000]] = 24000
|
| 31 |
+
|
| 32 |
|
| 33 |
class OpenaiRealtimeHandler(AsyncStreamHandler):
|
| 34 |
"""An OpenAI realtime handler for fastrtc Stream."""
|
|
|
|
| 37 |
"""Initialize the handler."""
|
| 38 |
super().__init__(
|
| 39 |
expected_layout="mono",
|
| 40 |
+
output_sample_rate=OPEN_AI_OUTPUT_SAMPLE_RATE,
|
| 41 |
+
input_sample_rate=OPEN_AI_INPUT_SAMPLE_RATE,
|
| 42 |
)
|
| 43 |
+
|
| 44 |
+
# Override typing of the sample rates to match OpenAI's requirements
|
| 45 |
+
self.output_sample_rate: Literal[24000] = self.output_sample_rate
|
| 46 |
+
self.input_sample_rate: Literal[24000] = self.input_sample_rate
|
| 47 |
+
|
| 48 |
self.deps = deps
|
| 49 |
|
| 50 |
# Override type annotations for OpenAI strict typing (only for values used in API)
|
| 51 |
+
self.output_sample_rate = OPEN_AI_OUTPUT_SAMPLE_RATE
|
| 52 |
+
self.input_sample_rate = OPEN_AI_INPUT_SAMPLE_RATE
|
|
|
|
|
|
|
| 53 |
|
| 54 |
self.connection: Any = None
|
| 55 |
self.output_queue: "asyncio.Queue[Tuple[int, NDArray[np.int16]] | AdditionalOutputs]" = asyncio.Queue()
|
|
|
|
| 67 |
"""Create a copy of the handler."""
|
| 68 |
return OpenaiRealtimeHandler(self.deps)
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
async def _emit_debounced_partial(self, transcript: str, sequence: int) -> None:
|
| 71 |
"""Emit partial transcript after debounce delay."""
|
| 72 |
try:
|
|
|
|
| 122 |
"input": {
|
| 123 |
"format": {
|
| 124 |
"type": "audio/pcm",
|
| 125 |
+
"rate": self.input_sample_rate,
|
| 126 |
},
|
| 127 |
"transcription": {
|
| 128 |
"model": "gpt-4o-transcribe",
|
|
|
|
| 340 |
|
| 341 |
# Microphone receive
|
| 342 |
async def receive(self, frame: Tuple[int, NDArray[np.int16]]) -> None:
|
| 343 |
+
"""Receive audio frame from the microphone and send it to the openai server.
|
| 344 |
+
|
| 345 |
+
Args:
|
| 346 |
+
frame: A tuple containing the sample rate and the audio frame.
|
| 347 |
+
|
| 348 |
+
"""
|
| 349 |
if not self.connection:
|
| 350 |
return
|
| 351 |
+
input_sample_rate, audio_frame = frame
|
| 352 |
+
|
| 353 |
+
# Reshape if needed
|
| 354 |
+
if audio_frame.ndim == 2:
|
| 355 |
+
audio_frame = audio_frame.squeeze()
|
| 356 |
|
| 357 |
# Resample if needed
|
| 358 |
+
if self.input_sample_rate != input_sample_rate:
|
| 359 |
+
audio_frame = resample(audio_frame, int(len(audio_frame) * self.input_sample_rate / input_sample_rate))
|
| 360 |
+
|
| 361 |
+
# Cast if needed
|
| 362 |
+
audio_frame = audio_to_int16(audio_frame)
|
| 363 |
|
| 364 |
+
audio_message = base64.b64encode(audio_frame.tobytes()).decode("utf-8")
|
| 365 |
await self.connection.input_audio_buffer.append(audio=audio_message)
|
| 366 |
|
| 367 |
async def emit(self) -> Tuple[int, NDArray[np.int16]] | AdditionalOutputs | None:
|