Spaces:

pollen-robotics
/

reachy_mini_conversation_app

Running

App Files Files Community

Alina Lozovskaya commited on 19 days ago

Commit

c10cc03

2 Parent(s): 6eddfa9 cced4d1

Merge branch 'develop' into 114-improve-transcript-accuracy

Browse files

Files changed (2) hide show

src/reachy_mini_conversation_app/console.py +19 -20
src/reachy_mini_conversation_app/openai_realtime.py +33 -30

src/reachy_mini_conversation_app/console.py CHANGED Viewed

@@ -8,8 +8,8 @@ import asyncio
 import logging
 from typing import List
-from fastrtc import AdditionalOutputs, audio_to_int16, audio_to_float32
-from librosa import resample
 from reachy_mini import ReachyMini
 from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler
@@ -30,11 +30,6 @@ class LocalStream:
         # Allow the handler to flush the player queue when appropriate.
         self.handler._clear_queue = self.clear_audio_queue
-        # Hack to avoid the first lenghty call to resample at runtime.
-        # This is likely caused by cache initialization overhead.
-        import numpy as np
-        resample(np.array([0.0]), orig_sr=1, target_sr=1)
     def launch(self) -> None:
         """Start the recorder/player and run the async processing loops."""
         self._stop_event.clear()
@@ -88,9 +83,7 @@ class LocalStream:
         while not self._stop_event.is_set():
             audio_frame = self._robot.media.get_audio_sample()
             if audio_frame is not None:
-                frame_mono = audio_frame.T[0]  # both channels are identical
-                frame = audio_to_int16(frame_mono)
-                await self.handler.receive((16000, frame))
             await asyncio.sleep(0.01)  # avoid busy loop
@@ -110,18 +103,24 @@ class LocalStream:
                         )
             elif isinstance(handler_output, tuple):
-                input_sample_rate, audio_frame = handler_output
-                device_sample_rate = self._robot.media.get_audio_samplerate()
-                audio_frame_float = audio_to_float32(audio_frame.squeeze())
-                if input_sample_rate != device_sample_rate:
-                    audio_frame_float = resample(
-                        audio_frame_float,
-                        orig_sr=input_sample_rate,
-                        target_sr=device_sample_rate,
                     )
-                self._robot.media.push_audio_sample(audio_frame_float)
             else:
                 logger.debug("Ignoring output type=%s", type(handler_output).__name__)

 import logging
 from typing import List
+from fastrtc import AdditionalOutputs, audio_to_float32
+from scipy.signal import resample
 from reachy_mini import ReachyMini
 from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler
         # Allow the handler to flush the player queue when appropriate.
         self.handler._clear_queue = self.clear_audio_queue
     def launch(self) -> None:
         """Start the recorder/player and run the async processing loops."""
         self._stop_event.clear()
         while not self._stop_event.is_set():
             audio_frame = self._robot.media.get_audio_sample()
             if audio_frame is not None:
+                await self.handler.receive((self._robot.media.get_input_audio_samplerate(), audio_frame))
             await asyncio.sleep(0.01)  # avoid busy loop
                         )
             elif isinstance(handler_output, tuple):
+                input_sample_rate, audio_data = handler_output
+                output_sample_rate = self._robot.media.get_output_audio_samplerate()
+                # Reshape if needed
+                if audio_data.ndim == 2:
+                    audio_data = audio_data.squeeze()
+                # Cast if needed
+                audio_frame = audio_to_float32(audio_data)
+                # Resample if needed
+                if input_sample_rate != output_sample_rate:
+                    audio_frame = resample(
+                        audio_frame,
+                        int(len(audio_frame) * output_sample_rate / input_sample_rate),
                     )
+                self._robot.media.push_audio_sample(audio_frame)
             else:
                 logger.debug("Ignoring output type=%s", type(handler_output).__name__)

src/reachy_mini_conversation_app/openai_realtime.py CHANGED Viewed

@@ -3,15 +3,16 @@ import base64
 import random
 import asyncio
 import logging
-from typing import Any, Tuple, Literal, cast
 from datetime import datetime
 import cv2
 import numpy as np
 import gradio as gr
 from openai import AsyncOpenAI
-from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
 from numpy.typing import NDArray
 from websockets.exceptions import ConnectionClosedError
 from reachy_mini_conversation_app.config import config
@@ -25,6 +26,9 @@ from reachy_mini_conversation_app.tools.core_tools import (
 logger = logging.getLogger(__name__)
 class OpenaiRealtimeHandler(AsyncStreamHandler):
     """An OpenAI realtime handler for fastrtc Stream."""
@@ -33,16 +37,19 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
         """Initialize the handler."""
         super().__init__(
             expected_layout="mono",
-            output_sample_rate=24000,  # openai outputs
-            input_sample_rate=16000,  # respeaker output
         )
         self.deps = deps
         # Override type annotations for OpenAI strict typing (only for values used in API)
-        self.output_sample_rate: Literal[24000]
-        self.target_input_rate: Literal[24000] = 24000
-        # input_sample_rate rest as int for comparison logic
-        self.resample_ratio = self.target_input_rate / self.input_sample_rate
         self.connection: Any = None
         self.output_queue: "asyncio.Queue[Tuple[int, NDArray[np.int16]] | AdditionalOutputs]" = asyncio.Queue()
@@ -60,21 +67,6 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
         """Create a copy of the handler."""
         return OpenaiRealtimeHandler(self.deps)
-    def resample_audio(self, audio: NDArray[np.int16]) -> NDArray[np.int16]:
-        """Resample audio using linear interpolation."""
-        if self.input_sample_rate == self.target_input_rate:
-            return audio
-        # Use numpy's interp for simple linear resampling
-        input_length = len(audio)
-        output_length = int(input_length * self.resample_ratio)
-        input_time = np.arange(input_length)
-        output_time = np.linspace(0, input_length - 1, output_length)
-        resampled = np.interp(output_time, input_time, audio.astype(np.float32))
-        return cast(NDArray[np.int16], resampled.astype(np.int16))
     async def _emit_debounced_partial(self, transcript: str, sequence: int) -> None:
         """Emit partial transcript after debounce delay."""
         try:
@@ -130,7 +122,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                             "input": {
                                 "format": {
                                     "type": "audio/pcm",
-                                    "rate": self.target_input_rate,
                                 },
                                 "transcription": {
                                     "model": "gpt-4o-transcribe",
@@ -348,17 +340,28 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
     # Microphone receive
     async def receive(self, frame: Tuple[int, NDArray[np.int16]]) -> None:
-        """Receive audio frame from the microphone and send it to the openai server."""
         if not self.connection:
             return
-        _, array = frame
-        array = array.squeeze()
         # Resample if needed
-        if self.input_sample_rate != self.target_input_rate:
-            array = self.resample_audio(array)
-        audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
         await self.connection.input_audio_buffer.append(audio=audio_message)
     async def emit(self) -> Tuple[int, NDArray[np.int16]] | AdditionalOutputs | None:

 import random
 import asyncio
 import logging
+from typing import Any, Final, Tuple, Literal
 from datetime import datetime
 import cv2
 import numpy as np
 import gradio as gr
 from openai import AsyncOpenAI
+from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item, audio_to_int16
 from numpy.typing import NDArray
+from scipy.signal import resample
 from websockets.exceptions import ConnectionClosedError
 from reachy_mini_conversation_app.config import config
 logger = logging.getLogger(__name__)
+OPEN_AI_INPUT_SAMPLE_RATE: Final[Literal[24000]] = 24000
+OPEN_AI_OUTPUT_SAMPLE_RATE: Final[Literal[24000]] = 24000
 class OpenaiRealtimeHandler(AsyncStreamHandler):
     """An OpenAI realtime handler for fastrtc Stream."""
         """Initialize the handler."""
         super().__init__(
             expected_layout="mono",
+            output_sample_rate=OPEN_AI_OUTPUT_SAMPLE_RATE,
+            input_sample_rate=OPEN_AI_INPUT_SAMPLE_RATE,
         )
+        # Override typing of the sample rates to match OpenAI's requirements
+        self.output_sample_rate: Literal[24000] = self.output_sample_rate
+        self.input_sample_rate: Literal[24000] = self.input_sample_rate
         self.deps = deps
         # Override type annotations for OpenAI strict typing (only for values used in API)
+        self.output_sample_rate = OPEN_AI_OUTPUT_SAMPLE_RATE
+        self.input_sample_rate = OPEN_AI_INPUT_SAMPLE_RATE
         self.connection: Any = None
         self.output_queue: "asyncio.Queue[Tuple[int, NDArray[np.int16]] | AdditionalOutputs]" = asyncio.Queue()
         """Create a copy of the handler."""
         return OpenaiRealtimeHandler(self.deps)
     async def _emit_debounced_partial(self, transcript: str, sequence: int) -> None:
         """Emit partial transcript after debounce delay."""
         try:
                             "input": {
                                 "format": {
                                     "type": "audio/pcm",
+                                    "rate": self.input_sample_rate,
                                 },
                                 "transcription": {
                                     "model": "gpt-4o-transcribe",
     # Microphone receive
     async def receive(self, frame: Tuple[int, NDArray[np.int16]]) -> None:
+        """Receive audio frame from the microphone and send it to the openai server.
+        Args:
+            frame: A tuple containing the sample rate and the audio frame.
+        """
         if not self.connection:
             return
+        input_sample_rate, audio_frame = frame
+        # Reshape if needed
+        if audio_frame.ndim == 2:
+            audio_frame = audio_frame.squeeze()
         # Resample if needed
+        if self.input_sample_rate != input_sample_rate:
+            audio_frame = resample(audio_frame, int(len(audio_frame) * self.input_sample_rate / input_sample_rate))
+        # Cast if needed
+        audio_frame = audio_to_int16(audio_frame)
+        audio_message = base64.b64encode(audio_frame.tobytes()).decode("utf-8")
         await self.connection.input_audio_buffer.append(audio=audio_message)
     async def emit(self) -> Tuple[int, NDArray[np.int16]] | AdditionalOutputs | None: