Alina Lozovskaya commited on
Commit
c10cc03
·
2 Parent(s): 6eddfa9 cced4d1

Merge branch 'develop' into 114-improve-transcript-accuracy

Browse files
src/reachy_mini_conversation_app/console.py CHANGED
@@ -8,8 +8,8 @@ import asyncio
8
  import logging
9
  from typing import List
10
 
11
- from fastrtc import AdditionalOutputs, audio_to_int16, audio_to_float32
12
- from librosa import resample
13
 
14
  from reachy_mini import ReachyMini
15
  from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler
@@ -30,11 +30,6 @@ class LocalStream:
30
  # Allow the handler to flush the player queue when appropriate.
31
  self.handler._clear_queue = self.clear_audio_queue
32
 
33
- # Hack to avoid the first lenghty call to resample at runtime.
34
- # This is likely caused by cache initialization overhead.
35
- import numpy as np
36
- resample(np.array([0.0]), orig_sr=1, target_sr=1)
37
-
38
  def launch(self) -> None:
39
  """Start the recorder/player and run the async processing loops."""
40
  self._stop_event.clear()
@@ -88,9 +83,7 @@ class LocalStream:
88
  while not self._stop_event.is_set():
89
  audio_frame = self._robot.media.get_audio_sample()
90
  if audio_frame is not None:
91
- frame_mono = audio_frame.T[0] # both channels are identical
92
- frame = audio_to_int16(frame_mono)
93
- await self.handler.receive((16000, frame))
94
 
95
  await asyncio.sleep(0.01) # avoid busy loop
96
 
@@ -110,18 +103,24 @@ class LocalStream:
110
  )
111
 
112
  elif isinstance(handler_output, tuple):
113
- input_sample_rate, audio_frame = handler_output
114
- device_sample_rate = self._robot.media.get_audio_samplerate()
115
- audio_frame_float = audio_to_float32(audio_frame.squeeze())
116
-
117
- if input_sample_rate != device_sample_rate:
118
- audio_frame_float = resample(
119
- audio_frame_float,
120
- orig_sr=input_sample_rate,
121
- target_sr=device_sample_rate,
 
 
 
 
 
 
122
  )
123
 
124
- self._robot.media.push_audio_sample(audio_frame_float)
125
 
126
  else:
127
  logger.debug("Ignoring output type=%s", type(handler_output).__name__)
 
8
  import logging
9
  from typing import List
10
 
11
+ from fastrtc import AdditionalOutputs, audio_to_float32
12
+ from scipy.signal import resample
13
 
14
  from reachy_mini import ReachyMini
15
  from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler
 
30
  # Allow the handler to flush the player queue when appropriate.
31
  self.handler._clear_queue = self.clear_audio_queue
32
 
 
 
 
 
 
33
  def launch(self) -> None:
34
  """Start the recorder/player and run the async processing loops."""
35
  self._stop_event.clear()
 
83
  while not self._stop_event.is_set():
84
  audio_frame = self._robot.media.get_audio_sample()
85
  if audio_frame is not None:
86
+ await self.handler.receive((self._robot.media.get_input_audio_samplerate(), audio_frame))
 
 
87
 
88
  await asyncio.sleep(0.01) # avoid busy loop
89
 
 
103
  )
104
 
105
  elif isinstance(handler_output, tuple):
106
+ input_sample_rate, audio_data = handler_output
107
+ output_sample_rate = self._robot.media.get_output_audio_samplerate()
108
+
109
+ # Reshape if needed
110
+ if audio_data.ndim == 2:
111
+ audio_data = audio_data.squeeze()
112
+
113
+ # Cast if needed
114
+ audio_frame = audio_to_float32(audio_data)
115
+
116
+ # Resample if needed
117
+ if input_sample_rate != output_sample_rate:
118
+ audio_frame = resample(
119
+ audio_frame,
120
+ int(len(audio_frame) * output_sample_rate / input_sample_rate),
121
  )
122
 
123
+ self._robot.media.push_audio_sample(audio_frame)
124
 
125
  else:
126
  logger.debug("Ignoring output type=%s", type(handler_output).__name__)
src/reachy_mini_conversation_app/openai_realtime.py CHANGED
@@ -3,15 +3,16 @@ import base64
3
  import random
4
  import asyncio
5
  import logging
6
- from typing import Any, Tuple, Literal, cast
7
  from datetime import datetime
8
 
9
  import cv2
10
  import numpy as np
11
  import gradio as gr
12
  from openai import AsyncOpenAI
13
- from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
14
  from numpy.typing import NDArray
 
15
  from websockets.exceptions import ConnectionClosedError
16
 
17
  from reachy_mini_conversation_app.config import config
@@ -25,6 +26,9 @@ from reachy_mini_conversation_app.tools.core_tools import (
25
 
26
  logger = logging.getLogger(__name__)
27
 
 
 
 
28
 
29
  class OpenaiRealtimeHandler(AsyncStreamHandler):
30
  """An OpenAI realtime handler for fastrtc Stream."""
@@ -33,16 +37,19 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
33
  """Initialize the handler."""
34
  super().__init__(
35
  expected_layout="mono",
36
- output_sample_rate=24000, # openai outputs
37
- input_sample_rate=16000, # respeaker output
38
  )
 
 
 
 
 
39
  self.deps = deps
40
 
41
  # Override type annotations for OpenAI strict typing (only for values used in API)
42
- self.output_sample_rate: Literal[24000]
43
- self.target_input_rate: Literal[24000] = 24000
44
- # input_sample_rate rest as int for comparison logic
45
- self.resample_ratio = self.target_input_rate / self.input_sample_rate
46
 
47
  self.connection: Any = None
48
  self.output_queue: "asyncio.Queue[Tuple[int, NDArray[np.int16]] | AdditionalOutputs]" = asyncio.Queue()
@@ -60,21 +67,6 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
60
  """Create a copy of the handler."""
61
  return OpenaiRealtimeHandler(self.deps)
62
 
63
- def resample_audio(self, audio: NDArray[np.int16]) -> NDArray[np.int16]:
64
- """Resample audio using linear interpolation."""
65
- if self.input_sample_rate == self.target_input_rate:
66
- return audio
67
-
68
- # Use numpy's interp for simple linear resampling
69
- input_length = len(audio)
70
- output_length = int(input_length * self.resample_ratio)
71
-
72
- input_time = np.arange(input_length)
73
- output_time = np.linspace(0, input_length - 1, output_length)
74
-
75
- resampled = np.interp(output_time, input_time, audio.astype(np.float32))
76
- return cast(NDArray[np.int16], resampled.astype(np.int16))
77
-
78
  async def _emit_debounced_partial(self, transcript: str, sequence: int) -> None:
79
  """Emit partial transcript after debounce delay."""
80
  try:
@@ -130,7 +122,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
130
  "input": {
131
  "format": {
132
  "type": "audio/pcm",
133
- "rate": self.target_input_rate,
134
  },
135
  "transcription": {
136
  "model": "gpt-4o-transcribe",
@@ -348,17 +340,28 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
348
 
349
  # Microphone receive
350
  async def receive(self, frame: Tuple[int, NDArray[np.int16]]) -> None:
351
- """Receive audio frame from the microphone and send it to the openai server."""
 
 
 
 
 
352
  if not self.connection:
353
  return
354
- _, array = frame
355
- array = array.squeeze()
 
 
 
356
 
357
  # Resample if needed
358
- if self.input_sample_rate != self.target_input_rate:
359
- array = self.resample_audio(array)
 
 
 
360
 
361
- audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
362
  await self.connection.input_audio_buffer.append(audio=audio_message)
363
 
364
  async def emit(self) -> Tuple[int, NDArray[np.int16]] | AdditionalOutputs | None:
 
3
  import random
4
  import asyncio
5
  import logging
6
+ from typing import Any, Final, Tuple, Literal
7
  from datetime import datetime
8
 
9
  import cv2
10
  import numpy as np
11
  import gradio as gr
12
  from openai import AsyncOpenAI
13
+ from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item, audio_to_int16
14
  from numpy.typing import NDArray
15
+ from scipy.signal import resample
16
  from websockets.exceptions import ConnectionClosedError
17
 
18
  from reachy_mini_conversation_app.config import config
 
26
 
27
  logger = logging.getLogger(__name__)
28
 
29
+ OPEN_AI_INPUT_SAMPLE_RATE: Final[Literal[24000]] = 24000
30
+ OPEN_AI_OUTPUT_SAMPLE_RATE: Final[Literal[24000]] = 24000
31
+
32
 
33
  class OpenaiRealtimeHandler(AsyncStreamHandler):
34
  """An OpenAI realtime handler for fastrtc Stream."""
 
37
  """Initialize the handler."""
38
  super().__init__(
39
  expected_layout="mono",
40
+ output_sample_rate=OPEN_AI_OUTPUT_SAMPLE_RATE,
41
+ input_sample_rate=OPEN_AI_INPUT_SAMPLE_RATE,
42
  )
43
+
44
+ # Override typing of the sample rates to match OpenAI's requirements
45
+ self.output_sample_rate: Literal[24000] = self.output_sample_rate
46
+ self.input_sample_rate: Literal[24000] = self.input_sample_rate
47
+
48
  self.deps = deps
49
 
50
  # Override type annotations for OpenAI strict typing (only for values used in API)
51
+ self.output_sample_rate = OPEN_AI_OUTPUT_SAMPLE_RATE
52
+ self.input_sample_rate = OPEN_AI_INPUT_SAMPLE_RATE
 
 
53
 
54
  self.connection: Any = None
55
  self.output_queue: "asyncio.Queue[Tuple[int, NDArray[np.int16]] | AdditionalOutputs]" = asyncio.Queue()
 
67
  """Create a copy of the handler."""
68
  return OpenaiRealtimeHandler(self.deps)
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  async def _emit_debounced_partial(self, transcript: str, sequence: int) -> None:
71
  """Emit partial transcript after debounce delay."""
72
  try:
 
122
  "input": {
123
  "format": {
124
  "type": "audio/pcm",
125
+ "rate": self.input_sample_rate,
126
  },
127
  "transcription": {
128
  "model": "gpt-4o-transcribe",
 
340
 
341
  # Microphone receive
342
  async def receive(self, frame: Tuple[int, NDArray[np.int16]]) -> None:
343
+ """Receive audio frame from the microphone and send it to the openai server.
344
+
345
+ Args:
346
+ frame: A tuple containing the sample rate and the audio frame.
347
+
348
+ """
349
  if not self.connection:
350
  return
351
+ input_sample_rate, audio_frame = frame
352
+
353
+ # Reshape if needed
354
+ if audio_frame.ndim == 2:
355
+ audio_frame = audio_frame.squeeze()
356
 
357
  # Resample if needed
358
+ if self.input_sample_rate != input_sample_rate:
359
+ audio_frame = resample(audio_frame, int(len(audio_frame) * self.input_sample_rate / input_sample_rate))
360
+
361
+ # Cast if needed
362
+ audio_frame = audio_to_int16(audio_frame)
363
 
364
+ audio_message = base64.b64encode(audio_frame.tobytes()).decode("utf-8")
365
  await self.connection.input_audio_buffer.append(audio=audio_message)
366
 
367
  async def emit(self) -> Tuple[int, NDArray[np.int16]] | AdditionalOutputs | None: