Spaces:

pollen-robotics
/

reachy_mini_conversation_app

Running

App Files Files Community

Alina Lozovskaya commited on Oct 10

Commit

d112528

2 Parent(s): b0cb5ad eaa4ab7

Merge branch 'develop' into 49-improve-readme

Browse files

Files changed (11) hide show

.gitignore +3 -0
README.md +2 -1
pyproject.toml +1 -2
src/reachy_mini_conversation_demo/audio/gstreamer.py +0 -214
src/reachy_mini_conversation_demo/camera_worker.py +3 -0
src/reachy_mini_conversation_demo/console.py +115 -0
src/reachy_mini_conversation_demo/main.py +37 -14
src/reachy_mini_conversation_demo/moves.py +2 -2
src/reachy_mini_conversation_demo/openai_realtime.py +3 -3
src/reachy_mini_conversation_demo/utils.py +2 -3
uv.lock +0 -0

.gitignore CHANGED Viewed

@@ -79,6 +79,9 @@ coverage.xml
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot

 .pytest_cache/
 cover/
+# Ruff cache
+.ruff_cache/
 # Translations
 *.mo
 *.pot

README.md CHANGED Viewed

@@ -93,9 +93,10 @@ The app starts a Gradio UI served locally (http://127.0.0.1:7860/). When running
 |--------|---------|-------------|
 | `--head-tracker {yolo,mediapipe}` | `None` | Select a face-tracking backend when a camera is available. Requires the matching optional extra. |
 | `--no-camera` | `False` | Run without camera capture or face tracking. |
-| `--headless` | `False` | Suppress launching the Gradio UI (useful on remote machines). |
 | `--debug` | `False` | Enable verbose logging for troubleshooting. |
 ### Examples
 - Run on hardware with MediaPipe face tracking:

 |--------|---------|-------------|
 | `--head-tracker {yolo,mediapipe}` | `None` | Select a face-tracking backend when a camera is available. Requires the matching optional extra. |
 | `--no-camera` | `False` | Run without camera capture or face tracking. |
+| `--gradio` | `False` | Launch the Gradio web UI. Without this flag, runs in console mode. Required when running in simulation mode. |
 | `--debug` | `False` | Enable verbose logging for troubleshooting. |
 ### Examples
 - Run on hardware with MediaPipe face tracking:

pyproject.toml CHANGED Viewed

@@ -16,7 +16,6 @@ dependencies = [
     "gradio>=5.49.0",
     "huggingface_hub>=0.34.4",
     "opencv-python>=4.12.0.88",
-    "PyGObject>=3.42.2,<=3.46.0",
     #Environment variables
     "python-dotenv",
@@ -88,4 +87,4 @@ split-on-trailing-comma = true
 quote-style = "double"
 indent-style = "space"
 skip-magic-trailing-comma = false
-line-ending = "auto"

     "gradio>=5.49.0",
     "huggingface_hub>=0.34.4",
     "opencv-python>=4.12.0.88",
     #Environment variables
     "python-dotenv",
 quote-style = "double"
 indent-style = "space"
 skip-magic-trailing-comma = false
+line-ending = "auto"

src/reachy_mini_conversation_demo/audio/gstreamer.py DELETED Viewed

@@ -1,214 +0,0 @@
-import logging
-from typing import Optional
-from threading import Thread
-import gi
-gi.require_version("Gst", "1.0")
-gi.require_version("GstApp", "1.0")
-from gi.repository import Gst, GLib  # noqa: E402
-class GstPlayer:
-    """Audio player using GStreamer."""
-    def __init__(self, sample_rate: int = 24000, device_name: Optional[str] = None):
-        """Initialize player."""
-        self._logger = logging.getLogger(__name__)
-        Gst.init(None)
-        self._loop = GLib.MainLoop()
-        self._thread_bus_calls: Optional[Thread] = None
-        self.pipeline = Gst.Pipeline.new("audio_player")
-        # Create elements
-        self.appsrc = Gst.ElementFactory.make("appsrc", None)
-        self.appsrc.set_property("format", Gst.Format.TIME)
-        self.appsrc.set_property("is-live", True)
-        caps = Gst.Caps.from_string(f"audio/x-raw,format=S16LE,channels=1,rate={sample_rate},layout=interleaved")
-        self.appsrc.set_property("caps", caps)
-        queue = Gst.ElementFactory.make("queue")
-        audioconvert = Gst.ElementFactory.make("audioconvert")
-        audioresample = Gst.ElementFactory.make("audioresample")
-        # Try to pin specific output device; fallback to autoaudiosink
-        audiosink = _create_device_element(direction="sink", name_substr=device_name) or Gst.ElementFactory.make(
-            "autoaudiosink"
-        )
-        self.pipeline.add(self.appsrc)
-        self.pipeline.add(queue)
-        self.pipeline.add(audioconvert)
-        self.pipeline.add(audioresample)
-        self.pipeline.add(audiosink)
-        self.appsrc.link(queue)
-        queue.link(audioconvert)
-        audioconvert.link(audioresample)
-        audioresample.link(audiosink)
-    def _on_bus_message(self, bus: Gst.Bus, msg: Gst.Message, loop) -> bool:  # type: ignore[no-untyped-def]
-        t = msg.type
-        if t == Gst.MessageType.EOS:
-            self._logger.warning("End-of-stream")
-            return False
-        elif t == Gst.MessageType.ERROR:
-            err, debug = msg.parse_error()
-            self._logger.error(f"Error: {err} {debug}")
-            return False
-        return True
-    def _handle_bus_calls(self) -> None:
-        self._logger.debug("starting bus message loop")
-        bus = self.pipeline.get_bus()
-        bus.add_watch(GLib.PRIORITY_DEFAULT, self._on_bus_message, self._loop)
-        self._loop.run()  # type: ignore[no-untyped-call]
-        bus.remove_watch()
-        self._logger.debug("bus message loop stopped")
-    def play(self):
-        """Start playback."""
-        self.pipeline.set_state(Gst.State.PLAYING)
-        self._thread_bus_calls = Thread(target=self._handle_bus_calls, daemon=True)
-        self._thread_bus_calls.start()
-    def push_sample(self, data: bytes):
-        """Push audio sample (bytes) to playback pipeline."""
-        buf = Gst.Buffer.new_wrapped(data)
-        self.appsrc.push_buffer(buf)
-    def stop(self):
-        """Stop playback and clean up."""
-        logger = logging.getLogger(__name__)
-        self._loop.quit()
-        self.pipeline.set_state(Gst.State.NULL)
-        if self._thread_bus_calls is not None:
-            self._thread_bus_calls.join()
-        logger.info("Stopped Player")
-class GstRecorder:
-    """Audio recorder using GStreamer."""
-    def __init__(self, sample_rate: int = 24000, device_name: Optional[str] = None):
-        """Initialize recorder."""
-        self._logger = logging.getLogger(__name__)
-        Gst.init(None)
-        self._loop = GLib.MainLoop()
-        self._thread_bus_calls: Optional[Thread] = None
-        self.pipeline = Gst.Pipeline.new("audio_recorder")
-        # Create elements: try specific mic; fallback to default
-        autoaudiosrc = _create_device_element(direction="source", name_substr=device_name) or Gst.ElementFactory.make(
-            "autoaudiosrc", None
-        )
-        queue = Gst.ElementFactory.make("queue", None)
-        audioconvert = Gst.ElementFactory.make("audioconvert", None)
-        audioresample = Gst.ElementFactory.make("audioresample", None)
-        self.appsink = Gst.ElementFactory.make("appsink", None)
-        if not all([autoaudiosrc, queue, audioconvert, audioresample, self.appsink]):
-            raise RuntimeError("Failed to create GStreamer elements")
-        # Force mono/S16LE at 24000; resample handles device SR (e.g., 16000 → 24000)
-        caps = Gst.Caps.from_string(f"audio/x-raw,channels=1,rate={sample_rate},format=S16LE")
-        self.appsink.set_property("caps", caps)
-        # Build pipeline
-        self.pipeline.add(autoaudiosrc)
-        self.pipeline.add(queue)
-        self.pipeline.add(audioconvert)
-        self.pipeline.add(audioresample)
-        self.pipeline.add(self.appsink)
-        autoaudiosrc.link(queue)
-        queue.link(audioconvert)
-        audioconvert.link(audioresample)
-        audioresample.link(self.appsink)
-    def _on_bus_message(self, bus: Gst.Bus, msg: Gst.Message, loop) -> bool:  # type: ignore[no-untyped-def]
-        t = msg.type
-        if t == Gst.MessageType.EOS:
-            self._logger.warning("End-of-stream")
-            return False
-        elif t == Gst.MessageType.ERROR:
-            err, debug = msg.parse_error()
-            self._logger.error(f"Error: {err} {debug}")
-            return False
-        return True
-    def _handle_bus_calls(self) -> None:
-        self._logger.debug("starting bus message loop")
-        bus = self.pipeline.get_bus()
-        bus.add_watch(GLib.PRIORITY_DEFAULT, self._on_bus_message, self._loop)
-        self._loop.run()  # type: ignore[no-untyped-call]
-        bus.remove_watch()
-        self._logger.debug("bus message loop stopped")
-    def record(self):
-        """Start recording."""
-        self.pipeline.set_state(Gst.State.PLAYING)
-        self._thread_bus_calls = Thread(target=self._handle_bus_calls, daemon=True)
-        self._thread_bus_calls.start()
-    def get_sample(self):
-        """Return next audio sample as bytes, or None if no sample available."""
-        sample = self.appsink.pull_sample()
-        data = None
-        if isinstance(sample, Gst.Sample):
-            buf = sample.get_buffer()
-            if buf is None:
-                self._logger.warning("Buffer is None")
-            data = buf.extract_dup(0, buf.get_size())
-        return data
-    def stop(self):
-        """Stop recording and clean up."""
-        logger = logging.getLogger(__name__)
-        self._loop.quit()
-        self.pipeline.set_state(Gst.State.NULL)
-        if self._thread_bus_calls is not None:
-            self._thread_bus_calls.join()
-        logger.info("Stopped Recorder")
-def _create_device_element(direction: str, name_substr: Optional[str]) -> Optional[Gst.Element]:
-    """direction: 'source' or 'sink'.
-    name_substr: case-insensitive substring matching device display name/description.
-    """
-    logger = logging.getLogger(__name__)
-    if not name_substr:
-        logger.error(f"Device select: no name_substr for {direction}; returning None")
-        return None
-    monitor = Gst.DeviceMonitor.new()
-    klass = "Audio/Source" if direction == "source" else "Audio/Sink"
-    monitor.add_filter(klass, None)
-    monitor.start()
-    try:
-        for dev in monitor.get_devices() or []:
-            disp = dev.get_display_name() or ""
-            props = dev.get_properties()
-            desc = props.get_string("device.description") if props and props.has_field("device.description") else ""
-            logger.info(f"Device candidate: disp='{disp}', desc='{desc}'")
-            if name_substr.lower() in disp.lower() or name_substr.lower() in desc.lower():
-                elem = dev.create_element(None)
-                factory = elem.get_factory().get_name() if elem and elem.get_factory() else "<?>"
-                logger.info(f"Using {direction} device: '{disp or desc}' (factory='{factory}')")
-                return elem
-    finally:
-        monitor.stop()
-    logging.getLogger(__name__).warning("Requested %s '%s' not found; using auto*", direction, name_substr)
-    return None

src/reachy_mini_conversation_demo/camera_worker.py CHANGED Viewed

@@ -92,6 +92,7 @@ class CameraWorker:
         self._stop_event.set()
         if self._thread is not None:
             self._thread.join()
         logger.debug("Camera worker stopped")
     def working_loop(self) -> None:
@@ -108,6 +109,8 @@ class CameraWorker:
         while not self._stop_event.is_set():
             try:
                 current_time = time.time()
                 frame = self.reachy_mini.media.get_frame()
                 if frame is not None:

         self._stop_event.set()
         if self._thread is not None:
             self._thread.join()
         logger.debug("Camera worker stopped")
     def working_loop(self) -> None:
         while not self._stop_event.is_set():
             try:
                 current_time = time.time()
+                # Get frame from robot
                 frame = self.reachy_mini.media.get_frame()
                 if frame is not None:

src/reachy_mini_conversation_demo/console.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""Bidirectional local audio stream.
+records mic frames to the handler and plays handler audio frames to the speaker.
+"""
+import asyncio
+import logging
+import librosa
+from fastrtc import AdditionalOutputs, audio_to_int16, audio_to_float32
+from reachy_mini import ReachyMini
+from reachy_mini_conversation_demo.openai_realtime import OpenaiRealtimeHandler
+logger = logging.getLogger(__name__)
+class LocalStream:
+    """LocalStream using Reachy Mini's recorder/player."""
+    def __init__(self, handler: OpenaiRealtimeHandler, robot: ReachyMini):
+        """Initialize the stream with an OpenAI realtime handler and pipelines."""
+        self.handler = handler
+        self._robot = robot
+        self._stop_event = asyncio.Event()
+        self._tasks = []
+        # Allow the handler to flush the player queue when appropriate.
+        self.handler._clear_queue = self.clear_queue  # type: ignore[assignment]
+    def launch(self) -> None:
+        """Start the recorder/player and run the async processing loops."""
+        self._stop_event.clear()
+        self._robot.media.start_recording()
+        self._robot.media.start_playing()
+        async def runner() -> None:
+            self._tasks = [
+                asyncio.create_task(self.handler.start_up(), name="openai-handler"),
+                asyncio.create_task(self.record_loop(), name="stream-record-loop"),
+                asyncio.create_task(self.play_loop(), name="stream-play-loop"),
+            ]
+            try:
+                await asyncio.gather(*self._tasks)
+            except asyncio.CancelledError:
+                logger.info("Tasks cancelled during shutdown")
+            finally:
+                # Ensure handler connection is closed
+                await self.handler.shutdown()
+        asyncio.run(runner())
+    def stop(self) -> None:
+        """Stop the stream and underlying GStreamer pipelines.
+        This method:
+        - Sets the stop event to signal async loops to terminate
+        - Cancels all pending async tasks (openai-handler, record-loop, play-loop)
+        - Stops audio recording and playback
+        """
+        logger.info("Stopping LocalStream...")
+        self._stop_event.set()
+        # Cancel all running tasks
+        for task in self._tasks:
+            if not task.done():
+                task.cancel()
+        self._robot.media.stop_recording()
+        self._robot.media.stop_playing()
+    def clear_queue(self) -> None:
+        """Flush the player's appsrc to drop any queued audio immediately."""
+        logger.info("User intervention: flushing player queue")
+        self.handler.output_queue = asyncio.Queue()
+    async def record_loop(self) -> None:
+        """Read mic frames from the recorder and forward them to the handler."""
+        logger.info("Starting receive loop")
+        while not self._stop_event.is_set():
+            data = self._robot.media.get_audio_sample()
+            if data is not None:
+                frame_mono = data.T[0]  # both channels are identical
+                frame = audio_to_int16(frame_mono)
+                await self.handler.receive((16000, frame))
+                # await asyncio.sleep(0)  # yield to event loop
+            else:
+                await asyncio.sleep(0.01)  # avoid busy loop
+    async def play_loop(self) -> None:
+        """Fetch outputs from the handler: log text and play audio frames."""
+        while not self._stop_event.is_set():
+            data = await self.handler.emit()
+            if isinstance(data, AdditionalOutputs):
+                for msg in data.args:
+                    content = msg.get("content", "")
+                    if isinstance(content, str):
+                        logger.info(
+                            "role=%s content=%s",
+                            msg.get("role"),
+                            content if len(content) < 500 else content[:500] + "…",
+                        )
+            elif isinstance(data, tuple):
+                sample_rate, frame = data
+                device_sample_rate = self._robot.media.get_audio_samplerate()
+                frame = audio_to_float32(frame.squeeze())
+                if sample_rate != device_sample_rate:
+                    frame = librosa.resample(frame, orig_sr=sample_rate, target_sr=device_sample_rate)
+                self._robot.media.push_audio_sample(frame)
+            # else: ignore None/unknown outputs
+            await asyncio.sleep(0)  # yield to event loop

src/reachy_mini_conversation_demo/main.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Entrypoint for the Reachy Mini conversation demo."""
 import os
 import gradio as gr
 from fastapi import FastAPI
@@ -14,6 +15,7 @@ from reachy_mini_conversation_demo.utils import (
     setup_logger,
     handle_vision_stuff,
 )
 from reachy_mini_conversation_demo.openai_realtime import OpenaiRealtimeHandler
 from reachy_mini_conversation_demo.audio.head_wobbler import HeadWobbler
@@ -31,8 +33,19 @@ def main():
     logger = setup_logger(args.debug)
     logger.info("Starting Reachy Mini Conversation Demo")
     robot = ReachyMini()
     camera_worker, _, vision_manager = handle_vision_stuff(args, robot)
     movement_manager = MovementManager(
@@ -62,18 +75,24 @@ def main():
     logger.debug(f"Chatbot avatar images: {chatbot.avatar_images}")
     handler = OpenaiRealtimeHandler(deps)
-    stream = Stream(
-        handler=handler,
-        mode="send-receive",
-        modality="audio",
-        additional_inputs=[chatbot],
-        additional_outputs=[chatbot],
-        additional_outputs_handler=update_chatbot,
-        ui_args={"title": "Talk with Reachy Mini"},
-    )
-    app = FastAPI()
-    app = gr.mount_gradio_app(app, stream.ui, path="/")
     # Each async service → its own thread/loop
     movement_manager.start()
@@ -82,11 +101,14 @@ def main():
         camera_worker.start()
     try:
-        stream.ui.launch()
     except KeyboardInterrupt:
-        logger.info("Exiting...")
     finally:
         movement_manager.stop()
         head_wobbler.stop()
         if camera_worker:
@@ -94,6 +116,7 @@ def main():
         # prevent connection to keep alive some threads
         robot.client.disconnect()
 if __name__ == "__main__":

 """Entrypoint for the Reachy Mini conversation demo."""
 import os
+import sys
 import gradio as gr
 from fastapi import FastAPI
     setup_logger,
     handle_vision_stuff,
 )
+from reachy_mini_conversation_demo.console import LocalStream
 from reachy_mini_conversation_demo.openai_realtime import OpenaiRealtimeHandler
 from reachy_mini_conversation_demo.audio.head_wobbler import HeadWobbler
     logger = setup_logger(args.debug)
     logger.info("Starting Reachy Mini Conversation Demo")
+    if args.no_camera and args.head_tracker is not None:
+        logger.warning("Head tracking is not activated due to --no-camera.")
     robot = ReachyMini()
+    # Check if running in simulation mode without --gradio
+    if robot.client.get_status()["simulation_enabled"] and not args.gradio:
+        logger.error(
+            "Simulation mode requires Gradio interface. Please use --gradio flag when running in simulation mode."
+        )
+        robot.client.disconnect()
+        sys.exit(1)
     camera_worker, _, vision_manager = handle_vision_stuff(args, robot)
     movement_manager = MovementManager(
     logger.debug(f"Chatbot avatar images: {chatbot.avatar_images}")
     handler = OpenaiRealtimeHandler(deps)
+    stream_manager = None
+    if args.gradio:
+        stream = Stream(
+            handler=handler,
+            mode="send-receive",
+            modality="audio",
+            additional_inputs=[chatbot],
+            additional_outputs=[chatbot],
+            additional_outputs_handler=update_chatbot,
+            ui_args={"title": "Talk with Reachy Mini"},
+        )
+        stream_manager = stream.ui
+        app = FastAPI()
+        app = gr.mount_gradio_app(app, stream.ui, path="/")
+    else:
+        stream_manager = LocalStream(handler, robot)
     # Each async service → its own thread/loop
     movement_manager.start()
         camera_worker.start()
     try:
+        stream_manager.launch()
     except KeyboardInterrupt:
+        logger.info("Keyboard interruption in main thread... closing server.")
     finally:
+        # Stop the stream manager and its pipelines
+        stream_manager.close()
+        # Stop other services
         movement_manager.stop()
         head_wobbler.stop()
         if camera_worker:
         # prevent connection to keep alive some threads
         robot.client.disconnect()
+        logger.info("Shutdown complete.")
 if __name__ == "__main__":

src/reachy_mini_conversation_demo/moves.py CHANGED Viewed

@@ -498,7 +498,7 @@ class MovementManager:
                 self.state.move_start_time = current_time
                 # Any real move cancels breathing mode flag
                 self._breathing_active = isinstance(self.state.current_move, BreathingMove)
-                logger.info(f"Starting new move, duration: {self.state.current_move.duration}s")
     def _manage_breathing(self, current_time: float) -> None:
         """Manage automatic breathing when idle."""
@@ -525,7 +525,7 @@ class MovementManager:
                         interpolation_duration=1.0,
                     )
                     self.move_queue.append(breathing_move)
-                    logger.info("Started breathing after %.1fs of inactivity", idle_for)
                 except Exception as e:
                     self._breathing_active = False
                     logger.error("Failed to start breathing: %s", e)

                 self.state.move_start_time = current_time
                 # Any real move cancels breathing mode flag
                 self._breathing_active = isinstance(self.state.current_move, BreathingMove)
+                logger.debug(f"Starting new move, duration: {self.state.current_move.duration}s")
     def _manage_breathing(self, current_time: float) -> None:
         """Manage automatic breathing when idle."""
                         interpolation_duration=1.0,
                     )
                     self.move_queue.append(breathing_move)
+                    logger.debug("Started breathing after %.1fs of inactivity", idle_for)
                 except Exception as e:
                     self._breathing_active = False
                     logger.error("Failed to start breathing: %s", e)

src/reachy_mini_conversation_demo/openai_realtime.py CHANGED Viewed

@@ -27,8 +27,8 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
         """Initialize the handler."""
         super().__init__(
             expected_layout="mono",
-            output_sample_rate=24000,
-            input_sample_rate=24000,
         )
         self.deps = deps
@@ -169,7 +169,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                         )
                     )
-                    if tool_name == "camera":
                         b64_im = json.dumps(tool_result["b64_im"])
                         await self.connection.conversation.item.create(
                             item={

         """Initialize the handler."""
         super().__init__(
             expected_layout="mono",
+            output_sample_rate=24000,  # openai outputs
+            input_sample_rate=16000,  # respeaker output
         )
         self.deps = deps
                         )
                     )
+                    if tool_name == "camera" and "b64_im" in tool_result:
                         b64_im = json.dumps(tool_result["b64_im"])
                         await self.connection.conversation.item.create(
                             item={

src/reachy_mini_conversation_demo/utils.py CHANGED Viewed

@@ -8,15 +8,14 @@ from reachy_mini_conversation_demo.camera_worker import CameraWorker
 def parse_args():
     """Parse command line arguments."""
     parser = argparse.ArgumentParser("Reachy Mini Conversation Demo")
-    parser.add_argument("--sim", action="store_true", help="Run in simulation mode")
     parser.add_argument(
         "--head-tracker",
         choices=["yolo", "mediapipe", None],
         default=None,
-        help="Choose head tracker (default: mediapipe)",
     )
     parser.add_argument("--no-camera", default=False, action="store_true", help="Disable camera usage")
-    parser.add_argument("--headless", default=False, action="store_true", help="Run in headless mode")
     parser.add_argument("--debug", default=False, action="store_true", help="Enable debug logging")
     return parser.parse_args()

 def parse_args():
     """Parse command line arguments."""
     parser = argparse.ArgumentParser("Reachy Mini Conversation Demo")
     parser.add_argument(
         "--head-tracker",
         choices=["yolo", "mediapipe", None],
         default=None,
+        help="Choose head tracker (default: None)",
     )
     parser.add_argument("--no-camera", default=False, action="store_true", help="Disable camera usage")
+    parser.add_argument("--gradio", default=False, action="store_true", help="Open gradio interface")
     parser.add_argument("--debug", default=False, action="store_true", help="Enable debug logging")
     return parser.parse_args()

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff