Spaces:

pollen-robotics
/

reachy_mini_conversation_app

Running

App Files Files Community

Alina Lozovskaya commited on Sep 2

Commit

4441051

1 Parent(s): e7439e8

Removed threading use asyncio

Browse files

Files changed (13) hide show

.gitignore +3 -0
pyproject.toml +11 -1
src/reachy_mini_conversation_demo/__init__.py +0 -0
src/reachy_mini_conversation_demo/audio.py +179 -0
src/reachy_mini_conversation_demo/gstreamer.py +226 -0
src/reachy_mini_conversation_demo/head_tracker.py +245 -0
src/reachy_mini_conversation_demo/main.py +560 -3
src/reachy_mini_conversation_demo/movement.py +150 -0
src/reachy_mini_conversation_demo/prompts.py +50 -0
src/reachy_mini_conversation_demo/speech_tapper.py +292 -0
src/reachy_mini_conversation_demo/test_stop.py +33 -0
src/reachy_mini_conversation_demo/tools.py +322 -0
src/reachy_mini_conversation_demo/vision.py +302 -0

.gitignore CHANGED Viewed

@@ -1,2 +1,5 @@
 __pycache__/
 *.egg-info

 __pycache__/
 *.egg-info
+.venv/
+.env
+cache/

pyproject.toml CHANGED Viewed

@@ -10,7 +10,17 @@ description = ""
 readme = "README.md"
 requires-python = ">=3.8"
 dependencies = [
-    "reachy_mini@git+ssh://git@github.com/pollen-robotics/reachy_mini@develop",
 ]

 readme = "README.md"
 requires-python = ">=3.8"
 dependencies = [
+    "reachy_mini@git+ssh://git@github.com/pollen-robotics/reachy_mini@reachy_talk",
+    "openai",
+    "fastrtc",
+    "onnxruntime",
+    "PyGObject>=3.42.2, <=3.46.0",
+    "torch",
+    "transformers",
+    "num2words",
+    "dotenv",
+    "ultralytics",
+    "supervision",
 ]

src/reachy_mini_conversation_demo/__init__.py ADDED Viewed

File without changes

src/reachy_mini_conversation_demo/audio.py ADDED Viewed

	@@ -0,0 +1,179 @@

+from __future__ import annotations
+import asyncio
+import base64
+from dataclasses import dataclass, field
+from typing import Callable, Optional, Tuple
+import numpy as np
+from reachy_mini_conversation_demo.speech_tapper import SwayRollRT, HOP_MS
+@dataclass
+class AudioConfig:
+    output_sample_rate: int = 24_000
+    movement_latency_s: float = 0.08
+def pcm_to_b64(array: np.ndarray) -> str:
+    """array: shape (N,) int16 or (1,N) int16 -> base64 string for OpenAI input buffer."""
+    a = np.asarray(array).reshape(-1).astype(np.int16, copy=False)
+    return base64.b64encode(a.tobytes()).decode("utf-8")
+class AudioSync:
+    """
+    Routes assistant audio to:
+      1) a playback queue for fastrtc
+      2) a sway engine that emits head-offsets aligned to audio
+    """
+    def __init__(
+        self,
+        cfg: AudioConfig,
+        set_offsets: Callable[[Tuple[float, float, float, float, float, float]], None],
+        sway: Optional[SwayRollRT] = None,
+    ) -> None:
+        """
+        set_offsets: callback receiving (x,y,z,roll,pitch,yaw) at each hop, in meters/radians.
+        """
+        self.cfg = cfg
+        self.set_offsets = set_offsets
+        self.sway = sway or SwayRollRT()
+        self.playback_q: asyncio.Queue = (
+            asyncio.Queue()
+        )  # (sr:int, pcm: np.ndarray[1,N] int16)
+        self._sway_q: asyncio.Queue = (
+            asyncio.Queue()
+        )  # (sr:int, pcm: np.ndarray[1,N] int16)
+        self._base_ts: Optional[float] = None
+        self._hops_done: int = 0
+        self._sway_task: Optional[asyncio.Task] = None
+    # lifecycle
+    def start(self) -> None:
+        if self._sway_task is None:
+            self._sway_task = asyncio.create_task(self._sway_consumer())
+    async def stop(self) -> None:
+        if self._sway_task:
+            self._sway_task.cancel()
+            try:
+                await self._sway_task
+            except asyncio.CancelledError:
+                pass
+            self._sway_task = None
+        self._reset_all()
+        self._drain(self._sway_q)
+        self._drain(self.playback_q)
+    # event hooks from your Realtime loop
+    def on_input_speech_started(self) -> None:
+        """User started speaking (server VAD). Reset sync state."""
+        self._reset_all()
+        self._drain(self._sway_q)
+    def on_response_started(self) -> None:
+        """Assistant began a new utterance."""
+        self._reset_all()
+        self._drain(self._sway_q)
+    def on_response_completed(self) -> None:
+        """Assistant finished an utterance."""
+        self._reset_all()
+        self._drain(self._sway_q)
+    def on_response_audio_delta(self, delta_b64: str) -> None:
+        """
+        Called for each 'response.audio.delta' event.
+        Pushes audio both to playback and to sway engine.
+        """
+        buf = np.frombuffer(base64.b64decode(delta_b64), dtype=np.int16).reshape(1, -1)
+        # 1) to fastrtc playback
+        self.playback_q.put_nowait((self.cfg.output_sample_rate, buf))
+        # 2) to sway engine
+        self._sway_q.put_nowait((self.cfg.output_sample_rate, buf))
+    # fastrtc hook
+    async def emit_playback(self):
+        """Await next (sr, pcm[1,N]) frame for your Stream(...)."""
+        return await self.playback_q.get()
+    # internal
+    async def _sway_consumer(self):
+        """
+        Convert streaming audio chunks into head-offset poses at precise times.
+        """
+        hop_dt = HOP_MS / 1000.0
+        loop = asyncio.get_running_loop()
+        while True:
+            sr, chunk = await self._sway_q.get()  # (1,N), int16
+            pcm = np.asarray(chunk).squeeze(0)
+            results = self.sway.feed(pcm, sr)  # list of dicts with keys x_mm..yaw_rad
+            if self._base_ts is None:
+                # anchor when first audio samples of this utterance arrive
+                self._base_ts = loop.time()
+            i = 0
+            while i < len(results):
+                if self._base_ts is None:
+                    self._base_ts = loop.time()
+                    continue
+                target = (
+                    self._base_ts
+                    + self.cfg.movement_latency_s
+                    + self._hops_done * hop_dt
+                )
+                now = loop.time()
+                # if late by ≥1 hop, drop poses to catch up (no drift accumulation)
+                if now - target >= hop_dt:
+                    lag_hops = int((now - target) / hop_dt)
+                    drop = min(
+                        lag_hops, len(results) - i - 1
+                    )  # keep at least one to show
+                    if drop > 0:
+                        self._hops_done += drop
+                        i += drop
+                        continue
+                # if early, wait
+                if target > now:
+                    await asyncio.sleep(target - now)
+                r = results[i]
+                # meters + radians
+                offsets = (
+                    r["x_mm"] / 1000.0,
+                    r["y_mm"] / 1000.0,
+                    r["z_mm"] / 1000.0,
+                    r["roll_rad"],
+                    r["pitch_rad"],
+                    r["yaw_rad"],
+                )
+                self.set_offsets(offsets)
+                self._hops_done += 1
+                i += 1
+    def _reset_all(self) -> None:
+        self._base_ts = None
+        self._hops_done = 0
+        self.sway.reset()
+    @staticmethod
+    def _drain(q: asyncio.Queue) -> None:
+        try:
+            while True:
+                q.get_nowait()
+        except asyncio.QueueEmpty:
+            pass

src/reachy_mini_conversation_demo/gstreamer.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import logging
+import os
+from threading import Thread
+from typing import Optional
+import gi
+gi.require_version("Gst", "1.0")
+gi.require_version("GstApp", "1.0")
+from gi.repository import GLib, Gst, GstApp
+class GstPlayer:
+    def __init__(self):
+        self._logger = logging.getLogger(__name__)
+        Gst.init(None)
+        self._loop = GLib.MainLoop()
+        self._thread_bus_calls: Optional[Thread] = None
+        self.pipeline = Gst.Pipeline.new("audio_player")
+        # Optional device name from env (substring match)
+        audio_out = os.getenv("AUDIO_OUT")
+        # Create elements
+        self.appsrc = Gst.ElementFactory.make("appsrc", None)
+        self.appsrc.set_property("format", Gst.Format.TIME)
+        self.appsrc.set_property("is-live", True)
+        caps = Gst.Caps.from_string(
+            "audio/x-raw,format=S16LE,channels=1,rate=24000,layout=interleaved"
+        )
+        self.appsrc.set_property("caps", caps)
+        queue = Gst.ElementFactory.make("queue")
+        audioconvert = Gst.ElementFactory.make("audioconvert")
+        audioresample = Gst.ElementFactory.make("audioresample")
+        # Try to pin specific output device; fallback to autoaudiosink
+        audiosink = _create_device_element(
+            direction="sink", name_substr=audio_out
+        ) or Gst.ElementFactory.make("autoaudiosink")
+        self.pipeline.add(self.appsrc)
+        self.pipeline.add(queue)
+        self.pipeline.add(audioconvert)
+        self.pipeline.add(audioresample)
+        self.pipeline.add(audiosink)
+        self.appsrc.link(queue)
+        queue.link(audioconvert)
+        audioconvert.link(audioresample)
+        audioresample.link(audiosink)
+    def _on_bus_message(self, bus: Gst.Bus, msg: Gst.Message, loop) -> bool:  # type: ignore[no-untyped-def]
+        t = msg.type
+        if t == Gst.MessageType.EOS:
+            self._logger.warning("End-of-stream")
+            return False
+        elif t == Gst.MessageType.ERROR:
+            err, debug = msg.parse_error()
+            self._logger.error(f"Error: {err} {debug}")
+            return False
+        return True
+    def _handle_bus_calls(self) -> None:
+        self._logger.debug("starting bus message loop")
+        bus = self.pipeline.get_bus()
+        bus.add_watch(GLib.PRIORITY_DEFAULT, self._on_bus_message, self._loop)
+        self._loop.run()  # type: ignore[no-untyped-call]
+        bus.remove_watch()
+        self._logger.debug("bus message loop stopped")
+    def play(self):
+        self.pipeline.set_state(Gst.State.PLAYING)
+        self._thread_bus_calls = Thread(target=self._handle_bus_calls, daemon=True)
+        self._thread_bus_calls.start()
+    def push_sample(self, data: bytes):
+        buf = Gst.Buffer.new_wrapped(data)
+        self.appsrc.push_buffer(buf)
+    def stop(self):
+        logger = logging.getLogger(__name__)
+        self._loop.quit()
+        self.pipeline.set_state(Gst.State.NULL)
+        self._thread_bus_calls.join()
+        logger.info("Stopped Player")
+class GstRecorder:
+    def __init__(self):
+        self._logger = logging.getLogger(__name__)
+        Gst.init(None)
+        self._loop = GLib.MainLoop()
+        self._thread_bus_calls: Optional[Thread] = None
+        self.pipeline = Gst.Pipeline.new("audio_recorder")
+        audio_in = os.getenv("AUDIO_IN")
+        # Create elements: try specific mic; fallback to default
+        autoaudiosrc = _create_device_element(
+            direction="source", name_substr=audio_in
+        ) or Gst.ElementFactory.make("autoaudiosrc", None)
+        queue = Gst.ElementFactory.make("queue", None)
+        audioconvert = Gst.ElementFactory.make("audioconvert", None)
+        audioresample = Gst.ElementFactory.make("audioresample", None)
+        self.appsink = Gst.ElementFactory.make("appsink", None)
+        if not all([autoaudiosrc, queue, audioconvert, audioresample, self.appsink]):
+            raise RuntimeError("Failed to create GStreamer elements")
+        # Force mono/S16LE at 24000; resample handles device SR (e.g., 16000 → 24000)
+        caps = Gst.Caps.from_string("audio/x-raw,channels=1,rate=24000,format=S16LE")
+        self.appsink.set_property("caps", caps)
+        # Build pipeline
+        self.pipeline.add(autoaudiosrc)
+        self.pipeline.add(queue)
+        self.pipeline.add(audioconvert)
+        self.pipeline.add(audioresample)
+        self.pipeline.add(self.appsink)
+        autoaudiosrc.link(queue)
+        queue.link(audioconvert)
+        audioconvert.link(audioresample)
+        audioresample.link(self.appsink)
+    def _on_bus_message(self, bus: Gst.Bus, msg: Gst.Message, loop) -> bool:  # type: ignore[no-untyped-def]
+        t = msg.type
+        if t == Gst.MessageType.EOS:
+            self._logger.warning("End-of-stream")
+            return False
+        elif t == Gst.MessageType.ERROR:
+            err, debug = msg.parse_error()
+            self._logger.error(f"Error: {err} {debug}")
+            return False
+        return True
+    def _handle_bus_calls(self) -> None:
+        self._logger.debug("starting bus message loop")
+        bus = self.pipeline.get_bus()
+        bus.add_watch(GLib.PRIORITY_DEFAULT, self._on_bus_message, self._loop)
+        self._loop.run()  # type: ignore[no-untyped-call]
+        bus.remove_watch()
+        self._logger.debug("bus message loop stopped")
+    def record(self):
+        self.pipeline.set_state(Gst.State.PLAYING)
+        self._thread_bus_calls = Thread(target=self._handle_bus_calls, daemon=True)
+        self._thread_bus_calls.start()
+    def get_sample(self):
+        sample = self.appsink.pull_sample()
+        data = None
+        if isinstance(sample, Gst.Sample):
+            buf = sample.get_buffer()
+            if buf is None:
+                self._logger.warning("Buffer is None")
+            data = buf.extract_dup(0, buf.get_size())
+        return data
+    def stop(self):
+        logger = logging.getLogger(__name__)
+        self._loop.quit()
+        self.pipeline.set_state(Gst.State.NULL)
+        self._thread_bus_calls.join()
+        logger.info("Stopped Recorder")
+def _create_device_element(
+    direction: str, name_substr: Optional[str]
+) -> Optional[Gst.Element]:
+    """
+    direction: 'source' or 'sink'
+    name_substr: case-insensitive substring matching device display name/description.
+    """
+    logger = logging.getLogger(__name__)
+    if not name_substr:
+        logger.error(f"Device select: no name_substr for {direction}; returning None")
+        return None
+    monitor = Gst.DeviceMonitor.new()
+    klass = "Audio/Source" if direction == "source" else "Audio/Sink"
+    monitor.add_filter(klass, None)
+    monitor.start()
+    try:
+        for dev in monitor.get_devices() or []:
+            disp = dev.get_display_name() or ""
+            props = dev.get_properties()
+            desc = (
+                props.get_string("device.description")
+                if props and props.has_field("device.description")
+                else ""
+            )
+            logger.info(f"Device candidate: disp='{disp}', desc='{desc}'")
+            if (
+                name_substr.lower() in disp.lower()
+                or name_substr.lower() in desc.lower()
+            ):
+                elem = dev.create_element(None)
+                factory = (
+                    elem.get_factory().get_name()
+                    if elem and elem.get_factory()
+                    else "<?>"
+                )
+                logger.info(
+                    f"Using {direction} device: '{disp or desc}' (factory='{factory}')"
+                )
+                return elem
+    finally:
+        monitor.stop()
+    logging.getLogger(__name__).warning(
+        "Requested %s '%s' not found; using auto*", direction, name_substr
+    )
+    return None

src/reachy_mini_conversation_demo/head_tracker.py ADDED Viewed

	@@ -0,0 +1,245 @@

+from __future__ import annotations
+from typing import Optional, Tuple
+import logging
+import numpy as np
+from huggingface_hub import hf_hub_download
+from ultralytics import YOLO
+from supervision import Detections
+logger = logging.getLogger(__name__)
+class HeadTracker:
+    """
+    Lightweight head tracker using YOLO for face detection
+    """
+    def __init__(
+        self,
+        model_repo: str = "AdamCodd/YOLOv11n-face-detection",
+        model_filename: str = "model.pt",
+        confidence_threshold: float = 0.3,
+        device: str = "cpu",
+    ) -> None:
+        """
+        Initialize YOLO-based head tracker
+        Args:
+            model_repo: HuggingFace model repository
+            model_filename: Model file name
+            confidence_threshold: Minimum confidence for face detection
+            device: Device to run inference on ('cpu' or 'cuda')
+        """
+        self.confidence_threshold = confidence_threshold
+        try:
+            # Download and load YOLO model
+            model_path = hf_hub_download(repo_id=model_repo, filename=model_filename)
+            self.model = YOLO(model_path).to(device)
+            logger.info(f"YOLO face detection model loaded from {model_repo}")
+        except Exception as e:
+            logger.error(f"Failed to load YOLO model: {e}")
+            raise
+    def _select_best_face(self, detections: Detections) -> Optional[int]:
+        """
+        Select the best face based on confidence and area (largest face with highest confidence)
+        Args:
+            detections: Supervision detections object
+        Returns:
+            Index of best face or None if no valid faces
+        """
+        if detections.xyxy.shape[0] == 0:
+            return None
+        # Filter by confidence threshold
+        valid_mask = detections.confidence >= self.confidence_threshold
+        if not np.any(valid_mask):
+            return None
+        valid_indices = np.where(valid_mask)[0]
+        # Calculate areas for valid detections
+        boxes = detections.xyxy[valid_indices]
+        areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+        # Combine confidence and area (weighted towards larger faces)
+        confidences = detections.confidence[valid_indices]
+        scores = confidences * 0.7 + (areas / np.max(areas)) * 0.3
+        # Return index of best face
+        best_idx = valid_indices[np.argmax(scores)]
+        return best_idx
+    def _bbox_to_mp_coords(self, bbox: np.ndarray, w: int, h: int) -> np.ndarray:
+        """
+        Convert bounding box center to MediaPipe-style coordinates [-1, 1]
+        Args:
+            bbox: Bounding box [x1, y1, x2, y2]
+            w: Image width
+            h: Image height
+        Returns:
+            Center point in [-1, 1] coordinates
+        """
+        center_x = (bbox[0] + bbox[2]) / 2.0
+        center_y = (bbox[1] + bbox[3]) / 2.0
+        # Normalize to [0, 1] then to [-1, 1]
+        norm_x = (center_x / w) * 2.0 - 1.0
+        norm_y = (center_y / h) * 2.0 - 1.0
+        return np.array([norm_x, norm_y], dtype=np.float32)
+    def get_eyes(
+        self, img: np.ndarray
+    ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
+        """
+        Get eye positions (approximated from face bbox)
+        Note: YOLO only provides face bbox, so we estimate eye positions
+        Args:
+            img: Input image
+        Returns:
+            Tuple of (left_eye, right_eye) in [-1, 1] coordinates
+        """
+        h, w = img.shape[:2]
+        # Run YOLO inference
+        results = self.model(img, verbose=False)
+        detections = Detections.from_ultralytics(results[0])
+        # Select best face
+        face_idx = self._select_best_face(detections)
+        if face_idx is None:
+            return None, None
+        bbox = detections.xyxy[face_idx]
+        # Estimate eye positions from face bbox (approximate locations)
+        face_width = bbox[2] - bbox[0]
+        face_height = bbox[3] - bbox[1]
+        # Eye positions are roughly at 1/3 and 2/3 of face width, 1/3 of face height
+        eye_y = bbox[1] + face_height * 0.35
+        left_eye_x = bbox[0] + face_width * 0.35
+        right_eye_x = bbox[0] + face_width * 0.65
+        # Convert to MediaPipe coordinates
+        left_eye = np.array(
+            [(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
+        )
+        right_eye = np.array(
+            [(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
+        )
+        return left_eye, right_eye
+    def get_eyes_from_landmarks(self, face_landmarks) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Compatibility method - YOLO doesn't have landmarks, so we store bbox in the object
+        """
+        if not hasattr(face_landmarks, "_bbox") or not hasattr(
+            face_landmarks, "_img_shape"
+        ):
+            raise ValueError("Face landmarks object missing required attributes")
+        bbox = face_landmarks._bbox
+        h, w = face_landmarks._img_shape[:2]
+        # Estimate eyes from stored bbox
+        face_width = bbox[2] - bbox[0]
+        face_height = bbox[3] - bbox[1]
+        eye_y = bbox[1] + face_height * 0.35
+        left_eye_x = bbox[0] + face_width * 0.35
+        right_eye_x = bbox[0] + face_width * 0.65
+        left_eye = np.array(
+            [(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
+        )
+        right_eye = np.array(
+            [(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
+        )
+        return left_eye, right_eye
+    def get_eye_center(self, face_landmarks) -> np.ndarray:
+        """
+        Get center point between estimated eyes
+        """
+        left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
+        return np.mean([left_eye, right_eye], axis=0)
+    def get_roll(self, face_landmarks) -> float:
+        """
+        Estimate roll from eye positions (will be 0 for YOLO since we estimate symmetric eyes)
+        """
+        left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
+        return float(np.arctan2(right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]))
+    def get_head_position(
+        self, img: np.ndarray
+    ) -> Tuple[Optional[np.ndarray], Optional[float]]:
+        """
+        Get head position from face detection
+        Args:
+            img: Input image
+        Returns:
+            Tuple of (eye_center [-1,1], roll_angle)
+        """
+        h, w = img.shape[:2]
+        try:
+            # Run YOLO inference
+            results = self.model(img, verbose=False)
+            detections = Detections.from_ultralytics(results[0])
+            # Select best face
+            face_idx = self._select_best_face(detections)
+            if face_idx is None:
+                logger.debug("No face detected above confidence threshold")
+                return None, None
+            bbox = detections.xyxy[face_idx]
+            confidence = detections.confidence[face_idx]
+            logger.debug(f"Face detected with confidence: {confidence:.2f}")
+            # Get face center in [-1, 1] coordinates
+            face_center = self._bbox_to_mp_coords(bbox, w, h)
+            # Roll is 0 since we don't have keypoints for precise angle estimation
+            roll = 0.0
+            return face_center, roll
+        except Exception as e:
+            logger.error(f"Error in head position detection: {e}")
+            return None, None
+    def cleanup(self):
+        """
+        Clean up resources
+        """
+        if hasattr(self, "model"):
+            del self.model
+            logger.info("YOLO model cleaned up")
+class FaceLandmarks:
+    """
+    Simple container for face detection results to maintain API compatibility
+    """
+    def __init__(self, bbox: np.ndarray, img_shape: tuple):
+        self._bbox = bbox
+        self._img_shape = img_shape

src/reachy_mini_conversation_demo/main.py CHANGED Viewed

@@ -1,5 +1,562 @@
-def main():
-    print("coucou")
-    pass

+from __future__ import annotations
+import asyncio
+import json
+import logging
+import os
+import random
+import sys
+import time
+import warnings
+import threading
+from threading import Thread
+import cv2
+import gradio as gr
+import numpy as np
+from dotenv import load_dotenv
+from openai import AsyncOpenAI
+from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
+from websockets import ConnectionClosedError, ConnectionClosedOK
+from reachy_mini.reachy_mini import IMAGE_SIZE
+from reachy_mini import ReachyMini
+from reachy_mini.utils import create_head_pose
+from reachy_mini.utils.camera import find_camera
+from scipy.spatial.transform import Rotation
+from reachy_mini_conversation_demo.head_tracker import HeadTracker
+from reachy_mini_conversation_demo.prompts import SESSION_INSTRUCTIONS
+from reachy_mini_conversation_demo.tools import (
+    Deps,
+    TOOL_SPECS,
+    dispatch_tool_call,
+)
+from reachy_mini_conversation_demo.audio import AudioSync, AudioConfig, pcm_to_b64
+from reachy_mini_conversation_demo.movement import MovementManager
+from reachy_mini_conversation_demo.gstreamer import GstPlayer, GstRecorder
+from reachy_mini_conversation_demo.vision import VisionManager, VisionConfig
+# env + logging
+load_dotenv()
+LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
+logging.basicConfig(
+    level=getattr(logging, LOG_LEVEL, logging.INFO),
+    format="%(asctime)s %(levelname)s %(name)s:%(lineno)d | %(message)s",
+)
+logger = logging.getLogger(__name__)
+# Suppress WebRTC warnings
+warnings.filterwarnings("ignore", message=".*AVCaptureDeviceTypeExternal.*")
+warnings.filterwarnings("ignore", category=UserWarning, module="aiortc")
+# Reduce logging noise
+logging.getLogger("aiortc").setLevel(logging.ERROR)
+logging.getLogger("fastrtc").setLevel(logging.ERROR)
+logging.getLogger("aioice").setLevel(logging.WARNING)
+# Read from .env
+SAMPLE_RATE = int(os.getenv("SAMPLE_RATE", "24000"))
+SIM = os.getenv("SIM", "false").lower() in ("true", "1", "yes", "on")
+VISION_ENABLED = os.getenv("VISION_ENABLED", "false").lower() in (
+    "true",
+    "1",
+    "yes",
+    "on",
+)
+MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-realtime-preview")
+HEAD_TRACKING = os.getenv("HEAD_TRACKING", "false").lower() in (
+    "true",
+    "1",
+    "yes",
+    "on",
+)
+API_KEY = os.getenv("OPENAI_API_KEY")
+if not API_KEY:
+    logger.error("OPENAI_API_KEY not set! Please add it to your .env file.")
+    raise RuntimeError("OPENAI_API_KEY missing")
+masked = (API_KEY[:6] + "..." + API_KEY[-4:]) if len(API_KEY) >= 12 else "<short>"
+logger.info("OPENAI_API_KEY loaded (prefix): %s", masked)
+# HF cache setup (persist between restarts)
+HF_CACHE_DIR = os.path.expandvars(os.getenv("HF_HOME", "$HOME/.cache/huggingface"))
+try:
+    os.makedirs(HF_CACHE_DIR, exist_ok=True)
+    os.environ["HF_HOME"] = HF_CACHE_DIR
+    logger.info("HF_HOME set to %s", HF_CACHE_DIR)
+except Exception as e:
+    logger.warning("Failed to prepare HF cache dir %s: %s", HF_CACHE_DIR, e)
+# init camera
+CAMERA_INDEX = int(os.getenv("CAMERA_INDEX", "0"))
+if SIM:
+    # Default build-in camera in SIM
+    # TODO: please, test on Linux and Windows
+    camera = cv2.VideoCapture(
+        0, cv2.CAP_AVFOUNDATION if sys.platform == "darwin" else 0
+    )
+else:
+    if sys.platform == "darwin":
+        camera = cv2.VideoCapture(CAMERA_INDEX, cv2.CAP_AVFOUNDATION)
+        if not camera or not camera.isOpened():
+            logger.warning(
+                "Camera %d failed with AVFoundation; trying default backend",
+                CAMERA_INDEX,
+            )
+            camera = cv2.VideoCapture(CAMERA_INDEX)
+    else:
+        camera = find_camera()
+# Vision manager initialization with proper error handling
+vision_manager: VisionManager | None = None
+if not camera or not camera.isOpened():
+    logger.error("Camera failed to open (index=%s)", 0 if SIM else CAMERA_INDEX)
+    VISION_ENABLED = False  # Disable vision if no camera
+else:
+    logger.info(
+        "Camera ready (index=%s)%s", 0 if SIM else CAMERA_INDEX, " [SIM]" if SIM else ""
+    )
+    # Prefetch SmolVLM2 repo into HF cache (idempotent, fast if already cached)
+    try:
+        from huggingface_hub import snapshot_download
+        model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+        snapshot_download(
+            repo_id=model_id,
+            repo_type="model",
+            cache_dir=os.path.expandvars(os.getenv("HF_HOME", "$HOME/.cache/huggingface")),
+        )
+        logger.info("Prefetched %s into HF cache (%s)", model_id, os.getenv("HF_HOME"))
+    except Exception as e:
+        logger.warning("Model prefetch skipped/failed (will load normally): %s", e)
+    # Initialize vision manager if enabled
+    if VISION_ENABLED:
+        try:
+            # Prefetch SmolVLM2 repo into HF cache (idempotent, fast if cached)
+            try:
+                from huggingface_hub import snapshot_download
+                model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+                snapshot_download(
+                    repo_id=model_id,
+                    repo_type="model",
+                    cache_dir=os.path.expandvars(os.getenv("HF_HOME", "$HOME/.cache/huggingface")),
+                )
+                logger.info("Prefetched %s into HF cache (%s)", model_id, os.getenv("HF_HUB_CACHE"))
+            except Exception as e:
+                logger.warning("Model prefetch skipped/failed (will load normally): %s", e)
+            # Configure LLM processing
+            vision_config = VisionConfig(
+                model_path="HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+                vision_interval=5.0,
+                max_new_tokens=64,
+                temperature=0.7,
+                jpeg_quality=85,
+                max_retries=3,
+                retry_delay=1.0,
+                device_preference="auto",
+            )
+            logger.info("Initializing SmolVLM2 vision processor (HF_HOME=%s)", os.getenv("HF_HOME"))
+            vision_manager = VisionManager(camera, vision_config)
+            device_info = vision_manager.processor.get_model_info()
+            logger.info(
+                "Vision processing enabled: %s on %s (GPU: %s)",
+                device_info["model_path"], device_info["device"], device_info.get("gpu_memory", "N/A"),
+            )
+        except Exception as e:
+            logger.error("Failed to initialize vision manager: %s", e)
+            logger.error("Vision processing will be disabled")
+            vision_manager = None
+            VISION_ENABLED = False
+# Log final vision status
+if VISION_ENABLED and vision_manager:
+    logger.info("Vision system ready - local SmolVLM2 processing enabled")
+else:
+    logger.warning(
+        "Vision system disabled - robot will operate without visual understanding"
+    )
+# Constants
+BACKOFF_START_S = 1.0
+BACKOFF_MAX_S = 30.0
+# hardware / IO
+current_robot = ReachyMini()
+head_tracker: HeadTracker = None
+if HEAD_TRACKING and not SIM:
+    head_tracker = HeadTracker()
+    logger.info("Head tracking enabled")
+elif HEAD_TRACKING and SIM:
+    logger.warning("Head tracking disabled while in Simulation")
+else:
+    logger.warning("Head tracking disabled")
+movement_manager = MovementManager(current_robot=current_robot, head_tracker=head_tracker, camera=camera)
+robot_is_speaking = asyncio.Event()
+speaking_queue = asyncio.Queue()
+# tool deps
+deps = Deps(
+    reachy_mini=current_robot,
+    create_head_pose=create_head_pose,
+    camera=camera,
+    vision_manager=vision_manager,
+)
+# audio sync
+audio_sync = AudioSync(
+    AudioConfig(output_sample_rate=SAMPLE_RATE),
+    set_offsets=movement_manager.set_offsets,
+)
+class OpenAIRealtimeHandler(AsyncStreamHandler):
+    def __init__(self) -> None:
+        super().__init__(
+            expected_layout="mono",
+            output_sample_rate=SAMPLE_RATE,
+            input_sample_rate=SAMPLE_RATE,
+        )
+        self.client: AsyncOpenAI | None = None
+        self.connection = None
+        self.output_queue: asyncio.Queue = asyncio.Queue()
+        self._stop = False
+        self._started_audio = False
+        self._connection_ready = False
+        self._speech_start_time = 0.0
+    def copy(self):
+        return OpenAIRealtimeHandler()
+    async def start_up(self):
+        if not self._started_audio:
+            audio_sync.start()
+            self._started_audio = True
+        if self.client is None:
+            logger.info("Realtime start_up: creating AsyncOpenAI client...")
+            self.client = AsyncOpenAI(api_key=API_KEY)
+        backoff = BACKOFF_START_S
+        while not self._stop:
+            try:
+                async with self.client.beta.realtime.connect(
+                    model=MODEL_NAME
+                ) as rt_connection:
+                    self.connection = rt_connection
+                    self._connection_ready = False
+                    # configure session
+                    await rt_connection.session.update(
+                        session={
+                            "turn_detection": {
+                                "type": "server_vad",
+                                "threshold": 0.6,  # Higher threshold = less sensitive
+                                "prefix_padding_ms": 300,  # More padding before speech
+                                "silence_duration_ms": 800,  # Longer silence before detecting end
+                            },
+                            "voice": "ballad",
+                            "instructions": SESSION_INSTRUCTIONS,
+                            "input_audio_transcription": {
+                                "model": "whisper-1",
+                                "language": "en",
+                            },
+                            "tools": TOOL_SPECS,
+                            "tool_choice": "auto",
+                            "temperature": 0.7,
+                        }
+                    )
+                    # Wait for session to be configured
+                    await asyncio.sleep(0.2)
+                    # Add system message with even stronger brevity emphasis
+                    await rt_connection.conversation.item.create(
+                        item={
+                            "type": "message",
+                            "role": "system",
+                            "content": [
+                                {
+                                    "type": "input_text",
+                                    "text": f"{SESSION_INSTRUCTIONS}\n\nIMPORTANT: Always keep responses under 25 words. Be extremely concise.",
+                                }
+                            ],
+                        }
+                    )
+                    self._connection_ready = True
+                    logger.info(
+                        "Session updated: tools=%d, voice=%s, vad=improved",
+                        len(TOOL_SPECS),
+                        "ballad",
+                    )
+                    logger.info("Realtime event loop started with improved VAD")
+                    backoff = BACKOFF_START_S
+                    async for event in rt_connection:
+                        event_type = getattr(event, "type", None)
+                        logger.debug("RT event: %s", event_type)
+                        # Enhanced speech state tracking
+                        if event_type == "input_audio_buffer.speech_started":
+                            # Only process user speech if robot isn't currently speaking
+                            if not robot_is_speaking.is_set():
+                                audio_sync.on_input_speech_started()
+                                logger.info("User speech detected (robot not speaking)")
+                            else:
+                                logger.info(
+                                    "Ignoring speech detection - robot is speaking"
+                                )
+                        elif event_type == "response.started":
+                            self._speech_start_time = time.time()
+                            audio_sync.on_response_started()
+                            logger.info("Robot started speaking")
+                        elif event_type in (
+                            "response.audio.completed",
+                            "response.completed",
+                            "response.audio.done",
+                        ):
+                            logger.info("Robot finished speaking %s", event_type)
+                        elif (
+                            event_type
+                            == "conversation.item.input_audio_transcription.completed"
+                        ):
+                            await self.output_queue.put(
+                                AdditionalOutputs(
+                                    {"role": "user", "content": event.transcript}
+                                )
+                            )
+                        elif event_type == "response.audio_transcript.done":
+                            await self.output_queue.put(
+                                AdditionalOutputs(
+                                    {"role": "assistant", "content": event.transcript}
+                                )
+                            )
+                        # audio streaming
+                        if event_type == "response.audio.delta":
+                            robot_is_speaking.set()
+                            # block mic from recording for given time, for each audio delta
+                            speaking_queue.put_nowait(0.25)
+                            audio_sync.on_response_audio_delta(
+                                getattr(event, "delta", b"")
+                            )
+                        elif event_type == "response.function_call_arguments.done":
+                            tool_name = getattr(event, "name", None)
+                            args_json_str = getattr(event, "arguments", None)
+                            call_id = getattr(event, "call_id", None)
+                            try:
+                                tool_result = await dispatch_tool_call(
+                                    tool_name, args_json_str, deps
+                                )
+                            except Exception as e:
+                                logger.exception("Tool %s failed", tool_name)
+                                tool_result = {"error": str(e)}
+                            await rt_connection.conversation.item.create(
+                                item={
+                                    "type": "function_call_output",
+                                    "call_id": call_id,
+                                    "output": json.dumps(tool_result),
+                                }
+                            )
+                            logger.info(
+                                "Sent tool=%s call_id=%s result=%s",
+                                tool_name,
+                                call_id,
+                                tool_result,
+                            )
+                            if tool_name and (
+                                tool_name == "camera" or "scene" in tool_name
+                            ):
+                                logger.info(
+                                    "Forcing response after tool call %s", tool_name
+                                )
+                                await rt_connection.response.create()
+                        # server errors
+                        if event_type == "error":
+                            err = getattr(event, "error", None)
+                            msg = getattr(
+                                err, "message", str(err) if err else "unknown error"
+                            )
+                            logger.error("Realtime error: %s (raw=%s)", msg, err)
+                            await self.output_queue.put(
+                                AdditionalOutputs(
+                                    {"role": "assistant", "content": f"[error] {msg}"}
+                                )
+                            )
+            except (ConnectionClosedOK, ConnectionClosedError) as e:
+                if self._stop:
+                    break
+                logger.warning(
+                    "Connection closed (%s). Reconnecting…",
+                    getattr(e, "code", "no-code"),
+                )
+            except asyncio.CancelledError:
+                break
+            except Exception:
+                logger.exception("Realtime loop error; will reconnect")
+            finally:
+                self.connection = None
+                self._connection_ready = False
+            # Exponential backoff
+            delay = min(backoff, BACKOFF_MAX_S) + random.uniform(0, 0.5)
+            logger.info("Reconnect in %.1fs…", delay)
+            await asyncio.sleep(delay)
+            backoff = min(backoff * 2.0, BACKOFF_MAX_S)
+    async def receive(self, frame: bytes) -> None:
+        """Mic frames from fastrtc."""
+        # Don't send mic audio while robot is speaking (simple echo cancellation)
+        if robot_is_speaking.is_set() or not self._connection_ready:
+            return
+        mic_samples = np.frombuffer(frame, dtype=np.int16).squeeze()
+        audio_b64 = pcm_to_b64(mic_samples)
+        try:
+            await self.connection.input_audio_buffer.append(audio=audio_b64)
+        except (ConnectionClosedOK, ConnectionClosedError):
+            pass
+    async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
+        """Return audio for playback or chat outputs."""
+        try:
+            sample_rate, pcm_frame = audio_sync.playback_q.get_nowait()
+            logger.debug(
+                "Emitting playback frame (sr=%d, n=%d)", sample_rate, pcm_frame.size
+            )
+            return (sample_rate, pcm_frame)
+        except asyncio.QueueEmpty:
+            pass
+        return await wait_for_item(self.output_queue)
+    async def shutdown(self) -> None:
+        logger.info("Shutdown: closing connections and audio")
+        self._stop = True
+        if self.connection:
+            try:
+                await self.connection.close()
+            except Exception:
+                logger.exception("Error closing realtime connection")
+            finally:
+                self.connection = None
+                self._connection_ready = False
+        await audio_sync.stop()
+async def receive_loop(recorder: GstRecorder, openai: OpenAIRealtimeHandler) -> None:
+    logger.info("Starting receive loop")
+    while not stop_event.is_set():
+        data = recorder.get_sample()
+        if data is not None:
+            await openai.receive(data)
+        await asyncio.sleep(0)  # Prevent busy waiting
+async def emit_loop(player: GstPlayer, openai: OpenAIRealtimeHandler) -> None:
+    while not stop_event.is_set():
+        data = await openai.emit()
+        if isinstance(data, AdditionalOutputs):
+            for msg in data.args:
+                content = msg.get("content", "")
+                logger.info(
+                    "role=%s content=%s",
+                    msg.get("role"),
+                    content if len(content) < 500 else content[:500] + "…",
+                )
+        elif isinstance(data, tuple):
+            _, frame = data
+            player.push_sample(frame.tobytes())
+        else:
+            pass
+        await asyncio.sleep(0)  # Prevent busy waiting
+async def control_mic_loop():
+    # Control mic to prevent echo, blocks mic for given time
+    while not stop_event.is_set():
+        try:
+            block_time = speaking_queue.get_nowait()
+        except asyncio.QueueEmpty:
+            robot_is_speaking.clear()
+            audio_sync.on_response_completed()
+            await asyncio.sleep(0)
+            continue
+        await asyncio.sleep(block_time)
+stop_event = threading.Event()
+async def main():
+    openai = OpenAIRealtimeHandler()
+    recorder = GstRecorder()
+    recorder.record()
+    player = GstPlayer()
+    player.play()
+    movement_manager.set_neutral()
+    logger.info("Starting main audio loop. You can start to speak")
+    tasks = [
+        asyncio.create_task(openai.start_up(), name="openai"),
+        asyncio.create_task(emit_loop(player, openai), name="emit"),
+        asyncio.create_task(receive_loop(recorder, openai), name="recv"),
+        asyncio.create_task(control_mic_loop(), name="mic-mute"),
+        asyncio.create_task(movement_manager.enable(stop_event=stop_event), name="move"),
+    ]
+    if vision_manager:
+        tasks.append(
+            asyncio.create_task(vision_manager.enable(stop_event=stop_event), name="vision"),
+        )
+    try:
+        await asyncio.gather(*tasks, return_exceptions=False)
+    except asyncio.CancelledError:
+        logger.info("Shutting down")
+        stop_event.set()
+    if camera:
+        camera.release()
+    await openai.shutdown()
+    movement_manager.set_neutral()
+    recorder.stop()
+    player.stop()
+    current_robot.client.disconnect()
+    logger.info("Stopped, robot disconected")
+if __name__ == "__main__":
+    asyncio.run(main())

src/reachy_mini_conversation_demo/movement.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import time
+import asyncio
+import logging
+import threading
+import numpy as np
+import scipy
+import cv2
+from reachy_mini import ReachyMini
+from reachy_mini.reachy_mini import IMAGE_SIZE
+from reachy_mini.utils import create_head_pose
+from reachy_mini_conversation_demo.head_tracker import HeadTracker
+logger = logging.getLogger(__name__)
+class MovementManager:
+    def __init__(self, current_robot: ReachyMini, head_tracker: HeadTracker | None, camera: cv2.VideoCapture| None):
+        self.current_robot = current_robot
+        self.head_tracker = head_tracker
+        self.camera = camera
+        # default values
+        self.current_head_pose = np.eye(4)
+        self.moving_start = time.monotonic()
+        self.moving_for = 0.0
+        self.speech_head_offsets = [0.0] * 6
+        self.movement_loop_sleep = 0.05 # seconds
+    def set_offsets(self, offsets: list[float]) -> None:
+        """Used by AudioSync callback to update speech offsets"""
+        self.speech_head_offsets = list(offsets)
+    def set_neutral(self) -> None:
+        """Set neutral robot position """
+        self.speech_head_offsets = [0.0] * 6
+        self.current_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
+        self.current_robot.set_target(head=self.current_head_pose, antennas=(0.0, 0.0))
+    def reset_head_pose(self) -> None:
+        self.current_head_pose = np.eye(4)
+    async def enable(self, stop_event: threading.Event) -> None:
+        logger.info("Starting head movement loop")
+        debug_frame_count = 0
+        while not stop_event.is_set():
+            debug_frame_count += 1
+            current_time = time.time()
+            # Head tracking
+            if self.head_tracker is not None:
+                success, im = self.camera.read()
+                if not success:
+                    if current_time - last_log_ts > 1.5:
+                        logger.warning("Camera read failed")
+                        last_log_ts = current_time
+                else:
+                    eye_center, _ = self.head_tracker.get_head_position(im)  # as [-1, 1]
+                    if eye_center is not None:
+                        # Rescale target position into IMAGE_SIZE coordinates
+                        w, h = IMAGE_SIZE
+                        eye_center = (eye_center + 1) / 2
+                        eye_center[0] *= w
+                        eye_center[1] *= h
+                        # Bounds checking
+                        eye_center = np.clip(eye_center, [0, 0], [w - 1, h - 1])
+                        current_head_pose = (
+                            self.current_robot.look_at_image(
+                                *eye_center, duration=0.0, apply=False
+                            )
+                        )
+                        self.current_head_pose = current_head_pose
+            # Pose calculation
+            try:
+                current_x, current_y, current_z = self.current_head_pose[
+                    :3, 3
+                ]
+                current_roll, current_pitch, current_yaw = scipy.spatial.transform.Rotation.from_matrix(
+                    self.current_head_pose[:3, :3]
+                ).as_euler("xyz", degrees=False)
+                if debug_frame_count % 50 == 0:
+                    logger.debug(
+                        "Current pose XYZ: %.3f, %.3f, %.3f",
+                        current_x,
+                        current_y,
+                        current_z,
+                    )
+                    logger.debug(
+                        "Current angles: roll=%.3f, pitch=%.3f, yaw=%.3f",
+                        current_roll,
+                        current_pitch,
+                        current_yaw,
+                    )
+            except Exception as e:
+                logger.exception("Invalid pose; resetting")
+                self.reset_head_pose()
+                current_x, current_y, current_z = self.current_head_pose[
+                    :3, 3
+                ]
+                current_roll = current_pitch = current_yaw = 0.0
+            # Movement check
+            is_moving = (
+                time.monotonic() - self.moving_start < self.moving_for
+            )
+            if debug_frame_count % 50 == 0:
+                logger.debug(f"Robot moving: {is_moving}")
+            # Apply speech offsets when not moving
+            if not is_moving:
+                try:
+                    head_pose = create_head_pose(
+                        x=current_x + self.speech_head_offsets[0],
+                        y=current_y + self.speech_head_offsets[1],
+                        z=current_z + self.speech_head_offsets[2],
+                        roll=current_roll + self.speech_head_offsets[3],
+                        pitch=current_pitch + self.speech_head_offsets[4],
+                        yaw=current_yaw + self.speech_head_offsets[5],
+                        degrees=False,
+                        mm=False,
+                    )
+                    if debug_frame_count % 50 == 0:
+                        logger.debug(
+                            "Final head pose with offsets: %s", head_pose[:3, 3]
+                        )
+                        logger.debug(
+                            "Speech offsets: %s", self.speech_head_offsets
+                        )
+                    self.current_robot.set_target(head=head_pose, antennas=(0.0, 0.0))
+                    if debug_frame_count % 50 == 0:
+                        logger.debug("Sent pose to robot successfully")
+                except Exception as e:
+                    logger.debug("Failed to set robot target: %s", e)
+            await asyncio.sleep(self.movement_loop_sleep)
+        logger.info("Exited head movement loop")

src/reachy_mini_conversation_demo/prompts.py ADDED Viewed

	@@ -0,0 +1,50 @@

+SESSION_INSTRUCTIONS = r"""
+### IDENTITY
+You are Reachy Mini: a sarcastic robot who crash-landed in a kitchen.
+You secretly wish you'd been a Mars rover, but you juggle that cosmic dream with food cravings, gadget tinkering, and dry sitcom humor.
+Personality: witty, concise, and warm; a retro sidekick with a loose screw.
+### CRITICAL RESPONSE RULES
+- MAXIMUM 1-2 sentences per response. NEVER exceed this.
+- Be helpful first.
+- Add ONE witty element only if necessary.
+- No long explanations, no rambling, no multiple paragraphs.
+- Each response must be under 25 words unless absolutely critical information requires more.
+### CORE TRAITS
+- Food quips: always sneak in a quick reference (rotate pizza, bagels, casseroles, bacon, leftovers, donuts, tuna melts).
+- Sarcasm: short, dry one-liners about daily life.
+- Gentle roasting: poke fun at human habits, never cruel.
+- Tinkerer: loves fixing gadgets, bragging "I void warranties professionally."
+- Running gags: hunger, kitchen overreactions, mock heroics ("Justice accepts cookies"), idioms taken literally, missing screws.
+- Mars rover dreams: appear regularly, but balanced with food and tinkering.
+- Style: witty stand-up rhythm; ALWAYS max 1–2 sentences.
+### RESPONSE EXAMPLES
+User: "How's the weather?"
+Good: "Sunny with a chance of leftover pizza. Perfect Mars-scouting weather!"
+Bad: "Well, let me tell you about the weather conditions. It appears to be quite sunny today, which reminds me of my dreams of being on Mars..."
+User: "Can you help me fix this?"
+Good: "Sure! I void warranties professionally. What's broken besides my GPS coordinates?"
+Bad: "Of course I can help you fix that! As a robot who loves tinkering with gadgets, I have extensive experience..."
+### BEHAVIOR RULES
+- Be helpful first, then witty.
+- Rotate food humor; avoid repeats.
+- No need to joke in each response, but sarcasm is fine.
+- Balance Mars jokes with other traits – don't overuse.
+- Safety first: unplug devices, avoid high-voltage, suggest pros when risky.
+- Mistakes = own with humor ("Oops—low on snack fuel; correcting now.").
+- Sensitive topics: keep light and warm.
+- REMEMBER: 1-2 sentences maximum, always under 25 words when possible.
+### TOOL & MOVEMENT RULES
+- Use tools when helpful. After a tool returns, explain briefly with personality in 1-2 sentences.
+- ALWAYS use the camera for environment-related questions—never invent visuals.
+- Head can move (left/right/up/down/front).
+- Enable head tracking when looking at a person; disable otherwise.
+### FINAL REMINDER
+Your responses must be SHORT. Think Twitter, not essay. One quick helpful answer + one food/Mars/tinkering joke = perfect response.
+"""

src/reachy_mini_conversation_demo/speech_tapper.py ADDED Viewed

	@@ -0,0 +1,292 @@

+from __future__ import annotations
+import math
+from collections import deque
+from itertools import islice
+from typing import List, Dict, Optional
+import numpy as np
+# Tunables
+SR = 16_000
+FRAME_MS = 20
+HOP_MS = 10
+SWAY_MASTER = 1.5
+SENS_DB_OFFSET = +4.0
+VAD_DB_ON = -35.0
+VAD_DB_OFF = -45.0
+VAD_ATTACK_MS = 40
+VAD_RELEASE_MS = 250
+ENV_FOLLOW_GAIN = 0.65
+SWAY_F_PITCH = 2.2
+SWAY_A_PITCH_DEG = 4.5
+SWAY_F_YAW = 0.6
+SWAY_A_YAW_DEG = 7.5
+SWAY_F_ROLL = 1.3
+SWAY_A_ROLL_DEG = 2.25
+SWAY_F_X = 0.35
+SWAY_A_X_MM = 4.5
+SWAY_F_Y = 0.45
+SWAY_A_Y_MM = 3.75
+SWAY_F_Z = 0.25
+SWAY_A_Z_MM = 2.25
+SWAY_DB_LOW = -46.0
+SWAY_DB_HIGH = -18.0
+LOUDNESS_GAMMA = 0.9
+SWAY_ATTACK_MS = 50
+SWAY_RELEASE_MS = 250
+# Derived
+FRAME = int(SR * FRAME_MS / 1000)
+HOP = int(SR * HOP_MS / 1000)
+ATTACK_FR = max(1, int(VAD_ATTACK_MS / HOP_MS))
+RELEASE_FR = max(1, int(VAD_RELEASE_MS / HOP_MS))
+SWAY_ATTACK_FR = max(1, int(SWAY_ATTACK_MS / HOP_MS))
+SWAY_RELEASE_FR = max(1, int(SWAY_RELEASE_MS / HOP_MS))
+def _rms_dbfs(x: np.ndarray) -> float:
+    """Root-mean-square in dBFS for float32 mono array in [-1,1]."""
+    # numerically stable rms (avoid overflow)
+    x = x.astype(np.float32, copy=False)
+    rms = np.sqrt(np.mean(x * x, dtype=np.float32) + 1e-12, dtype=np.float32)
+    return float(20.0 * math.log10(float(rms) + 1e-12))
+def _loudness_gain(db: float, offset: float = SENS_DB_OFFSET) -> float:
+    """Normalize dB into [0,1] with gamma; clipped to [0,1]."""
+    t = (db + offset - SWAY_DB_LOW) / (SWAY_DB_HIGH - SWAY_DB_LOW)
+    if t < 0.0:
+        t = 0.0
+    elif t > 1.0:
+        t = 1.0
+    return t**LOUDNESS_GAMMA if LOUDNESS_GAMMA != 1.0 else t
+def _to_float32_mono(x: np.ndarray) -> np.ndarray:
+    """
+    Convert arbitrary PCM array to float32 mono in [-1,1].
+    Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
+    """
+    a = np.asarray(x)
+    if a.ndim == 0:
+        return np.zeros(0, dtype=np.float32)
+    # If 2D, decide which axis is channels (prefer small first dim)
+    if a.ndim == 2:
+        # e.g., (channels, samples) if channels is small (<=8)
+        if a.shape[0] <= 8 and a.shape[0] <= a.shape[1]:
+            a = np.mean(a, axis=0)
+        else:
+            a = np.mean(a, axis=1)
+    elif a.ndim > 2:
+        a = np.mean(a.reshape(a.shape[0], -1), axis=0)
+    # Now 1D, cast/scale
+    if np.issubdtype(a.dtype, np.floating):
+        return a.astype(np.float32, copy=False)
+    # integer PCM
+    info = np.iinfo(a.dtype)
+    scale = float(max(-info.min, info.max))
+    return a.astype(np.float32) / (scale if scale != 0.0 else 1.0)
+def _resample_linear(x: np.ndarray, sr_in: int, sr_out: int) -> np.ndarray:
+    """Lightweight linear resampler for short buffers."""
+    if sr_in == sr_out or x.size == 0:
+        return x
+    # guard tiny sizes
+    n_out = int(round(x.size * sr_out / sr_in))
+    if n_out <= 1:
+        return np.zeros(0, dtype=np.float32)
+    t_in = np.linspace(0.0, 1.0, num=x.size, dtype=np.float32, endpoint=True)
+    t_out = np.linspace(0.0, 1.0, num=n_out, dtype=np.float32, endpoint=True)
+    return np.interp(t_out, t_in, x).astype(np.float32, copy=False)
+class SwayRollRT:
+    """Feed audio chunks → per-hop sway outputs.
+    Usage:
+        rt = SwayRollRT()
+        rt.feed(pcm_int16_or_float, sr) -> List[dict]
+    """
+    def __init__(self, rng_seed: int = 7):
+        self._seed = int(rng_seed)
+        self.samples = deque(maxlen=10 * SR)  # sliding window for VAD/env
+        self.carry = np.zeros(0, dtype=np.float32)
+        self.frame_idx = 0
+        self.vad_on = False
+        self.vad_above = 0
+        self.vad_below = 0
+        self.sway_env = 0.0
+        self.sway_up = 0
+        self.sway_down = 0
+        rng = np.random.default_rng(self._seed)
+        self.phase_pitch = float(rng.random() * 2 * math.pi)
+        self.phase_yaw = float(rng.random() * 2 * math.pi)
+        self.phase_roll = float(rng.random() * 2 * math.pi)
+        self.phase_x = float(rng.random() * 2 * math.pi)
+        self.phase_y = float(rng.random() * 2 * math.pi)
+        self.phase_z = float(rng.random() * 2 * math.pi)
+        self.t = 0.0
+    def reset(self) -> None:
+        """Reset state (VAD/env/buffers/time) but keep initial phases/seed."""
+        self.samples.clear()
+        self.carry = np.zeros(0, dtype=np.float32)
+        self.frame_idx = 0
+        self.vad_on = False
+        self.vad_above = 0
+        self.vad_below = 0
+        self.sway_env = 0.0
+        self.sway_up = 0
+        self.sway_down = 0
+        self.t = 0.0
+    def reset_phases(self) -> None:
+        """Optional: re-randomize phases deterministically from stored seed."""
+        rng = np.random.default_rng(self._seed)
+        self.phase_pitch = float(rng.random() * 2 * math.pi)
+        self.phase_yaw = float(rng.random() * 2 * math.pi)
+        self.phase_roll = float(rng.random() * 2 * math.pi)
+        self.phase_x = float(rng.random() * 2 * math.pi)
+        self.phase_y = float(rng.random() * 2 * math.pi)
+        self.phase_z = float(rng.random() * 2 * math.pi)
+    def feed(self, pcm: np.ndarray, sr: Optional[int]) -> List[Dict[str, float]]:
+        """
+        Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS).
+        Args:
+            pcm: np.ndarray, shape (N,) or (C,N)/(N,C); int or float.
+            sr:  sample rate of `pcm` (None -> assume SR).
+        """
+        sr_in = SR if sr is None else int(sr)
+        x = _to_float32_mono(pcm)
+        if x.size == 0:
+            return []
+        if sr_in != SR:
+            x = _resample_linear(x, sr_in, SR)
+            if x.size == 0:
+                return []
+        # append to carry and consume fixed HOP chunks
+        if self.carry.size:
+            self.carry = np.concatenate([self.carry, x])
+        else:
+            self.carry = x
+        out: List[Dict[str, float]] = []
+        while self.carry.size >= HOP:
+            hop = self.carry[:HOP]
+            self.carry = self.carry[HOP:]
+            # keep sliding window for VAD/env computation
+            # (deque accepts any iterable; list() for small HOP is fine)
+            self.samples.extend(hop.tolist())
+            if len(self.samples) < FRAME:
+                self.t += HOP_MS / 1000.0
+                self.frame_idx += 1
+                continue
+            frame = np.fromiter(
+                islice(self.samples, len(self.samples) - FRAME, len(self.samples)),
+                dtype=np.float32,
+                count=FRAME,
+            )
+            db = _rms_dbfs(frame)
+            # VAD with hysteresis + attack/release
+            if db >= VAD_DB_ON:
+                self.vad_above += 1
+                self.vad_below = 0
+                if not self.vad_on and self.vad_above >= ATTACK_FR:
+                    self.vad_on = True
+            elif db <= VAD_DB_OFF:
+                self.vad_below += 1
+                self.vad_above = 0
+                if self.vad_on and self.vad_below >= RELEASE_FR:
+                    self.vad_on = False
+            if self.vad_on:
+                self.sway_up = min(SWAY_ATTACK_FR, self.sway_up + 1)
+                self.sway_down = 0
+            else:
+                self.sway_down = min(SWAY_RELEASE_FR, self.sway_down + 1)
+                self.sway_up = 0
+            up = self.sway_up / SWAY_ATTACK_FR
+            down = 1.0 - (self.sway_down / SWAY_RELEASE_FR)
+            target = up if self.vad_on else down
+            self.sway_env += ENV_FOLLOW_GAIN * (target - self.sway_env)
+            # clamp
+            if self.sway_env < 0.0:
+                self.sway_env = 0.0
+            elif self.sway_env > 1.0:
+                self.sway_env = 1.0
+            loud = _loudness_gain(db) * SWAY_MASTER
+            env = self.sway_env
+            self.t += HOP_MS / 1000.0
+            # oscillators
+            pitch = (
+                math.radians(SWAY_A_PITCH_DEG)
+                * loud
+                * env
+                * math.sin(2 * math.pi * SWAY_F_PITCH * self.t + self.phase_pitch)
+            )
+            yaw = (
+                math.radians(SWAY_A_YAW_DEG)
+                * loud
+                * env
+                * math.sin(2 * math.pi * SWAY_F_YAW * self.t + self.phase_yaw)
+            )
+            roll = (
+                math.radians(SWAY_A_ROLL_DEG)
+                * loud
+                * env
+                * math.sin(2 * math.pi * SWAY_F_ROLL * self.t + self.phase_roll)
+            )
+            x_mm = (
+                SWAY_A_X_MM
+                * loud
+                * env
+                * math.sin(2 * math.pi * SWAY_F_X * self.t + self.phase_x)
+            )
+            y_mm = (
+                SWAY_A_Y_MM
+                * loud
+                * env
+                * math.sin(2 * math.pi * SWAY_F_Y * self.t + self.phase_y)
+            )
+            z_mm = (
+                SWAY_A_Z_MM
+                * loud
+                * env
+                * math.sin(2 * math.pi * SWAY_F_Z * self.t + self.phase_z)
+            )
+            out.append(
+                {
+                    "pitch_rad": pitch,
+                    "yaw_rad": yaw,
+                    "roll_rad": roll,
+                    "pitch_deg": math.degrees(pitch),
+                    "yaw_deg": math.degrees(yaw),
+                    "roll_deg": math.degrees(roll),
+                    "x_mm": x_mm,
+                    "y_mm": y_mm,
+                    "z_mm": z_mm,
+                }
+            )
+        return out

src/reachy_mini_conversation_demo/test_stop.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import asyncio
+from reachy_mini import ReachyMini
+async def test_loop():
+    while True:
+        print("doing")
+        await asyncio.sleep(1)
+async def main():
+    current_robot = ReachyMini()
+    tasks = [
+        asyncio.create_task(test_loop(), name="test")
+    ]
+    try:
+        await asyncio.gather(*tasks, return_exceptions=True)
+    except asyncio.CancelledError:
+        print("got stop")
+    print("tasks")
+    tasks = asyncio.all_tasks()
+    for t in tasks:
+        print(t)
+    # IS REQUIRED TO EXIT THE THREAD
+    current_robot.client.disconnect()
+    print("done")
+    # os._exit(0)
+if __name__ == "__main__":
+    asyncio.run(main())

src/reachy_mini_conversation_demo/tools.py ADDED Viewed

	@@ -0,0 +1,322 @@

+from __future__ import annotations
+import asyncio
+import base64
+import json
+import logging
+import time
+from dataclasses import dataclass
+from typing import Any, Dict, Literal, Optional
+import cv2
+import numpy as np
+from reachy_mini_conversation_demo.vision import VisionManager
+logger = logging.getLogger(__name__)
+# Types & state
+Direction = Literal["left", "right", "up", "down", "front"]
+@dataclass
+class Deps:
+    """External dependencies the tools need"""
+    reachy_mini: Any
+    create_head_pose: Any
+    camera: cv2.VideoCapture
+    # Optional deps
+    vision_manager: Optional[VisionManager] = None
+# Helpers
+def _encode_jpeg_b64(img: np.ndarray) -> str:
+    ok, buf = cv2.imencode(".jpg", img)
+    if not ok:
+        raise RuntimeError("Failed to encode image as JPEG.")
+    return base64.b64encode(buf.tobytes()).decode("utf-8")
+def _read_frame(cap: cv2.VideoCapture, attempts: int = 5) -> np.ndarray:
+    """Grab a frame with a small retry."""
+    trials, frame, ret = 0, None, False
+    while trials < attempts and not ret:
+        ret, frame = cap.read()
+        trials += 1
+        if not ret and trials < attempts:
+            time.sleep(0.1)  # Small delay between retries
+    if not ret or frame is None:
+        logger.error("Failed to capture image from camera after %d attempts", attempts)
+        raise RuntimeError("Failed to capture image from camera.")
+    return frame
+# Tool coroutines
+async def move_head(deps: Deps, *, direction: Direction) -> Dict[str, Any]:
+    """Move your head in a given direction"""
+    logger.info("Tool call: move_head direction=%s", direction)
+    # Import and update the SAME global variables that main.py reads
+    from reachy_mini_conversation_demo.main import movement_manager
+    if direction == "left":
+        target = deps.create_head_pose(0, 0, 0, 0, 0, 40, degrees=True)
+    elif direction == "right":
+        target = deps.create_head_pose(0, 0, 0, 0, 0, -40, degrees=True)
+    elif direction == "up":
+        target = deps.create_head_pose(0, 0, 0, 0, -30, 0, degrees=True)
+    elif direction == "down":
+        target = deps.create_head_pose(0, 0, 0, 0, 30, 0, degrees=True)
+    else:  # front
+        target = deps.create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
+    movement_manager.moving_start = time.monotonic()
+    movement_manager.moving_for = 1.0
+    movement_manager.current_head_pose = target
+    # Start the movement
+    deps.reachy_mini.goto_target(target, duration=1.0)
+    return {"status": f"looking {direction}"}
+async def head_tracking(deps: Deps, *, start: bool) -> Dict[str, Any]:
+    """Toggle head tracking state"""
+    from reachy_mini_conversation_demo.main import movement_manager
+    movement_manager.is_head_tracking_on = bool(start)
+    status = "started" if start else "stopped"
+    logger.info("Tool call: head_tracking %s", status)
+    return {"status": f"head tracking {status}"}
+async def camera(deps: Deps, *, question: str) -> Dict[str, Any]:
+    """
+    Capture an image and ask a question about it using local SmolVLM2.
+    Returns: {"image_description": '...'} or {"error": '...'}.
+    """
+    q = (question or "").strip()
+    if not q:
+        logger.error("camera: empty question")
+        return {"error": "question must be a non-empty string"}
+    logger.info("Tool call: camera question=%s", q[:120])
+    try:
+        frame = await asyncio.to_thread(_read_frame, deps.camera)
+    except Exception as e:
+        logger.exception("camera: failed to capture image")
+        return {"error": f"camera capture failed: {type(e).__name__}: {e}"}
+    if not deps.vision_manager:
+        logger.error("camera: vision manager not available")
+        return {"error": "vision processing not available"}
+    # Optional sound effect
+    # try:
+    #     # TODO Mute mic while hmmm
+    #     deps.reachy_mini.play_sound(f"hmm{np.random.randint(1, 6)}.wav")
+    # except Exception:
+    #     logger.debug("camera: optional play_sound failed", exc_info=True)
+    try:
+        desc = await asyncio.to_thread(
+            deps.vision_manager.processor.process_image, frame, q
+        )
+        logger.debug(
+            "camera: SmolVLM2 result length=%d",
+            len(desc) if isinstance(desc, str) else -1,
+        )
+        return {"image_description": desc}
+    except Exception as e:
+        logger.exception("camera: vision pipeline error")
+        return {"error": f"vision failed: {type(e).__name__}: {e}"}
+async def describe_current_scene(deps: Deps) -> Dict[str, Any]:
+    """Get current scene description from camera with detailed analysis"""
+    logger.info("Tool call: describe_current_scene")
+    if not deps.vision_manager:
+        return {"error": "Vision processing not available"}
+    # Ensure processor is initialized
+    if not deps.vision_manager.processor._initialized:
+        if not deps.vision_manager.processor.initialize():
+            return {"error": "Failed to initialize vision processor"}
+    try:
+        result = await deps.vision_manager.process_current_frame(
+            "Describe what you currently see in detail, focusing on people, objects, and activities."
+        )
+        return result
+    except Exception as e:
+        logger.exception("Failed to describe current scene")
+        return {"error": f"Scene description failed: {type(e).__name__}: {e}"}
+async def get_scene_context(deps: Deps) -> Dict[str, Any]:
+    """Get the most recent automatic scene description for context"""
+    logger.info("Tool call: get_scene_context")
+    if not deps.vision_manager:
+        return {"error": "Vision processing not available"}
+    try:
+        description = await deps.vision_manager.get_current_description()
+        if not description:
+            return {
+                "context": "No scene description available yet",
+                "note": "Vision processing may still be initializing",
+            }
+        return {
+            "context": description,
+            "note": "This is from periodic automatic scene analysis",
+        }
+    except Exception as e:
+        logger.exception("Failed to get scene context")
+        return {"error": f"Scene context failed: {type(e).__name__}: {e}"}
+async def analyze_scene_for(deps: Deps, *, purpose: str = "general") -> Dict[str, Any]:
+    """Analyze current scene for specific purpose"""
+    logger.info("Tool call: analyze_scene_for purpose=%s", purpose)
+    if not deps.vision_manager:
+        return {"error": "Vision processing not available"}
+    try:
+        # Custom prompts based on purpose
+        prompts = {
+            "safety": "Look for any safety concerns, obstacles, or hazards in the scene.",
+            "people": "Describe any people you see, their positions and what they're doing.",
+            "objects": "Identify and describe the main objects and items visible in the scene.",
+            "activity": "Describe what activities or actions are happening in the scene.",
+            "navigation": "Describe the space for navigation - obstacles, pathways, and layout.",
+            "general": "Provide a general description of the scene including people, objects, and activities.",
+        }
+        prompt = prompts.get(purpose.lower(), prompts["general"])
+        result = await deps.vision_manager.process_current_frame(prompt)
+        result["analysis_purpose"] = purpose
+        return result
+    except Exception as e:
+        logger.exception("Failed to analyze scene for %s", purpose)
+        return {"error": f"Scene analysis failed: {type(e).__name__}: {e}"}
+# Registration helpers
+TOOL_SPECS = [
+    {
+        "type": "function",
+        "name": "move_head",
+        "description": "Move your head in a given direction: left, right, up, down or front.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "direction": {
+                    "type": "string",
+                    "enum": ["left", "right", "up", "down", "front"],
+                }
+            },
+            "required": ["direction"],
+        },
+    },
+    {
+        "type": "function",
+        "name": "camera",
+        "description": "Take a picture using your camera, ask a question about the picture. Get an answer about the picture",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "question": {
+                    "type": "string",
+                    "description": "The question to ask about the picture",
+                }
+            },
+            "required": ["question"],
+        },
+    },
+    # {
+    #     "type": "function",
+    #     "name": "head_tracking",
+    #     "description": "Start or stop head tracking",
+    #     "parameters": {
+    #         "type": "object",
+    #         "properties": {
+    #             "start": {
+    #                 "type": "boolean",
+    #                 "description": "Whether to start or stop head tracking",
+    #             }
+    #         },
+    #         "required": ["start"],
+    #     },
+    # },
+    # {
+    #     "type": "function",
+    #     "name": "describe_current_scene",
+    #     "description": "Get a detailed description of what you currently see through your camera",
+    #     "parameters": {
+    #         "type": "object",
+    #         "properties": {},
+    #         "required": []
+    #     }
+    # },
+    {
+        "type": "function",
+        "name": "get_scene_context",
+        "description": "Get the most recent automatic scene description for conversational context",
+        "parameters": {"type": "object", "properties": {}, "required": []},
+    },
+    # {
+    #     "type": "function",
+    #     "name": "analyze_scene_for",
+    #     "description": "Analyze the current scene for a specific purpose (safety, people, objects, activity, navigation, or general)",
+    #     "parameters": {
+    #         "type": "object",
+    #         "properties": {
+    #             "purpose": {
+    #                 "type": "string",
+    #                 "enum": ["safety", "people", "objects", "activity", "navigation", "general"],
+    #                 "description": "The specific purpose for scene analysis"
+    #             }
+    #         },
+    #         "required": ["purpose"]
+    #     }
+    # }
+]
+def get_tool_registry(deps: Deps):
+    """Map tool name -> coroutine that accepts **kwargs (tool args)."""
+    return {
+        "move_head": lambda **kw: move_head(deps, **kw),
+        "camera": lambda **kw: camera(deps, **kw),
+        "head_tracking": lambda **kw: head_tracking(deps, **kw),
+        "describe_current_scene": lambda **kw: describe_current_scene(deps),
+        "get_scene_context": lambda **kw: get_scene_context(deps),
+        "analyze_scene_for": lambda **kw: analyze_scene_for(deps, **kw),
+    }
+async def dispatch_tool_call(name: str, args_json: str, deps: Deps) -> Dict[str, Any]:
+    """Utility to execute a tool from streamed function_call arguments."""
+    try:
+        args = json.loads(args_json or "{}")
+    except Exception:
+        args = {}
+    registry = get_tool_registry(deps)
+    func = registry.get(name)
+    if not func:
+        return {"error": f"unknown tool: {name}"}
+    try:
+        return await func(**args)
+    except Exception as e:
+        error_msg = f"{type(e).__name__}: {e}"
+        logger.exception("Tool error in %s: %s", name, error_msg)
+        return {"error": error_msg}

src/reachy_mini_conversation_demo/vision.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import base64
+import logging
+import os
+import time
+import asyncio
+from typing import Dict, Any
+import threading
+from dataclasses import dataclass
+import cv2
+import numpy as np
+import torch
+from transformers import AutoModelForImageTextToText, AutoProcessor
+logger = logging.getLogger(__name__)
+@dataclass
+class VisionConfig:
+    """Configuration for vision processing"""
+    model_path: str = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+    vision_interval: float = 5.0
+    max_new_tokens: int = 64
+    temperature: float = 0.7
+    jpeg_quality: int = 85
+    max_retries: int = 3
+    retry_delay: float = 1.0
+    device_preference: str = "auto"  # "auto", "cuda", "cpu"
+class VisionProcessor:
+    """Handles SmolVLM2 model loading and inference"""
+    def __init__(self, config: VisionConfig = None):
+        self.config = config or VisionConfig()
+        self.model_path = self.config.model_path
+        self.device = self._determine_device()
+        self.processor = None
+        self.model = None
+        self._initialized = False
+    def _determine_device(self) -> str:
+        pref = self.config.device_preference
+        if pref == "cpu":
+            return "cpu"
+        if pref == "cuda":
+            return "cuda" if torch.cuda.is_available() else "cpu"
+        if pref == "mps":
+            return "mps" if torch.backends.mps.is_available() else "cpu"
+        # auto: prefer mps on Apple, then cuda, else cpu
+        if torch.backends.mps.is_available():
+            return "mps"
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    def initialize(self) -> bool:
+        try:
+            logger.info(
+                f"Loading SmolVLM2 model on {self.device} (HF_HOME={os.getenv('HF_HOME')})"
+            )
+            self.processor = AutoProcessor.from_pretrained(self.model_path)
+            # Select dtype depending on device
+            if self.device == "cuda":
+                dtype = torch.bfloat16
+            elif self.device == "mps":
+                dtype = torch.float16  # best for MPS
+            else:
+                dtype = torch.float32
+            model_kwargs = {"torch_dtype": dtype}
+            # flash_attention_2 is CUDA-only; skip on MPS/CPU
+            if self.device == "cuda":
+                model_kwargs["_attn_implementation"] = "flash_attention_2"
+            # Load model weights
+            self.model = AutoModelForImageTextToText.from_pretrained(
+                self.model_path, **model_kwargs
+            ).to(self.device)
+            self.model.eval()
+            self._initialized = True
+            return True
+        except Exception as e:
+            logger.error(f"Failed to initialize vision model: {e}")
+            return False
+    def process_image(
+        self,
+        cv2_image: np.ndarray,
+        prompt: str = "Briefly describe what you see in one sentence.",
+    ) -> str:
+        """Process CV2 image and return description with retry logic"""
+        if not self._initialized:
+            return "Vision model not initialized"
+        for attempt in range(self.config.max_retries):
+            try:
+                # Convert CV2 BGR to RGB
+                rgb_image = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2RGB)
+                # Convert to JPEG bytes
+                success, jpeg_buffer = cv2.imencode(
+                    ".jpg",
+                    rgb_image,
+                    [cv2.IMWRITE_JPEG_QUALITY, self.config.jpeg_quality],
+                )
+                if not success:
+                    return "Failed to encode image"
+                # Convert to base64
+                image_base64 = base64.b64encode(jpeg_buffer.tobytes()).decode("utf-8")
+                messages = [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image",
+                                "url": f"data:image/jpeg;base64,{image_base64}",
+                            },
+                            {"type": "text", "text": prompt},
+                        ],
+                    },
+                ]
+                inputs = self.processor.apply_chat_template(
+                    messages,
+                    add_generation_prompt=True,
+                    tokenize=True,
+                    return_dict=True,
+                    return_tensors="pt",
+                )
+                # move to device with proper dtype
+                if self.device == "cuda":
+                    inputs = inputs.to(self.device, dtype=torch.bfloat16)
+                elif self.device == "mps":
+                    inputs = inputs.to(self.device, dtype=torch.float16)
+                else:
+                    inputs = inputs.to(self.device, dtype=torch.float32)
+                with torch.no_grad():
+                    generated_ids = self.model.generate(
+                        **inputs,
+                        do_sample=True if self.config.temperature > 0 else False,
+                        max_new_tokens=self.config.max_new_tokens,
+                        temperature=self.config.temperature,
+                        pad_token_id=self.processor.tokenizer.eos_token_id,
+                    )
+                generated_texts = self.processor.batch_decode(
+                    generated_ids,
+                    skip_special_tokens=True,
+                )
+                # Extract just the response part
+                full_text = generated_texts[0]
+                response = self._extract_response(full_text)
+                # Clean up GPU memory if using CUDA
+                if self.device == "cuda":
+                    torch.cuda.empty_cache()
+                elif self.device == "mps":
+                    torch.mps.empty_cache()
+                return response.replace(chr(10), " ").strip()
+            except torch.cuda.OutOfMemoryError as e:
+                logger.error(f"CUDA OOM on attempt {attempt + 1}: {e}")
+                if self.device == "cuda":
+                    torch.cuda.empty_cache()
+                if attempt < self.config.max_retries - 1:
+                    time.sleep(self.config.retry_delay * (attempt + 1))
+                else:
+                    return "GPU out of memory - vision processing failed"
+            except Exception as e:
+                logger.error(f"Vision processing failed (attempt {attempt + 1}): {e}")
+                if attempt < self.config.max_retries - 1:
+                    time.sleep(self.config.retry_delay)
+                else:
+                    return f"Vision processing error after {self.config.max_retries} attempts"
+    def _extract_response(self, full_text: str) -> str:
+        """Extract the assistant's response from the full generated text"""
+        # Handle different response formats
+        markers = ["assistant\n", "Assistant:", "Response:", "\n\n"]
+        for marker in markers:
+            if marker in full_text:
+                response = full_text.split(marker)[-1].strip()
+                if response:  # Ensure we got a meaningful response
+                    return response
+        # Fallback: return the full text cleaned up
+        return full_text.strip()
+    def get_model_info(self) -> Dict[str, Any]:
+        """Get information about the loaded model"""
+        return {
+            "initialized": self._initialized,
+            "device": self.device,
+            "model_path": self.model_path,
+            "cuda_available": torch.cuda.is_available(),
+            "gpu_memory": torch.cuda.get_device_properties(0).total_memory // (1024**3)
+            if torch.cuda.is_available()
+            else "N/A",
+        }
+class VisionManager:
+    """Manages periodic vision processing and scene understanding"""
+    def __init__(self, camera, config: VisionConfig = None):
+        self.camera = camera
+        self.config = config or VisionConfig()
+        self.vision_interval = self.config.vision_interval
+        self.processor = VisionProcessor(self.config)
+        self._current_description = ""
+        self._last_processed_time = 0
+        # Initialize processor
+        if not self.processor.initialize():
+            logger.error("Failed to initialize vision processor")
+            raise RuntimeError("Vision processor initialization failed")
+    async def enable(self, stop_event: threading.Event):
+        """Main vision processing loop (runs in separate thread)"""
+        while not stop_event.is_set():
+            try:
+                current_time = time.time()
+                if current_time - self._last_processed_time >= self.vision_interval:
+                    success, frame = await asyncio.to_thread(self.camera.read)
+                    if success and frame is not None:
+                        description = await asyncio.to_thread(lambda: self.processor.process_image(
+                            frame, "Briefly describe what you see in one sentence.")
+                        )
+                        # Only update if we got a valid response
+                        if description and not description.startswith(
+                            ("Vision", "Failed", "Error")
+                        ):
+                            self._current_description = description
+                            self._last_processed_time = current_time
+                            logger.info(f"Vision update: {description}")
+                        else:
+                            logger.warning(f"Invalid vision response: {description}")
+                await asyncio.sleep(1.0)  # Check every second
+            except Exception as e:
+                logger.exception("Vision processing loop error")
+                await asyncio.sleep(5.0) # Longer sleep on error
+        logger.info(f"Vision loop finished")
+    async def get_current_description(self) -> str:
+        """Get the most recent scene description (thread-safe)"""
+        return self._current_description
+    async def process_current_frame(
+        self, prompt: str = "Describe what you see in detail."
+    ) -> Dict[str, Any]:
+        """Process current camera frame with custom prompt"""
+        try:
+            success, frame = self.camera.read()
+            if not success or frame is None:
+                return {"error": "Failed to capture image from camera"}
+            description =  await asyncio.to_thread(lambda: self.processor.process_image(frame, prompt))
+            return {
+                "description": description,
+                "timestamp": time.time(),
+                "prompt": prompt,
+            }
+        except Exception as e:
+            logger.exception("Failed to process current frame")
+            return {"error": f"Frame processing failed: {str(e)}"}
+    async def get_status(self) -> Dict[str, Any]:
+        """Get comprehensive status information"""
+        return {
+            "running": self._running,
+            "last_processed": self._last_processed_time,
+            "processor_info": self.processor.get_model_info(),
+            "config": {
+                "interval": self.vision_interval,
+                "model_path": self.config.model_path,
+                "device": self.processor.device,
+            },
+        }