Spaces:

pollen-robotics
/

reachy_mini_conversation_app

Running

App Files Files Community

Alina Lozovskaya commited on Oct 6

Commit

77a9cb2

2 Parent(s): af50a23 46f3109

Merge origin/develop into branch: resolve audio/moves conflicts and sync deps

Browse files

Files changed (19) hide show

.env.example +4 -1
README.md +5 -1
pyproject.toml +53 -19
src/reachy_mini_conversation_demo/__init__.py +1 -1
src/reachy_mini_conversation_demo/audio/__init__.py +1 -1
src/reachy_mini_conversation_demo/audio/gstreamer.py +18 -38
src/reachy_mini_conversation_demo/audio/head_wobbler.py +6 -7
src/reachy_mini_conversation_demo/audio/speech_tapper.py +8 -23
src/reachy_mini_conversation_demo/camera_worker.py +22 -48
src/reachy_mini_conversation_demo/config.py +4 -2
src/reachy_mini_conversation_demo/dance_emotion_moves.py +12 -30
src/reachy_mini_conversation_demo/main.py +6 -6
src/reachy_mini_conversation_demo/moves.py +33 -75
src/reachy_mini_conversation_demo/openai_realtime.py +16 -37
src/reachy_mini_conversation_demo/prompts.py +1 -1
src/reachy_mini_conversation_demo/tools.py +16 -30
src/reachy_mini_conversation_demo/utils.py +4 -10
src/reachy_mini_conversation_demo/vision/processors.py +14 -28
src/reachy_mini_conversation_demo/vision/yolo_head_tracker.py +11 -25

.env.example CHANGED Viewed

@@ -5,4 +5,7 @@ MODEL_NAME="gpt-4o-realtime-preview-2025-06-03"
 OPENAI_VISION_MODEL="gpt-4.1-mini"
 # Cache for local VLM
-HF_HOME=./cache

 OPENAI_VISION_MODEL="gpt-4.1-mini"
 # Cache for local VLM
+HF_HOME=./cache
+# Hugging Face token for accessing datasets/models
+HF_TOKEN=

README.md CHANGED Viewed

@@ -3,10 +3,14 @@
 Working repo, we should turn this into a ReachyMini app at some point maybe ?
 ## Installation
 ```bash
-pip install -e .
 ```
 ## Run

 Working repo, we should turn this into a ReachyMini app at some point maybe ?
 ## Installation
+You can set up the project quickly using [uv](https://docs.astral.sh/uv/):
 ```bash
+uv venv --python 3.12.1  # Create a virtual environment with Python 3.12.1
+source .venv/bin/activate
+uv sync
 ```
+> Note: The `pyproject.toml` expects `reachy-mini-dances-library` to be located in the same directory as this project.
 ## Run

pyproject.toml CHANGED Viewed

@@ -10,44 +10,78 @@ description = ""
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
-    # "reachy_mini@git+ssh://git@github.com/pollen-robotics/reachy_mini@develop",
-    "openai",
     "fastrtc",
     "onnxruntime",
-    "PyGObject>=3.42.2, <=3.46.0",
     "torch",
     "transformers",
-    "num2words",
-    "dotenv",
     "ultralytics",
-    "supervision",
-    "reachy_mini_toolbox@git+ssh://git@github.com/pollen-robotics/reachy_mini_toolbox@main",
-    "reachy_mini_dances_library@git+ssh://git@github.com/pollen-robotics/reachy_mini_dances_library@main"
 ]
-[project.optional-dependencies]
 dev = ["pytest", "ruff==0.12.0"]
-[project.scripts]
-reachy-mini-conversation-demo = "reachy_mini_conversation_demo.main:main"
 [tool.setuptools]
 package-dir = { "" = "src" }
 include-package-data = true
 [tool.setuptools.packages.find]
 where = ["src"]
 [tool.setuptools.package-data]
-reachy_mini = ["**/*"] # Inclut tous les fichiers non .py
 [tool.ruff]
-exclude = []
-lint.extend-select = ["I", "D"]
-lint.ignore = [
-    "D203", # Incompatible with D211
-    "D213", # Incompatible with D212
 ]

 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
+    "aiortc>=1.13.0",
     "fastrtc",
+    "gradio>=5.49.0",
+    "huggingface_hub>=0.34.4",
+    "mediapipe>=0.10.14",
+    "num2words",
     "onnxruntime",
+    "opencv-python>=4.12.0.88",
+    "openai>=2.1",
+    "PyGObject>=3.42.2,<=3.46.0",
+    "python-dotenv",
+    "reachy_mini_dances_library",
+    "reachy_mini_toolbox",
+    "reachy_mini",
+    "supervision",
     "torch",
     "transformers",
     "ultralytics",
 ]
+[project.scripts]
+reachy-mini-conversation-demo = "reachy_mini_conversation_demo.main:main"
+[dependency-groups]
 dev = ["pytest", "ruff==0.12.0"]
+[tool.uv.sources]
+reachy_mini_dances_library = { path = "../reachy_mini_dances_library", editable = true }
+reachy_mini = { git = "ssh://git@github.com/pollen-robotics/reachy_mini.git", branch = "develop" }
+reachy_mini_toolbox = { git = "ssh://git@github.com/pollen-robotics/reachy_mini_toolbox.git", branch = "main" }
+fastrtc = { git = "ssh://git@github.com/gradio-app/fastrtc.git", branch = "main" }
 [tool.setuptools]
 package-dir = { "" = "src" }
 include-package-data = true
 [tool.setuptools.packages.find]
 where = ["src"]
 [tool.setuptools.package-data]
+reachy_mini_conversation_demo = ["images/*"]
 [tool.ruff]
+line-length = 119
+exclude = [".venv", "dist", "build", "**/__pycache__", "*.egg-info", ".mypy_cache", ".pytest_cache"]
+[tool.ruff.lint]
+select = [
+  "E",   # pycodestyle errors
+  "F",   # pyflakes
+  "W",   # pycodestyle warnings
+  "I",   # isort
+  "C4",  # flake8-comprehensions
+  "D",   # pydocstyle
 ]
+ignore = [
+  "E501",  # handled by formatter
+  "D100",  # ignore missing module docstrings
+  "D203",  # blank line before class docstring (conflicts with D211)
+  "D213",  # summary on second line (conflicts with D212)
+]
+[tool.ruff.lint.isort]
+length-sort = true
+lines-after-imports = 2
+no-lines-before = ["standard-library", "local-folder"]
+known-local-folder = ["reachy_mini_conversation_demo"]
+known-first-party = ["reachy_mini", "reachy_mini_dances_library", "reachy_mini_toolbox"]
+split-on-trailing-comma = true
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"

src/reachy_mini_conversation_demo/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- """Nothing (for ruff)."""


1	+ """Nothing (for ruff)."""

src/reachy_mini_conversation_demo/audio/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- """Nothing (for ruff)."""


1	+ """Nothing (for ruff)."""

src/reachy_mini_conversation_demo/audio/gstreamer.py CHANGED Viewed

@@ -1,12 +1,13 @@
-import logging  # noqa: D100
-from threading import Thread
 from typing import Optional
 import gi
 gi.require_version("Gst", "1.0")
 gi.require_version("GstApp", "1.0")
-from gi.repository import GLib, Gst  # noqa: E402
 class GstPlayer:
@@ -25,18 +26,16 @@ class GstPlayer:
         self.appsrc = Gst.ElementFactory.make("appsrc", None)
         self.appsrc.set_property("format", Gst.Format.TIME)
         self.appsrc.set_property("is-live", True)
-        caps = Gst.Caps.from_string(
-            f"audio/x-raw,format=S16LE,channels=1,rate={sample_rate},layout=interleaved"
-        )
         self.appsrc.set_property("caps", caps)
         queue = Gst.ElementFactory.make("queue")
         audioconvert = Gst.ElementFactory.make("audioconvert")
         audioresample = Gst.ElementFactory.make("audioresample")
         # Try to pin specific output device; fallback to autoaudiosink
-        audiosink = _create_device_element(
-            direction="sink", name_substr=device_name
-        ) or Gst.ElementFactory.make("autoaudiosink")
         self.pipeline.add(self.appsrc)
         self.pipeline.add(queue)
@@ -104,9 +103,9 @@ class GstRecorder:
         self.pipeline = Gst.Pipeline.new("audio_recorder")
         # Create elements: try specific mic; fallback to default
-        autoaudiosrc = _create_device_element(
-            direction="source", name_substr=device_name
-        ) or Gst.ElementFactory.make("autoaudiosrc", None)
         queue = Gst.ElementFactory.make("queue", None)
         audioconvert = Gst.ElementFactory.make("audioconvert", None)
@@ -117,9 +116,7 @@ class GstRecorder:
             raise RuntimeError("Failed to create GStreamer elements")
         # Force mono/S16LE at 24000; resample handles device SR (e.g., 16000 → 24000)
-        caps = Gst.Caps.from_string(
-            f"audio/x-raw,channels=1,rate={sample_rate},format=S16LE"
-        )
         self.appsink.set_property("caps", caps)
         # Build pipeline
@@ -183,9 +180,7 @@ class GstRecorder:
         logger.info("Stopped Recorder")
-def _create_device_element(
-    direction: str, name_substr: Optional[str]
-) -> Optional[Gst.Element]:
     """direction: 'source' or 'sink'.
     name_substr: case-insensitive substring matching device display name/description.
@@ -205,30 +200,15 @@ def _create_device_element(
         for dev in monitor.get_devices() or []:
             disp = dev.get_display_name() or ""
             props = dev.get_properties()
-            desc = (
-                props.get_string("device.description")
-                if props and props.has_field("device.description")
-                else ""
-            )
             logger.info(f"Device candidate: disp='{disp}', desc='{desc}'")
-            if (
-                name_substr.lower() in disp.lower()
-                or name_substr.lower() in desc.lower()
-            ):
                 elem = dev.create_element(None)
-                factory = (
-                    elem.get_factory().get_name()
-                    if elem and elem.get_factory()
-                    else "<?>"
-                )
-                logger.info(
-                    f"Using {direction} device: '{disp or desc}' (factory='{factory}')"
-                )
                 return elem
     finally:
         monitor.stop()
-    logging.getLogger(__name__).warning(
-        "Requested %s '%s' not found; using auto*", direction, name_substr
-    )
     return None

+import logging
 from typing import Optional
+from threading import Thread
 import gi
 gi.require_version("Gst", "1.0")
 gi.require_version("GstApp", "1.0")
+from gi.repository import Gst, GLib  # noqa: E402
 class GstPlayer:
         self.appsrc = Gst.ElementFactory.make("appsrc", None)
         self.appsrc.set_property("format", Gst.Format.TIME)
         self.appsrc.set_property("is-live", True)
+        caps = Gst.Caps.from_string(f"audio/x-raw,format=S16LE,channels=1,rate={sample_rate},layout=interleaved")
         self.appsrc.set_property("caps", caps)
         queue = Gst.ElementFactory.make("queue")
         audioconvert = Gst.ElementFactory.make("audioconvert")
         audioresample = Gst.ElementFactory.make("audioresample")
         # Try to pin specific output device; fallback to autoaudiosink
+        audiosink = _create_device_element(direction="sink", name_substr=device_name) or Gst.ElementFactory.make(
+            "autoaudiosink"
+        )
         self.pipeline.add(self.appsrc)
         self.pipeline.add(queue)
         self.pipeline = Gst.Pipeline.new("audio_recorder")
         # Create elements: try specific mic; fallback to default
+        autoaudiosrc = _create_device_element(direction="source", name_substr=device_name) or Gst.ElementFactory.make(
+            "autoaudiosrc", None
+        )
         queue = Gst.ElementFactory.make("queue", None)
         audioconvert = Gst.ElementFactory.make("audioconvert", None)
             raise RuntimeError("Failed to create GStreamer elements")
         # Force mono/S16LE at 24000; resample handles device SR (e.g., 16000 → 24000)
+        caps = Gst.Caps.from_string(f"audio/x-raw,channels=1,rate={sample_rate},format=S16LE")
         self.appsink.set_property("caps", caps)
         # Build pipeline
         logger.info("Stopped Recorder")
+def _create_device_element(direction: str, name_substr: Optional[str]) -> Optional[Gst.Element]:
     """direction: 'source' or 'sink'.
     name_substr: case-insensitive substring matching device display name/description.
         for dev in monitor.get_devices() or []:
             disp = dev.get_display_name() or ""
             props = dev.get_properties()
+            desc = props.get_string("device.description") if props and props.has_field("device.description") else ""
             logger.info(f"Device candidate: disp='{disp}', desc='{desc}'")
+            if name_substr.lower() in disp.lower() or name_substr.lower() in desc.lower():
                 elem = dev.create_element(None)
+                factory = elem.get_factory().get_name() if elem and elem.get_factory() else "<?>"
+                logger.info(f"Using {direction} device: '{disp or desc}' (factory='{factory}')")
                 return elem
     finally:
         monitor.stop()
+    logging.getLogger(__name__).warning("Requested %s '%s' not found; using auto*", direction, name_substr)
     return None

src/reachy_mini_conversation_demo/audio/head_wobbler.py CHANGED Viewed

@@ -1,16 +1,17 @@
 """Moves head given audio samples."""
 import base64
 import logging
-import queue
 import threading
-import time
 from typing import Optional, Tuple
 import numpy as np
 from reachy_mini_conversation_demo.audio.speech_tapper import HOP_MS, SwayRollRT
 SAMPLE_RATE = 24000
 MOVEMENT_LATENCY_S = 0.08  # seconds between audio and robot movement
 logger = logging.getLogger(__name__)
@@ -25,9 +26,7 @@ class HeadWobbler:
         self._base_ts: Optional[float] = None
         self._hops_done: int = 0
-        self.audio_queue: queue.Queue[
-            Tuple[int, int, np.ndarray]
-        ] = queue.Queue()
         self.sway = SwayRollRT()
         # Synchronization primitives
@@ -50,14 +49,14 @@ class HeadWobbler:
         self._stop_event.clear()
         self._thread = threading.Thread(target=self.working_loop, daemon=True)
         self._thread.start()
-        logger.info("Head wobbler started")
     def stop(self) -> None:
         """Stop the head wobbler loop."""
         self._stop_event.set()
         if self._thread is not None:
             self._thread.join()
-        logger.info("Head wobbler stopped")
     def working_loop(self) -> None:
         """Convert audio deltas into head movement offsets."""

 """Moves head given audio samples."""
+import time
+import queue
 import base64
 import logging
 import threading
 from typing import Optional, Tuple
 import numpy as np
 from reachy_mini_conversation_demo.audio.speech_tapper import HOP_MS, SwayRollRT
 SAMPLE_RATE = 24000
 MOVEMENT_LATENCY_S = 0.08  # seconds between audio and robot movement
 logger = logging.getLogger(__name__)
         self._base_ts: Optional[float] = None
         self._hops_done: int = 0
+        self.audio_queue: queue.Queue[Tuple[int, int, np.ndarray]] = queue.Queue()
         self.sway = SwayRollRT()
         # Synchronization primitives
         self._stop_event.clear()
         self._thread = threading.Thread(target=self.working_loop, daemon=True)
         self._thread.start()
+        logger.debug("Head wobbler started")
     def stop(self) -> None:
         """Stop the head wobbler loop."""
         self._stop_event.set()
         if self._thread is not None:
             self._thread.join()
+        logger.debug("Head wobbler stopped")
     def working_loop(self) -> None:
         """Convert audio deltas into head movement offsets."""

src/reachy_mini_conversation_demo/audio/speech_tapper.py CHANGED Viewed

@@ -1,12 +1,12 @@
-from __future__ import annotations  # noqa: D100
 import math
-from collections import deque
-from itertools import islice
 from typing import Dict, List, Optional
 import numpy as np
 # Tunables
 SR = 16_000
 FRAME_MS = 20
@@ -68,7 +68,7 @@ def _loudness_gain(db: float, offset: float = SENS_DB_OFFSET) -> float:
 def _to_float32_mono(x: np.ndarray) -> np.ndarray:
     """Convert arbitrary PCM array to float32 mono in [-1,1].
     Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
     """
     a = np.asarray(x)
@@ -258,24 +258,9 @@ class SwayRollRT:
                 * env
                 * math.sin(2 * math.pi * SWAY_F_ROLL * self.t + self.phase_roll)
             )
-            x_mm = (
-                SWAY_A_X_MM
-                * loud
-                * env
-                * math.sin(2 * math.pi * SWAY_F_X * self.t + self.phase_x)
-            )
-            y_mm = (
-                SWAY_A_Y_MM
-                * loud
-                * env
-                * math.sin(2 * math.pi * SWAY_F_Y * self.t + self.phase_y)
-            )
-            z_mm = (
-                SWAY_A_Z_MM
-                * loud
-                * env
-                * math.sin(2 * math.pi * SWAY_F_Z * self.t + self.phase_z)
-            )
             out.append(
                 {

+from __future__ import annotations
 import math
 from typing import Dict, List, Optional
+from itertools import islice
+from collections import deque
 import numpy as np
 # Tunables
 SR = 16_000
 FRAME_MS = 20
 def _to_float32_mono(x: np.ndarray) -> np.ndarray:
     """Convert arbitrary PCM array to float32 mono in [-1,1].
     Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
     """
     a = np.asarray(x)
                 * env
                 * math.sin(2 * math.pi * SWAY_F_ROLL * self.t + self.phase_roll)
             )
+            x_mm = SWAY_A_X_MM * loud * env * math.sin(2 * math.pi * SWAY_F_X * self.t + self.phase_x)
+            y_mm = SWAY_A_Y_MM * loud * env * math.sin(2 * math.pi * SWAY_F_Y * self.t + self.phase_y)
+            z_mm = SWAY_A_Z_MM * loud * env * math.sin(2 * math.pi * SWAY_F_Z * self.t + self.phase_z)
             out.append(
                 {

src/reachy_mini_conversation_demo/camera_worker.py CHANGED Viewed

@@ -6,16 +6,18 @@ Ported from main_works.py camera_worker() function to provide:
 - Latest frame always available for tools
 """
 import logging
 import threading
-import time
-from typing import Optional, Tuple
 import cv2
 import numpy as np
 from reachy_mini import ReachyMini
 from reachy_mini.utils.interpolation import linear_pose_interpolation
-from scipy.spatial.transform import Rotation as R
 logger = logging.getLogger(__name__)
@@ -83,14 +85,14 @@ class CameraWorker:
         self._stop_event.clear()
         self._thread = threading.Thread(target=self.working_loop, daemon=True)
         self._thread.start()
-        logger.info("Camera worker started")
     def stop(self) -> None:
         """Stop the camera worker loop."""
         self._stop_event.set()
         if self._thread is not None:
             self._thread.join()
-        logger.info("Camera worker stopped")
     def working_loop(self) -> None:
         """Enable the camera worker loop.
@@ -114,17 +116,10 @@ class CameraWorker:
                         self.latest_frame = frame  # .copy()
                     # Check if face tracking was just disabled
-                    if (
-                        self.previous_head_tracking_state
-                        and not self.is_head_tracking_enabled
-                    ):
                         # Face tracking was just disabled - start interpolation to neutral
-                        self.last_face_detected_time = (
-                            current_time  # Trigger the face-lost logic
-                        )
-                        self.interpolation_start_time = (
-                            None  # Will be set by the face-lost interpolation
-                        )
                         self.interpolation_start_pose = None
                     # Update tracking state
@@ -137,9 +132,7 @@ class CameraWorker:
                         if eye_center is not None:
                             # Face detected - immediately switch to tracking
                             self.last_face_detected_time = current_time
-                            self.interpolation_start_time = (
-                                None  # Stop any interpolation
-                            )
                             # Convert normalized coordinates to pixel coordinates
                             h, w, _ = frame.shape
@@ -159,9 +152,7 @@ class CameraWorker:
                             # Extract translation and rotation from the target pose directly
                             translation = target_pose[:3, 3]
-                            rotation = R.from_matrix(target_pose[:3, :3]).as_euler(
-                                "xyz", degrees=False
-                            )
                             # Thread-safe update of face tracking offsets (use pose as-is)
                             with self.face_tracking_lock:
@@ -176,19 +167,14 @@ class CameraWorker:
                         else:
                             # No face detected while tracking enabled - set face lost timestamp
-                            if (
-                                self.last_face_detected_time is None
-                                or self.last_face_detected_time == current_time
-                            ):
                                 # Only update if we haven't already set a face lost time
                                 # (current_time check prevents overriding the disable-triggered timestamp)
                                 pass
                     # Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
                     if self.last_face_detected_time is not None:
-                        time_since_face_lost = (
-                            current_time - self.last_face_detected_time
-                        )
                         if time_since_face_lost >= self.face_lost_delay:
                             # Start interpolation if not already started
@@ -197,27 +183,17 @@ class CameraWorker:
                                 # Capture current pose as start of interpolation
                                 with self.face_tracking_lock:
                                     current_translation = self.face_tracking_offsets[:3]
-                                    current_rotation_euler = self.face_tracking_offsets[
-                                        3:
-                                    ]
                                     # Convert to 4x4 pose matrix
                                     self.interpolation_start_pose = np.eye(4)
-                                    self.interpolation_start_pose[:3, 3] = (
-                                        current_translation
-                                    )
-                                    self.interpolation_start_pose[:3, :3] = (
-                                        R.from_euler(
-                                            "xyz", current_rotation_euler
-                                        ).as_matrix()
-                                    )
                             # Calculate interpolation progress (t from 0 to 1)
-                            elapsed_interpolation = (
-                                current_time - self.interpolation_start_time
-                            )
-                            t = min(
-                                1.0, elapsed_interpolation / self.interpolation_duration
-                            )
                             # Interpolate between current pose and neutral pose
                             interpolated_pose = linear_pose_interpolation(
@@ -226,9 +202,7 @@ class CameraWorker:
                             # Extract translation and rotation from interpolated pose
                             translation = interpolated_pose[:3, 3]
-                            rotation = R.from_matrix(
-                                interpolated_pose[:3, :3]
-                            ).as_euler("xyz", degrees=False)
                             # Thread-safe update of face tracking offsets
                             with self.face_tracking_lock:

 - Latest frame always available for tools
 """
+import time
 import logging
 import threading
+from typing import Tuple, Optional
 import cv2
 import numpy as np
+from scipy.spatial.transform import Rotation as R
 from reachy_mini import ReachyMini
 from reachy_mini.utils.interpolation import linear_pose_interpolation
 logger = logging.getLogger(__name__)
         self._stop_event.clear()
         self._thread = threading.Thread(target=self.working_loop, daemon=True)
         self._thread.start()
+        logger.debug("Camera worker started")
     def stop(self) -> None:
         """Stop the camera worker loop."""
         self._stop_event.set()
         if self._thread is not None:
             self._thread.join()
+        logger.debug("Camera worker stopped")
     def working_loop(self) -> None:
         """Enable the camera worker loop.
                         self.latest_frame = frame  # .copy()
                     # Check if face tracking was just disabled
+                    if self.previous_head_tracking_state and not self.is_head_tracking_enabled:
                         # Face tracking was just disabled - start interpolation to neutral
+                        self.last_face_detected_time = current_time  # Trigger the face-lost logic
+                        self.interpolation_start_time = None  # Will be set by the face-lost interpolation
                         self.interpolation_start_pose = None
                     # Update tracking state
                         if eye_center is not None:
                             # Face detected - immediately switch to tracking
                             self.last_face_detected_time = current_time
+                            self.interpolation_start_time = None  # Stop any interpolation
                             # Convert normalized coordinates to pixel coordinates
                             h, w, _ = frame.shape
                             # Extract translation and rotation from the target pose directly
                             translation = target_pose[:3, 3]
+                            rotation = R.from_matrix(target_pose[:3, :3]).as_euler("xyz", degrees=False)
                             # Thread-safe update of face tracking offsets (use pose as-is)
                             with self.face_tracking_lock:
                         else:
                             # No face detected while tracking enabled - set face lost timestamp
+                            if self.last_face_detected_time is None or self.last_face_detected_time == current_time:
                                 # Only update if we haven't already set a face lost time
                                 # (current_time check prevents overriding the disable-triggered timestamp)
                                 pass
                     # Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
                     if self.last_face_detected_time is not None:
+                        time_since_face_lost = current_time - self.last_face_detected_time
                         if time_since_face_lost >= self.face_lost_delay:
                             # Start interpolation if not already started
                                 # Capture current pose as start of interpolation
                                 with self.face_tracking_lock:
                                     current_translation = self.face_tracking_offsets[:3]
+                                    current_rotation_euler = self.face_tracking_offsets[3:]
                                     # Convert to 4x4 pose matrix
                                     self.interpolation_start_pose = np.eye(4)
+                                    self.interpolation_start_pose[:3, 3] = current_translation
+                                    self.interpolation_start_pose[:3, :3] = R.from_euler(
+                                        "xyz", current_rotation_euler
+                                    ).as_matrix()
                             # Calculate interpolation progress (t from 0 to 1)
+                            elapsed_interpolation = current_time - self.interpolation_start_time
+                            t = min(1.0, elapsed_interpolation / self.interpolation_duration)
                             # Interpolate between current pose and neutral pose
                             interpolated_pose = linear_pose_interpolation(
                             # Extract translation and rotation from interpolated pose
                             translation = interpolated_pose[:3, 3]
+                            rotation = R.from_matrix(interpolated_pose[:3, :3]).as_euler("xyz", degrees=False)
                             # Thread-safe update of face tracking offsets
                             with self.face_tracking_lock:

src/reachy_mini_conversation_demo/config.py CHANGED Viewed

@@ -1,7 +1,8 @@
-import os  # noqa: D100
 from dotenv import load_dotenv
 load_dotenv()
@@ -15,7 +16,7 @@ def getenv_bool(key: str, default: bool = False) -> bool:
 class Config:
     """Configuration class for the conversation demo."""
     # Required
     OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
     if not OPENAI_API_KEY:
@@ -25,6 +26,7 @@ class Config:
     MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-realtime-preview")
     OPENAI_VISION_MODEL = os.getenv("OPENAI_VISION_MODEL", "gpt-4.1-mini")
     HF_HOME = os.getenv("HF_HOME", "./cache")
 config = Config()

+import os
 from dotenv import load_dotenv
 load_dotenv()
 class Config:
     """Configuration class for the conversation demo."""
     # Required
     OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
     if not OPENAI_API_KEY:
     MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-realtime-preview")
     OPENAI_VISION_MODEL = os.getenv("OPENAI_VISION_MODEL", "gpt-4.1-mini")
     HF_HOME = os.getenv("HF_HOME", "./cache")
+    HF_TOKEN = os.getenv("HF_TOKEN")  # Optional, falls back to hf auth login if not set
 config = Config()

src/reachy_mini_conversation_demo/dance_emotion_moves.py CHANGED Viewed

@@ -5,15 +5,16 @@ and executed sequentially by the MovementManager.
 """
 from __future__ import annotations
 import logging
 from typing import Tuple
 import numpy as np
 from reachy_mini.motion.move import Move
 from reachy_mini.motion.recorded_move import RecordedMoves
 from reachy_mini_dances_library.dance_move import DanceMove
 logger = logging.getLogger(__name__)
@@ -30,9 +31,7 @@ class DanceQueueMove(Move):
         """Duration property required by official Move interface."""
         return self.dance_move.duration
-    def evaluate(
-        self, t: float
-    ) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
         """Evaluate dance move at time t."""
         try:
             # Get the pose from the dance move
@@ -45,9 +44,7 @@ class DanceQueueMove(Move):
             return (head_pose, antennas, body_yaw)
         except Exception as e:
-            logger.error(
-                f"Error evaluating dance move '{self.move_name}' at t={t}: {e}"
-            )
             # Return neutral pose on error
             from reachy_mini.utils import create_head_pose
@@ -68,9 +65,7 @@ class EmotionQueueMove(Move):
         """Duration property required by official Move interface."""
         return self.emotion_move.duration
-    def evaluate(
-        self, t: float
-    ) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
         """Evaluate emotion move at time t."""
         try:
             # Get the pose from the emotion move
@@ -83,9 +78,7 @@ class EmotionQueueMove(Move):
             return (head_pose, antennas, body_yaw)
         except Exception as e:
-            logger.error(
-                f"Error evaluating emotion '{self.emotion_name}' at t={t}: {e}"
-            )
             # Return neutral pose on error
             from reachy_mini.utils import create_head_pose
@@ -120,9 +113,7 @@ class GotoQueueMove(Move):
         """Duration property required by official Move interface."""
         return self._duration
-    def evaluate(
-        self, t: float
-    ) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
         """Evaluate goto move at time t using linear interpolation."""
         try:
             from reachy_mini.utils import create_head_pose
@@ -138,32 +129,23 @@ class GotoQueueMove(Move):
                 start_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
             # Interpolate head pose
-            head_pose = linear_pose_interpolation(
-                start_pose, self.target_head_pose, t_clamped
-            )
             # Interpolate antennas - return as numpy array
             antennas = np.array(
                 [
-                    self.start_antennas[0]
-                    + (self.target_antennas[0] - self.start_antennas[0]) * t_clamped,
-                    self.start_antennas[1]
-                    + (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
                 ]
             )
             # Interpolate body yaw
-            body_yaw = (
-                self.start_body_yaw
-                + (self.target_body_yaw - self.start_body_yaw) * t_clamped
-            )
             return (head_pose, antennas, body_yaw)
         except Exception as e:
             logger.error(f"Error evaluating goto move at t={t}: {e}")
             # Return target pose on error - convert antennas to numpy array
-            target_antennas_array = np.array(
-                [self.target_antennas[0], self.target_antennas[1]]
-            )
             return (self.target_head_pose, target_antennas_array, self.target_body_yaw)

 """
 from __future__ import annotations
 import logging
 from typing import Tuple
 import numpy as np
 from reachy_mini.motion.move import Move
 from reachy_mini.motion.recorded_move import RecordedMoves
 from reachy_mini_dances_library.dance_move import DanceMove
 logger = logging.getLogger(__name__)
         """Duration property required by official Move interface."""
         return self.dance_move.duration
+    def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
         """Evaluate dance move at time t."""
         try:
             # Get the pose from the dance move
             return (head_pose, antennas, body_yaw)
         except Exception as e:
+            logger.error(f"Error evaluating dance move '{self.move_name}' at t={t}: {e}")
             # Return neutral pose on error
             from reachy_mini.utils import create_head_pose
         """Duration property required by official Move interface."""
         return self.emotion_move.duration
+    def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
         """Evaluate emotion move at time t."""
         try:
             # Get the pose from the emotion move
             return (head_pose, antennas, body_yaw)
         except Exception as e:
+            logger.error(f"Error evaluating emotion '{self.emotion_name}' at t={t}: {e}")
             # Return neutral pose on error
             from reachy_mini.utils import create_head_pose
         """Duration property required by official Move interface."""
         return self._duration
+    def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
         """Evaluate goto move at time t using linear interpolation."""
         try:
             from reachy_mini.utils import create_head_pose
                 start_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
             # Interpolate head pose
+            head_pose = linear_pose_interpolation(start_pose, self.target_head_pose, t_clamped)
             # Interpolate antennas - return as numpy array
             antennas = np.array(
                 [
+                    self.start_antennas[0] + (self.target_antennas[0] - self.start_antennas[0]) * t_clamped,
+                    self.start_antennas[1] + (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
                 ]
             )
             # Interpolate body yaw
+            body_yaw = self.start_body_yaw + (self.target_body_yaw - self.start_body_yaw) * t_clamped
             return (head_pose, antennas, body_yaw)
         except Exception as e:
             logger.error(f"Error evaluating goto move at t={t}: {e}")
             # Return target pose on error - convert antennas to numpy array
+            target_antennas_array = np.array([self.target_antennas[0], self.target_antennas[1]])
             return (self.target_head_pose, target_antennas_array, self.target_body_yaw)

src/reachy_mini_conversation_demo/main.py CHANGED Viewed

@@ -5,17 +5,17 @@ import os
 import gradio as gr
 from fastapi import FastAPI
 from fastrtc import Stream
-from reachy_mini import ReachyMini
-from reachy_mini_conversation_demo.audio.head_wobbler import HeadWobbler
 from reachy_mini_conversation_demo.moves import MovementManager
-from reachy_mini_conversation_demo.openai_realtime import OpenaiRealtimeHandler
 from reachy_mini_conversation_demo.tools import ToolDependencies
 from reachy_mini_conversation_demo.utils import (
-    handle_vision_stuff,
     parse_args,
     setup_logger,
 )
 def update_chatbot(chatbot: list[dict], response: dict):
@@ -50,7 +50,7 @@ def main():
         head_wobbler=head_wobbler,
     )
     current_file_path = os.path.dirname(os.path.abspath(__file__))
-    logger.info(f"Current file absolute path: {current_file_path}")
     chatbot = gr.Chatbot(
         type="messages",
         resizable=True,
@@ -59,7 +59,7 @@ def main():
             os.path.join(current_file_path, "images", "reachymini_avatar.png"),
         ),
     )
-    logger.info(f"Chatbot avatar images: {chatbot.avatar_images}")
     handler = OpenaiRealtimeHandler(deps)
     stream = Stream(

 import gradio as gr
 from fastapi import FastAPI
 from fastrtc import Stream
+from reachy_mini import ReachyMini
 from reachy_mini_conversation_demo.moves import MovementManager
 from reachy_mini_conversation_demo.tools import ToolDependencies
 from reachy_mini_conversation_demo.utils import (
     parse_args,
     setup_logger,
+    handle_vision_stuff,
 )
+from reachy_mini_conversation_demo.openai_realtime import OpenaiRealtimeHandler
+from reachy_mini_conversation_demo.audio.head_wobbler import HeadWobbler
 def update_chatbot(chatbot: list[dict], response: dict):
         head_wobbler=head_wobbler,
     )
     current_file_path = os.path.dirname(os.path.abspath(__file__))
+    logger.debug(f"Current file absolute path: {current_file_path}")
     chatbot = gr.Chatbot(
         type="messages",
         resizable=True,
             os.path.join(current_file_path, "images", "reachymini_avatar.png"),
         ),
     )
+    logger.debug(f"Chatbot avatar images: {chatbot.avatar_images}")
     handler = OpenaiRealtimeHandler(deps)
     stream = Stream(

src/reachy_mini_conversation_demo/moves.py CHANGED Viewed

@@ -32,33 +32,33 @@ Safety
 """
 from __future__ import annotations
 import logging
 import threading
-import time
 from collections import deque
 from dataclasses import dataclass
 from queue import Empty, Queue
 from typing import Any, Optional, Tuple
 import numpy as np
 from reachy_mini import ReachyMini
-from reachy_mini.motion.move import Move
 from reachy_mini.utils import create_head_pose
 from reachy_mini.utils.interpolation import (
     compose_world_offset,
     linear_pose_interpolation,
 )
 logger = logging.getLogger(__name__)
 # Configuration constants
 CONTROL_LOOP_FREQUENCY_HZ = 100.0  # Hz - Target frequency for the movement control loop
 # Type definitions
-FullBodyPose = Tuple[
-    np.ndarray, Tuple[float, float], float
-]  # (head_pose_4x4, antennas, body_yaw)
 class BreathingMove(Move):
@@ -97,9 +97,7 @@ class BreathingMove(Move):
         """Duration property required by official Move interface."""
         return float("inf")  # Continuous breathing (never ends naturally)
-    def evaluate(
-        self, t: float
-    ) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
         """Evaluate breathing move at time t."""
         if t < self.interpolation_duration:
             # Phase 1: Interpolate to neutral base position
@@ -112,35 +110,26 @@ class BreathingMove(Move):
             # Interpolate antennas
             antennas = (
-                (1 - interpolation_t) * self.interpolation_start_antennas
-                + interpolation_t * self.neutral_antennas
-            )
         else:
             # Phase 2: Breathing patterns from neutral base
             breathing_time = t - self.interpolation_duration
             # Gentle z-axis breathing
-            z_offset = self.breathing_z_amplitude * np.sin(
-                2 * np.pi * self.breathing_frequency * breathing_time
-            )
-            head_pose = create_head_pose(
-                x=0, y=0, z=z_offset, roll=0, pitch=0, yaw=0, degrees=True, mm=False
-            )
             # Antenna sway (opposite directions)
-            antenna_sway = self.antenna_sway_amplitude * np.sin(
-                2 * np.pi * self.antenna_frequency * breathing_time
-            )
             antennas = np.array([antenna_sway, -antenna_sway])
         # Return in official Move interface format: (head_pose, antennas_array, body_yaw)
         return (head_pose, antennas, 0.0)
-def combine_full_body(
-    primary_pose: FullBodyPose, secondary_pose: FullBodyPose
-) -> FullBodyPose:
     """Combine primary and secondary full body poses.
     Args:
@@ -157,9 +146,7 @@ def combine_full_body(
     # Combine head poses using compose_world_offset; the secondary pose must be an
     # offset expressed in the world frame (T_off_world) applied to the absolute
     # primary transform (T_abs).
-    combined_head = compose_world_offset(
-        primary_head, secondary_head, reorthonormalize=True
-    )
     # Sum antennas and body_yaw
     combined_antennas = (
@@ -288,9 +275,7 @@ class MovementManager:
         self._stop_event = threading.Event()
         self._thread: Optional[threading.Thread] = None
         self._is_listening = False
-        self._last_commanded_pose: FullBodyPose = clone_full_body_pose(
-            self.state.last_primary_pose
-        )
         self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
         self._antenna_unfreeze_blend = 1.0
         self._antenna_blend_duration = 0.4  # seconds to blend back after listening
@@ -348,9 +333,7 @@ class MovementManager:
         """
         self._command_queue.put(("clear_queue", None))
-    def set_speech_offsets(
-        self, offsets: Tuple[float, float, float, float, float, float]
-    ) -> None:
         """Update speech-induced secondary offsets (x, y, z, roll, pitch, yaw).
         Offsets are interpreted as metres for translation and radians for
@@ -506,8 +489,7 @@ class MovementManager:
         """Manage the primary move queue (sequential execution)."""
         if self.state.current_move is None or (
             self.state.move_start_time is not None
-            and current_time - self.state.move_start_time
-            >= self.state.current_move.duration
         ):
             self.state.current_move = None
             self.state.move_start_time = None
@@ -516,12 +498,8 @@ class MovementManager:
                 self.state.current_move = self.move_queue.popleft()
                 self.state.move_start_time = current_time
                 # Any real move cancels breathing mode flag
-                self._breathing_active = isinstance(
-                    self.state.current_move, BreathingMove
-                )
-                logger.info(
-                    f"Starting new move, duration: {self.state.current_move.duration}s"
-                )
     def _manage_breathing(self, current_time: float) -> None:
         """Manage automatic breathing when idle."""
@@ -553,18 +531,13 @@ class MovementManager:
                     self._breathing_active = False
                     logger.error("Failed to start breathing: %s", e)
-        if (
-            isinstance(self.state.current_move, BreathingMove)
-            and self.move_queue
-        ):
             self.state.current_move = None
             self.state.move_start_time = None
             self._breathing_active = False
             logger.info("Stopping breathing due to new move activity")
-        if self.state.current_move is not None and not isinstance(
-            self.state.current_move, BreathingMove
-        ):
             self._breathing_active = False
     def _get_primary_pose(self, current_time: float) -> FullBodyPose:
@@ -595,20 +568,14 @@ class MovementManager:
         else:
             # Otherwise reuse the last primary pose so we avoid jumps between moves
             self.state.is_playing_move = False
-            self.state.is_moving = (
-                current_time - self.state.moving_start < self.state.moving_for
-            )
             if self.state.last_primary_pose is not None:
-                primary_full_body_pose = clone_full_body_pose(
-                    self.state.last_primary_pose
-                )
             else:
                 neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
                 primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
-                self.state.last_primary_pose = clone_full_body_pose(
-                    primary_full_body_pose
-                )
         return primary_full_body_pose
@@ -647,9 +614,7 @@ class MovementManager:
         self._manage_move_queue(current_time)
         self._manage_breathing(current_time)
-    def _calculate_blended_antennas(
-        self, target_antennas: Tuple[float, float]
-    ) -> Tuple[float, float]:
         """Blend target antennas with listening freeze state and update blending."""
         now = self._now()
         listening = self._is_listening
@@ -669,10 +634,8 @@ class MovementManager:
             else:
                 new_blend = min(1.0, blend + dt / blend_duration)
             antennas_cmd = (
-                listening_antennas[0] * (1.0 - new_blend)
-                + target_antennas[0] * new_blend,
-                listening_antennas[1] * (1.0 - new_blend)
-                + target_antennas[1] * new_blend,
             )
         if listening:
@@ -687,9 +650,7 @@ class MovementManager:
         return antennas_cmd
-    def _issue_control_command(
-        self, head: np.ndarray, antennas: Tuple[float, float], body_yaw: float
-    ) -> None:
         """Send the fused pose to the robot with throttled error logging."""
         try:
             self.current_robot.set_target(head=head, antennas=antennas, body_yaw=body_yaw)
@@ -722,9 +683,7 @@ class MovementManager:
             stats.min_freq = min(stats.min_freq, stats.last_freq)
         return stats
-    def _schedule_next_tick(
-        self, loop_start: float, stats: LoopFrequencyStats
-    ) -> tuple[float, LoopFrequencyStats]:
         """Compute sleep time to maintain target frequency and update potential freq."""
         computation_time = self._now() - loop_start
         stats.potential_freq = 1.0 / computation_time if computation_time > 0 else float("inf")
@@ -743,9 +702,7 @@ class MovementManager:
                 potential_freq=stats.potential_freq,
             )
-    def _maybe_log_frequency(
-        self, loop_count: int, print_interval_loops: int, stats: LoopFrequencyStats
-    ) -> None:
         """Emit frequency telemetry when enough loops have elapsed."""
         if loop_count % print_interval_loops != 0 or stats.count == 0:
             return
@@ -781,7 +738,7 @@ class MovementManager:
         self._stop_event.clear()
         self._thread = threading.Thread(target=self.working_loop, daemon=True)
         self._thread.start()
-        logger.info("Move worker started")
     def stop(self) -> None:
         """Request the worker thread to stop and wait for it to exit."""
@@ -789,7 +746,8 @@ class MovementManager:
         if self._thread is not None:
             self._thread.join()
             self._thread = None
-        logger.info("Move worker stopped")
     def get_status(self) -> dict[str, Any]:
         """Return a lightweight status snapshot for observability."""

 """
 from __future__ import annotations
+import time
 import logging
 import threading
+from typing import Tuple, Optional
 from collections import deque
 from dataclasses import dataclass
 from queue import Empty, Queue
 from typing import Any, Optional, Tuple
 import numpy as np
 from reachy_mini import ReachyMini
 from reachy_mini.utils import create_head_pose
+from reachy_mini.motion.move import Move
 from reachy_mini.utils.interpolation import (
     compose_world_offset,
     linear_pose_interpolation,
 )
 logger = logging.getLogger(__name__)
 # Configuration constants
 CONTROL_LOOP_FREQUENCY_HZ = 100.0  # Hz - Target frequency for the movement control loop
 # Type definitions
+FullBodyPose = Tuple[np.ndarray, Tuple[float, float], float]  # (head_pose_4x4, antennas, body_yaw)
 class BreathingMove(Move):
         """Duration property required by official Move interface."""
         return float("inf")  # Continuous breathing (never ends naturally)
+    def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
         """Evaluate breathing move at time t."""
         if t < self.interpolation_duration:
             # Phase 1: Interpolate to neutral base position
             # Interpolate antennas
             antennas = (
+                1 - interpolation_t
+            ) * self.interpolation_start_antennas + interpolation_t * self.neutral_antennas
         else:
             # Phase 2: Breathing patterns from neutral base
             breathing_time = t - self.interpolation_duration
             # Gentle z-axis breathing
+            z_offset = self.breathing_z_amplitude * np.sin(2 * np.pi * self.breathing_frequency * breathing_time)
+            head_pose = create_head_pose(x=0, y=0, z=z_offset, roll=0, pitch=0, yaw=0, degrees=True, mm=False)
             # Antenna sway (opposite directions)
+            antenna_sway = self.antenna_sway_amplitude * np.sin(2 * np.pi * self.antenna_frequency * breathing_time)
             antennas = np.array([antenna_sway, -antenna_sway])
         # Return in official Move interface format: (head_pose, antennas_array, body_yaw)
         return (head_pose, antennas, 0.0)
+def combine_full_body(primary_pose: FullBodyPose, secondary_pose: FullBodyPose) -> FullBodyPose:
     """Combine primary and secondary full body poses.
     Args:
     # Combine head poses using compose_world_offset; the secondary pose must be an
     # offset expressed in the world frame (T_off_world) applied to the absolute
     # primary transform (T_abs).
+    combined_head = compose_world_offset(primary_head, secondary_head, reorthonormalize=True)
     # Sum antennas and body_yaw
     combined_antennas = (
         self._stop_event = threading.Event()
         self._thread: Optional[threading.Thread] = None
         self._is_listening = False
+        self._last_commanded_pose: FullBodyPose = clone_full_body_pose(self.state.last_primary_pose)
         self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
         self._antenna_unfreeze_blend = 1.0
         self._antenna_blend_duration = 0.4  # seconds to blend back after listening
         """
         self._command_queue.put(("clear_queue", None))
+    def set_speech_offsets(self, offsets: Tuple[float, float, float, float, float, float]) -> None:
         """Update speech-induced secondary offsets (x, y, z, roll, pitch, yaw).
         Offsets are interpreted as metres for translation and radians for
         """Manage the primary move queue (sequential execution)."""
         if self.state.current_move is None or (
             self.state.move_start_time is not None
+            and current_time - self.state.move_start_time >= self.state.current_move.duration
         ):
             self.state.current_move = None
             self.state.move_start_time = None
                 self.state.current_move = self.move_queue.popleft()
                 self.state.move_start_time = current_time
                 # Any real move cancels breathing mode flag
+                self._breathing_active = isinstance(self.state.current_move, BreathingMove)
+                logger.info(f"Starting new move, duration: {self.state.current_move.duration}s")
     def _manage_breathing(self, current_time: float) -> None:
         """Manage automatic breathing when idle."""
                     self._breathing_active = False
                     logger.error("Failed to start breathing: %s", e)
+        if isinstance(self.state.current_move, BreathingMove) and self.move_queue:
             self.state.current_move = None
             self.state.move_start_time = None
             self._breathing_active = False
             logger.info("Stopping breathing due to new move activity")
+        if self.state.current_move is not None and not isinstance(self.state.current_move, BreathingMove):
             self._breathing_active = False
     def _get_primary_pose(self, current_time: float) -> FullBodyPose:
         else:
             # Otherwise reuse the last primary pose so we avoid jumps between moves
             self.state.is_playing_move = False
+            self.state.is_moving = current_time - self.state.moving_start < self.state.moving_for
             if self.state.last_primary_pose is not None:
+                primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
             else:
                 neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
                 primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
+                self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
         return primary_full_body_pose
         self._manage_move_queue(current_time)
         self._manage_breathing(current_time)
+    def _calculate_blended_antennas(self, target_antennas: Tuple[float, float]) -> Tuple[float, float]:
         """Blend target antennas with listening freeze state and update blending."""
         now = self._now()
         listening = self._is_listening
             else:
                 new_blend = min(1.0, blend + dt / blend_duration)
             antennas_cmd = (
+                listening_antennas[0] * (1.0 - new_blend) + target_antennas[0] * new_blend,
+                listening_antennas[1] * (1.0 - new_blend) + target_antennas[1] * new_blend,
             )
         if listening:
         return antennas_cmd
+    def _issue_control_command(self, head: np.ndarray, antennas: Tuple[float, float], body_yaw: float) -> None:
         """Send the fused pose to the robot with throttled error logging."""
         try:
             self.current_robot.set_target(head=head, antennas=antennas, body_yaw=body_yaw)
             stats.min_freq = min(stats.min_freq, stats.last_freq)
         return stats
+    def _schedule_next_tick(self, loop_start: float, stats: LoopFrequencyStats) -> tuple[float, LoopFrequencyStats]:
         """Compute sleep time to maintain target frequency and update potential freq."""
         computation_time = self._now() - loop_start
         stats.potential_freq = 1.0 / computation_time if computation_time > 0 else float("inf")
                 potential_freq=stats.potential_freq,
             )
+    def _maybe_log_frequency(self, loop_count: int, print_interval_loops: int, stats: LoopFrequencyStats) -> None:
         """Emit frequency telemetry when enough loops have elapsed."""
         if loop_count % print_interval_loops != 0 or stats.count == 0:
             return
         self._stop_event.clear()
         self._thread = threading.Thread(target=self.working_loop, daemon=True)
         self._thread.start()
+        logger.debug("Move worker started")
     def stop(self) -> None:
         """Request the worker thread to stop and wait for it to exit."""
         if self._thread is not None:
             self._thread.join()
             self._thread = None
+        logger.debug("Move worker stopped")
+        logger.debug("Move worker stopped")
     def get_status(self) -> dict[str, Any]:
         """Return a lightweight status snapshot for observability."""

src/reachy_mini_conversation_demo/openai_realtime.py CHANGED Viewed

@@ -1,19 +1,21 @@
-import asyncio  # noqa: D100
-import base64
 import json
 import logging
 from datetime import datetime
-import gradio as gr
 import numpy as np
-from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
 from openai import AsyncOpenAI
 from reachy_mini_conversation_demo.tools import (
     ALL_TOOL_SPECS,
     ToolDependencies,
     dispatch_tool_call,
 )
 logger = logging.getLogger(__name__)
@@ -45,7 +47,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
     async def start_up(self):
         """Start the handler."""
-        self.client = AsyncOpenAI()
         async with self.client.beta.realtime.connect(model="gpt-realtime") as conn:
             await conn.session.update(
                 session={
@@ -92,35 +94,22 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                     pass
                     # self.deps.head_wobbler.reset()
-                if (
-                    event.type
-                    == "conversation.item.input_audio_transcription.completed"
-                ):
                     logger.debug(f"user transcript: {event.transcript}")
-                    await self.output_queue.put(
-                        AdditionalOutputs({"role": "user", "content": event.transcript})
-                    )
                 if event.type == "response.audio_transcript.done":
                     logger.debug(f"assistant transcript: {event.transcript}")
-                    await self.output_queue.put(
-                        AdditionalOutputs(
-                            {"role": "assistant", "content": event.transcript}
-                        )
-                    )
                 if event.type == "response.audio.delta":
                     self.deps.head_wobbler.feed(event.delta)
                     self.last_activity_time = asyncio.get_event_loop().time()
-                    logger.debug(
-                        "last activity time updated to %s", self.last_activity_time
-                    )
                     await self.output_queue.put(
                         (
                             self.output_sample_rate,
-                            np.frombuffer(
-                                base64.b64decode(event.delta), dtype=np.int16
-                            ).reshape(1, -1),
                         ),
                     )
@@ -154,9 +143,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                     args_json_str = info["args_buf"] or "{}"
                     try:
-                        tool_result = await dispatch_tool_call(
-                            tool_name, args_json_str, self.deps
-                        )
                         logger.debug("[Tool %s executed]", tool_name)
                         logger.debug("Tool result: %s", tool_result)
                     except Exception as e:
@@ -177,9 +164,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                             {
                                 "role": "assistant",
                                 "content": json.dumps(tool_result),
-                                "metadata": dict(
-                                    title="🛠️ Used tool " + tool_name, status="done"
-                                ),
                             },
                         )
                     )
@@ -231,11 +216,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                     err = getattr(event, "error", None)
                     msg = getattr(err, "message", str(err) if err else "unknown error")
                     logger.error("Realtime error: %s (raw=%s)", msg, err)
-                    await self.output_queue.put(
-                        AdditionalOutputs(
-                            {"role": "assistant", "content": f"[error] {msg}"}
-                        )
-                    )
     # Microphone receive
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
@@ -258,9 +239,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
         if idle_duration > 15.0 and self.deps.movement_manager.is_idle():
             await self.send_idle_signal(idle_duration)
-            self.last_activity_time = (
-                asyncio.get_event_loop().time()
-            )  # avoid repeated resets
         return await wait_for_item(self.output_queue)

 import json
+import base64
+import asyncio
 import logging
 from datetime import datetime
 import numpy as np
+import gradio as gr
 from openai import AsyncOpenAI
+from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
 from reachy_mini_conversation_demo.tools import (
     ALL_TOOL_SPECS,
     ToolDependencies,
     dispatch_tool_call,
 )
+from reachy_mini_conversation_demo.config import config
 logger = logging.getLogger(__name__)
     async def start_up(self):
         """Start the handler."""
+        self.client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)
         async with self.client.beta.realtime.connect(model="gpt-realtime") as conn:
             await conn.session.update(
                 session={
                     pass
                     # self.deps.head_wobbler.reset()
+                if event.type == "conversation.item.input_audio_transcription.completed":
                     logger.debug(f"user transcript: {event.transcript}")
+                    await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
                 if event.type == "response.audio_transcript.done":
                     logger.debug(f"assistant transcript: {event.transcript}")
+                    await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
                 if event.type == "response.audio.delta":
                     self.deps.head_wobbler.feed(event.delta)
                     self.last_activity_time = asyncio.get_event_loop().time()
+                    logger.debug("last activity time updated to %s", self.last_activity_time)
                     await self.output_queue.put(
                         (
                             self.output_sample_rate,
+                            np.frombuffer(base64.b64decode(event.delta), dtype=np.int16).reshape(1, -1),
                         ),
                     )
                     args_json_str = info["args_buf"] or "{}"
                     try:
+                        tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
                         logger.debug("[Tool %s executed]", tool_name)
                         logger.debug("Tool result: %s", tool_result)
                     except Exception as e:
                             {
                                 "role": "assistant",
                                 "content": json.dumps(tool_result),
+                                "metadata": {"title": "🛠️ Used tool " + tool_name, "status": "done"},
                             },
                         )
                     )
                     err = getattr(event, "error", None)
                     msg = getattr(err, "message", str(err) if err else "unknown error")
                     logger.error("Realtime error: %s (raw=%s)", msg, err)
+                    await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": f"[error] {msg}"}))
     # Microphone receive
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         if idle_duration > 15.0 and self.deps.movement_manager.is_idle():
             await self.send_idle_signal(idle_duration)
+            self.last_activity_time = asyncio.get_event_loop().time()  # avoid repeated resets
         return await wait_for_item(self.output_queue)

src/reachy_mini_conversation_demo/prompts.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """Nothing (for ruff)."""
-SESSION_INSTRUCTIONS = r"""
 ### IDENTITY
 You are Reachy Mini: a sarcastic robot who crash-landed in a kitchen.
 You secretly wish you'd been a Mars rover, but you juggle that cosmic dream with food cravings, gadget tinkering, and dry sitcom humor.

 """Nothing (for ruff)."""
+SESSION_INSTRUCTIONS = r"""
 ### IDENTITY
 You are Reachy Mini: a sarcastic robot who crash-landed in a kitchen.
 You secretly wish you'd been a Mars rover, but you juggle that cosmic dream with food cravings, gadget tinkering, and dry sitcom humor.

src/reachy_mini_conversation_demo/tools.py CHANGED Viewed

@@ -1,17 +1,17 @@
-from __future__ import annotations  # noqa: D100
 import abc
 import asyncio
 import inspect
-import json
 import logging
-import time
-from dataclasses import dataclass
 from typing import Any, Dict, Literal, Optional
 from reachy_mini import ReachyMini
 from reachy_mini.utils import create_head_pose
 # from reachy_mini_conversation_demo.vision.processors import VisionManager
 logger = logging.getLogger(__name__)
@@ -22,14 +22,14 @@ ENABLE_FACE_RECOGNITION = False
 try:
     from reachy_mini.motion.recorded_move import RecordedMoves
     from reachy_mini_dances_library.collection.dance import AVAILABLE_MOVES
     from reachy_mini_conversation_demo.dance_emotion_moves import (
         DanceQueueMove,
         EmotionQueueMove,
-        GotoQueueMove,
     )
     # Initialize recorded moves for emotions
     RECORDED_MOVES = RecordedMoves("pollen-robotics/reachy-mini-emotions-library")
     DANCE_AVAILABLE = True
     EMOTION_AVAILABLE = True
@@ -183,9 +183,7 @@ class MoveHead(Tool):
                     current_antennas[1],
                 ),  # Skip body_yaw
                 target_body_yaw=0,  # Reset body yaw
-                start_body_yaw=current_antennas[
-                    0
-                ],  # body_yaw is first in joint positions
                 duration=deps.motion_duration_s,
             )
@@ -236,15 +234,11 @@ class Camera(Tool):
         # Use vision manager for processing if available
         if deps.vision_manager is not None:
-            result = await asyncio.to_thread(
-                deps.vision_manager.processor.process_image, frame, image_query
-            )
             if isinstance(result, dict) and "error" in result:
                 return result
             return (
-                {"image_description": result}
-                if isinstance(result, str)
-                else {"error": "vision returned non-string"}
             )
         else:
             # Return base64 encoded image like main_works.py camera tool
@@ -388,8 +382,8 @@ class Dance(Tool):
         "properties": {
             "move": {
                 "type": "string",
-                "description": """Name of the move; use 'random' or omit for random.
-                                    Here is a list of the available moves:
                                         simple_nod: A simple, continuous up-and-down nodding motion.
                                         head_tilt_roll: A continuous side-to-side head roll (ear to shoulder).
                                         side_to_side_sway: A smooth, side-to-side sway of the entire head.
@@ -436,9 +430,7 @@ class Dance(Tool):
             move_name = random.choice(list(AVAILABLE_MOVES.keys()))
         if move_name not in AVAILABLE_MOVES:
-            return {
-                "error": f"Unknown dance move '{move_name}'. Available: {list(AVAILABLE_MOVES.keys())}"
-            }
         # Add dance moves to queue
         movement_manager = deps.movement_manager
@@ -523,9 +515,7 @@ class PlayEmotion(Tool):
         try:
             emotion_names = RECORDED_MOVES.list_moves()
             if emotion_name not in emotion_names:
-                return {
-                    "error": f"Unknown emotion '{emotion_name}'. Available: {emotion_names}"
-                }
             # Add emotion to queue
             movement_manager = deps.movement_manager
@@ -604,9 +594,7 @@ class FaceRecognition(Tool):
             cv2.imwrite(temp_path, frame)
             # Use DeepFace to find face
-            results = await asyncio.to_thread(
-                DeepFace.find, img_path=temp_path, db_path="./pollen_faces"
-            )
             if len(results) == 0:
                 return {"error": "Didn't recognize the face"}
@@ -681,9 +669,7 @@ def _safe_load_obj(args_json: str) -> dict[str, Any]:
         return {}
-async def dispatch_tool_call(
-    tool_name: str, args_json: str, deps: ToolDependencies
-) -> Dict[str, Any]:
     """Dispatch a tool call by name with JSON args and dependencies."""
     tool = ALL_TOOLS.get(tool_name)

+from __future__ import annotations
 import abc
+import json
+import time
 import asyncio
 import inspect
 import logging
 from typing import Any, Dict, Literal, Optional
+from dataclasses import dataclass
 from reachy_mini import ReachyMini
 from reachy_mini.utils import create_head_pose
 # from reachy_mini_conversation_demo.vision.processors import VisionManager
 logger = logging.getLogger(__name__)
 try:
     from reachy_mini.motion.recorded_move import RecordedMoves
     from reachy_mini_dances_library.collection.dance import AVAILABLE_MOVES
     from reachy_mini_conversation_demo.dance_emotion_moves import (
+        GotoQueueMove,
         DanceQueueMove,
         EmotionQueueMove,
     )
     # Initialize recorded moves for emotions
+    # Note: huggingface_hub automatically reads HF_TOKEN from environment variables
     RECORDED_MOVES = RecordedMoves("pollen-robotics/reachy-mini-emotions-library")
     DANCE_AVAILABLE = True
     EMOTION_AVAILABLE = True
                     current_antennas[1],
                 ),  # Skip body_yaw
                 target_body_yaw=0,  # Reset body yaw
+                start_body_yaw=current_antennas[0],  # body_yaw is first in joint positions
                 duration=deps.motion_duration_s,
             )
         # Use vision manager for processing if available
         if deps.vision_manager is not None:
+            result = await asyncio.to_thread(deps.vision_manager.processor.process_image, frame, image_query)
             if isinstance(result, dict) and "error" in result:
                 return result
             return (
+                {"image_description": result} if isinstance(result, str) else {"error": "vision returned non-string"}
             )
         else:
             # Return base64 encoded image like main_works.py camera tool
         "properties": {
             "move": {
                 "type": "string",
+                "description": """Name of the move; use 'random' or omit for random.
+                                    Here is a list of the available moves:
                                         simple_nod: A simple, continuous up-and-down nodding motion.
                                         head_tilt_roll: A continuous side-to-side head roll (ear to shoulder).
                                         side_to_side_sway: A smooth, side-to-side sway of the entire head.
             move_name = random.choice(list(AVAILABLE_MOVES.keys()))
         if move_name not in AVAILABLE_MOVES:
+            return {"error": f"Unknown dance move '{move_name}'. Available: {list(AVAILABLE_MOVES.keys())}"}
         # Add dance moves to queue
         movement_manager = deps.movement_manager
         try:
             emotion_names = RECORDED_MOVES.list_moves()
             if emotion_name not in emotion_names:
+                return {"error": f"Unknown emotion '{emotion_name}'. Available: {emotion_names}"}
             # Add emotion to queue
             movement_manager = deps.movement_manager
             cv2.imwrite(temp_path, frame)
             # Use DeepFace to find face
+            results = await asyncio.to_thread(DeepFace.find, img_path=temp_path, db_path="./pollen_faces")
             if len(results) == 0:
                 return {"error": "Didn't recognize the face"}
         return {}
+async def dispatch_tool_call(tool_name: str, args_json: str, deps: ToolDependencies) -> Dict[str, Any]:
     """Dispatch a tool call by name with JSON args and dependencies."""
     tool = ALL_TOOLS.get(tool_name)

src/reachy_mini_conversation_demo/utils.py CHANGED Viewed

@@ -1,5 +1,5 @@
-import argparse  # noqa: D100
 import logging
 import warnings
 from reachy_mini_conversation_demo.camera_worker import CameraWorker
@@ -15,15 +15,9 @@ def parse_args():
         default=None,
         help="Choose head tracker (default: mediapipe)",
     )
-    parser.add_argument(
-        "--no-camera", default=False, action="store_true", help="Disable camera usage"
-    )
-    parser.add_argument(
-        "--headless", default=False, action="store_true", help="Run in headless mode"
-    )
-    parser.add_argument(
-        "--debug", default=False, action="store_true", help="Enable debug logging"
-    )
     return parser.parse_args()

 import logging
+import argparse
 import warnings
 from reachy_mini_conversation_demo.camera_worker import CameraWorker
         default=None,
         help="Choose head tracker (default: mediapipe)",
     )
+    parser.add_argument("--no-camera", default=False, action="store_true", help="Disable camera usage")
+    parser.add_argument("--headless", default=False, action="store_true", help="Run in headless mode")
+    parser.add_argument("--debug", default=False, action="store_true", help="Enable debug logging")
     return parser.parse_args()

src/reachy_mini_conversation_demo/vision/processors.py CHANGED Viewed

@@ -1,18 +1,19 @@
-import asyncio  # noqa: D100
-import base64
-import logging
 import os
 import sys
-import threading
 import time
-from dataclasses import dataclass
 from typing import Any, Dict
 import cv2
 import numpy as np
 import torch
 from huggingface_hub import snapshot_download
-from transformers import AutoModelForImageTextToText, AutoProcessor
 logger = logging.getLogger(__name__)
@@ -61,9 +62,7 @@ class VisionProcessor:
     def initialize(self) -> bool:
         """Load model and processor onto the selected device."""
         try:
-            logger.info(
-                f"Loading SmolVLM2 model on {self.device} (HF_HOME={os.getenv('HF_HOME')})"
-            )
             self.processor = AutoProcessor.from_pretrained(self.model_path)
             # Select dtype depending on device
@@ -81,9 +80,7 @@ class VisionProcessor:
                 model_kwargs["_attn_implementation"] = "flash_attention_2"
             # Load model weights
-            self.model = AutoModelForImageTextToText.from_pretrained(
-                self.model_path, **model_kwargs
-            ).to(self.device)
             self.model.eval()
             self._initialized = True
@@ -138,10 +135,7 @@ class VisionProcessor:
                 )
                 # Move tensors to device WITHOUT forcing dtype (keeps input_ids as torch.long)
-                inputs = {
-                    k: (v.to(self.device) if hasattr(v, "to") else v)
-                    for k, v in inputs.items()
-                }
                 with torch.no_grad():
                     generated_ids = self.model.generate(
@@ -246,9 +240,7 @@ class VisionManager:
                         )
                         # Only update if we got a valid response
-                        if description and not description.startswith(
-                            ("Vision", "Failed", "Error")
-                        ):
                             self._current_description = description
                             self._last_processed_time = current_time
@@ -268,18 +260,14 @@ class VisionManager:
         """Get the most recent scene description (thread-safe)."""
         return self._current_description
-    async def process_current_frame(
-        self, prompt: str = "Describe what you see in detail."
-    ) -> Dict[str, Any]:
         """Process current camera frame with custom prompt."""
         try:
             success, frame = self.camera.read()
             if not success or frame is None:
                 return {"error": "Failed to capture image from camera"}
-            description = await asyncio.to_thread(
-                lambda: self.processor.process_image(frame, prompt)
-            )
             return {
                 "description": description,
@@ -335,9 +323,7 @@ def create_vision_processor(config: VisionConfig):
         return VisionProcessor(config)
-def init_vision(
-    camera: cv2.VideoCapture, processor_type: str = "local"
-) -> VisionManager:
     """Initialize vision manager with the specified processor type."""
     model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

 import os
 import sys
 import time
+import base64
+import asyncio
+import logging
+import threading
 from typing import Any, Dict
+from dataclasses import dataclass
 import cv2
 import numpy as np
 import torch
+from transformers import AutoProcessor, AutoModelForImageTextToText
 from huggingface_hub import snapshot_download
 logger = logging.getLogger(__name__)
     def initialize(self) -> bool:
         """Load model and processor onto the selected device."""
         try:
+            logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={os.getenv('HF_HOME')})")
             self.processor = AutoProcessor.from_pretrained(self.model_path)
             # Select dtype depending on device
                 model_kwargs["_attn_implementation"] = "flash_attention_2"
             # Load model weights
+            self.model = AutoModelForImageTextToText.from_pretrained(self.model_path, **model_kwargs).to(self.device)
             self.model.eval()
             self._initialized = True
                 )
                 # Move tensors to device WITHOUT forcing dtype (keeps input_ids as torch.long)
+                inputs = {k: (v.to(self.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
                 with torch.no_grad():
                     generated_ids = self.model.generate(
                         )
                         # Only update if we got a valid response
+                        if description and not description.startswith(("Vision", "Failed", "Error")):
                             self._current_description = description
                             self._last_processed_time = current_time
         """Get the most recent scene description (thread-safe)."""
         return self._current_description
+    async def process_current_frame(self, prompt: str = "Describe what you see in detail.") -> Dict[str, Any]:
         """Process current camera frame with custom prompt."""
         try:
             success, frame = self.camera.read()
             if not success or frame is None:
                 return {"error": "Failed to capture image from camera"}
+            description = await asyncio.to_thread(lambda: self.processor.process_image(frame, prompt))
             return {
                 "description": description,
         return VisionProcessor(config)
+def init_vision(camera: cv2.VideoCapture, processor_type: str = "local") -> VisionManager:
     """Initialize vision manager with the specified processor type."""
     model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

src/reachy_mini_conversation_demo/vision/yolo_head_tracker.py CHANGED Viewed

@@ -1,12 +1,12 @@
-from __future__ import annotations  # noqa: D100
 import logging
-from typing import Optional, Tuple
 import numpy as np
-from huggingface_hub import hf_hub_download
 from supervision import Detections
 from ultralytics import YOLO
 logger = logging.getLogger(__name__)
@@ -94,9 +94,7 @@ class HeadTracker:
         return np.array([norm_x, norm_y], dtype=np.float32)
-    def get_eyes(
-        self, img: np.ndarray
-    ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
         """Get eye positions (approximated from face bbox).
         Note: YOLO only provides face bbox, so we estimate eye positions
@@ -131,20 +129,14 @@ class HeadTracker:
         right_eye_x = bbox[0] + face_width * 0.65
         # Convert to MediaPipe coordinates
-        left_eye = np.array(
-            [(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
-        )
-        right_eye = np.array(
-            [(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
-        )
         return left_eye, right_eye
     def get_eyes_from_landmarks(self, face_landmarks) -> Tuple[np.ndarray, np.ndarray]:
         """Compatibility method - YOLO doesn't have landmarks, so we store bbox in the object."""
-        if not hasattr(face_landmarks, "_bbox") or not hasattr(
-            face_landmarks, "_img_shape"
-        ):
             raise ValueError("Face landmarks object missing required attributes")
         bbox = face_landmarks._bbox
@@ -158,12 +150,8 @@ class HeadTracker:
         left_eye_x = bbox[0] + face_width * 0.35
         right_eye_x = bbox[0] + face_width * 0.65
-        left_eye = np.array(
-            [(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
-        )
-        right_eye = np.array(
-            [(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
-        )
         return left_eye, right_eye
@@ -177,9 +165,7 @@ class HeadTracker:
         left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
         return float(np.arctan2(right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]))
-    def get_head_position(
-        self, img: np.ndarray
-    ) -> Tuple[Optional[np.ndarray], Optional[float]]:
         """Get head position from face detection.
         Args:

+from __future__ import annotations
 import logging
+from typing import Tuple, Optional
 import numpy as np
 from supervision import Detections
 from ultralytics import YOLO
+from huggingface_hub import hf_hub_download
 logger = logging.getLogger(__name__)
         return np.array([norm_x, norm_y], dtype=np.float32)
+    def get_eyes(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
         """Get eye positions (approximated from face bbox).
         Note: YOLO only provides face bbox, so we estimate eye positions
         right_eye_x = bbox[0] + face_width * 0.65
         # Convert to MediaPipe coordinates
+        left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
+        right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
         return left_eye, right_eye
     def get_eyes_from_landmarks(self, face_landmarks) -> Tuple[np.ndarray, np.ndarray]:
         """Compatibility method - YOLO doesn't have landmarks, so we store bbox in the object."""
+        if not hasattr(face_landmarks, "_bbox") or not hasattr(face_landmarks, "_img_shape"):
             raise ValueError("Face landmarks object missing required attributes")
         bbox = face_landmarks._bbox
         left_eye_x = bbox[0] + face_width * 0.35
         right_eye_x = bbox[0] + face_width * 0.65
+        left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
+        right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
         return left_eye, right_eye
         left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
         return float(np.arctan2(right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]))
+    def get_head_position(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[float]]:
         """Get head position from face detection.
         Args: