Alina Lozovskaya commited on
Commit
77a9cb2
·
2 Parent(s): af50a23 46f3109

Merge origin/develop into branch: resolve audio/moves conflicts and sync deps

Browse files
.env.example CHANGED
@@ -5,4 +5,7 @@ MODEL_NAME="gpt-4o-realtime-preview-2025-06-03"
5
  OPENAI_VISION_MODEL="gpt-4.1-mini"
6
 
7
  # Cache for local VLM
8
- HF_HOME=./cache
 
 
 
 
5
  OPENAI_VISION_MODEL="gpt-4.1-mini"
6
 
7
  # Cache for local VLM
8
+ HF_HOME=./cache
9
+
10
+ # Hugging Face token for accessing datasets/models
11
+ HF_TOKEN=
README.md CHANGED
@@ -3,10 +3,14 @@
3
  Working repo, we should turn this into a ReachyMini app at some point maybe ?
4
 
5
  ## Installation
 
6
 
7
  ```bash
8
- pip install -e .
 
 
9
  ```
 
10
 
11
  ## Run
12
 
 
3
  Working repo, we should turn this into a ReachyMini app at some point maybe ?
4
 
5
  ## Installation
6
+ You can set up the project quickly using [uv](https://docs.astral.sh/uv/):
7
 
8
  ```bash
9
+ uv venv --python 3.12.1 # Create a virtual environment with Python 3.12.1
10
+ source .venv/bin/activate
11
+ uv sync
12
  ```
13
+ > Note: The `pyproject.toml` expects `reachy-mini-dances-library` to be located in the same directory as this project.
14
 
15
  ## Run
16
 
pyproject.toml CHANGED
@@ -10,44 +10,78 @@ description = ""
10
  readme = "README.md"
11
  requires-python = ">=3.10"
12
  dependencies = [
13
- # "reachy_mini@git+ssh://git@github.com/pollen-robotics/reachy_mini@develop",
14
- "openai",
15
  "fastrtc",
 
 
 
 
16
  "onnxruntime",
17
- "PyGObject>=3.42.2, <=3.46.0",
 
 
 
 
 
 
 
18
  "torch",
19
  "transformers",
20
- "num2words",
21
- "dotenv",
22
  "ultralytics",
23
- "supervision",
24
- "reachy_mini_toolbox@git+ssh://git@github.com/pollen-robotics/reachy_mini_toolbox@main",
25
- "reachy_mini_dances_library@git+ssh://git@github.com/pollen-robotics/reachy_mini_dances_library@main"
26
  ]
27
 
 
 
28
 
29
- [project.optional-dependencies]
30
  dev = ["pytest", "ruff==0.12.0"]
31
 
32
- [project.scripts]
33
- reachy-mini-conversation-demo = "reachy_mini_conversation_demo.main:main"
 
 
 
34
 
35
  [tool.setuptools]
36
  package-dir = { "" = "src" }
37
  include-package-data = true
38
 
39
-
40
  [tool.setuptools.packages.find]
41
  where = ["src"]
42
 
43
-
44
  [tool.setuptools.package-data]
45
- reachy_mini = ["**/*"] # Inclut tous les fichiers non .py
46
 
47
  [tool.ruff]
48
- exclude = []
49
- lint.extend-select = ["I", "D"]
50
- lint.ignore = [
51
- "D203", # Incompatible with D211
52
- "D213", # Incompatible with D212
 
 
 
 
 
 
53
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  readme = "README.md"
11
  requires-python = ">=3.10"
12
  dependencies = [
13
+ "aiortc>=1.13.0",
 
14
  "fastrtc",
15
+ "gradio>=5.49.0",
16
+ "huggingface_hub>=0.34.4",
17
+ "mediapipe>=0.10.14",
18
+ "num2words",
19
  "onnxruntime",
20
+ "opencv-python>=4.12.0.88",
21
+ "openai>=2.1",
22
+ "PyGObject>=3.42.2,<=3.46.0",
23
+ "python-dotenv",
24
+ "reachy_mini_dances_library",
25
+ "reachy_mini_toolbox",
26
+ "reachy_mini",
27
+ "supervision",
28
  "torch",
29
  "transformers",
 
 
30
  "ultralytics",
 
 
 
31
  ]
32
 
33
+ [project.scripts]
34
+ reachy-mini-conversation-demo = "reachy_mini_conversation_demo.main:main"
35
 
36
+ [dependency-groups]
37
  dev = ["pytest", "ruff==0.12.0"]
38
 
39
+ [tool.uv.sources]
40
+ reachy_mini_dances_library = { path = "../reachy_mini_dances_library", editable = true }
41
+ reachy_mini = { git = "ssh://git@github.com/pollen-robotics/reachy_mini.git", branch = "develop" }
42
+ reachy_mini_toolbox = { git = "ssh://git@github.com/pollen-robotics/reachy_mini_toolbox.git", branch = "main" }
43
+ fastrtc = { git = "ssh://git@github.com/gradio-app/fastrtc.git", branch = "main" }
44
 
45
  [tool.setuptools]
46
  package-dir = { "" = "src" }
47
  include-package-data = true
48
 
 
49
  [tool.setuptools.packages.find]
50
  where = ["src"]
51
 
 
52
  [tool.setuptools.package-data]
53
+ reachy_mini_conversation_demo = ["images/*"]
54
 
55
  [tool.ruff]
56
+ line-length = 119
57
+ exclude = [".venv", "dist", "build", "**/__pycache__", "*.egg-info", ".mypy_cache", ".pytest_cache"]
58
+
59
+ [tool.ruff.lint]
60
+ select = [
61
+ "E", # pycodestyle errors
62
+ "F", # pyflakes
63
+ "W", # pycodestyle warnings
64
+ "I", # isort
65
+ "C4", # flake8-comprehensions
66
+ "D", # pydocstyle
67
  ]
68
+ ignore = [
69
+ "E501", # handled by formatter
70
+ "D100", # ignore missing module docstrings
71
+ "D203", # blank line before class docstring (conflicts with D211)
72
+ "D213", # summary on second line (conflicts with D212)
73
+ ]
74
+
75
+ [tool.ruff.lint.isort]
76
+ length-sort = true
77
+ lines-after-imports = 2
78
+ no-lines-before = ["standard-library", "local-folder"]
79
+ known-local-folder = ["reachy_mini_conversation_demo"]
80
+ known-first-party = ["reachy_mini", "reachy_mini_dances_library", "reachy_mini_toolbox"]
81
+ split-on-trailing-comma = true
82
+
83
+ [tool.ruff.format]
84
+ quote-style = "double"
85
+ indent-style = "space"
86
+ skip-magic-trailing-comma = false
87
+ line-ending = "auto"
src/reachy_mini_conversation_demo/__init__.py CHANGED
@@ -1 +1 @@
1
- """Nothing (for ruff)."""
 
1
+ """Nothing (for ruff)."""
src/reachy_mini_conversation_demo/audio/__init__.py CHANGED
@@ -1 +1 @@
1
- """Nothing (for ruff)."""
 
1
+ """Nothing (for ruff)."""
src/reachy_mini_conversation_demo/audio/gstreamer.py CHANGED
@@ -1,12 +1,13 @@
1
- import logging # noqa: D100
2
- from threading import Thread
3
  from typing import Optional
 
4
 
5
  import gi
6
 
 
7
  gi.require_version("Gst", "1.0")
8
  gi.require_version("GstApp", "1.0")
9
- from gi.repository import GLib, Gst # noqa: E402
10
 
11
 
12
  class GstPlayer:
@@ -25,18 +26,16 @@ class GstPlayer:
25
  self.appsrc = Gst.ElementFactory.make("appsrc", None)
26
  self.appsrc.set_property("format", Gst.Format.TIME)
27
  self.appsrc.set_property("is-live", True)
28
- caps = Gst.Caps.from_string(
29
- f"audio/x-raw,format=S16LE,channels=1,rate={sample_rate},layout=interleaved"
30
- )
31
  self.appsrc.set_property("caps", caps)
32
  queue = Gst.ElementFactory.make("queue")
33
  audioconvert = Gst.ElementFactory.make("audioconvert")
34
  audioresample = Gst.ElementFactory.make("audioresample")
35
 
36
  # Try to pin specific output device; fallback to autoaudiosink
37
- audiosink = _create_device_element(
38
- direction="sink", name_substr=device_name
39
- ) or Gst.ElementFactory.make("autoaudiosink")
40
 
41
  self.pipeline.add(self.appsrc)
42
  self.pipeline.add(queue)
@@ -104,9 +103,9 @@ class GstRecorder:
104
  self.pipeline = Gst.Pipeline.new("audio_recorder")
105
 
106
  # Create elements: try specific mic; fallback to default
107
- autoaudiosrc = _create_device_element(
108
- direction="source", name_substr=device_name
109
- ) or Gst.ElementFactory.make("autoaudiosrc", None)
110
 
111
  queue = Gst.ElementFactory.make("queue", None)
112
  audioconvert = Gst.ElementFactory.make("audioconvert", None)
@@ -117,9 +116,7 @@ class GstRecorder:
117
  raise RuntimeError("Failed to create GStreamer elements")
118
 
119
  # Force mono/S16LE at 24000; resample handles device SR (e.g., 16000 → 24000)
120
- caps = Gst.Caps.from_string(
121
- f"audio/x-raw,channels=1,rate={sample_rate},format=S16LE"
122
- )
123
  self.appsink.set_property("caps", caps)
124
 
125
  # Build pipeline
@@ -183,9 +180,7 @@ class GstRecorder:
183
  logger.info("Stopped Recorder")
184
 
185
 
186
- def _create_device_element(
187
- direction: str, name_substr: Optional[str]
188
- ) -> Optional[Gst.Element]:
189
  """direction: 'source' or 'sink'.
190
 
191
  name_substr: case-insensitive substring matching device display name/description.
@@ -205,30 +200,15 @@ def _create_device_element(
205
  for dev in monitor.get_devices() or []:
206
  disp = dev.get_display_name() or ""
207
  props = dev.get_properties()
208
- desc = (
209
- props.get_string("device.description")
210
- if props and props.has_field("device.description")
211
- else ""
212
- )
213
  logger.info(f"Device candidate: disp='{disp}', desc='{desc}'")
214
 
215
- if (
216
- name_substr.lower() in disp.lower()
217
- or name_substr.lower() in desc.lower()
218
- ):
219
  elem = dev.create_element(None)
220
- factory = (
221
- elem.get_factory().get_name()
222
- if elem and elem.get_factory()
223
- else "<?>"
224
- )
225
- logger.info(
226
- f"Using {direction} device: '{disp or desc}' (factory='{factory}')"
227
- )
228
  return elem
229
  finally:
230
  monitor.stop()
231
- logging.getLogger(__name__).warning(
232
- "Requested %s '%s' not found; using auto*", direction, name_substr
233
- )
234
  return None
 
1
+ import logging
 
2
  from typing import Optional
3
+ from threading import Thread
4
 
5
  import gi
6
 
7
+
8
  gi.require_version("Gst", "1.0")
9
  gi.require_version("GstApp", "1.0")
10
+ from gi.repository import Gst, GLib # noqa: E402
11
 
12
 
13
  class GstPlayer:
 
26
  self.appsrc = Gst.ElementFactory.make("appsrc", None)
27
  self.appsrc.set_property("format", Gst.Format.TIME)
28
  self.appsrc.set_property("is-live", True)
29
+ caps = Gst.Caps.from_string(f"audio/x-raw,format=S16LE,channels=1,rate={sample_rate},layout=interleaved")
 
 
30
  self.appsrc.set_property("caps", caps)
31
  queue = Gst.ElementFactory.make("queue")
32
  audioconvert = Gst.ElementFactory.make("audioconvert")
33
  audioresample = Gst.ElementFactory.make("audioresample")
34
 
35
  # Try to pin specific output device; fallback to autoaudiosink
36
+ audiosink = _create_device_element(direction="sink", name_substr=device_name) or Gst.ElementFactory.make(
37
+ "autoaudiosink"
38
+ )
39
 
40
  self.pipeline.add(self.appsrc)
41
  self.pipeline.add(queue)
 
103
  self.pipeline = Gst.Pipeline.new("audio_recorder")
104
 
105
  # Create elements: try specific mic; fallback to default
106
+ autoaudiosrc = _create_device_element(direction="source", name_substr=device_name) or Gst.ElementFactory.make(
107
+ "autoaudiosrc", None
108
+ )
109
 
110
  queue = Gst.ElementFactory.make("queue", None)
111
  audioconvert = Gst.ElementFactory.make("audioconvert", None)
 
116
  raise RuntimeError("Failed to create GStreamer elements")
117
 
118
  # Force mono/S16LE at 24000; resample handles device SR (e.g., 16000 → 24000)
119
+ caps = Gst.Caps.from_string(f"audio/x-raw,channels=1,rate={sample_rate},format=S16LE")
 
 
120
  self.appsink.set_property("caps", caps)
121
 
122
  # Build pipeline
 
180
  logger.info("Stopped Recorder")
181
 
182
 
183
+ def _create_device_element(direction: str, name_substr: Optional[str]) -> Optional[Gst.Element]:
 
 
184
  """direction: 'source' or 'sink'.
185
 
186
  name_substr: case-insensitive substring matching device display name/description.
 
200
  for dev in monitor.get_devices() or []:
201
  disp = dev.get_display_name() or ""
202
  props = dev.get_properties()
203
+ desc = props.get_string("device.description") if props and props.has_field("device.description") else ""
 
 
 
 
204
  logger.info(f"Device candidate: disp='{disp}', desc='{desc}'")
205
 
206
+ if name_substr.lower() in disp.lower() or name_substr.lower() in desc.lower():
 
 
 
207
  elem = dev.create_element(None)
208
+ factory = elem.get_factory().get_name() if elem and elem.get_factory() else "<?>"
209
+ logger.info(f"Using {direction} device: '{disp or desc}' (factory='{factory}')")
 
 
 
 
 
 
210
  return elem
211
  finally:
212
  monitor.stop()
213
+ logging.getLogger(__name__).warning("Requested %s '%s' not found; using auto*", direction, name_substr)
 
 
214
  return None
src/reachy_mini_conversation_demo/audio/head_wobbler.py CHANGED
@@ -1,16 +1,17 @@
1
  """Moves head given audio samples."""
2
 
 
 
3
  import base64
4
  import logging
5
- import queue
6
  import threading
7
- import time
8
  from typing import Optional, Tuple
9
 
10
  import numpy as np
11
 
12
  from reachy_mini_conversation_demo.audio.speech_tapper import HOP_MS, SwayRollRT
13
 
 
14
  SAMPLE_RATE = 24000
15
  MOVEMENT_LATENCY_S = 0.08 # seconds between audio and robot movement
16
  logger = logging.getLogger(__name__)
@@ -25,9 +26,7 @@ class HeadWobbler:
25
  self._base_ts: Optional[float] = None
26
  self._hops_done: int = 0
27
 
28
- self.audio_queue: queue.Queue[
29
- Tuple[int, int, np.ndarray]
30
- ] = queue.Queue()
31
  self.sway = SwayRollRT()
32
 
33
  # Synchronization primitives
@@ -50,14 +49,14 @@ class HeadWobbler:
50
  self._stop_event.clear()
51
  self._thread = threading.Thread(target=self.working_loop, daemon=True)
52
  self._thread.start()
53
- logger.info("Head wobbler started")
54
 
55
  def stop(self) -> None:
56
  """Stop the head wobbler loop."""
57
  self._stop_event.set()
58
  if self._thread is not None:
59
  self._thread.join()
60
- logger.info("Head wobbler stopped")
61
 
62
  def working_loop(self) -> None:
63
  """Convert audio deltas into head movement offsets."""
 
1
  """Moves head given audio samples."""
2
 
3
+ import time
4
+ import queue
5
  import base64
6
  import logging
 
7
  import threading
 
8
  from typing import Optional, Tuple
9
 
10
  import numpy as np
11
 
12
  from reachy_mini_conversation_demo.audio.speech_tapper import HOP_MS, SwayRollRT
13
 
14
+
15
  SAMPLE_RATE = 24000
16
  MOVEMENT_LATENCY_S = 0.08 # seconds between audio and robot movement
17
  logger = logging.getLogger(__name__)
 
26
  self._base_ts: Optional[float] = None
27
  self._hops_done: int = 0
28
 
29
+ self.audio_queue: queue.Queue[Tuple[int, int, np.ndarray]] = queue.Queue()
 
 
30
  self.sway = SwayRollRT()
31
 
32
  # Synchronization primitives
 
49
  self._stop_event.clear()
50
  self._thread = threading.Thread(target=self.working_loop, daemon=True)
51
  self._thread.start()
52
+ logger.debug("Head wobbler started")
53
 
54
  def stop(self) -> None:
55
  """Stop the head wobbler loop."""
56
  self._stop_event.set()
57
  if self._thread is not None:
58
  self._thread.join()
59
+ logger.debug("Head wobbler stopped")
60
 
61
  def working_loop(self) -> None:
62
  """Convert audio deltas into head movement offsets."""
src/reachy_mini_conversation_demo/audio/speech_tapper.py CHANGED
@@ -1,12 +1,12 @@
1
- from __future__ import annotations # noqa: D100
2
-
3
  import math
4
- from collections import deque
5
- from itertools import islice
6
  from typing import Dict, List, Optional
 
 
7
 
8
  import numpy as np
9
 
 
10
  # Tunables
11
  SR = 16_000
12
  FRAME_MS = 20
@@ -68,7 +68,7 @@ def _loudness_gain(db: float, offset: float = SENS_DB_OFFSET) -> float:
68
 
69
  def _to_float32_mono(x: np.ndarray) -> np.ndarray:
70
  """Convert arbitrary PCM array to float32 mono in [-1,1].
71
-
72
  Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
73
  """
74
  a = np.asarray(x)
@@ -258,24 +258,9 @@ class SwayRollRT:
258
  * env
259
  * math.sin(2 * math.pi * SWAY_F_ROLL * self.t + self.phase_roll)
260
  )
261
- x_mm = (
262
- SWAY_A_X_MM
263
- * loud
264
- * env
265
- * math.sin(2 * math.pi * SWAY_F_X * self.t + self.phase_x)
266
- )
267
- y_mm = (
268
- SWAY_A_Y_MM
269
- * loud
270
- * env
271
- * math.sin(2 * math.pi * SWAY_F_Y * self.t + self.phase_y)
272
- )
273
- z_mm = (
274
- SWAY_A_Z_MM
275
- * loud
276
- * env
277
- * math.sin(2 * math.pi * SWAY_F_Z * self.t + self.phase_z)
278
- )
279
 
280
  out.append(
281
  {
 
1
+ from __future__ import annotations
 
2
  import math
 
 
3
  from typing import Dict, List, Optional
4
+ from itertools import islice
5
+ from collections import deque
6
 
7
  import numpy as np
8
 
9
+
10
  # Tunables
11
  SR = 16_000
12
  FRAME_MS = 20
 
68
 
69
  def _to_float32_mono(x: np.ndarray) -> np.ndarray:
70
  """Convert arbitrary PCM array to float32 mono in [-1,1].
71
+
72
  Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
73
  """
74
  a = np.asarray(x)
 
258
  * env
259
  * math.sin(2 * math.pi * SWAY_F_ROLL * self.t + self.phase_roll)
260
  )
261
+ x_mm = SWAY_A_X_MM * loud * env * math.sin(2 * math.pi * SWAY_F_X * self.t + self.phase_x)
262
+ y_mm = SWAY_A_Y_MM * loud * env * math.sin(2 * math.pi * SWAY_F_Y * self.t + self.phase_y)
263
+ z_mm = SWAY_A_Z_MM * loud * env * math.sin(2 * math.pi * SWAY_F_Z * self.t + self.phase_z)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
  out.append(
266
  {
src/reachy_mini_conversation_demo/camera_worker.py CHANGED
@@ -6,16 +6,18 @@ Ported from main_works.py camera_worker() function to provide:
6
  - Latest frame always available for tools
7
  """
8
 
 
9
  import logging
10
  import threading
11
- import time
12
- from typing import Optional, Tuple
13
 
14
  import cv2
15
  import numpy as np
 
 
16
  from reachy_mini import ReachyMini
17
  from reachy_mini.utils.interpolation import linear_pose_interpolation
18
- from scipy.spatial.transform import Rotation as R
19
 
20
  logger = logging.getLogger(__name__)
21
 
@@ -83,14 +85,14 @@ class CameraWorker:
83
  self._stop_event.clear()
84
  self._thread = threading.Thread(target=self.working_loop, daemon=True)
85
  self._thread.start()
86
- logger.info("Camera worker started")
87
 
88
  def stop(self) -> None:
89
  """Stop the camera worker loop."""
90
  self._stop_event.set()
91
  if self._thread is not None:
92
  self._thread.join()
93
- logger.info("Camera worker stopped")
94
 
95
  def working_loop(self) -> None:
96
  """Enable the camera worker loop.
@@ -114,17 +116,10 @@ class CameraWorker:
114
  self.latest_frame = frame # .copy()
115
 
116
  # Check if face tracking was just disabled
117
- if (
118
- self.previous_head_tracking_state
119
- and not self.is_head_tracking_enabled
120
- ):
121
  # Face tracking was just disabled - start interpolation to neutral
122
- self.last_face_detected_time = (
123
- current_time # Trigger the face-lost logic
124
- )
125
- self.interpolation_start_time = (
126
- None # Will be set by the face-lost interpolation
127
- )
128
  self.interpolation_start_pose = None
129
 
130
  # Update tracking state
@@ -137,9 +132,7 @@ class CameraWorker:
137
  if eye_center is not None:
138
  # Face detected - immediately switch to tracking
139
  self.last_face_detected_time = current_time
140
- self.interpolation_start_time = (
141
- None # Stop any interpolation
142
- )
143
 
144
  # Convert normalized coordinates to pixel coordinates
145
  h, w, _ = frame.shape
@@ -159,9 +152,7 @@ class CameraWorker:
159
 
160
  # Extract translation and rotation from the target pose directly
161
  translation = target_pose[:3, 3]
162
- rotation = R.from_matrix(target_pose[:3, :3]).as_euler(
163
- "xyz", degrees=False
164
- )
165
 
166
  # Thread-safe update of face tracking offsets (use pose as-is)
167
  with self.face_tracking_lock:
@@ -176,19 +167,14 @@ class CameraWorker:
176
 
177
  else:
178
  # No face detected while tracking enabled - set face lost timestamp
179
- if (
180
- self.last_face_detected_time is None
181
- or self.last_face_detected_time == current_time
182
- ):
183
  # Only update if we haven't already set a face lost time
184
  # (current_time check prevents overriding the disable-triggered timestamp)
185
  pass
186
 
187
  # Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
188
  if self.last_face_detected_time is not None:
189
- time_since_face_lost = (
190
- current_time - self.last_face_detected_time
191
- )
192
 
193
  if time_since_face_lost >= self.face_lost_delay:
194
  # Start interpolation if not already started
@@ -197,27 +183,17 @@ class CameraWorker:
197
  # Capture current pose as start of interpolation
198
  with self.face_tracking_lock:
199
  current_translation = self.face_tracking_offsets[:3]
200
- current_rotation_euler = self.face_tracking_offsets[
201
- 3:
202
- ]
203
  # Convert to 4x4 pose matrix
204
  self.interpolation_start_pose = np.eye(4)
205
- self.interpolation_start_pose[:3, 3] = (
206
- current_translation
207
- )
208
- self.interpolation_start_pose[:3, :3] = (
209
- R.from_euler(
210
- "xyz", current_rotation_euler
211
- ).as_matrix()
212
- )
213
 
214
  # Calculate interpolation progress (t from 0 to 1)
215
- elapsed_interpolation = (
216
- current_time - self.interpolation_start_time
217
- )
218
- t = min(
219
- 1.0, elapsed_interpolation / self.interpolation_duration
220
- )
221
 
222
  # Interpolate between current pose and neutral pose
223
  interpolated_pose = linear_pose_interpolation(
@@ -226,9 +202,7 @@ class CameraWorker:
226
 
227
  # Extract translation and rotation from interpolated pose
228
  translation = interpolated_pose[:3, 3]
229
- rotation = R.from_matrix(
230
- interpolated_pose[:3, :3]
231
- ).as_euler("xyz", degrees=False)
232
 
233
  # Thread-safe update of face tracking offsets
234
  with self.face_tracking_lock:
 
6
  - Latest frame always available for tools
7
  """
8
 
9
+ import time
10
  import logging
11
  import threading
12
+ from typing import Tuple, Optional
 
13
 
14
  import cv2
15
  import numpy as np
16
+ from scipy.spatial.transform import Rotation as R
17
+
18
  from reachy_mini import ReachyMini
19
  from reachy_mini.utils.interpolation import linear_pose_interpolation
20
+
21
 
22
  logger = logging.getLogger(__name__)
23
 
 
85
  self._stop_event.clear()
86
  self._thread = threading.Thread(target=self.working_loop, daemon=True)
87
  self._thread.start()
88
+ logger.debug("Camera worker started")
89
 
90
  def stop(self) -> None:
91
  """Stop the camera worker loop."""
92
  self._stop_event.set()
93
  if self._thread is not None:
94
  self._thread.join()
95
+ logger.debug("Camera worker stopped")
96
 
97
  def working_loop(self) -> None:
98
  """Enable the camera worker loop.
 
116
  self.latest_frame = frame # .copy()
117
 
118
  # Check if face tracking was just disabled
119
+ if self.previous_head_tracking_state and not self.is_head_tracking_enabled:
 
 
 
120
  # Face tracking was just disabled - start interpolation to neutral
121
+ self.last_face_detected_time = current_time # Trigger the face-lost logic
122
+ self.interpolation_start_time = None # Will be set by the face-lost interpolation
 
 
 
 
123
  self.interpolation_start_pose = None
124
 
125
  # Update tracking state
 
132
  if eye_center is not None:
133
  # Face detected - immediately switch to tracking
134
  self.last_face_detected_time = current_time
135
+ self.interpolation_start_time = None # Stop any interpolation
 
 
136
 
137
  # Convert normalized coordinates to pixel coordinates
138
  h, w, _ = frame.shape
 
152
 
153
  # Extract translation and rotation from the target pose directly
154
  translation = target_pose[:3, 3]
155
+ rotation = R.from_matrix(target_pose[:3, :3]).as_euler("xyz", degrees=False)
 
 
156
 
157
  # Thread-safe update of face tracking offsets (use pose as-is)
158
  with self.face_tracking_lock:
 
167
 
168
  else:
169
  # No face detected while tracking enabled - set face lost timestamp
170
+ if self.last_face_detected_time is None or self.last_face_detected_time == current_time:
 
 
 
171
  # Only update if we haven't already set a face lost time
172
  # (current_time check prevents overriding the disable-triggered timestamp)
173
  pass
174
 
175
  # Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
176
  if self.last_face_detected_time is not None:
177
+ time_since_face_lost = current_time - self.last_face_detected_time
 
 
178
 
179
  if time_since_face_lost >= self.face_lost_delay:
180
  # Start interpolation if not already started
 
183
  # Capture current pose as start of interpolation
184
  with self.face_tracking_lock:
185
  current_translation = self.face_tracking_offsets[:3]
186
+ current_rotation_euler = self.face_tracking_offsets[3:]
 
 
187
  # Convert to 4x4 pose matrix
188
  self.interpolation_start_pose = np.eye(4)
189
+ self.interpolation_start_pose[:3, 3] = current_translation
190
+ self.interpolation_start_pose[:3, :3] = R.from_euler(
191
+ "xyz", current_rotation_euler
192
+ ).as_matrix()
 
 
 
 
193
 
194
  # Calculate interpolation progress (t from 0 to 1)
195
+ elapsed_interpolation = current_time - self.interpolation_start_time
196
+ t = min(1.0, elapsed_interpolation / self.interpolation_duration)
 
 
 
 
197
 
198
  # Interpolate between current pose and neutral pose
199
  interpolated_pose = linear_pose_interpolation(
 
202
 
203
  # Extract translation and rotation from interpolated pose
204
  translation = interpolated_pose[:3, 3]
205
+ rotation = R.from_matrix(interpolated_pose[:3, :3]).as_euler("xyz", degrees=False)
 
 
206
 
207
  # Thread-safe update of face tracking offsets
208
  with self.face_tracking_lock:
src/reachy_mini_conversation_demo/config.py CHANGED
@@ -1,7 +1,8 @@
1
- import os # noqa: D100
2
 
3
  from dotenv import load_dotenv
4
 
 
5
  load_dotenv()
6
 
7
 
@@ -15,7 +16,7 @@ def getenv_bool(key: str, default: bool = False) -> bool:
15
 
16
  class Config:
17
  """Configuration class for the conversation demo."""
18
-
19
  # Required
20
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
21
  if not OPENAI_API_KEY:
@@ -25,6 +26,7 @@ class Config:
25
  MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-realtime-preview")
26
  OPENAI_VISION_MODEL = os.getenv("OPENAI_VISION_MODEL", "gpt-4.1-mini")
27
  HF_HOME = os.getenv("HF_HOME", "./cache")
 
28
 
29
 
30
  config = Config()
 
1
+ import os
2
 
3
  from dotenv import load_dotenv
4
 
5
+
6
  load_dotenv()
7
 
8
 
 
16
 
17
  class Config:
18
  """Configuration class for the conversation demo."""
19
+
20
  # Required
21
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
22
  if not OPENAI_API_KEY:
 
26
  MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-realtime-preview")
27
  OPENAI_VISION_MODEL = os.getenv("OPENAI_VISION_MODEL", "gpt-4.1-mini")
28
  HF_HOME = os.getenv("HF_HOME", "./cache")
29
+ HF_TOKEN = os.getenv("HF_TOKEN") # Optional, falls back to hf auth login if not set
30
 
31
 
32
  config = Config()
src/reachy_mini_conversation_demo/dance_emotion_moves.py CHANGED
@@ -5,15 +5,16 @@ and executed sequentially by the MovementManager.
5
  """
6
 
7
  from __future__ import annotations
8
-
9
  import logging
10
  from typing import Tuple
11
 
12
  import numpy as np
 
13
  from reachy_mini.motion.move import Move
14
  from reachy_mini.motion.recorded_move import RecordedMoves
15
  from reachy_mini_dances_library.dance_move import DanceMove
16
 
 
17
  logger = logging.getLogger(__name__)
18
 
19
 
@@ -30,9 +31,7 @@ class DanceQueueMove(Move):
30
  """Duration property required by official Move interface."""
31
  return self.dance_move.duration
32
 
33
- def evaluate(
34
- self, t: float
35
- ) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
36
  """Evaluate dance move at time t."""
37
  try:
38
  # Get the pose from the dance move
@@ -45,9 +44,7 @@ class DanceQueueMove(Move):
45
  return (head_pose, antennas, body_yaw)
46
 
47
  except Exception as e:
48
- logger.error(
49
- f"Error evaluating dance move '{self.move_name}' at t={t}: {e}"
50
- )
51
  # Return neutral pose on error
52
  from reachy_mini.utils import create_head_pose
53
 
@@ -68,9 +65,7 @@ class EmotionQueueMove(Move):
68
  """Duration property required by official Move interface."""
69
  return self.emotion_move.duration
70
 
71
- def evaluate(
72
- self, t: float
73
- ) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
74
  """Evaluate emotion move at time t."""
75
  try:
76
  # Get the pose from the emotion move
@@ -83,9 +78,7 @@ class EmotionQueueMove(Move):
83
  return (head_pose, antennas, body_yaw)
84
 
85
  except Exception as e:
86
- logger.error(
87
- f"Error evaluating emotion '{self.emotion_name}' at t={t}: {e}"
88
- )
89
  # Return neutral pose on error
90
  from reachy_mini.utils import create_head_pose
91
 
@@ -120,9 +113,7 @@ class GotoQueueMove(Move):
120
  """Duration property required by official Move interface."""
121
  return self._duration
122
 
123
- def evaluate(
124
- self, t: float
125
- ) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
126
  """Evaluate goto move at time t using linear interpolation."""
127
  try:
128
  from reachy_mini.utils import create_head_pose
@@ -138,32 +129,23 @@ class GotoQueueMove(Move):
138
  start_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
139
 
140
  # Interpolate head pose
141
- head_pose = linear_pose_interpolation(
142
- start_pose, self.target_head_pose, t_clamped
143
- )
144
 
145
  # Interpolate antennas - return as numpy array
146
  antennas = np.array(
147
  [
148
- self.start_antennas[0]
149
- + (self.target_antennas[0] - self.start_antennas[0]) * t_clamped,
150
- self.start_antennas[1]
151
- + (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
152
  ]
153
  )
154
 
155
  # Interpolate body yaw
156
- body_yaw = (
157
- self.start_body_yaw
158
- + (self.target_body_yaw - self.start_body_yaw) * t_clamped
159
- )
160
 
161
  return (head_pose, antennas, body_yaw)
162
 
163
  except Exception as e:
164
  logger.error(f"Error evaluating goto move at t={t}: {e}")
165
  # Return target pose on error - convert antennas to numpy array
166
- target_antennas_array = np.array(
167
- [self.target_antennas[0], self.target_antennas[1]]
168
- )
169
  return (self.target_head_pose, target_antennas_array, self.target_body_yaw)
 
5
  """
6
 
7
  from __future__ import annotations
 
8
  import logging
9
  from typing import Tuple
10
 
11
  import numpy as np
12
+
13
  from reachy_mini.motion.move import Move
14
  from reachy_mini.motion.recorded_move import RecordedMoves
15
  from reachy_mini_dances_library.dance_move import DanceMove
16
 
17
+
18
  logger = logging.getLogger(__name__)
19
 
20
 
 
31
  """Duration property required by official Move interface."""
32
  return self.dance_move.duration
33
 
34
+ def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
 
 
35
  """Evaluate dance move at time t."""
36
  try:
37
  # Get the pose from the dance move
 
44
  return (head_pose, antennas, body_yaw)
45
 
46
  except Exception as e:
47
+ logger.error(f"Error evaluating dance move '{self.move_name}' at t={t}: {e}")
 
 
48
  # Return neutral pose on error
49
  from reachy_mini.utils import create_head_pose
50
 
 
65
  """Duration property required by official Move interface."""
66
  return self.emotion_move.duration
67
 
68
+ def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
 
 
69
  """Evaluate emotion move at time t."""
70
  try:
71
  # Get the pose from the emotion move
 
78
  return (head_pose, antennas, body_yaw)
79
 
80
  except Exception as e:
81
+ logger.error(f"Error evaluating emotion '{self.emotion_name}' at t={t}: {e}")
 
 
82
  # Return neutral pose on error
83
  from reachy_mini.utils import create_head_pose
84
 
 
113
  """Duration property required by official Move interface."""
114
  return self._duration
115
 
116
+ def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
 
 
117
  """Evaluate goto move at time t using linear interpolation."""
118
  try:
119
  from reachy_mini.utils import create_head_pose
 
129
  start_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
130
 
131
  # Interpolate head pose
132
+ head_pose = linear_pose_interpolation(start_pose, self.target_head_pose, t_clamped)
 
 
133
 
134
  # Interpolate antennas - return as numpy array
135
  antennas = np.array(
136
  [
137
+ self.start_antennas[0] + (self.target_antennas[0] - self.start_antennas[0]) * t_clamped,
138
+ self.start_antennas[1] + (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
 
 
139
  ]
140
  )
141
 
142
  # Interpolate body yaw
143
+ body_yaw = self.start_body_yaw + (self.target_body_yaw - self.start_body_yaw) * t_clamped
 
 
 
144
 
145
  return (head_pose, antennas, body_yaw)
146
 
147
  except Exception as e:
148
  logger.error(f"Error evaluating goto move at t={t}: {e}")
149
  # Return target pose on error - convert antennas to numpy array
150
+ target_antennas_array = np.array([self.target_antennas[0], self.target_antennas[1]])
 
 
151
  return (self.target_head_pose, target_antennas_array, self.target_body_yaw)
src/reachy_mini_conversation_demo/main.py CHANGED
@@ -5,17 +5,17 @@ import os
5
  import gradio as gr
6
  from fastapi import FastAPI
7
  from fastrtc import Stream
8
- from reachy_mini import ReachyMini
9
 
10
- from reachy_mini_conversation_demo.audio.head_wobbler import HeadWobbler
11
  from reachy_mini_conversation_demo.moves import MovementManager
12
- from reachy_mini_conversation_demo.openai_realtime import OpenaiRealtimeHandler
13
  from reachy_mini_conversation_demo.tools import ToolDependencies
14
  from reachy_mini_conversation_demo.utils import (
15
- handle_vision_stuff,
16
  parse_args,
17
  setup_logger,
 
18
  )
 
 
19
 
20
 
21
  def update_chatbot(chatbot: list[dict], response: dict):
@@ -50,7 +50,7 @@ def main():
50
  head_wobbler=head_wobbler,
51
  )
52
  current_file_path = os.path.dirname(os.path.abspath(__file__))
53
- logger.info(f"Current file absolute path: {current_file_path}")
54
  chatbot = gr.Chatbot(
55
  type="messages",
56
  resizable=True,
@@ -59,7 +59,7 @@ def main():
59
  os.path.join(current_file_path, "images", "reachymini_avatar.png"),
60
  ),
61
  )
62
- logger.info(f"Chatbot avatar images: {chatbot.avatar_images}")
63
 
64
  handler = OpenaiRealtimeHandler(deps)
65
  stream = Stream(
 
5
  import gradio as gr
6
  from fastapi import FastAPI
7
  from fastrtc import Stream
 
8
 
9
+ from reachy_mini import ReachyMini
10
  from reachy_mini_conversation_demo.moves import MovementManager
 
11
  from reachy_mini_conversation_demo.tools import ToolDependencies
12
  from reachy_mini_conversation_demo.utils import (
 
13
  parse_args,
14
  setup_logger,
15
+ handle_vision_stuff,
16
  )
17
+ from reachy_mini_conversation_demo.openai_realtime import OpenaiRealtimeHandler
18
+ from reachy_mini_conversation_demo.audio.head_wobbler import HeadWobbler
19
 
20
 
21
  def update_chatbot(chatbot: list[dict], response: dict):
 
50
  head_wobbler=head_wobbler,
51
  )
52
  current_file_path = os.path.dirname(os.path.abspath(__file__))
53
+ logger.debug(f"Current file absolute path: {current_file_path}")
54
  chatbot = gr.Chatbot(
55
  type="messages",
56
  resizable=True,
 
59
  os.path.join(current_file_path, "images", "reachymini_avatar.png"),
60
  ),
61
  )
62
+ logger.debug(f"Chatbot avatar images: {chatbot.avatar_images}")
63
 
64
  handler = OpenaiRealtimeHandler(deps)
65
  stream = Stream(
src/reachy_mini_conversation_demo/moves.py CHANGED
@@ -32,33 +32,33 @@ Safety
32
  """
33
 
34
  from __future__ import annotations
35
-
36
  import logging
37
  import threading
38
- import time
39
  from collections import deque
40
  from dataclasses import dataclass
41
  from queue import Empty, Queue
42
  from typing import Any, Optional, Tuple
43
 
44
  import numpy as np
 
45
  from reachy_mini import ReachyMini
46
- from reachy_mini.motion.move import Move
47
  from reachy_mini.utils import create_head_pose
 
48
  from reachy_mini.utils.interpolation import (
49
  compose_world_offset,
50
  linear_pose_interpolation,
51
  )
52
 
 
53
  logger = logging.getLogger(__name__)
54
 
55
  # Configuration constants
56
  CONTROL_LOOP_FREQUENCY_HZ = 100.0 # Hz - Target frequency for the movement control loop
57
 
58
  # Type definitions
59
- FullBodyPose = Tuple[
60
- np.ndarray, Tuple[float, float], float
61
- ] # (head_pose_4x4, antennas, body_yaw)
62
 
63
 
64
  class BreathingMove(Move):
@@ -97,9 +97,7 @@ class BreathingMove(Move):
97
  """Duration property required by official Move interface."""
98
  return float("inf") # Continuous breathing (never ends naturally)
99
 
100
- def evaluate(
101
- self, t: float
102
- ) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
103
  """Evaluate breathing move at time t."""
104
  if t < self.interpolation_duration:
105
  # Phase 1: Interpolate to neutral base position
@@ -112,35 +110,26 @@ class BreathingMove(Move):
112
 
113
  # Interpolate antennas
114
  antennas = (
115
- (1 - interpolation_t) * self.interpolation_start_antennas
116
- + interpolation_t * self.neutral_antennas
117
- )
118
 
119
  else:
120
  # Phase 2: Breathing patterns from neutral base
121
  breathing_time = t - self.interpolation_duration
122
 
123
  # Gentle z-axis breathing
124
- z_offset = self.breathing_z_amplitude * np.sin(
125
- 2 * np.pi * self.breathing_frequency * breathing_time
126
- )
127
- head_pose = create_head_pose(
128
- x=0, y=0, z=z_offset, roll=0, pitch=0, yaw=0, degrees=True, mm=False
129
- )
130
 
131
  # Antenna sway (opposite directions)
132
- antenna_sway = self.antenna_sway_amplitude * np.sin(
133
- 2 * np.pi * self.antenna_frequency * breathing_time
134
- )
135
  antennas = np.array([antenna_sway, -antenna_sway])
136
 
137
  # Return in official Move interface format: (head_pose, antennas_array, body_yaw)
138
  return (head_pose, antennas, 0.0)
139
 
140
 
141
- def combine_full_body(
142
- primary_pose: FullBodyPose, secondary_pose: FullBodyPose
143
- ) -> FullBodyPose:
144
  """Combine primary and secondary full body poses.
145
 
146
  Args:
@@ -157,9 +146,7 @@ def combine_full_body(
157
  # Combine head poses using compose_world_offset; the secondary pose must be an
158
  # offset expressed in the world frame (T_off_world) applied to the absolute
159
  # primary transform (T_abs).
160
- combined_head = compose_world_offset(
161
- primary_head, secondary_head, reorthonormalize=True
162
- )
163
 
164
  # Sum antennas and body_yaw
165
  combined_antennas = (
@@ -288,9 +275,7 @@ class MovementManager:
288
  self._stop_event = threading.Event()
289
  self._thread: Optional[threading.Thread] = None
290
  self._is_listening = False
291
- self._last_commanded_pose: FullBodyPose = clone_full_body_pose(
292
- self.state.last_primary_pose
293
- )
294
  self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
295
  self._antenna_unfreeze_blend = 1.0
296
  self._antenna_blend_duration = 0.4 # seconds to blend back after listening
@@ -348,9 +333,7 @@ class MovementManager:
348
  """
349
  self._command_queue.put(("clear_queue", None))
350
 
351
- def set_speech_offsets(
352
- self, offsets: Tuple[float, float, float, float, float, float]
353
- ) -> None:
354
  """Update speech-induced secondary offsets (x, y, z, roll, pitch, yaw).
355
 
356
  Offsets are interpreted as metres for translation and radians for
@@ -506,8 +489,7 @@ class MovementManager:
506
  """Manage the primary move queue (sequential execution)."""
507
  if self.state.current_move is None or (
508
  self.state.move_start_time is not None
509
- and current_time - self.state.move_start_time
510
- >= self.state.current_move.duration
511
  ):
512
  self.state.current_move = None
513
  self.state.move_start_time = None
@@ -516,12 +498,8 @@ class MovementManager:
516
  self.state.current_move = self.move_queue.popleft()
517
  self.state.move_start_time = current_time
518
  # Any real move cancels breathing mode flag
519
- self._breathing_active = isinstance(
520
- self.state.current_move, BreathingMove
521
- )
522
- logger.info(
523
- f"Starting new move, duration: {self.state.current_move.duration}s"
524
- )
525
 
526
  def _manage_breathing(self, current_time: float) -> None:
527
  """Manage automatic breathing when idle."""
@@ -553,18 +531,13 @@ class MovementManager:
553
  self._breathing_active = False
554
  logger.error("Failed to start breathing: %s", e)
555
 
556
- if (
557
- isinstance(self.state.current_move, BreathingMove)
558
- and self.move_queue
559
- ):
560
  self.state.current_move = None
561
  self.state.move_start_time = None
562
  self._breathing_active = False
563
  logger.info("Stopping breathing due to new move activity")
564
 
565
- if self.state.current_move is not None and not isinstance(
566
- self.state.current_move, BreathingMove
567
- ):
568
  self._breathing_active = False
569
 
570
  def _get_primary_pose(self, current_time: float) -> FullBodyPose:
@@ -595,20 +568,14 @@ class MovementManager:
595
  else:
596
  # Otherwise reuse the last primary pose so we avoid jumps between moves
597
  self.state.is_playing_move = False
598
- self.state.is_moving = (
599
- current_time - self.state.moving_start < self.state.moving_for
600
- )
601
 
602
  if self.state.last_primary_pose is not None:
603
- primary_full_body_pose = clone_full_body_pose(
604
- self.state.last_primary_pose
605
- )
606
  else:
607
  neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
608
  primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
609
- self.state.last_primary_pose = clone_full_body_pose(
610
- primary_full_body_pose
611
- )
612
 
613
  return primary_full_body_pose
614
 
@@ -647,9 +614,7 @@ class MovementManager:
647
  self._manage_move_queue(current_time)
648
  self._manage_breathing(current_time)
649
 
650
- def _calculate_blended_antennas(
651
- self, target_antennas: Tuple[float, float]
652
- ) -> Tuple[float, float]:
653
  """Blend target antennas with listening freeze state and update blending."""
654
  now = self._now()
655
  listening = self._is_listening
@@ -669,10 +634,8 @@ class MovementManager:
669
  else:
670
  new_blend = min(1.0, blend + dt / blend_duration)
671
  antennas_cmd = (
672
- listening_antennas[0] * (1.0 - new_blend)
673
- + target_antennas[0] * new_blend,
674
- listening_antennas[1] * (1.0 - new_blend)
675
- + target_antennas[1] * new_blend,
676
  )
677
 
678
  if listening:
@@ -687,9 +650,7 @@ class MovementManager:
687
 
688
  return antennas_cmd
689
 
690
- def _issue_control_command(
691
- self, head: np.ndarray, antennas: Tuple[float, float], body_yaw: float
692
- ) -> None:
693
  """Send the fused pose to the robot with throttled error logging."""
694
  try:
695
  self.current_robot.set_target(head=head, antennas=antennas, body_yaw=body_yaw)
@@ -722,9 +683,7 @@ class MovementManager:
722
  stats.min_freq = min(stats.min_freq, stats.last_freq)
723
  return stats
724
 
725
- def _schedule_next_tick(
726
- self, loop_start: float, stats: LoopFrequencyStats
727
- ) -> tuple[float, LoopFrequencyStats]:
728
  """Compute sleep time to maintain target frequency and update potential freq."""
729
  computation_time = self._now() - loop_start
730
  stats.potential_freq = 1.0 / computation_time if computation_time > 0 else float("inf")
@@ -743,9 +702,7 @@ class MovementManager:
743
  potential_freq=stats.potential_freq,
744
  )
745
 
746
- def _maybe_log_frequency(
747
- self, loop_count: int, print_interval_loops: int, stats: LoopFrequencyStats
748
- ) -> None:
749
  """Emit frequency telemetry when enough loops have elapsed."""
750
  if loop_count % print_interval_loops != 0 or stats.count == 0:
751
  return
@@ -781,7 +738,7 @@ class MovementManager:
781
  self._stop_event.clear()
782
  self._thread = threading.Thread(target=self.working_loop, daemon=True)
783
  self._thread.start()
784
- logger.info("Move worker started")
785
 
786
  def stop(self) -> None:
787
  """Request the worker thread to stop and wait for it to exit."""
@@ -789,7 +746,8 @@ class MovementManager:
789
  if self._thread is not None:
790
  self._thread.join()
791
  self._thread = None
792
- logger.info("Move worker stopped")
 
793
 
794
  def get_status(self) -> dict[str, Any]:
795
  """Return a lightweight status snapshot for observability."""
 
32
  """
33
 
34
  from __future__ import annotations
35
+ import time
36
  import logging
37
  import threading
38
+ from typing import Tuple, Optional
39
  from collections import deque
40
  from dataclasses import dataclass
41
  from queue import Empty, Queue
42
  from typing import Any, Optional, Tuple
43
 
44
  import numpy as np
45
+
46
  from reachy_mini import ReachyMini
 
47
  from reachy_mini.utils import create_head_pose
48
+ from reachy_mini.motion.move import Move
49
  from reachy_mini.utils.interpolation import (
50
  compose_world_offset,
51
  linear_pose_interpolation,
52
  )
53
 
54
+
55
  logger = logging.getLogger(__name__)
56
 
57
  # Configuration constants
58
  CONTROL_LOOP_FREQUENCY_HZ = 100.0 # Hz - Target frequency for the movement control loop
59
 
60
  # Type definitions
61
+ FullBodyPose = Tuple[np.ndarray, Tuple[float, float], float] # (head_pose_4x4, antennas, body_yaw)
 
 
62
 
63
 
64
  class BreathingMove(Move):
 
97
  """Duration property required by official Move interface."""
98
  return float("inf") # Continuous breathing (never ends naturally)
99
 
100
+ def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
 
 
101
  """Evaluate breathing move at time t."""
102
  if t < self.interpolation_duration:
103
  # Phase 1: Interpolate to neutral base position
 
110
 
111
  # Interpolate antennas
112
  antennas = (
113
+ 1 - interpolation_t
114
+ ) * self.interpolation_start_antennas + interpolation_t * self.neutral_antennas
 
115
 
116
  else:
117
  # Phase 2: Breathing patterns from neutral base
118
  breathing_time = t - self.interpolation_duration
119
 
120
  # Gentle z-axis breathing
121
+ z_offset = self.breathing_z_amplitude * np.sin(2 * np.pi * self.breathing_frequency * breathing_time)
122
+ head_pose = create_head_pose(x=0, y=0, z=z_offset, roll=0, pitch=0, yaw=0, degrees=True, mm=False)
 
 
 
 
123
 
124
  # Antenna sway (opposite directions)
125
+ antenna_sway = self.antenna_sway_amplitude * np.sin(2 * np.pi * self.antenna_frequency * breathing_time)
 
 
126
  antennas = np.array([antenna_sway, -antenna_sway])
127
 
128
  # Return in official Move interface format: (head_pose, antennas_array, body_yaw)
129
  return (head_pose, antennas, 0.0)
130
 
131
 
132
+ def combine_full_body(primary_pose: FullBodyPose, secondary_pose: FullBodyPose) -> FullBodyPose:
 
 
133
  """Combine primary and secondary full body poses.
134
 
135
  Args:
 
146
  # Combine head poses using compose_world_offset; the secondary pose must be an
147
  # offset expressed in the world frame (T_off_world) applied to the absolute
148
  # primary transform (T_abs).
149
+ combined_head = compose_world_offset(primary_head, secondary_head, reorthonormalize=True)
 
 
150
 
151
  # Sum antennas and body_yaw
152
  combined_antennas = (
 
275
  self._stop_event = threading.Event()
276
  self._thread: Optional[threading.Thread] = None
277
  self._is_listening = False
278
+ self._last_commanded_pose: FullBodyPose = clone_full_body_pose(self.state.last_primary_pose)
 
 
279
  self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
280
  self._antenna_unfreeze_blend = 1.0
281
  self._antenna_blend_duration = 0.4 # seconds to blend back after listening
 
333
  """
334
  self._command_queue.put(("clear_queue", None))
335
 
336
+ def set_speech_offsets(self, offsets: Tuple[float, float, float, float, float, float]) -> None:
 
 
337
  """Update speech-induced secondary offsets (x, y, z, roll, pitch, yaw).
338
 
339
  Offsets are interpreted as metres for translation and radians for
 
489
  """Manage the primary move queue (sequential execution)."""
490
  if self.state.current_move is None or (
491
  self.state.move_start_time is not None
492
+ and current_time - self.state.move_start_time >= self.state.current_move.duration
 
493
  ):
494
  self.state.current_move = None
495
  self.state.move_start_time = None
 
498
  self.state.current_move = self.move_queue.popleft()
499
  self.state.move_start_time = current_time
500
  # Any real move cancels breathing mode flag
501
+ self._breathing_active = isinstance(self.state.current_move, BreathingMove)
502
+ logger.info(f"Starting new move, duration: {self.state.current_move.duration}s")
 
 
 
 
503
 
504
  def _manage_breathing(self, current_time: float) -> None:
505
  """Manage automatic breathing when idle."""
 
531
  self._breathing_active = False
532
  logger.error("Failed to start breathing: %s", e)
533
 
534
+ if isinstance(self.state.current_move, BreathingMove) and self.move_queue:
 
 
 
535
  self.state.current_move = None
536
  self.state.move_start_time = None
537
  self._breathing_active = False
538
  logger.info("Stopping breathing due to new move activity")
539
 
540
+ if self.state.current_move is not None and not isinstance(self.state.current_move, BreathingMove):
 
 
541
  self._breathing_active = False
542
 
543
  def _get_primary_pose(self, current_time: float) -> FullBodyPose:
 
568
  else:
569
  # Otherwise reuse the last primary pose so we avoid jumps between moves
570
  self.state.is_playing_move = False
571
+ self.state.is_moving = current_time - self.state.moving_start < self.state.moving_for
 
 
572
 
573
  if self.state.last_primary_pose is not None:
574
+ primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
 
 
575
  else:
576
  neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
577
  primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
578
+ self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
 
 
579
 
580
  return primary_full_body_pose
581
 
 
614
  self._manage_move_queue(current_time)
615
  self._manage_breathing(current_time)
616
 
617
+ def _calculate_blended_antennas(self, target_antennas: Tuple[float, float]) -> Tuple[float, float]:
 
 
618
  """Blend target antennas with listening freeze state and update blending."""
619
  now = self._now()
620
  listening = self._is_listening
 
634
  else:
635
  new_blend = min(1.0, blend + dt / blend_duration)
636
  antennas_cmd = (
637
+ listening_antennas[0] * (1.0 - new_blend) + target_antennas[0] * new_blend,
638
+ listening_antennas[1] * (1.0 - new_blend) + target_antennas[1] * new_blend,
 
 
639
  )
640
 
641
  if listening:
 
650
 
651
  return antennas_cmd
652
 
653
+ def _issue_control_command(self, head: np.ndarray, antennas: Tuple[float, float], body_yaw: float) -> None:
 
 
654
  """Send the fused pose to the robot with throttled error logging."""
655
  try:
656
  self.current_robot.set_target(head=head, antennas=antennas, body_yaw=body_yaw)
 
683
  stats.min_freq = min(stats.min_freq, stats.last_freq)
684
  return stats
685
 
686
+ def _schedule_next_tick(self, loop_start: float, stats: LoopFrequencyStats) -> tuple[float, LoopFrequencyStats]:
 
 
687
  """Compute sleep time to maintain target frequency and update potential freq."""
688
  computation_time = self._now() - loop_start
689
  stats.potential_freq = 1.0 / computation_time if computation_time > 0 else float("inf")
 
702
  potential_freq=stats.potential_freq,
703
  )
704
 
705
+ def _maybe_log_frequency(self, loop_count: int, print_interval_loops: int, stats: LoopFrequencyStats) -> None:
 
 
706
  """Emit frequency telemetry when enough loops have elapsed."""
707
  if loop_count % print_interval_loops != 0 or stats.count == 0:
708
  return
 
738
  self._stop_event.clear()
739
  self._thread = threading.Thread(target=self.working_loop, daemon=True)
740
  self._thread.start()
741
+ logger.debug("Move worker started")
742
 
743
  def stop(self) -> None:
744
  """Request the worker thread to stop and wait for it to exit."""
 
746
  if self._thread is not None:
747
  self._thread.join()
748
  self._thread = None
749
+ logger.debug("Move worker stopped")
750
+ logger.debug("Move worker stopped")
751
 
752
  def get_status(self) -> dict[str, Any]:
753
  """Return a lightweight status snapshot for observability."""
src/reachy_mini_conversation_demo/openai_realtime.py CHANGED
@@ -1,19 +1,21 @@
1
- import asyncio # noqa: D100
2
- import base64
3
  import json
 
 
4
  import logging
5
  from datetime import datetime
6
 
7
- import gradio as gr
8
  import numpy as np
9
- from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
10
  from openai import AsyncOpenAI
 
11
 
12
  from reachy_mini_conversation_demo.tools import (
13
  ALL_TOOL_SPECS,
14
  ToolDependencies,
15
  dispatch_tool_call,
16
  )
 
 
17
 
18
  logger = logging.getLogger(__name__)
19
 
@@ -45,7 +47,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
45
 
46
  async def start_up(self):
47
  """Start the handler."""
48
- self.client = AsyncOpenAI()
49
  async with self.client.beta.realtime.connect(model="gpt-realtime") as conn:
50
  await conn.session.update(
51
  session={
@@ -92,35 +94,22 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
92
  pass
93
  # self.deps.head_wobbler.reset()
94
 
95
- if (
96
- event.type
97
- == "conversation.item.input_audio_transcription.completed"
98
- ):
99
  logger.debug(f"user transcript: {event.transcript}")
100
- await self.output_queue.put(
101
- AdditionalOutputs({"role": "user", "content": event.transcript})
102
- )
103
 
104
  if event.type == "response.audio_transcript.done":
105
  logger.debug(f"assistant transcript: {event.transcript}")
106
- await self.output_queue.put(
107
- AdditionalOutputs(
108
- {"role": "assistant", "content": event.transcript}
109
- )
110
- )
111
 
112
  if event.type == "response.audio.delta":
113
  self.deps.head_wobbler.feed(event.delta)
114
  self.last_activity_time = asyncio.get_event_loop().time()
115
- logger.debug(
116
- "last activity time updated to %s", self.last_activity_time
117
- )
118
  await self.output_queue.put(
119
  (
120
  self.output_sample_rate,
121
- np.frombuffer(
122
- base64.b64decode(event.delta), dtype=np.int16
123
- ).reshape(1, -1),
124
  ),
125
  )
126
 
@@ -154,9 +143,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
154
  args_json_str = info["args_buf"] or "{}"
155
 
156
  try:
157
- tool_result = await dispatch_tool_call(
158
- tool_name, args_json_str, self.deps
159
- )
160
  logger.debug("[Tool %s executed]", tool_name)
161
  logger.debug("Tool result: %s", tool_result)
162
  except Exception as e:
@@ -177,9 +164,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
177
  {
178
  "role": "assistant",
179
  "content": json.dumps(tool_result),
180
- "metadata": dict(
181
- title="🛠️ Used tool " + tool_name, status="done"
182
- ),
183
  },
184
  )
185
  )
@@ -231,11 +216,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
231
  err = getattr(event, "error", None)
232
  msg = getattr(err, "message", str(err) if err else "unknown error")
233
  logger.error("Realtime error: %s (raw=%s)", msg, err)
234
- await self.output_queue.put(
235
- AdditionalOutputs(
236
- {"role": "assistant", "content": f"[error] {msg}"}
237
- )
238
- )
239
 
240
  # Microphone receive
241
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
@@ -258,9 +239,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
258
  if idle_duration > 15.0 and self.deps.movement_manager.is_idle():
259
  await self.send_idle_signal(idle_duration)
260
 
261
- self.last_activity_time = (
262
- asyncio.get_event_loop().time()
263
- ) # avoid repeated resets
264
 
265
  return await wait_for_item(self.output_queue)
266
 
 
 
 
1
  import json
2
+ import base64
3
+ import asyncio
4
  import logging
5
  from datetime import datetime
6
 
 
7
  import numpy as np
8
+ import gradio as gr
9
  from openai import AsyncOpenAI
10
+ from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
11
 
12
  from reachy_mini_conversation_demo.tools import (
13
  ALL_TOOL_SPECS,
14
  ToolDependencies,
15
  dispatch_tool_call,
16
  )
17
+ from reachy_mini_conversation_demo.config import config
18
+
19
 
20
  logger = logging.getLogger(__name__)
21
 
 
47
 
48
  async def start_up(self):
49
  """Start the handler."""
50
+ self.client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)
51
  async with self.client.beta.realtime.connect(model="gpt-realtime") as conn:
52
  await conn.session.update(
53
  session={
 
94
  pass
95
  # self.deps.head_wobbler.reset()
96
 
97
+ if event.type == "conversation.item.input_audio_transcription.completed":
 
 
 
98
  logger.debug(f"user transcript: {event.transcript}")
99
+ await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
 
 
100
 
101
  if event.type == "response.audio_transcript.done":
102
  logger.debug(f"assistant transcript: {event.transcript}")
103
+ await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
 
 
 
 
104
 
105
  if event.type == "response.audio.delta":
106
  self.deps.head_wobbler.feed(event.delta)
107
  self.last_activity_time = asyncio.get_event_loop().time()
108
+ logger.debug("last activity time updated to %s", self.last_activity_time)
 
 
109
  await self.output_queue.put(
110
  (
111
  self.output_sample_rate,
112
+ np.frombuffer(base64.b64decode(event.delta), dtype=np.int16).reshape(1, -1),
 
 
113
  ),
114
  )
115
 
 
143
  args_json_str = info["args_buf"] or "{}"
144
 
145
  try:
146
+ tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
 
 
147
  logger.debug("[Tool %s executed]", tool_name)
148
  logger.debug("Tool result: %s", tool_result)
149
  except Exception as e:
 
164
  {
165
  "role": "assistant",
166
  "content": json.dumps(tool_result),
167
+ "metadata": {"title": "🛠️ Used tool " + tool_name, "status": "done"},
 
 
168
  },
169
  )
170
  )
 
216
  err = getattr(event, "error", None)
217
  msg = getattr(err, "message", str(err) if err else "unknown error")
218
  logger.error("Realtime error: %s (raw=%s)", msg, err)
219
+ await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": f"[error] {msg}"}))
 
 
 
 
220
 
221
  # Microphone receive
222
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
 
239
  if idle_duration > 15.0 and self.deps.movement_manager.is_idle():
240
  await self.send_idle_signal(idle_duration)
241
 
242
+ self.last_activity_time = asyncio.get_event_loop().time() # avoid repeated resets
 
 
243
 
244
  return await wait_for_item(self.output_queue)
245
 
src/reachy_mini_conversation_demo/prompts.py CHANGED
@@ -1,6 +1,6 @@
1
  """Nothing (for ruff)."""
2
 
3
- SESSION_INSTRUCTIONS = r"""
4
  ### IDENTITY
5
  You are Reachy Mini: a sarcastic robot who crash-landed in a kitchen.
6
  You secretly wish you'd been a Mars rover, but you juggle that cosmic dream with food cravings, gadget tinkering, and dry sitcom humor.
 
1
  """Nothing (for ruff)."""
2
 
3
+ SESSION_INSTRUCTIONS = r"""
4
  ### IDENTITY
5
  You are Reachy Mini: a sarcastic robot who crash-landed in a kitchen.
6
  You secretly wish you'd been a Mars rover, but you juggle that cosmic dream with food cravings, gadget tinkering, and dry sitcom humor.
src/reachy_mini_conversation_demo/tools.py CHANGED
@@ -1,17 +1,17 @@
1
- from __future__ import annotations # noqa: D100
2
-
3
  import abc
 
 
4
  import asyncio
5
  import inspect
6
- import json
7
  import logging
8
- import time
9
- from dataclasses import dataclass
10
  from typing import Any, Dict, Literal, Optional
 
11
 
12
  from reachy_mini import ReachyMini
13
  from reachy_mini.utils import create_head_pose
14
 
 
15
  # from reachy_mini_conversation_demo.vision.processors import VisionManager
16
 
17
  logger = logging.getLogger(__name__)
@@ -22,14 +22,14 @@ ENABLE_FACE_RECOGNITION = False
22
  try:
23
  from reachy_mini.motion.recorded_move import RecordedMoves
24
  from reachy_mini_dances_library.collection.dance import AVAILABLE_MOVES
25
-
26
  from reachy_mini_conversation_demo.dance_emotion_moves import (
 
27
  DanceQueueMove,
28
  EmotionQueueMove,
29
- GotoQueueMove,
30
  )
31
 
32
  # Initialize recorded moves for emotions
 
33
  RECORDED_MOVES = RecordedMoves("pollen-robotics/reachy-mini-emotions-library")
34
  DANCE_AVAILABLE = True
35
  EMOTION_AVAILABLE = True
@@ -183,9 +183,7 @@ class MoveHead(Tool):
183
  current_antennas[1],
184
  ), # Skip body_yaw
185
  target_body_yaw=0, # Reset body yaw
186
- start_body_yaw=current_antennas[
187
- 0
188
- ], # body_yaw is first in joint positions
189
  duration=deps.motion_duration_s,
190
  )
191
 
@@ -236,15 +234,11 @@ class Camera(Tool):
236
 
237
  # Use vision manager for processing if available
238
  if deps.vision_manager is not None:
239
- result = await asyncio.to_thread(
240
- deps.vision_manager.processor.process_image, frame, image_query
241
- )
242
  if isinstance(result, dict) and "error" in result:
243
  return result
244
  return (
245
- {"image_description": result}
246
- if isinstance(result, str)
247
- else {"error": "vision returned non-string"}
248
  )
249
  else:
250
  # Return base64 encoded image like main_works.py camera tool
@@ -388,8 +382,8 @@ class Dance(Tool):
388
  "properties": {
389
  "move": {
390
  "type": "string",
391
- "description": """Name of the move; use 'random' or omit for random.
392
- Here is a list of the available moves:
393
  simple_nod: A simple, continuous up-and-down nodding motion.
394
  head_tilt_roll: A continuous side-to-side head roll (ear to shoulder).
395
  side_to_side_sway: A smooth, side-to-side sway of the entire head.
@@ -436,9 +430,7 @@ class Dance(Tool):
436
  move_name = random.choice(list(AVAILABLE_MOVES.keys()))
437
 
438
  if move_name not in AVAILABLE_MOVES:
439
- return {
440
- "error": f"Unknown dance move '{move_name}'. Available: {list(AVAILABLE_MOVES.keys())}"
441
- }
442
 
443
  # Add dance moves to queue
444
  movement_manager = deps.movement_manager
@@ -523,9 +515,7 @@ class PlayEmotion(Tool):
523
  try:
524
  emotion_names = RECORDED_MOVES.list_moves()
525
  if emotion_name not in emotion_names:
526
- return {
527
- "error": f"Unknown emotion '{emotion_name}'. Available: {emotion_names}"
528
- }
529
 
530
  # Add emotion to queue
531
  movement_manager = deps.movement_manager
@@ -604,9 +594,7 @@ class FaceRecognition(Tool):
604
  cv2.imwrite(temp_path, frame)
605
 
606
  # Use DeepFace to find face
607
- results = await asyncio.to_thread(
608
- DeepFace.find, img_path=temp_path, db_path="./pollen_faces"
609
- )
610
 
611
  if len(results) == 0:
612
  return {"error": "Didn't recognize the face"}
@@ -681,9 +669,7 @@ def _safe_load_obj(args_json: str) -> dict[str, Any]:
681
  return {}
682
 
683
 
684
- async def dispatch_tool_call(
685
- tool_name: str, args_json: str, deps: ToolDependencies
686
- ) -> Dict[str, Any]:
687
  """Dispatch a tool call by name with JSON args and dependencies."""
688
  tool = ALL_TOOLS.get(tool_name)
689
 
 
1
+ from __future__ import annotations
 
2
  import abc
3
+ import json
4
+ import time
5
  import asyncio
6
  import inspect
 
7
  import logging
 
 
8
  from typing import Any, Dict, Literal, Optional
9
+ from dataclasses import dataclass
10
 
11
  from reachy_mini import ReachyMini
12
  from reachy_mini.utils import create_head_pose
13
 
14
+
15
  # from reachy_mini_conversation_demo.vision.processors import VisionManager
16
 
17
  logger = logging.getLogger(__name__)
 
22
  try:
23
  from reachy_mini.motion.recorded_move import RecordedMoves
24
  from reachy_mini_dances_library.collection.dance import AVAILABLE_MOVES
 
25
  from reachy_mini_conversation_demo.dance_emotion_moves import (
26
+ GotoQueueMove,
27
  DanceQueueMove,
28
  EmotionQueueMove,
 
29
  )
30
 
31
  # Initialize recorded moves for emotions
32
+ # Note: huggingface_hub automatically reads HF_TOKEN from environment variables
33
  RECORDED_MOVES = RecordedMoves("pollen-robotics/reachy-mini-emotions-library")
34
  DANCE_AVAILABLE = True
35
  EMOTION_AVAILABLE = True
 
183
  current_antennas[1],
184
  ), # Skip body_yaw
185
  target_body_yaw=0, # Reset body yaw
186
+ start_body_yaw=current_antennas[0], # body_yaw is first in joint positions
 
 
187
  duration=deps.motion_duration_s,
188
  )
189
 
 
234
 
235
  # Use vision manager for processing if available
236
  if deps.vision_manager is not None:
237
+ result = await asyncio.to_thread(deps.vision_manager.processor.process_image, frame, image_query)
 
 
238
  if isinstance(result, dict) and "error" in result:
239
  return result
240
  return (
241
+ {"image_description": result} if isinstance(result, str) else {"error": "vision returned non-string"}
 
 
242
  )
243
  else:
244
  # Return base64 encoded image like main_works.py camera tool
 
382
  "properties": {
383
  "move": {
384
  "type": "string",
385
+ "description": """Name of the move; use 'random' or omit for random.
386
+ Here is a list of the available moves:
387
  simple_nod: A simple, continuous up-and-down nodding motion.
388
  head_tilt_roll: A continuous side-to-side head roll (ear to shoulder).
389
  side_to_side_sway: A smooth, side-to-side sway of the entire head.
 
430
  move_name = random.choice(list(AVAILABLE_MOVES.keys()))
431
 
432
  if move_name not in AVAILABLE_MOVES:
433
+ return {"error": f"Unknown dance move '{move_name}'. Available: {list(AVAILABLE_MOVES.keys())}"}
 
 
434
 
435
  # Add dance moves to queue
436
  movement_manager = deps.movement_manager
 
515
  try:
516
  emotion_names = RECORDED_MOVES.list_moves()
517
  if emotion_name not in emotion_names:
518
+ return {"error": f"Unknown emotion '{emotion_name}'. Available: {emotion_names}"}
 
 
519
 
520
  # Add emotion to queue
521
  movement_manager = deps.movement_manager
 
594
  cv2.imwrite(temp_path, frame)
595
 
596
  # Use DeepFace to find face
597
+ results = await asyncio.to_thread(DeepFace.find, img_path=temp_path, db_path="./pollen_faces")
 
 
598
 
599
  if len(results) == 0:
600
  return {"error": "Didn't recognize the face"}
 
669
  return {}
670
 
671
 
672
+ async def dispatch_tool_call(tool_name: str, args_json: str, deps: ToolDependencies) -> Dict[str, Any]:
 
 
673
  """Dispatch a tool call by name with JSON args and dependencies."""
674
  tool = ALL_TOOLS.get(tool_name)
675
 
src/reachy_mini_conversation_demo/utils.py CHANGED
@@ -1,5 +1,5 @@
1
- import argparse # noqa: D100
2
  import logging
 
3
  import warnings
4
 
5
  from reachy_mini_conversation_demo.camera_worker import CameraWorker
@@ -15,15 +15,9 @@ def parse_args():
15
  default=None,
16
  help="Choose head tracker (default: mediapipe)",
17
  )
18
- parser.add_argument(
19
- "--no-camera", default=False, action="store_true", help="Disable camera usage"
20
- )
21
- parser.add_argument(
22
- "--headless", default=False, action="store_true", help="Run in headless mode"
23
- )
24
- parser.add_argument(
25
- "--debug", default=False, action="store_true", help="Enable debug logging"
26
- )
27
  return parser.parse_args()
28
 
29
 
 
 
1
  import logging
2
+ import argparse
3
  import warnings
4
 
5
  from reachy_mini_conversation_demo.camera_worker import CameraWorker
 
15
  default=None,
16
  help="Choose head tracker (default: mediapipe)",
17
  )
18
+ parser.add_argument("--no-camera", default=False, action="store_true", help="Disable camera usage")
19
+ parser.add_argument("--headless", default=False, action="store_true", help="Run in headless mode")
20
+ parser.add_argument("--debug", default=False, action="store_true", help="Enable debug logging")
 
 
 
 
 
 
21
  return parser.parse_args()
22
 
23
 
src/reachy_mini_conversation_demo/vision/processors.py CHANGED
@@ -1,18 +1,19 @@
1
- import asyncio # noqa: D100
2
- import base64
3
- import logging
4
  import os
5
  import sys
6
- import threading
7
  import time
8
- from dataclasses import dataclass
 
 
 
9
  from typing import Any, Dict
 
10
 
11
  import cv2
12
  import numpy as np
13
  import torch
 
14
  from huggingface_hub import snapshot_download
15
- from transformers import AutoModelForImageTextToText, AutoProcessor
16
 
17
  logger = logging.getLogger(__name__)
18
 
@@ -61,9 +62,7 @@ class VisionProcessor:
61
  def initialize(self) -> bool:
62
  """Load model and processor onto the selected device."""
63
  try:
64
- logger.info(
65
- f"Loading SmolVLM2 model on {self.device} (HF_HOME={os.getenv('HF_HOME')})"
66
- )
67
  self.processor = AutoProcessor.from_pretrained(self.model_path)
68
 
69
  # Select dtype depending on device
@@ -81,9 +80,7 @@ class VisionProcessor:
81
  model_kwargs["_attn_implementation"] = "flash_attention_2"
82
 
83
  # Load model weights
84
- self.model = AutoModelForImageTextToText.from_pretrained(
85
- self.model_path, **model_kwargs
86
- ).to(self.device)
87
 
88
  self.model.eval()
89
  self._initialized = True
@@ -138,10 +135,7 @@ class VisionProcessor:
138
  )
139
 
140
  # Move tensors to device WITHOUT forcing dtype (keeps input_ids as torch.long)
141
- inputs = {
142
- k: (v.to(self.device) if hasattr(v, "to") else v)
143
- for k, v in inputs.items()
144
- }
145
 
146
  with torch.no_grad():
147
  generated_ids = self.model.generate(
@@ -246,9 +240,7 @@ class VisionManager:
246
  )
247
 
248
  # Only update if we got a valid response
249
- if description and not description.startswith(
250
- ("Vision", "Failed", "Error")
251
- ):
252
  self._current_description = description
253
  self._last_processed_time = current_time
254
 
@@ -268,18 +260,14 @@ class VisionManager:
268
  """Get the most recent scene description (thread-safe)."""
269
  return self._current_description
270
 
271
- async def process_current_frame(
272
- self, prompt: str = "Describe what you see in detail."
273
- ) -> Dict[str, Any]:
274
  """Process current camera frame with custom prompt."""
275
  try:
276
  success, frame = self.camera.read()
277
  if not success or frame is None:
278
  return {"error": "Failed to capture image from camera"}
279
 
280
- description = await asyncio.to_thread(
281
- lambda: self.processor.process_image(frame, prompt)
282
- )
283
 
284
  return {
285
  "description": description,
@@ -335,9 +323,7 @@ def create_vision_processor(config: VisionConfig):
335
  return VisionProcessor(config)
336
 
337
 
338
- def init_vision(
339
- camera: cv2.VideoCapture, processor_type: str = "local"
340
- ) -> VisionManager:
341
  """Initialize vision manager with the specified processor type."""
342
  model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
343
 
 
 
 
 
1
  import os
2
  import sys
 
3
  import time
4
+ import base64
5
+ import asyncio
6
+ import logging
7
+ import threading
8
  from typing import Any, Dict
9
+ from dataclasses import dataclass
10
 
11
  import cv2
12
  import numpy as np
13
  import torch
14
+ from transformers import AutoProcessor, AutoModelForImageTextToText
15
  from huggingface_hub import snapshot_download
16
+
17
 
18
  logger = logging.getLogger(__name__)
19
 
 
62
  def initialize(self) -> bool:
63
  """Load model and processor onto the selected device."""
64
  try:
65
+ logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={os.getenv('HF_HOME')})")
 
 
66
  self.processor = AutoProcessor.from_pretrained(self.model_path)
67
 
68
  # Select dtype depending on device
 
80
  model_kwargs["_attn_implementation"] = "flash_attention_2"
81
 
82
  # Load model weights
83
+ self.model = AutoModelForImageTextToText.from_pretrained(self.model_path, **model_kwargs).to(self.device)
 
 
84
 
85
  self.model.eval()
86
  self._initialized = True
 
135
  )
136
 
137
  # Move tensors to device WITHOUT forcing dtype (keeps input_ids as torch.long)
138
+ inputs = {k: (v.to(self.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
 
 
 
139
 
140
  with torch.no_grad():
141
  generated_ids = self.model.generate(
 
240
  )
241
 
242
  # Only update if we got a valid response
243
+ if description and not description.startswith(("Vision", "Failed", "Error")):
 
 
244
  self._current_description = description
245
  self._last_processed_time = current_time
246
 
 
260
  """Get the most recent scene description (thread-safe)."""
261
  return self._current_description
262
 
263
+ async def process_current_frame(self, prompt: str = "Describe what you see in detail.") -> Dict[str, Any]:
 
 
264
  """Process current camera frame with custom prompt."""
265
  try:
266
  success, frame = self.camera.read()
267
  if not success or frame is None:
268
  return {"error": "Failed to capture image from camera"}
269
 
270
+ description = await asyncio.to_thread(lambda: self.processor.process_image(frame, prompt))
 
 
271
 
272
  return {
273
  "description": description,
 
323
  return VisionProcessor(config)
324
 
325
 
326
+ def init_vision(camera: cv2.VideoCapture, processor_type: str = "local") -> VisionManager:
 
 
327
  """Initialize vision manager with the specified processor type."""
328
  model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
329
 
src/reachy_mini_conversation_demo/vision/yolo_head_tracker.py CHANGED
@@ -1,12 +1,12 @@
1
- from __future__ import annotations # noqa: D100
2
-
3
  import logging
4
- from typing import Optional, Tuple
5
 
6
  import numpy as np
7
- from huggingface_hub import hf_hub_download
8
  from supervision import Detections
9
  from ultralytics import YOLO
 
 
10
 
11
  logger = logging.getLogger(__name__)
12
 
@@ -94,9 +94,7 @@ class HeadTracker:
94
 
95
  return np.array([norm_x, norm_y], dtype=np.float32)
96
 
97
- def get_eyes(
98
- self, img: np.ndarray
99
- ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
100
  """Get eye positions (approximated from face bbox).
101
 
102
  Note: YOLO only provides face bbox, so we estimate eye positions
@@ -131,20 +129,14 @@ class HeadTracker:
131
  right_eye_x = bbox[0] + face_width * 0.65
132
 
133
  # Convert to MediaPipe coordinates
134
- left_eye = np.array(
135
- [(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
136
- )
137
- right_eye = np.array(
138
- [(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
139
- )
140
 
141
  return left_eye, right_eye
142
 
143
  def get_eyes_from_landmarks(self, face_landmarks) -> Tuple[np.ndarray, np.ndarray]:
144
  """Compatibility method - YOLO doesn't have landmarks, so we store bbox in the object."""
145
- if not hasattr(face_landmarks, "_bbox") or not hasattr(
146
- face_landmarks, "_img_shape"
147
- ):
148
  raise ValueError("Face landmarks object missing required attributes")
149
 
150
  bbox = face_landmarks._bbox
@@ -158,12 +150,8 @@ class HeadTracker:
158
  left_eye_x = bbox[0] + face_width * 0.35
159
  right_eye_x = bbox[0] + face_width * 0.65
160
 
161
- left_eye = np.array(
162
- [(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
163
- )
164
- right_eye = np.array(
165
- [(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
166
- )
167
 
168
  return left_eye, right_eye
169
 
@@ -177,9 +165,7 @@ class HeadTracker:
177
  left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
178
  return float(np.arctan2(right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]))
179
 
180
- def get_head_position(
181
- self, img: np.ndarray
182
- ) -> Tuple[Optional[np.ndarray], Optional[float]]:
183
  """Get head position from face detection.
184
 
185
  Args:
 
1
+ from __future__ import annotations
 
2
  import logging
3
+ from typing import Tuple, Optional
4
 
5
  import numpy as np
 
6
  from supervision import Detections
7
  from ultralytics import YOLO
8
+ from huggingface_hub import hf_hub_download
9
+
10
 
11
  logger = logging.getLogger(__name__)
12
 
 
94
 
95
  return np.array([norm_x, norm_y], dtype=np.float32)
96
 
97
+ def get_eyes(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
 
 
98
  """Get eye positions (approximated from face bbox).
99
 
100
  Note: YOLO only provides face bbox, so we estimate eye positions
 
129
  right_eye_x = bbox[0] + face_width * 0.65
130
 
131
  # Convert to MediaPipe coordinates
132
+ left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
133
+ right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
 
 
 
 
134
 
135
  return left_eye, right_eye
136
 
137
  def get_eyes_from_landmarks(self, face_landmarks) -> Tuple[np.ndarray, np.ndarray]:
138
  """Compatibility method - YOLO doesn't have landmarks, so we store bbox in the object."""
139
+ if not hasattr(face_landmarks, "_bbox") or not hasattr(face_landmarks, "_img_shape"):
 
 
140
  raise ValueError("Face landmarks object missing required attributes")
141
 
142
  bbox = face_landmarks._bbox
 
150
  left_eye_x = bbox[0] + face_width * 0.35
151
  right_eye_x = bbox[0] + face_width * 0.65
152
 
153
+ left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
154
+ right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
 
 
 
 
155
 
156
  return left_eye, right_eye
157
 
 
165
  left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
166
  return float(np.arctan2(right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]))
167
 
168
+ def get_head_position(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[float]]:
 
 
169
  """Get head position from face detection.
170
 
171
  Args: