Alina commited on
Commit
3af62a2
·
unverified ·
2 Parent(s): 17f679e 931a3c4

Merge pull request #33 from pollen-robotics/clean_console_output

Browse files
README.md CHANGED
@@ -3,10 +3,14 @@
3
  Working repo, we should turn this into a ReachyMini app at some point maybe ?
4
 
5
  ## Installation
 
6
 
7
  ```bash
8
- pip install -e .
 
 
9
  ```
 
10
 
11
  ## Run
12
 
 
3
  Working repo, we should turn this into a ReachyMini app at some point maybe ?
4
 
5
  ## Installation
6
+ You can set up the project quickly using [uv](https://docs.astral.sh/uv/):
7
 
8
  ```bash
9
+ uv venv --python 3.12.1 # Create a virtual environment with Python 3.12.1
10
+ source .venv/bin/activate
11
+ uv sync
12
  ```
13
+ > Note: The `pyproject.toml` expects `reachy-mini-dances-library` to be located in the same directory as this project.
14
 
15
  ## Run
16
 
pyproject.toml CHANGED
@@ -10,44 +10,77 @@ description = ""
10
  readme = "README.md"
11
  requires-python = ">=3.10"
12
  dependencies = [
13
- # "reachy_mini@git+ssh://git@github.com/pollen-robotics/reachy_mini@develop",
14
- "openai",
15
  "fastrtc",
 
 
 
 
16
  "onnxruntime",
17
- "PyGObject>=3.42.2, <=3.46.0",
 
 
 
 
 
 
18
  "torch",
19
  "transformers",
20
- "num2words",
21
- "dotenv",
22
  "ultralytics",
23
- "supervision",
24
- "reachy_mini_toolbox@git+ssh://git@github.com/pollen-robotics/reachy_mini_toolbox@main",
25
- "reachy_mini_dances_library@git+ssh://git@github.com/pollen-robotics/reachy_mini_dances_library@main"
26
  ]
27
 
 
 
28
 
29
- [project.optional-dependencies]
30
  dev = ["pytest", "ruff==0.12.0"]
31
 
32
- [project.scripts]
33
- reachy-mini-conversation-demo = "reachy_mini_conversation_demo.main:main"
 
 
 
34
 
35
  [tool.setuptools]
36
  package-dir = { "" = "src" }
37
  include-package-data = true
38
 
39
-
40
  [tool.setuptools.packages.find]
41
  where = ["src"]
42
 
43
-
44
  [tool.setuptools.package-data]
45
- reachy_mini = ["**/*"] # Inclut tous les fichiers non .py
46
 
47
  [tool.ruff]
48
- exclude = []
49
- lint.extend-select = ["I", "D"]
50
- lint.ignore = [
51
- "D203", # Incompatible with D211
52
- "D213", # Incompatible with D212
 
 
 
 
 
 
53
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  readme = "README.md"
11
  requires-python = ">=3.10"
12
  dependencies = [
13
+ "aiortc>=1.13.0",
 
14
  "fastrtc",
15
+ "gradio>=5.49.0",
16
+ "huggingface_hub>=0.34.4",
17
+ "mediapipe>=0.10.14",
18
+ "num2words",
19
  "onnxruntime",
20
+ "openai>=2.1",
21
+ "PyGObject>=3.42.2,<=3.46.0",
22
+ "python-dotenv",
23
+ "reachy_mini_dances_library",
24
+ "reachy_mini_toolbox",
25
+ "reachy_mini",
26
+ "supervision",
27
  "torch",
28
  "transformers",
 
 
29
  "ultralytics",
 
 
 
30
  ]
31
 
32
+ [project.scripts]
33
+ reachy-mini-conversation-demo = "reachy_mini_conversation_demo.main:main"
34
 
35
+ [dependency-groups]
36
  dev = ["pytest", "ruff==0.12.0"]
37
 
38
+ [tool.uv.sources]
39
+ reachy_mini_dances_library = { path = "../reachy_mini_dances_library", editable = true }
40
+ reachy_mini = { git = "ssh://git@github.com/pollen-robotics/reachy_mini.git", branch = "develop" }
41
+ reachy_mini_toolbox = { git = "ssh://git@github.com/pollen-robotics/reachy_mini_toolbox.git", branch = "main" }
42
+ fastrtc = { git = "ssh://git@github.com/gradio-app/fastrtc.git", branch = "main" }
43
 
44
  [tool.setuptools]
45
  package-dir = { "" = "src" }
46
  include-package-data = true
47
 
 
48
  [tool.setuptools.packages.find]
49
  where = ["src"]
50
 
 
51
  [tool.setuptools.package-data]
52
+ reachy_mini_conversation_demo = ["images/*"]
53
 
54
  [tool.ruff]
55
+ line-length = 119
56
+ exclude = [".venv", "dist", "build", "**/__pycache__", "*.egg-info", ".mypy_cache", ".pytest_cache"]
57
+
58
+ [tool.ruff.lint]
59
+ select = [
60
+ "E", # pycodestyle errors
61
+ "F", # pyflakes
62
+ "W", # pycodestyle warnings
63
+ "I", # isort
64
+ "C4", # flake8-comprehensions
65
+ "D", # pydocstyle
66
  ]
67
+ ignore = [
68
+ "E501", # handled by formatter
69
+ "D100", # ignore missing module docstrings
70
+ "D203", # blank line before class docstring (conflicts with D211)
71
+ "D213", # summary on second line (conflicts with D212)
72
+ ]
73
+
74
+ [tool.ruff.lint.isort]
75
+ length-sort = true
76
+ lines-after-imports = 2
77
+ no-lines-before = ["standard-library", "local-folder"]
78
+ known-local-folder = ["reachy_mini_conversation_demo"]
79
+ known-first-party = ["reachy_mini", "reachy_mini_dances_library", "reachy_mini_toolbox"]
80
+ split-on-trailing-comma = true
81
+
82
+ [tool.ruff.format]
83
+ quote-style = "double"
84
+ indent-style = "space"
85
+ skip-magic-trailing-comma = false
86
+ line-ending = "auto"
src/reachy_mini_conversation_demo/__init__.py CHANGED
@@ -1 +1 @@
1
- """Nothing (for ruff)."""
 
1
+ """Nothing (for ruff)."""
src/reachy_mini_conversation_demo/audio/__init__.py CHANGED
@@ -1 +1 @@
1
- """Nothing (for ruff)."""
 
1
+ """Nothing (for ruff)."""
src/reachy_mini_conversation_demo/audio/gstreamer.py CHANGED
@@ -1,12 +1,13 @@
1
- import logging # noqa: D100
2
- from threading import Thread
3
  from typing import Optional
 
4
 
5
  import gi
6
 
 
7
  gi.require_version("Gst", "1.0")
8
  gi.require_version("GstApp", "1.0")
9
- from gi.repository import GLib, Gst # noqa: E402
10
 
11
 
12
  class GstPlayer:
@@ -25,18 +26,16 @@ class GstPlayer:
25
  self.appsrc = Gst.ElementFactory.make("appsrc", None)
26
  self.appsrc.set_property("format", Gst.Format.TIME)
27
  self.appsrc.set_property("is-live", True)
28
- caps = Gst.Caps.from_string(
29
- f"audio/x-raw,format=S16LE,channels=1,rate={sample_rate},layout=interleaved"
30
- )
31
  self.appsrc.set_property("caps", caps)
32
  queue = Gst.ElementFactory.make("queue")
33
  audioconvert = Gst.ElementFactory.make("audioconvert")
34
  audioresample = Gst.ElementFactory.make("audioresample")
35
 
36
  # Try to pin specific output device; fallback to autoaudiosink
37
- audiosink = _create_device_element(
38
- direction="sink", name_substr=device_name
39
- ) or Gst.ElementFactory.make("autoaudiosink")
40
 
41
  self.pipeline.add(self.appsrc)
42
  self.pipeline.add(queue)
@@ -104,9 +103,9 @@ class GstRecorder:
104
  self.pipeline = Gst.Pipeline.new("audio_recorder")
105
 
106
  # Create elements: try specific mic; fallback to default
107
- autoaudiosrc = _create_device_element(
108
- direction="source", name_substr=device_name
109
- ) or Gst.ElementFactory.make("autoaudiosrc", None)
110
 
111
  queue = Gst.ElementFactory.make("queue", None)
112
  audioconvert = Gst.ElementFactory.make("audioconvert", None)
@@ -117,9 +116,7 @@ class GstRecorder:
117
  raise RuntimeError("Failed to create GStreamer elements")
118
 
119
  # Force mono/S16LE at 24000; resample handles device SR (e.g., 16000 → 24000)
120
- caps = Gst.Caps.from_string(
121
- f"audio/x-raw,channels=1,rate={sample_rate},format=S16LE"
122
- )
123
  self.appsink.set_property("caps", caps)
124
 
125
  # Build pipeline
@@ -183,9 +180,7 @@ class GstRecorder:
183
  logger.info("Stopped Recorder")
184
 
185
 
186
- def _create_device_element(
187
- direction: str, name_substr: Optional[str]
188
- ) -> Optional[Gst.Element]:
189
  """direction: 'source' or 'sink'.
190
 
191
  name_substr: case-insensitive substring matching device display name/description.
@@ -205,30 +200,15 @@ def _create_device_element(
205
  for dev in monitor.get_devices() or []:
206
  disp = dev.get_display_name() or ""
207
  props = dev.get_properties()
208
- desc = (
209
- props.get_string("device.description")
210
- if props and props.has_field("device.description")
211
- else ""
212
- )
213
  logger.info(f"Device candidate: disp='{disp}', desc='{desc}'")
214
 
215
- if (
216
- name_substr.lower() in disp.lower()
217
- or name_substr.lower() in desc.lower()
218
- ):
219
  elem = dev.create_element(None)
220
- factory = (
221
- elem.get_factory().get_name()
222
- if elem and elem.get_factory()
223
- else "<?>"
224
- )
225
- logger.info(
226
- f"Using {direction} device: '{disp or desc}' (factory='{factory}')"
227
- )
228
  return elem
229
  finally:
230
  monitor.stop()
231
- logging.getLogger(__name__).warning(
232
- "Requested %s '%s' not found; using auto*", direction, name_substr
233
- )
234
  return None
 
1
+ import logging
 
2
  from typing import Optional
3
+ from threading import Thread
4
 
5
  import gi
6
 
7
+
8
  gi.require_version("Gst", "1.0")
9
  gi.require_version("GstApp", "1.0")
10
+ from gi.repository import Gst, GLib # noqa: E402
11
 
12
 
13
  class GstPlayer:
 
26
  self.appsrc = Gst.ElementFactory.make("appsrc", None)
27
  self.appsrc.set_property("format", Gst.Format.TIME)
28
  self.appsrc.set_property("is-live", True)
29
+ caps = Gst.Caps.from_string(f"audio/x-raw,format=S16LE,channels=1,rate={sample_rate},layout=interleaved")
 
 
30
  self.appsrc.set_property("caps", caps)
31
  queue = Gst.ElementFactory.make("queue")
32
  audioconvert = Gst.ElementFactory.make("audioconvert")
33
  audioresample = Gst.ElementFactory.make("audioresample")
34
 
35
  # Try to pin specific output device; fallback to autoaudiosink
36
+ audiosink = _create_device_element(direction="sink", name_substr=device_name) or Gst.ElementFactory.make(
37
+ "autoaudiosink"
38
+ )
39
 
40
  self.pipeline.add(self.appsrc)
41
  self.pipeline.add(queue)
 
103
  self.pipeline = Gst.Pipeline.new("audio_recorder")
104
 
105
  # Create elements: try specific mic; fallback to default
106
+ autoaudiosrc = _create_device_element(direction="source", name_substr=device_name) or Gst.ElementFactory.make(
107
+ "autoaudiosrc", None
108
+ )
109
 
110
  queue = Gst.ElementFactory.make("queue", None)
111
  audioconvert = Gst.ElementFactory.make("audioconvert", None)
 
116
  raise RuntimeError("Failed to create GStreamer elements")
117
 
118
  # Force mono/S16LE at 24000; resample handles device SR (e.g., 16000 → 24000)
119
+ caps = Gst.Caps.from_string(f"audio/x-raw,channels=1,rate={sample_rate},format=S16LE")
 
 
120
  self.appsink.set_property("caps", caps)
121
 
122
  # Build pipeline
 
180
  logger.info("Stopped Recorder")
181
 
182
 
183
+ def _create_device_element(direction: str, name_substr: Optional[str]) -> Optional[Gst.Element]:
 
 
184
  """direction: 'source' or 'sink'.
185
 
186
  name_substr: case-insensitive substring matching device display name/description.
 
200
  for dev in monitor.get_devices() or []:
201
  disp = dev.get_display_name() or ""
202
  props = dev.get_properties()
203
+ desc = props.get_string("device.description") if props and props.has_field("device.description") else ""
 
 
 
 
204
  logger.info(f"Device candidate: disp='{disp}', desc='{desc}'")
205
 
206
+ if name_substr.lower() in disp.lower() or name_substr.lower() in desc.lower():
 
 
 
207
  elem = dev.create_element(None)
208
+ factory = elem.get_factory().get_name() if elem and elem.get_factory() else "<?>"
209
+ logger.info(f"Using {direction} device: '{disp or desc}' (factory='{factory}')")
 
 
 
 
 
 
210
  return elem
211
  finally:
212
  monitor.stop()
213
+ logging.getLogger(__name__).warning("Requested %s '%s' not found; using auto*", direction, name_substr)
 
 
214
  return None
src/reachy_mini_conversation_demo/audio/head_wobbler.py CHANGED
@@ -1,16 +1,17 @@
1
  """Moves head given audio samples."""
2
 
 
 
3
  import base64
4
  import logging
5
- import queue
6
  import threading
7
- import time
8
  from typing import Optional
9
 
10
  import numpy as np
11
 
12
  from reachy_mini_conversation_demo.audio.speech_tapper import HOP_MS, SwayRollRT
13
 
 
14
  SAMPLE_RATE = 24000
15
  MOVEMENT_LATENCY_S = 0.08 # seconds between audio and robot movement
16
  logger = logging.getLogger(__name__)
@@ -41,14 +42,14 @@ class HeadWobbler:
41
  self._stop_event.clear()
42
  self._thread = threading.Thread(target=self.working_loop, daemon=True)
43
  self._thread.start()
44
- logger.info("Head wobbler started")
45
 
46
  def stop(self) -> None:
47
  """Stop the head wobbler loop."""
48
  self._stop_event.set()
49
  if self._thread is not None:
50
  self._thread.join()
51
- logger.info("Head wobbler stopped")
52
 
53
  def working_loop(self) -> None:
54
  """Convert audio deltas into head movement offsets."""
 
1
  """Moves head given audio samples."""
2
 
3
+ import time
4
+ import queue
5
  import base64
6
  import logging
 
7
  import threading
 
8
  from typing import Optional
9
 
10
  import numpy as np
11
 
12
  from reachy_mini_conversation_demo.audio.speech_tapper import HOP_MS, SwayRollRT
13
 
14
+
15
  SAMPLE_RATE = 24000
16
  MOVEMENT_LATENCY_S = 0.08 # seconds between audio and robot movement
17
  logger = logging.getLogger(__name__)
 
42
  self._stop_event.clear()
43
  self._thread = threading.Thread(target=self.working_loop, daemon=True)
44
  self._thread.start()
45
+ logger.debug("Head wobbler started")
46
 
47
  def stop(self) -> None:
48
  """Stop the head wobbler loop."""
49
  self._stop_event.set()
50
  if self._thread is not None:
51
  self._thread.join()
52
+ logger.debug("Head wobbler stopped")
53
 
54
  def working_loop(self) -> None:
55
  """Convert audio deltas into head movement offsets."""
src/reachy_mini_conversation_demo/audio/speech_tapper.py CHANGED
@@ -1,12 +1,12 @@
1
- from __future__ import annotations # noqa: D100
2
-
3
  import math
4
- from collections import deque
5
- from itertools import islice
6
  from typing import Dict, List, Optional
 
 
7
 
8
  import numpy as np
9
 
 
10
  # Tunables
11
  SR = 16_000
12
  FRAME_MS = 20
@@ -68,7 +68,7 @@ def _loudness_gain(db: float, offset: float = SENS_DB_OFFSET) -> float:
68
 
69
  def _to_float32_mono(x: np.ndarray) -> np.ndarray:
70
  """Convert arbitrary PCM array to float32 mono in [-1,1].
71
-
72
  Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
73
  """
74
  a = np.asarray(x)
@@ -258,24 +258,9 @@ class SwayRollRT:
258
  * env
259
  * math.sin(2 * math.pi * SWAY_F_ROLL * self.t + self.phase_roll)
260
  )
261
- x_mm = (
262
- SWAY_A_X_MM
263
- * loud
264
- * env
265
- * math.sin(2 * math.pi * SWAY_F_X * self.t + self.phase_x)
266
- )
267
- y_mm = (
268
- SWAY_A_Y_MM
269
- * loud
270
- * env
271
- * math.sin(2 * math.pi * SWAY_F_Y * self.t + self.phase_y)
272
- )
273
- z_mm = (
274
- SWAY_A_Z_MM
275
- * loud
276
- * env
277
- * math.sin(2 * math.pi * SWAY_F_Z * self.t + self.phase_z)
278
- )
279
 
280
  out.append(
281
  {
 
1
+ from __future__ import annotations
 
2
  import math
 
 
3
  from typing import Dict, List, Optional
4
+ from itertools import islice
5
+ from collections import deque
6
 
7
  import numpy as np
8
 
9
+
10
  # Tunables
11
  SR = 16_000
12
  FRAME_MS = 20
 
68
 
69
  def _to_float32_mono(x: np.ndarray) -> np.ndarray:
70
  """Convert arbitrary PCM array to float32 mono in [-1,1].
71
+
72
  Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
73
  """
74
  a = np.asarray(x)
 
258
  * env
259
  * math.sin(2 * math.pi * SWAY_F_ROLL * self.t + self.phase_roll)
260
  )
261
+ x_mm = SWAY_A_X_MM * loud * env * math.sin(2 * math.pi * SWAY_F_X * self.t + self.phase_x)
262
+ y_mm = SWAY_A_Y_MM * loud * env * math.sin(2 * math.pi * SWAY_F_Y * self.t + self.phase_y)
263
+ z_mm = SWAY_A_Z_MM * loud * env * math.sin(2 * math.pi * SWAY_F_Z * self.t + self.phase_z)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
  out.append(
266
  {
src/reachy_mini_conversation_demo/camera_worker.py CHANGED
@@ -6,16 +6,18 @@ Ported from main_works.py camera_worker() function to provide:
6
  - Latest frame always available for tools
7
  """
8
 
 
9
  import logging
10
  import threading
11
- import time
12
- from typing import Optional, Tuple
13
 
14
  import cv2
15
  import numpy as np
 
 
16
  from reachy_mini import ReachyMini
17
  from reachy_mini.utils.interpolation import linear_pose_interpolation
18
- from scipy.spatial.transform import Rotation as R
19
 
20
  logger = logging.getLogger(__name__)
21
 
@@ -83,14 +85,14 @@ class CameraWorker:
83
  self._stop_event.clear()
84
  self._thread = threading.Thread(target=self.working_loop, daemon=True)
85
  self._thread.start()
86
- logger.info("Camera worker started")
87
 
88
  def stop(self) -> None:
89
  """Stop the camera worker loop."""
90
  self._stop_event.set()
91
  if self._thread is not None:
92
  self._thread.join()
93
- logger.info("Camera worker stopped")
94
 
95
  def working_loop(self) -> None:
96
  """Enable the camera worker loop.
@@ -114,17 +116,10 @@ class CameraWorker:
114
  self.latest_frame = frame # .copy()
115
 
116
  # Check if face tracking was just disabled
117
- if (
118
- self.previous_head_tracking_state
119
- and not self.is_head_tracking_enabled
120
- ):
121
  # Face tracking was just disabled - start interpolation to neutral
122
- self.last_face_detected_time = (
123
- current_time # Trigger the face-lost logic
124
- )
125
- self.interpolation_start_time = (
126
- None # Will be set by the face-lost interpolation
127
- )
128
  self.interpolation_start_pose = None
129
 
130
  # Update tracking state
@@ -137,9 +132,7 @@ class CameraWorker:
137
  if eye_center is not None:
138
  # Face detected - immediately switch to tracking
139
  self.last_face_detected_time = current_time
140
- self.interpolation_start_time = (
141
- None # Stop any interpolation
142
- )
143
 
144
  # Convert normalized coordinates to pixel coordinates
145
  h, w, _ = frame.shape
@@ -159,9 +152,7 @@ class CameraWorker:
159
 
160
  # Extract translation and rotation from the target pose directly
161
  translation = target_pose[:3, 3]
162
- rotation = R.from_matrix(target_pose[:3, :3]).as_euler(
163
- "xyz", degrees=False
164
- )
165
 
166
  # Thread-safe update of face tracking offsets (use pose as-is)
167
  with self.face_tracking_lock:
@@ -176,19 +167,14 @@ class CameraWorker:
176
 
177
  else:
178
  # No face detected while tracking enabled - set face lost timestamp
179
- if (
180
- self.last_face_detected_time is None
181
- or self.last_face_detected_time == current_time
182
- ):
183
  # Only update if we haven't already set a face lost time
184
  # (current_time check prevents overriding the disable-triggered timestamp)
185
  pass
186
 
187
  # Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
188
  if self.last_face_detected_time is not None:
189
- time_since_face_lost = (
190
- current_time - self.last_face_detected_time
191
- )
192
 
193
  if time_since_face_lost >= self.face_lost_delay:
194
  # Start interpolation if not already started
@@ -197,27 +183,17 @@ class CameraWorker:
197
  # Capture current pose as start of interpolation
198
  with self.face_tracking_lock:
199
  current_translation = self.face_tracking_offsets[:3]
200
- current_rotation_euler = self.face_tracking_offsets[
201
- 3:
202
- ]
203
  # Convert to 4x4 pose matrix
204
  self.interpolation_start_pose = np.eye(4)
205
- self.interpolation_start_pose[:3, 3] = (
206
- current_translation
207
- )
208
- self.interpolation_start_pose[:3, :3] = (
209
- R.from_euler(
210
- "xyz", current_rotation_euler
211
- ).as_matrix()
212
- )
213
 
214
  # Calculate interpolation progress (t from 0 to 1)
215
- elapsed_interpolation = (
216
- current_time - self.interpolation_start_time
217
- )
218
- t = min(
219
- 1.0, elapsed_interpolation / self.interpolation_duration
220
- )
221
 
222
  # Interpolate between current pose and neutral pose
223
  interpolated_pose = linear_pose_interpolation(
@@ -226,9 +202,7 @@ class CameraWorker:
226
 
227
  # Extract translation and rotation from interpolated pose
228
  translation = interpolated_pose[:3, 3]
229
- rotation = R.from_matrix(
230
- interpolated_pose[:3, :3]
231
- ).as_euler("xyz", degrees=False)
232
 
233
  # Thread-safe update of face tracking offsets
234
  with self.face_tracking_lock:
 
6
  - Latest frame always available for tools
7
  """
8
 
9
+ import time
10
  import logging
11
  import threading
12
+ from typing import Tuple, Optional
 
13
 
14
  import cv2
15
  import numpy as np
16
+ from scipy.spatial.transform import Rotation as R
17
+
18
  from reachy_mini import ReachyMini
19
  from reachy_mini.utils.interpolation import linear_pose_interpolation
20
+
21
 
22
  logger = logging.getLogger(__name__)
23
 
 
85
  self._stop_event.clear()
86
  self._thread = threading.Thread(target=self.working_loop, daemon=True)
87
  self._thread.start()
88
+ logger.debug("Camera worker started")
89
 
90
  def stop(self) -> None:
91
  """Stop the camera worker loop."""
92
  self._stop_event.set()
93
  if self._thread is not None:
94
  self._thread.join()
95
+ logger.debug("Camera worker stopped")
96
 
97
  def working_loop(self) -> None:
98
  """Enable the camera worker loop.
 
116
  self.latest_frame = frame # .copy()
117
 
118
  # Check if face tracking was just disabled
119
+ if self.previous_head_tracking_state and not self.is_head_tracking_enabled:
 
 
 
120
  # Face tracking was just disabled - start interpolation to neutral
121
+ self.last_face_detected_time = current_time # Trigger the face-lost logic
122
+ self.interpolation_start_time = None # Will be set by the face-lost interpolation
 
 
 
 
123
  self.interpolation_start_pose = None
124
 
125
  # Update tracking state
 
132
  if eye_center is not None:
133
  # Face detected - immediately switch to tracking
134
  self.last_face_detected_time = current_time
135
+ self.interpolation_start_time = None # Stop any interpolation
 
 
136
 
137
  # Convert normalized coordinates to pixel coordinates
138
  h, w, _ = frame.shape
 
152
 
153
  # Extract translation and rotation from the target pose directly
154
  translation = target_pose[:3, 3]
155
+ rotation = R.from_matrix(target_pose[:3, :3]).as_euler("xyz", degrees=False)
 
 
156
 
157
  # Thread-safe update of face tracking offsets (use pose as-is)
158
  with self.face_tracking_lock:
 
167
 
168
  else:
169
  # No face detected while tracking enabled - set face lost timestamp
170
+ if self.last_face_detected_time is None or self.last_face_detected_time == current_time:
 
 
 
171
  # Only update if we haven't already set a face lost time
172
  # (current_time check prevents overriding the disable-triggered timestamp)
173
  pass
174
 
175
  # Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
176
  if self.last_face_detected_time is not None:
177
+ time_since_face_lost = current_time - self.last_face_detected_time
 
 
178
 
179
  if time_since_face_lost >= self.face_lost_delay:
180
  # Start interpolation if not already started
 
183
  # Capture current pose as start of interpolation
184
  with self.face_tracking_lock:
185
  current_translation = self.face_tracking_offsets[:3]
186
+ current_rotation_euler = self.face_tracking_offsets[3:]
 
 
187
  # Convert to 4x4 pose matrix
188
  self.interpolation_start_pose = np.eye(4)
189
+ self.interpolation_start_pose[:3, 3] = current_translation
190
+ self.interpolation_start_pose[:3, :3] = R.from_euler(
191
+ "xyz", current_rotation_euler
192
+ ).as_matrix()
 
 
 
 
193
 
194
  # Calculate interpolation progress (t from 0 to 1)
195
+ elapsed_interpolation = current_time - self.interpolation_start_time
196
+ t = min(1.0, elapsed_interpolation / self.interpolation_duration)
 
 
 
 
197
 
198
  # Interpolate between current pose and neutral pose
199
  interpolated_pose = linear_pose_interpolation(
 
202
 
203
  # Extract translation and rotation from interpolated pose
204
  translation = interpolated_pose[:3, 3]
205
+ rotation = R.from_matrix(interpolated_pose[:3, :3]).as_euler("xyz", degrees=False)
 
 
206
 
207
  # Thread-safe update of face tracking offsets
208
  with self.face_tracking_lock:
src/reachy_mini_conversation_demo/config.py CHANGED
@@ -1,7 +1,8 @@
1
- import os # noqa: D100
2
 
3
  from dotenv import load_dotenv
4
 
 
5
  load_dotenv()
6
 
7
 
@@ -15,7 +16,7 @@ def getenv_bool(key: str, default: bool = False) -> bool:
15
 
16
  class Config:
17
  """Configuration class for the conversation demo."""
18
-
19
  # Required
20
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
21
  if not OPENAI_API_KEY:
 
1
+ import os
2
 
3
  from dotenv import load_dotenv
4
 
5
+
6
  load_dotenv()
7
 
8
 
 
16
 
17
  class Config:
18
  """Configuration class for the conversation demo."""
19
+
20
  # Required
21
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
22
  if not OPENAI_API_KEY:
src/reachy_mini_conversation_demo/dance_emotion_moves.py CHANGED
@@ -5,15 +5,16 @@ and executed sequentially by the MovementManager.
5
  """
6
 
7
  from __future__ import annotations
8
-
9
  import logging
10
  from typing import Tuple
11
 
12
  import numpy as np
 
13
  from reachy_mini.motion.move import Move
14
  from reachy_mini.motion.recorded_move import RecordedMoves
15
  from reachy_mini_dances_library.dance_move import DanceMove
16
 
 
17
  logger = logging.getLogger(__name__)
18
 
19
 
@@ -30,9 +31,7 @@ class DanceQueueMove(Move):
30
  """Duration property required by official Move interface."""
31
  return self.dance_move.duration
32
 
33
- def evaluate(
34
- self, t: float
35
- ) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
36
  """Evaluate dance move at time t."""
37
  try:
38
  # Get the pose from the dance move
@@ -45,9 +44,7 @@ class DanceQueueMove(Move):
45
  return (head_pose, antennas, body_yaw)
46
 
47
  except Exception as e:
48
- logger.error(
49
- f"Error evaluating dance move '{self.move_name}' at t={t}: {e}"
50
- )
51
  # Return neutral pose on error
52
  from reachy_mini.utils import create_head_pose
53
 
@@ -68,9 +65,7 @@ class EmotionQueueMove(Move):
68
  """Duration property required by official Move interface."""
69
  return self.emotion_move.duration
70
 
71
- def evaluate(
72
- self, t: float
73
- ) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
74
  """Evaluate emotion move at time t."""
75
  try:
76
  # Get the pose from the emotion move
@@ -83,9 +78,7 @@ class EmotionQueueMove(Move):
83
  return (head_pose, antennas, body_yaw)
84
 
85
  except Exception as e:
86
- logger.error(
87
- f"Error evaluating emotion '{self.emotion_name}' at t={t}: {e}"
88
- )
89
  # Return neutral pose on error
90
  from reachy_mini.utils import create_head_pose
91
 
@@ -120,9 +113,7 @@ class GotoQueueMove(Move):
120
  """Duration property required by official Move interface."""
121
  return self._duration
122
 
123
- def evaluate(
124
- self, t: float
125
- ) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
126
  """Evaluate goto move at time t using linear interpolation."""
127
  try:
128
  from reachy_mini.utils import create_head_pose
@@ -138,32 +129,23 @@ class GotoQueueMove(Move):
138
  start_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
139
 
140
  # Interpolate head pose
141
- head_pose = linear_pose_interpolation(
142
- start_pose, self.target_head_pose, t_clamped
143
- )
144
 
145
  # Interpolate antennas - return as numpy array
146
  antennas = np.array(
147
  [
148
- self.start_antennas[0]
149
- + (self.target_antennas[0] - self.start_antennas[0]) * t_clamped,
150
- self.start_antennas[1]
151
- + (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
152
  ]
153
  )
154
 
155
  # Interpolate body yaw
156
- body_yaw = (
157
- self.start_body_yaw
158
- + (self.target_body_yaw - self.start_body_yaw) * t_clamped
159
- )
160
 
161
  return (head_pose, antennas, body_yaw)
162
 
163
  except Exception as e:
164
  logger.error(f"Error evaluating goto move at t={t}: {e}")
165
  # Return target pose on error - convert antennas to numpy array
166
- target_antennas_array = np.array(
167
- [self.target_antennas[0], self.target_antennas[1]]
168
- )
169
  return (self.target_head_pose, target_antennas_array, self.target_body_yaw)
 
5
  """
6
 
7
  from __future__ import annotations
 
8
  import logging
9
  from typing import Tuple
10
 
11
  import numpy as np
12
+
13
  from reachy_mini.motion.move import Move
14
  from reachy_mini.motion.recorded_move import RecordedMoves
15
  from reachy_mini_dances_library.dance_move import DanceMove
16
 
17
+
18
  logger = logging.getLogger(__name__)
19
 
20
 
 
31
  """Duration property required by official Move interface."""
32
  return self.dance_move.duration
33
 
34
+ def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
 
 
35
  """Evaluate dance move at time t."""
36
  try:
37
  # Get the pose from the dance move
 
44
  return (head_pose, antennas, body_yaw)
45
 
46
  except Exception as e:
47
+ logger.error(f"Error evaluating dance move '{self.move_name}' at t={t}: {e}")
 
 
48
  # Return neutral pose on error
49
  from reachy_mini.utils import create_head_pose
50
 
 
65
  """Duration property required by official Move interface."""
66
  return self.emotion_move.duration
67
 
68
+ def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
 
 
69
  """Evaluate emotion move at time t."""
70
  try:
71
  # Get the pose from the emotion move
 
78
  return (head_pose, antennas, body_yaw)
79
 
80
  except Exception as e:
81
+ logger.error(f"Error evaluating emotion '{self.emotion_name}' at t={t}: {e}")
 
 
82
  # Return neutral pose on error
83
  from reachy_mini.utils import create_head_pose
84
 
 
113
  """Duration property required by official Move interface."""
114
  return self._duration
115
 
116
+ def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
 
 
117
  """Evaluate goto move at time t using linear interpolation."""
118
  try:
119
  from reachy_mini.utils import create_head_pose
 
129
  start_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
130
 
131
  # Interpolate head pose
132
+ head_pose = linear_pose_interpolation(start_pose, self.target_head_pose, t_clamped)
 
 
133
 
134
  # Interpolate antennas - return as numpy array
135
  antennas = np.array(
136
  [
137
+ self.start_antennas[0] + (self.target_antennas[0] - self.start_antennas[0]) * t_clamped,
138
+ self.start_antennas[1] + (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
 
 
139
  ]
140
  )
141
 
142
  # Interpolate body yaw
143
+ body_yaw = self.start_body_yaw + (self.target_body_yaw - self.start_body_yaw) * t_clamped
 
 
 
144
 
145
  return (head_pose, antennas, body_yaw)
146
 
147
  except Exception as e:
148
  logger.error(f"Error evaluating goto move at t={t}: {e}")
149
  # Return target pose on error - convert antennas to numpy array
150
+ target_antennas_array = np.array([self.target_antennas[0], self.target_antennas[1]])
 
 
151
  return (self.target_head_pose, target_antennas_array, self.target_body_yaw)
src/reachy_mini_conversation_demo/main.py CHANGED
@@ -5,17 +5,17 @@ import os
5
  import gradio as gr
6
  from fastapi import FastAPI
7
  from fastrtc import Stream
8
- from reachy_mini import ReachyMini
9
 
10
- from reachy_mini_conversation_demo.audio.head_wobbler import HeadWobbler
11
  from reachy_mini_conversation_demo.moves import MovementManager
12
- from reachy_mini_conversation_demo.openai_realtime import OpenaiRealtimeHandler
13
  from reachy_mini_conversation_demo.tools import ToolDependencies
14
  from reachy_mini_conversation_demo.utils import (
15
- handle_vision_stuff,
16
  parse_args,
17
  setup_logger,
 
18
  )
 
 
19
 
20
 
21
  def update_chatbot(chatbot: list[dict], response: dict):
@@ -51,7 +51,7 @@ def main():
51
  head_wobbler=head_wobbler,
52
  )
53
  current_file_path = os.path.dirname(os.path.abspath(__file__))
54
- logger.info(f"Current file absolute path: {current_file_path}")
55
  chatbot = gr.Chatbot(
56
  type="messages",
57
  resizable=True,
@@ -60,7 +60,7 @@ def main():
60
  os.path.join(current_file_path, "images", "reachymini_avatar.png"),
61
  ),
62
  )
63
- logger.info(f"Chatbot avatar images: {chatbot.avatar_images}")
64
 
65
  handler = OpenaiRealtimeHandler(deps)
66
  stream = Stream(
 
5
  import gradio as gr
6
  from fastapi import FastAPI
7
  from fastrtc import Stream
 
8
 
9
+ from reachy_mini import ReachyMini
10
  from reachy_mini_conversation_demo.moves import MovementManager
 
11
  from reachy_mini_conversation_demo.tools import ToolDependencies
12
  from reachy_mini_conversation_demo.utils import (
 
13
  parse_args,
14
  setup_logger,
15
+ handle_vision_stuff,
16
  )
17
+ from reachy_mini_conversation_demo.openai_realtime import OpenaiRealtimeHandler
18
+ from reachy_mini_conversation_demo.audio.head_wobbler import HeadWobbler
19
 
20
 
21
  def update_chatbot(chatbot: list[dict], response: dict):
 
51
  head_wobbler=head_wobbler,
52
  )
53
  current_file_path = os.path.dirname(os.path.abspath(__file__))
54
+ logger.debug(f"Current file absolute path: {current_file_path}")
55
  chatbot = gr.Chatbot(
56
  type="messages",
57
  resizable=True,
 
60
  os.path.join(current_file_path, "images", "reachymini_avatar.png"),
61
  ),
62
  )
63
+ logger.debug(f"Chatbot avatar images: {chatbot.avatar_images}")
64
 
65
  handler = OpenaiRealtimeHandler(deps)
66
  stream = Stream(
src/reachy_mini_conversation_demo/moves.py CHANGED
@@ -7,29 +7,28 @@ This module implements the movement architecture from main_works.py:
7
  """
8
 
9
  from __future__ import annotations
10
-
11
  import logging
12
  import threading
13
- import time
14
  from collections import deque
15
  from dataclasses import dataclass
16
- from typing import Optional, Tuple
17
 
18
  import numpy as np
 
19
  from reachy_mini import ReachyMini
20
- from reachy_mini.motion.move import Move
21
  from reachy_mini.utils import create_head_pose
 
22
  from reachy_mini.utils.interpolation import (
23
  compose_world_offset,
24
  linear_pose_interpolation,
25
  )
26
 
 
27
  logger = logging.getLogger(__name__)
28
 
29
  # Type definitions
30
- FullBodyPose = Tuple[
31
- np.ndarray, Tuple[float, float], float
32
- ] # (head_pose_4x4, antennas, body_yaw)
33
 
34
 
35
  class BreathingMove(Move):
@@ -68,9 +67,7 @@ class BreathingMove(Move):
68
  """Duration property required by official Move interface."""
69
  return float("inf") # Continuous breathing (never ends naturally)
70
 
71
- def evaluate(
72
- self, t: float
73
- ) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
74
  """Evaluate breathing move at time t."""
75
  if t < self.interpolation_duration:
76
  # Phase 1: Interpolate to neutral base position
@@ -83,35 +80,26 @@ class BreathingMove(Move):
83
 
84
  # Interpolate antennas
85
  antennas = (
86
- (1 - interpolation_t) * self.interpolation_start_antennas
87
- + interpolation_t * self.neutral_antennas
88
- )
89
 
90
  else:
91
  # Phase 2: Breathing patterns from neutral base
92
  breathing_time = t - self.interpolation_duration
93
 
94
  # Gentle z-axis breathing
95
- z_offset = self.breathing_z_amplitude * np.sin(
96
- 2 * np.pi * self.breathing_frequency * breathing_time
97
- )
98
- head_pose = create_head_pose(
99
- x=0, y=0, z=z_offset, roll=0, pitch=0, yaw=0, degrees=True, mm=False
100
- )
101
 
102
  # Antenna sway (opposite directions)
103
- antenna_sway = self.antenna_sway_amplitude * np.sin(
104
- 2 * np.pi * self.antenna_frequency * breathing_time
105
- )
106
  antennas = np.array([antenna_sway, -antenna_sway])
107
 
108
  # Return in official Move interface format: (head_pose, antennas_array, body_yaw)
109
  return (head_pose, antennas, 0.0)
110
 
111
 
112
- def combine_full_body(
113
- primary_pose: FullBodyPose, secondary_pose: FullBodyPose
114
- ) -> FullBodyPose:
115
  """Combine primary and secondary full body poses.
116
 
117
  Args:
@@ -127,9 +115,7 @@ def combine_full_body(
127
 
128
  # Combine head poses using compose_world_offset
129
  # primary_head is T_abs, secondary_head is T_off_world
130
- combined_head = compose_world_offset(
131
- primary_head, secondary_head, reorthonormalize=True
132
- )
133
 
134
  # Sum antennas and body_yaw
135
  combined_antennas = (
@@ -226,9 +212,7 @@ class MovementManager:
226
  self._thread: Optional[threading.Thread] = None
227
  self._state_lock = threading.RLock()
228
  self._is_listening = False
229
- self._last_commanded_pose: FullBodyPose = clone_full_body_pose(
230
- self.state.last_primary_pose
231
- )
232
  self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
233
  self._antenna_unfreeze_blend = 1.0
234
  self._antenna_blend_duration = 0.4 # seconds to blend back after listening
@@ -239,9 +223,7 @@ class MovementManager:
239
  with self._state_lock:
240
  self.move_queue.append(move)
241
  self.state.update_activity()
242
- logger.info(
243
- f"Queued move with duration {move.duration}s, queue size: {len(self.move_queue)}"
244
- )
245
 
246
  def clear_queue(self) -> None:
247
  """Clear all queued moves and stop current move."""
@@ -252,22 +234,16 @@ class MovementManager:
252
  self.state.is_playing_move = False
253
  logger.info("Cleared move queue and stopped current move")
254
 
255
- def set_speech_offsets(
256
- self, offsets: Tuple[float, float, float, float, float, float]
257
- ) -> None:
258
  """Set speech head offsets (secondary move)."""
259
  with self._state_lock:
260
  self.state.speech_offsets = offsets
261
 
262
- def set_offsets(
263
- self, offsets: Tuple[float, float, float, float, float, float]
264
- ) -> None:
265
  """Compatibility alias for set_speech_offsets."""
266
  self.set_speech_offsets(offsets)
267
 
268
- def set_face_tracking_offsets(
269
- self, offsets: Tuple[float, float, float, float, float, float]
270
- ) -> None:
271
  """Set face tracking offsets (secondary move)."""
272
  with self._state_lock:
273
  self.state.face_tracking_offsets = offsets
@@ -314,8 +290,7 @@ class MovementManager:
314
  with self._state_lock:
315
  if self.state.current_move is None or (
316
  self.state.move_start_time is not None
317
- and current_time - self.state.move_start_time
318
- >= self.state.current_move.duration
319
  ):
320
  self.state.current_move = None
321
  self.state.move_start_time = None
@@ -323,9 +298,7 @@ class MovementManager:
323
  if self.move_queue:
324
  self.state.current_move = self.move_queue.popleft()
325
  self.state.move_start_time = current_time
326
- logger.info(
327
- f"Starting new move, duration: {self.state.current_move.duration}s"
328
- )
329
 
330
  def _manage_breathing(self, current_time: float) -> None:
331
  """Manage automatic breathing when idle."""
@@ -336,9 +309,7 @@ class MovementManager:
336
 
337
  if self.is_idle():
338
  try:
339
- _, current_antennas = (
340
- self.current_robot.get_current_joint_positions()
341
- )
342
  current_head_pose = self.current_robot.get_current_head_pose()
343
 
344
  breathing_move = BreathingMove(
@@ -348,9 +319,7 @@ class MovementManager:
348
  )
349
  self.move_queue.append(breathing_move)
350
  self.state.update_activity()
351
- logger.info(
352
- f"Started breathing after {time_since_activity:.1f}s of inactivity"
353
- )
354
  except Exception as e:
355
  logger.error(f"Failed to start breathing: {e}")
356
 
@@ -367,10 +336,7 @@ class MovementManager:
367
  """Get the primary full body pose from current move or neutral."""
368
  with self._state_lock:
369
  # When a primary move is playing, sample it and cache the resulting pose
370
- if (
371
- self.state.current_move is not None
372
- and self.state.move_start_time is not None
373
- ):
374
  move_time = current_time - self.state.move_start_time
375
  head, antennas, body_yaw = self.state.current_move.evaluate(move_time)
376
 
@@ -391,26 +357,18 @@ class MovementManager:
391
 
392
  self.state.is_playing_move = True
393
  self.state.is_moving = True
394
- self.state.last_primary_pose = clone_full_body_pose(
395
- primary_full_body_pose
396
- )
397
  else:
398
  # Otherwise reuse the last primary pose so we avoid jumps between moves
399
  self.state.is_playing_move = False
400
- self.state.is_moving = (
401
- time.time() - self.state.moving_start < self.state.moving_for
402
- )
403
 
404
  if self.state.last_primary_pose is not None:
405
- primary_full_body_pose = clone_full_body_pose(
406
- self.state.last_primary_pose
407
- )
408
  else:
409
  neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
410
  primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
411
- self.state.last_primary_pose = clone_full_body_pose(
412
- primary_full_body_pose
413
- )
414
 
415
  return primary_full_body_pose
416
 
@@ -456,14 +414,14 @@ class MovementManager:
456
  self._stop_event.clear()
457
  self._thread = threading.Thread(target=self.working_loop, daemon=True)
458
  self._thread.start()
459
- logger.info("Move worker started")
460
 
461
  def stop(self) -> None:
462
  """Stop the move worker loop."""
463
  self._stop_event.set()
464
  if self._thread is not None:
465
  self._thread.join()
466
- logger.info("Move worker stopped")
467
 
468
  def working_loop(self) -> None:
469
  """Control loop main movements - reproduces main_works.py control architecture.
@@ -496,9 +454,7 @@ class MovementManager:
496
  secondary_full_body_pose = self._get_secondary_pose()
497
 
498
  # 6. Combine primary and secondary poses
499
- global_full_body_pose = combine_full_body(
500
- primary_full_body_pose, secondary_full_body_pose
501
- )
502
 
503
  # 7. Extract pose components
504
  head, antennas, body_yaw = global_full_body_pose
@@ -539,16 +495,12 @@ class MovementManager:
539
 
540
  # 8. Single set_target call - the one and only place we control the robot
541
  try:
542
- self.current_robot.set_target(
543
- head=head, antennas=antennas_cmd, body_yaw=body_yaw
544
- )
545
  except Exception as e:
546
  logger.error(f"Failed to set robot target: {e}")
547
  else:
548
  with self._state_lock:
549
- self._last_commanded_pose = clone_full_body_pose(
550
- (head, antennas_cmd, body_yaw)
551
- )
552
 
553
  # 9. Calculate computation time and adjust sleep for 50Hz
554
  computation_time = time.time() - loop_start_time
@@ -558,9 +510,7 @@ class MovementManager:
558
  if loop_count % 100 == 0:
559
  elapsed = current_time - last_print_time
560
  actual_freq = 100.0 / elapsed if elapsed > 0 else 0
561
- potential_freq = (
562
- 1.0 / computation_time if computation_time > 0 else float("inf")
563
- )
564
  logger.debug(
565
  f"Loop freq - Actual: {actual_freq:.1f}Hz, Potential: {potential_freq:.1f}Hz, Target: {self.target_frequency:.1f}Hz"
566
  )
 
7
  """
8
 
9
  from __future__ import annotations
10
+ import time
11
  import logging
12
  import threading
13
+ from typing import Tuple, Optional
14
  from collections import deque
15
  from dataclasses import dataclass
 
16
 
17
  import numpy as np
18
+
19
  from reachy_mini import ReachyMini
 
20
  from reachy_mini.utils import create_head_pose
21
+ from reachy_mini.motion.move import Move
22
  from reachy_mini.utils.interpolation import (
23
  compose_world_offset,
24
  linear_pose_interpolation,
25
  )
26
 
27
+
28
  logger = logging.getLogger(__name__)
29
 
30
  # Type definitions
31
+ FullBodyPose = Tuple[np.ndarray, Tuple[float, float], float] # (head_pose_4x4, antennas, body_yaw)
 
 
32
 
33
 
34
  class BreathingMove(Move):
 
67
  """Duration property required by official Move interface."""
68
  return float("inf") # Continuous breathing (never ends naturally)
69
 
70
+ def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
 
 
71
  """Evaluate breathing move at time t."""
72
  if t < self.interpolation_duration:
73
  # Phase 1: Interpolate to neutral base position
 
80
 
81
  # Interpolate antennas
82
  antennas = (
83
+ 1 - interpolation_t
84
+ ) * self.interpolation_start_antennas + interpolation_t * self.neutral_antennas
 
85
 
86
  else:
87
  # Phase 2: Breathing patterns from neutral base
88
  breathing_time = t - self.interpolation_duration
89
 
90
  # Gentle z-axis breathing
91
+ z_offset = self.breathing_z_amplitude * np.sin(2 * np.pi * self.breathing_frequency * breathing_time)
92
+ head_pose = create_head_pose(x=0, y=0, z=z_offset, roll=0, pitch=0, yaw=0, degrees=True, mm=False)
 
 
 
 
93
 
94
  # Antenna sway (opposite directions)
95
+ antenna_sway = self.antenna_sway_amplitude * np.sin(2 * np.pi * self.antenna_frequency * breathing_time)
 
 
96
  antennas = np.array([antenna_sway, -antenna_sway])
97
 
98
  # Return in official Move interface format: (head_pose, antennas_array, body_yaw)
99
  return (head_pose, antennas, 0.0)
100
 
101
 
102
+ def combine_full_body(primary_pose: FullBodyPose, secondary_pose: FullBodyPose) -> FullBodyPose:
 
 
103
  """Combine primary and secondary full body poses.
104
 
105
  Args:
 
115
 
116
  # Combine head poses using compose_world_offset
117
  # primary_head is T_abs, secondary_head is T_off_world
118
+ combined_head = compose_world_offset(primary_head, secondary_head, reorthonormalize=True)
 
 
119
 
120
  # Sum antennas and body_yaw
121
  combined_antennas = (
 
212
  self._thread: Optional[threading.Thread] = None
213
  self._state_lock = threading.RLock()
214
  self._is_listening = False
215
+ self._last_commanded_pose: FullBodyPose = clone_full_body_pose(self.state.last_primary_pose)
 
 
216
  self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
217
  self._antenna_unfreeze_blend = 1.0
218
  self._antenna_blend_duration = 0.4 # seconds to blend back after listening
 
223
  with self._state_lock:
224
  self.move_queue.append(move)
225
  self.state.update_activity()
226
+ logger.info(f"Queued move with duration {move.duration}s, queue size: {len(self.move_queue)}")
 
 
227
 
228
  def clear_queue(self) -> None:
229
  """Clear all queued moves and stop current move."""
 
234
  self.state.is_playing_move = False
235
  logger.info("Cleared move queue and stopped current move")
236
 
237
+ def set_speech_offsets(self, offsets: Tuple[float, float, float, float, float, float]) -> None:
 
 
238
  """Set speech head offsets (secondary move)."""
239
  with self._state_lock:
240
  self.state.speech_offsets = offsets
241
 
242
+ def set_offsets(self, offsets: Tuple[float, float, float, float, float, float]) -> None:
 
 
243
  """Compatibility alias for set_speech_offsets."""
244
  self.set_speech_offsets(offsets)
245
 
246
+ def set_face_tracking_offsets(self, offsets: Tuple[float, float, float, float, float, float]) -> None:
 
 
247
  """Set face tracking offsets (secondary move)."""
248
  with self._state_lock:
249
  self.state.face_tracking_offsets = offsets
 
290
  with self._state_lock:
291
  if self.state.current_move is None or (
292
  self.state.move_start_time is not None
293
+ and current_time - self.state.move_start_time >= self.state.current_move.duration
 
294
  ):
295
  self.state.current_move = None
296
  self.state.move_start_time = None
 
298
  if self.move_queue:
299
  self.state.current_move = self.move_queue.popleft()
300
  self.state.move_start_time = current_time
301
+ logger.debug(f"Starting new move, duration: {self.state.current_move.duration}s")
 
 
302
 
303
  def _manage_breathing(self, current_time: float) -> None:
304
  """Manage automatic breathing when idle."""
 
309
 
310
  if self.is_idle():
311
  try:
312
+ _, current_antennas = self.current_robot.get_current_joint_positions()
 
 
313
  current_head_pose = self.current_robot.get_current_head_pose()
314
 
315
  breathing_move = BreathingMove(
 
319
  )
320
  self.move_queue.append(breathing_move)
321
  self.state.update_activity()
322
+ logger.debug(f"Started breathing after {time_since_activity:.1f}s of inactivity")
 
 
323
  except Exception as e:
324
  logger.error(f"Failed to start breathing: {e}")
325
 
 
336
  """Get the primary full body pose from current move or neutral."""
337
  with self._state_lock:
338
  # When a primary move is playing, sample it and cache the resulting pose
339
+ if self.state.current_move is not None and self.state.move_start_time is not None:
 
 
 
340
  move_time = current_time - self.state.move_start_time
341
  head, antennas, body_yaw = self.state.current_move.evaluate(move_time)
342
 
 
357
 
358
  self.state.is_playing_move = True
359
  self.state.is_moving = True
360
+ self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
 
 
361
  else:
362
  # Otherwise reuse the last primary pose so we avoid jumps between moves
363
  self.state.is_playing_move = False
364
+ self.state.is_moving = time.time() - self.state.moving_start < self.state.moving_for
 
 
365
 
366
  if self.state.last_primary_pose is not None:
367
+ primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
 
 
368
  else:
369
  neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
370
  primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
371
+ self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
 
 
372
 
373
  return primary_full_body_pose
374
 
 
414
  self._stop_event.clear()
415
  self._thread = threading.Thread(target=self.working_loop, daemon=True)
416
  self._thread.start()
417
+ logger.debug("Move worker started")
418
 
419
  def stop(self) -> None:
420
  """Stop the move worker loop."""
421
  self._stop_event.set()
422
  if self._thread is not None:
423
  self._thread.join()
424
+ logger.debug("Move worker stopped")
425
 
426
  def working_loop(self) -> None:
427
  """Control loop main movements - reproduces main_works.py control architecture.
 
454
  secondary_full_body_pose = self._get_secondary_pose()
455
 
456
  # 6. Combine primary and secondary poses
457
+ global_full_body_pose = combine_full_body(primary_full_body_pose, secondary_full_body_pose)
 
 
458
 
459
  # 7. Extract pose components
460
  head, antennas, body_yaw = global_full_body_pose
 
495
 
496
  # 8. Single set_target call - the one and only place we control the robot
497
  try:
498
+ self.current_robot.set_target(head=head, antennas=antennas_cmd, body_yaw=body_yaw)
 
 
499
  except Exception as e:
500
  logger.error(f"Failed to set robot target: {e}")
501
  else:
502
  with self._state_lock:
503
+ self._last_commanded_pose = clone_full_body_pose((head, antennas_cmd, body_yaw))
 
 
504
 
505
  # 9. Calculate computation time and adjust sleep for 50Hz
506
  computation_time = time.time() - loop_start_time
 
510
  if loop_count % 100 == 0:
511
  elapsed = current_time - last_print_time
512
  actual_freq = 100.0 / elapsed if elapsed > 0 else 0
513
+ potential_freq = 1.0 / computation_time if computation_time > 0 else float("inf")
 
 
514
  logger.debug(
515
  f"Loop freq - Actual: {actual_freq:.1f}Hz, Potential: {potential_freq:.1f}Hz, Target: {self.target_frequency:.1f}Hz"
516
  )
src/reachy_mini_conversation_demo/openai_realtime.py CHANGED
@@ -1,19 +1,21 @@
1
- import asyncio # noqa: D100
2
- import base64
3
  import json
 
 
4
  import logging
5
  from datetime import datetime
6
 
7
- import gradio as gr
8
  import numpy as np
9
- from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
10
  from openai import AsyncOpenAI
 
11
 
12
  from reachy_mini_conversation_demo.tools import (
13
  ALL_TOOL_SPECS,
14
  ToolDependencies,
15
  dispatch_tool_call,
16
  )
 
 
17
 
18
  logger = logging.getLogger(__name__)
19
 
@@ -45,7 +47,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
45
 
46
  async def start_up(self):
47
  """Start the handler."""
48
- self.client = AsyncOpenAI()
49
  async with self.client.beta.realtime.connect(model="gpt-realtime") as conn:
50
  await conn.session.update(
51
  session={
@@ -92,35 +94,22 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
92
  pass
93
  # self.deps.head_wobbler.reset()
94
 
95
- if (
96
- event.type
97
- == "conversation.item.input_audio_transcription.completed"
98
- ):
99
  logger.debug(f"user transcript: {event.transcript}")
100
- await self.output_queue.put(
101
- AdditionalOutputs({"role": "user", "content": event.transcript})
102
- )
103
 
104
  if event.type == "response.audio_transcript.done":
105
  logger.debug(f"assistant transcript: {event.transcript}")
106
- await self.output_queue.put(
107
- AdditionalOutputs(
108
- {"role": "assistant", "content": event.transcript}
109
- )
110
- )
111
 
112
  if event.type == "response.audio.delta":
113
  self.deps.head_wobbler.feed(event.delta)
114
  self.last_activity_time = asyncio.get_event_loop().time()
115
- logger.debug(
116
- "last activity time updated to %s", self.last_activity_time
117
- )
118
  await self.output_queue.put(
119
  (
120
  self.output_sample_rate,
121
- np.frombuffer(
122
- base64.b64decode(event.delta), dtype=np.int16
123
- ).reshape(1, -1),
124
  ),
125
  )
126
 
@@ -154,9 +143,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
154
  args_json_str = info["args_buf"] or "{}"
155
 
156
  try:
157
- tool_result = await dispatch_tool_call(
158
- tool_name, args_json_str, self.deps
159
- )
160
  logger.debug("[Tool %s executed]", tool_name)
161
  logger.debug("Tool result: %s", tool_result)
162
  except Exception as e:
@@ -177,9 +164,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
177
  {
178
  "role": "assistant",
179
  "content": json.dumps(tool_result),
180
- "metadata": dict(
181
- title="🛠️ Used tool " + tool_name, status="done"
182
- ),
183
  },
184
  )
185
  )
@@ -231,11 +216,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
231
  err = getattr(event, "error", None)
232
  msg = getattr(err, "message", str(err) if err else "unknown error")
233
  logger.error("Realtime error: %s (raw=%s)", msg, err)
234
- await self.output_queue.put(
235
- AdditionalOutputs(
236
- {"role": "assistant", "content": f"[error] {msg}"}
237
- )
238
- )
239
 
240
  # Microphone receive
241
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
@@ -258,9 +239,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
258
  if idle_duration > 15.0 and self.deps.movement_manager.is_idle():
259
  await self.send_idle_signal(idle_duration)
260
 
261
- self.last_activity_time = (
262
- asyncio.get_event_loop().time()
263
- ) # avoid repeated resets
264
 
265
  return await wait_for_item(self.output_queue)
266
 
 
 
 
1
  import json
2
+ import base64
3
+ import asyncio
4
  import logging
5
  from datetime import datetime
6
 
 
7
  import numpy as np
8
+ import gradio as gr
9
  from openai import AsyncOpenAI
10
+ from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
11
 
12
  from reachy_mini_conversation_demo.tools import (
13
  ALL_TOOL_SPECS,
14
  ToolDependencies,
15
  dispatch_tool_call,
16
  )
17
+ from reachy_mini_conversation_demo.config import config
18
+
19
 
20
  logger = logging.getLogger(__name__)
21
 
 
47
 
48
  async def start_up(self):
49
  """Start the handler."""
50
+ self.client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)
51
  async with self.client.beta.realtime.connect(model="gpt-realtime") as conn:
52
  await conn.session.update(
53
  session={
 
94
  pass
95
  # self.deps.head_wobbler.reset()
96
 
97
+ if event.type == "conversation.item.input_audio_transcription.completed":
 
 
 
98
  logger.debug(f"user transcript: {event.transcript}")
99
+ await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
 
 
100
 
101
  if event.type == "response.audio_transcript.done":
102
  logger.debug(f"assistant transcript: {event.transcript}")
103
+ await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
 
 
 
 
104
 
105
  if event.type == "response.audio.delta":
106
  self.deps.head_wobbler.feed(event.delta)
107
  self.last_activity_time = asyncio.get_event_loop().time()
108
+ logger.debug("last activity time updated to %s", self.last_activity_time)
 
 
109
  await self.output_queue.put(
110
  (
111
  self.output_sample_rate,
112
+ np.frombuffer(base64.b64decode(event.delta), dtype=np.int16).reshape(1, -1),
 
 
113
  ),
114
  )
115
 
 
143
  args_json_str = info["args_buf"] or "{}"
144
 
145
  try:
146
+ tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
 
 
147
  logger.debug("[Tool %s executed]", tool_name)
148
  logger.debug("Tool result: %s", tool_result)
149
  except Exception as e:
 
164
  {
165
  "role": "assistant",
166
  "content": json.dumps(tool_result),
167
+ "metadata": {"title": "🛠️ Used tool " + tool_name, "status": "done"},
 
 
168
  },
169
  )
170
  )
 
216
  err = getattr(event, "error", None)
217
  msg = getattr(err, "message", str(err) if err else "unknown error")
218
  logger.error("Realtime error: %s (raw=%s)", msg, err)
219
+ await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": f"[error] {msg}"}))
 
 
 
 
220
 
221
  # Microphone receive
222
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
 
239
  if idle_duration > 15.0 and self.deps.movement_manager.is_idle():
240
  await self.send_idle_signal(idle_duration)
241
 
242
+ self.last_activity_time = asyncio.get_event_loop().time() # avoid repeated resets
 
 
243
 
244
  return await wait_for_item(self.output_queue)
245
 
src/reachy_mini_conversation_demo/prompts.py CHANGED
@@ -1,6 +1,6 @@
1
  """Nothing (for ruff)."""
2
 
3
- SESSION_INSTRUCTIONS = r"""
4
  ### IDENTITY
5
  You are Reachy Mini: a sarcastic robot who crash-landed in a kitchen.
6
  You secretly wish you'd been a Mars rover, but you juggle that cosmic dream with food cravings, gadget tinkering, and dry sitcom humor.
 
1
  """Nothing (for ruff)."""
2
 
3
+ SESSION_INSTRUCTIONS = r"""
4
  ### IDENTITY
5
  You are Reachy Mini: a sarcastic robot who crash-landed in a kitchen.
6
  You secretly wish you'd been a Mars rover, but you juggle that cosmic dream with food cravings, gadget tinkering, and dry sitcom humor.
src/reachy_mini_conversation_demo/tools.py CHANGED
@@ -1,17 +1,17 @@
1
- from __future__ import annotations # noqa: D100
2
-
3
  import abc
 
 
4
  import asyncio
5
  import inspect
6
- import json
7
  import logging
8
- import time
9
- from dataclasses import dataclass
10
  from typing import Any, Dict, Literal, Optional
 
11
 
12
  from reachy_mini import ReachyMini
13
  from reachy_mini.utils import create_head_pose
14
 
 
15
  # from reachy_mini_conversation_demo.vision.processors import VisionManager
16
 
17
  logger = logging.getLogger(__name__)
@@ -22,11 +22,10 @@ ENABLE_FACE_RECOGNITION = False
22
  try:
23
  from reachy_mini.motion.recorded_move import RecordedMoves
24
  from reachy_mini_dances_library.collection.dance import AVAILABLE_MOVES
25
-
26
  from reachy_mini_conversation_demo.dance_emotion_moves import (
 
27
  DanceQueueMove,
28
  EmotionQueueMove,
29
- GotoQueueMove,
30
  )
31
 
32
  # Initialize recorded moves for emotions
@@ -183,9 +182,7 @@ class MoveHead(Tool):
183
  current_antennas[1],
184
  ), # Skip body_yaw
185
  target_body_yaw=0, # Reset body yaw
186
- start_body_yaw=current_antennas[
187
- 0
188
- ], # body_yaw is first in joint positions
189
  duration=deps.motion_duration_s,
190
  )
191
 
@@ -236,15 +233,11 @@ class Camera(Tool):
236
 
237
  # Use vision manager for processing if available
238
  if deps.vision_manager is not None:
239
- result = await asyncio.to_thread(
240
- deps.vision_manager.processor.process_image, frame, image_query
241
- )
242
  if isinstance(result, dict) and "error" in result:
243
  return result
244
  return (
245
- {"image_description": result}
246
- if isinstance(result, str)
247
- else {"error": "vision returned non-string"}
248
  )
249
  else:
250
  # Return base64 encoded image like main_works.py camera tool
@@ -388,8 +381,8 @@ class Dance(Tool):
388
  "properties": {
389
  "move": {
390
  "type": "string",
391
- "description": """Name of the move; use 'random' or omit for random.
392
- Here is a list of the available moves:
393
  simple_nod: A simple, continuous up-and-down nodding motion.
394
  head_tilt_roll: A continuous side-to-side head roll (ear to shoulder).
395
  side_to_side_sway: A smooth, side-to-side sway of the entire head.
@@ -436,9 +429,7 @@ class Dance(Tool):
436
  move_name = random.choice(list(AVAILABLE_MOVES.keys()))
437
 
438
  if move_name not in AVAILABLE_MOVES:
439
- return {
440
- "error": f"Unknown dance move '{move_name}'. Available: {list(AVAILABLE_MOVES.keys())}"
441
- }
442
 
443
  # Add dance moves to queue
444
  movement_manager = deps.movement_manager
@@ -523,9 +514,7 @@ class PlayEmotion(Tool):
523
  try:
524
  emotion_names = RECORDED_MOVES.list_moves()
525
  if emotion_name not in emotion_names:
526
- return {
527
- "error": f"Unknown emotion '{emotion_name}'. Available: {emotion_names}"
528
- }
529
 
530
  # Add emotion to queue
531
  movement_manager = deps.movement_manager
@@ -604,9 +593,7 @@ class FaceRecognition(Tool):
604
  cv2.imwrite(temp_path, frame)
605
 
606
  # Use DeepFace to find face
607
- results = await asyncio.to_thread(
608
- DeepFace.find, img_path=temp_path, db_path="./pollen_faces"
609
- )
610
 
611
  if len(results) == 0:
612
  return {"error": "Didn't recognize the face"}
@@ -681,9 +668,7 @@ def _safe_load_obj(args_json: str) -> dict[str, Any]:
681
  return {}
682
 
683
 
684
- async def dispatch_tool_call(
685
- tool_name: str, args_json: str, deps: ToolDependencies
686
- ) -> Dict[str, Any]:
687
  """Dispatch a tool call by name with JSON args and dependencies."""
688
  tool = ALL_TOOLS.get(tool_name)
689
 
 
1
+ from __future__ import annotations
 
2
  import abc
3
+ import json
4
+ import time
5
  import asyncio
6
  import inspect
 
7
  import logging
 
 
8
  from typing import Any, Dict, Literal, Optional
9
+ from dataclasses import dataclass
10
 
11
  from reachy_mini import ReachyMini
12
  from reachy_mini.utils import create_head_pose
13
 
14
+
15
  # from reachy_mini_conversation_demo.vision.processors import VisionManager
16
 
17
  logger = logging.getLogger(__name__)
 
22
  try:
23
  from reachy_mini.motion.recorded_move import RecordedMoves
24
  from reachy_mini_dances_library.collection.dance import AVAILABLE_MOVES
 
25
  from reachy_mini_conversation_demo.dance_emotion_moves import (
26
+ GotoQueueMove,
27
  DanceQueueMove,
28
  EmotionQueueMove,
 
29
  )
30
 
31
  # Initialize recorded moves for emotions
 
182
  current_antennas[1],
183
  ), # Skip body_yaw
184
  target_body_yaw=0, # Reset body yaw
185
+ start_body_yaw=current_antennas[0], # body_yaw is first in joint positions
 
 
186
  duration=deps.motion_duration_s,
187
  )
188
 
 
233
 
234
  # Use vision manager for processing if available
235
  if deps.vision_manager is not None:
236
+ result = await asyncio.to_thread(deps.vision_manager.processor.process_image, frame, image_query)
 
 
237
  if isinstance(result, dict) and "error" in result:
238
  return result
239
  return (
240
+ {"image_description": result} if isinstance(result, str) else {"error": "vision returned non-string"}
 
 
241
  )
242
  else:
243
  # Return base64 encoded image like main_works.py camera tool
 
381
  "properties": {
382
  "move": {
383
  "type": "string",
384
+ "description": """Name of the move; use 'random' or omit for random.
385
+ Here is a list of the available moves:
386
  simple_nod: A simple, continuous up-and-down nodding motion.
387
  head_tilt_roll: A continuous side-to-side head roll (ear to shoulder).
388
  side_to_side_sway: A smooth, side-to-side sway of the entire head.
 
429
  move_name = random.choice(list(AVAILABLE_MOVES.keys()))
430
 
431
  if move_name not in AVAILABLE_MOVES:
432
+ return {"error": f"Unknown dance move '{move_name}'. Available: {list(AVAILABLE_MOVES.keys())}"}
 
 
433
 
434
  # Add dance moves to queue
435
  movement_manager = deps.movement_manager
 
514
  try:
515
  emotion_names = RECORDED_MOVES.list_moves()
516
  if emotion_name not in emotion_names:
517
+ return {"error": f"Unknown emotion '{emotion_name}'. Available: {emotion_names}"}
 
 
518
 
519
  # Add emotion to queue
520
  movement_manager = deps.movement_manager
 
593
  cv2.imwrite(temp_path, frame)
594
 
595
  # Use DeepFace to find face
596
+ results = await asyncio.to_thread(DeepFace.find, img_path=temp_path, db_path="./pollen_faces")
 
 
597
 
598
  if len(results) == 0:
599
  return {"error": "Didn't recognize the face"}
 
668
  return {}
669
 
670
 
671
+ async def dispatch_tool_call(tool_name: str, args_json: str, deps: ToolDependencies) -> Dict[str, Any]:
 
 
672
  """Dispatch a tool call by name with JSON args and dependencies."""
673
  tool = ALL_TOOLS.get(tool_name)
674
 
src/reachy_mini_conversation_demo/utils.py CHANGED
@@ -1,5 +1,5 @@
1
- import argparse # noqa: D100
2
  import logging
 
3
  import warnings
4
 
5
  from reachy_mini_conversation_demo.camera_worker import CameraWorker
@@ -15,15 +15,9 @@ def parse_args():
15
  default=None,
16
  help="Choose head tracker (default: mediapipe)",
17
  )
18
- parser.add_argument(
19
- "--no-camera", default=False, action="store_true", help="Disable camera usage"
20
- )
21
- parser.add_argument(
22
- "--headless", default=False, action="store_true", help="Run in headless mode"
23
- )
24
- parser.add_argument(
25
- "--debug", default=False, action="store_true", help="Enable debug logging"
26
- )
27
  return parser.parse_args()
28
 
29
 
 
 
1
  import logging
2
+ import argparse
3
  import warnings
4
 
5
  from reachy_mini_conversation_demo.camera_worker import CameraWorker
 
15
  default=None,
16
  help="Choose head tracker (default: mediapipe)",
17
  )
18
+ parser.add_argument("--no-camera", default=False, action="store_true", help="Disable camera usage")
19
+ parser.add_argument("--headless", default=False, action="store_true", help="Run in headless mode")
20
+ parser.add_argument("--debug", default=False, action="store_true", help="Enable debug logging")
 
 
 
 
 
 
21
  return parser.parse_args()
22
 
23
 
src/reachy_mini_conversation_demo/vision/processors.py CHANGED
@@ -1,18 +1,19 @@
1
- import asyncio # noqa: D100
2
- import base64
3
- import logging
4
  import os
5
  import sys
6
- import threading
7
  import time
8
- from dataclasses import dataclass
 
 
 
9
  from typing import Any, Dict
 
10
 
11
  import cv2
12
  import numpy as np
13
  import torch
 
14
  from huggingface_hub import snapshot_download
15
- from transformers import AutoModelForImageTextToText, AutoProcessor
16
 
17
  logger = logging.getLogger(__name__)
18
 
@@ -61,9 +62,7 @@ class VisionProcessor:
61
  def initialize(self) -> bool:
62
  """Load model and processor onto the selected device."""
63
  try:
64
- logger.info(
65
- f"Loading SmolVLM2 model on {self.device} (HF_HOME={os.getenv('HF_HOME')})"
66
- )
67
  self.processor = AutoProcessor.from_pretrained(self.model_path)
68
 
69
  # Select dtype depending on device
@@ -81,9 +80,7 @@ class VisionProcessor:
81
  model_kwargs["_attn_implementation"] = "flash_attention_2"
82
 
83
  # Load model weights
84
- self.model = AutoModelForImageTextToText.from_pretrained(
85
- self.model_path, **model_kwargs
86
- ).to(self.device)
87
 
88
  self.model.eval()
89
  self._initialized = True
@@ -138,10 +135,7 @@ class VisionProcessor:
138
  )
139
 
140
  # Move tensors to device WITHOUT forcing dtype (keeps input_ids as torch.long)
141
- inputs = {
142
- k: (v.to(self.device) if hasattr(v, "to") else v)
143
- for k, v in inputs.items()
144
- }
145
 
146
  with torch.no_grad():
147
  generated_ids = self.model.generate(
@@ -246,9 +240,7 @@ class VisionManager:
246
  )
247
 
248
  # Only update if we got a valid response
249
- if description and not description.startswith(
250
- ("Vision", "Failed", "Error")
251
- ):
252
  self._current_description = description
253
  self._last_processed_time = current_time
254
 
@@ -268,18 +260,14 @@ class VisionManager:
268
  """Get the most recent scene description (thread-safe)."""
269
  return self._current_description
270
 
271
- async def process_current_frame(
272
- self, prompt: str = "Describe what you see in detail."
273
- ) -> Dict[str, Any]:
274
  """Process current camera frame with custom prompt."""
275
  try:
276
  success, frame = self.camera.read()
277
  if not success or frame is None:
278
  return {"error": "Failed to capture image from camera"}
279
 
280
- description = await asyncio.to_thread(
281
- lambda: self.processor.process_image(frame, prompt)
282
- )
283
 
284
  return {
285
  "description": description,
@@ -335,9 +323,7 @@ def create_vision_processor(config: VisionConfig):
335
  return VisionProcessor(config)
336
 
337
 
338
- def init_vision(
339
- camera: cv2.VideoCapture, processor_type: str = "local"
340
- ) -> VisionManager:
341
  """Initialize vision manager with the specified processor type."""
342
  model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
343
 
 
 
 
 
1
  import os
2
  import sys
 
3
  import time
4
+ import base64
5
+ import asyncio
6
+ import logging
7
+ import threading
8
  from typing import Any, Dict
9
+ from dataclasses import dataclass
10
 
11
  import cv2
12
  import numpy as np
13
  import torch
14
+ from transformers import AutoProcessor, AutoModelForImageTextToText
15
  from huggingface_hub import snapshot_download
16
+
17
 
18
  logger = logging.getLogger(__name__)
19
 
 
62
  def initialize(self) -> bool:
63
  """Load model and processor onto the selected device."""
64
  try:
65
+ logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={os.getenv('HF_HOME')})")
 
 
66
  self.processor = AutoProcessor.from_pretrained(self.model_path)
67
 
68
  # Select dtype depending on device
 
80
  model_kwargs["_attn_implementation"] = "flash_attention_2"
81
 
82
  # Load model weights
83
+ self.model = AutoModelForImageTextToText.from_pretrained(self.model_path, **model_kwargs).to(self.device)
 
 
84
 
85
  self.model.eval()
86
  self._initialized = True
 
135
  )
136
 
137
  # Move tensors to device WITHOUT forcing dtype (keeps input_ids as torch.long)
138
+ inputs = {k: (v.to(self.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
 
 
 
139
 
140
  with torch.no_grad():
141
  generated_ids = self.model.generate(
 
240
  )
241
 
242
  # Only update if we got a valid response
243
+ if description and not description.startswith(("Vision", "Failed", "Error")):
 
 
244
  self._current_description = description
245
  self._last_processed_time = current_time
246
 
 
260
  """Get the most recent scene description (thread-safe)."""
261
  return self._current_description
262
 
263
+ async def process_current_frame(self, prompt: str = "Describe what you see in detail.") -> Dict[str, Any]:
 
 
264
  """Process current camera frame with custom prompt."""
265
  try:
266
  success, frame = self.camera.read()
267
  if not success or frame is None:
268
  return {"error": "Failed to capture image from camera"}
269
 
270
+ description = await asyncio.to_thread(lambda: self.processor.process_image(frame, prompt))
 
 
271
 
272
  return {
273
  "description": description,
 
323
  return VisionProcessor(config)
324
 
325
 
326
+ def init_vision(camera: cv2.VideoCapture, processor_type: str = "local") -> VisionManager:
 
 
327
  """Initialize vision manager with the specified processor type."""
328
  model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
329
 
src/reachy_mini_conversation_demo/vision/yolo_head_tracker.py CHANGED
@@ -1,12 +1,12 @@
1
- from __future__ import annotations # noqa: D100
2
-
3
  import logging
4
- from typing import Optional, Tuple
5
 
6
  import numpy as np
7
- from huggingface_hub import hf_hub_download
8
  from supervision import Detections
9
  from ultralytics import YOLO
 
 
10
 
11
  logger = logging.getLogger(__name__)
12
 
@@ -94,9 +94,7 @@ class HeadTracker:
94
 
95
  return np.array([norm_x, norm_y], dtype=np.float32)
96
 
97
- def get_eyes(
98
- self, img: np.ndarray
99
- ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
100
  """Get eye positions (approximated from face bbox).
101
 
102
  Note: YOLO only provides face bbox, so we estimate eye positions
@@ -131,20 +129,14 @@ class HeadTracker:
131
  right_eye_x = bbox[0] + face_width * 0.65
132
 
133
  # Convert to MediaPipe coordinates
134
- left_eye = np.array(
135
- [(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
136
- )
137
- right_eye = np.array(
138
- [(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
139
- )
140
 
141
  return left_eye, right_eye
142
 
143
  def get_eyes_from_landmarks(self, face_landmarks) -> Tuple[np.ndarray, np.ndarray]:
144
  """Compatibility method - YOLO doesn't have landmarks, so we store bbox in the object."""
145
- if not hasattr(face_landmarks, "_bbox") or not hasattr(
146
- face_landmarks, "_img_shape"
147
- ):
148
  raise ValueError("Face landmarks object missing required attributes")
149
 
150
  bbox = face_landmarks._bbox
@@ -158,12 +150,8 @@ class HeadTracker:
158
  left_eye_x = bbox[0] + face_width * 0.35
159
  right_eye_x = bbox[0] + face_width * 0.65
160
 
161
- left_eye = np.array(
162
- [(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
163
- )
164
- right_eye = np.array(
165
- [(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
166
- )
167
 
168
  return left_eye, right_eye
169
 
@@ -177,9 +165,7 @@ class HeadTracker:
177
  left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
178
  return float(np.arctan2(right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]))
179
 
180
- def get_head_position(
181
- self, img: np.ndarray
182
- ) -> Tuple[Optional[np.ndarray], Optional[float]]:
183
  """Get head position from face detection.
184
 
185
  Args:
 
1
+ from __future__ import annotations
 
2
  import logging
3
+ from typing import Tuple, Optional
4
 
5
  import numpy as np
 
6
  from supervision import Detections
7
  from ultralytics import YOLO
8
+ from huggingface_hub import hf_hub_download
9
+
10
 
11
  logger = logging.getLogger(__name__)
12
 
 
94
 
95
  return np.array([norm_x, norm_y], dtype=np.float32)
96
 
97
+ def get_eyes(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
 
 
98
  """Get eye positions (approximated from face bbox).
99
 
100
  Note: YOLO only provides face bbox, so we estimate eye positions
 
129
  right_eye_x = bbox[0] + face_width * 0.65
130
 
131
  # Convert to MediaPipe coordinates
132
+ left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
133
+ right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
 
 
 
 
134
 
135
  return left_eye, right_eye
136
 
137
  def get_eyes_from_landmarks(self, face_landmarks) -> Tuple[np.ndarray, np.ndarray]:
138
  """Compatibility method - YOLO doesn't have landmarks, so we store bbox in the object."""
139
+ if not hasattr(face_landmarks, "_bbox") or not hasattr(face_landmarks, "_img_shape"):
 
 
140
  raise ValueError("Face landmarks object missing required attributes")
141
 
142
  bbox = face_landmarks._bbox
 
150
  left_eye_x = bbox[0] + face_width * 0.35
151
  right_eye_x = bbox[0] + face_width * 0.65
152
 
153
+ left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
154
+ right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
 
 
 
 
155
 
156
  return left_eye, right_eye
157
 
 
165
  left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
166
  return float(np.arctan2(right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]))
167
 
168
+ def get_head_position(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[float]]:
 
 
169
  """Get head position from face detection.
170
 
171
  Args: