Alina Lozovskaya commited on
Commit
426e6c4
·
1 Parent(s): 8383ea5

Apply Ruff

Browse files
src/reachy_mini_conversation_demo/__init__.py CHANGED
@@ -1 +1 @@
1
- """Nothing (for ruff)."""
 
1
+ """Nothing (for ruff)."""
src/reachy_mini_conversation_demo/audio/__init__.py CHANGED
@@ -1 +1 @@
1
- """Nothing (for ruff)."""
 
1
+ """Nothing (for ruff)."""
src/reachy_mini_conversation_demo/audio/gstreamer.py CHANGED
@@ -25,18 +25,16 @@ class GstPlayer:
25
  self.appsrc = Gst.ElementFactory.make("appsrc", None)
26
  self.appsrc.set_property("format", Gst.Format.TIME)
27
  self.appsrc.set_property("is-live", True)
28
- caps = Gst.Caps.from_string(
29
- f"audio/x-raw,format=S16LE,channels=1,rate={sample_rate},layout=interleaved"
30
- )
31
  self.appsrc.set_property("caps", caps)
32
  queue = Gst.ElementFactory.make("queue")
33
  audioconvert = Gst.ElementFactory.make("audioconvert")
34
  audioresample = Gst.ElementFactory.make("audioresample")
35
 
36
  # Try to pin specific output device; fallback to autoaudiosink
37
- audiosink = _create_device_element(
38
- direction="sink", name_substr=device_name
39
- ) or Gst.ElementFactory.make("autoaudiosink")
40
 
41
  self.pipeline.add(self.appsrc)
42
  self.pipeline.add(queue)
@@ -104,9 +102,9 @@ class GstRecorder:
104
  self.pipeline = Gst.Pipeline.new("audio_recorder")
105
 
106
  # Create elements: try specific mic; fallback to default
107
- autoaudiosrc = _create_device_element(
108
- direction="source", name_substr=device_name
109
- ) or Gst.ElementFactory.make("autoaudiosrc", None)
110
 
111
  queue = Gst.ElementFactory.make("queue", None)
112
  audioconvert = Gst.ElementFactory.make("audioconvert", None)
@@ -117,9 +115,7 @@ class GstRecorder:
117
  raise RuntimeError("Failed to create GStreamer elements")
118
 
119
  # Force mono/S16LE at 24000; resample handles device SR (e.g., 16000 → 24000)
120
- caps = Gst.Caps.from_string(
121
- f"audio/x-raw,channels=1,rate={sample_rate},format=S16LE"
122
- )
123
  self.appsink.set_property("caps", caps)
124
 
125
  # Build pipeline
@@ -183,9 +179,7 @@ class GstRecorder:
183
  logger.info("Stopped Recorder")
184
 
185
 
186
- def _create_device_element(
187
- direction: str, name_substr: Optional[str]
188
- ) -> Optional[Gst.Element]:
189
  """direction: 'source' or 'sink'.
190
 
191
  name_substr: case-insensitive substring matching device display name/description.
@@ -205,30 +199,15 @@ def _create_device_element(
205
  for dev in monitor.get_devices() or []:
206
  disp = dev.get_display_name() or ""
207
  props = dev.get_properties()
208
- desc = (
209
- props.get_string("device.description")
210
- if props and props.has_field("device.description")
211
- else ""
212
- )
213
  logger.info(f"Device candidate: disp='{disp}', desc='{desc}'")
214
 
215
- if (
216
- name_substr.lower() in disp.lower()
217
- or name_substr.lower() in desc.lower()
218
- ):
219
  elem = dev.create_element(None)
220
- factory = (
221
- elem.get_factory().get_name()
222
- if elem and elem.get_factory()
223
- else "<?>"
224
- )
225
- logger.info(
226
- f"Using {direction} device: '{disp or desc}' (factory='{factory}')"
227
- )
228
  return elem
229
  finally:
230
  monitor.stop()
231
- logging.getLogger(__name__).warning(
232
- "Requested %s '%s' not found; using auto*", direction, name_substr
233
- )
234
  return None
 
25
  self.appsrc = Gst.ElementFactory.make("appsrc", None)
26
  self.appsrc.set_property("format", Gst.Format.TIME)
27
  self.appsrc.set_property("is-live", True)
28
+ caps = Gst.Caps.from_string(f"audio/x-raw,format=S16LE,channels=1,rate={sample_rate},layout=interleaved")
 
 
29
  self.appsrc.set_property("caps", caps)
30
  queue = Gst.ElementFactory.make("queue")
31
  audioconvert = Gst.ElementFactory.make("audioconvert")
32
  audioresample = Gst.ElementFactory.make("audioresample")
33
 
34
  # Try to pin specific output device; fallback to autoaudiosink
35
+ audiosink = _create_device_element(direction="sink", name_substr=device_name) or Gst.ElementFactory.make(
36
+ "autoaudiosink"
37
+ )
38
 
39
  self.pipeline.add(self.appsrc)
40
  self.pipeline.add(queue)
 
102
  self.pipeline = Gst.Pipeline.new("audio_recorder")
103
 
104
  # Create elements: try specific mic; fallback to default
105
+ autoaudiosrc = _create_device_element(direction="source", name_substr=device_name) or Gst.ElementFactory.make(
106
+ "autoaudiosrc", None
107
+ )
108
 
109
  queue = Gst.ElementFactory.make("queue", None)
110
  audioconvert = Gst.ElementFactory.make("audioconvert", None)
 
115
  raise RuntimeError("Failed to create GStreamer elements")
116
 
117
  # Force mono/S16LE at 24000; resample handles device SR (e.g., 16000 → 24000)
118
+ caps = Gst.Caps.from_string(f"audio/x-raw,channels=1,rate={sample_rate},format=S16LE")
 
 
119
  self.appsink.set_property("caps", caps)
120
 
121
  # Build pipeline
 
179
  logger.info("Stopped Recorder")
180
 
181
 
182
+ def _create_device_element(direction: str, name_substr: Optional[str]) -> Optional[Gst.Element]:
 
 
183
  """direction: 'source' or 'sink'.
184
 
185
  name_substr: case-insensitive substring matching device display name/description.
 
199
  for dev in monitor.get_devices() or []:
200
  disp = dev.get_display_name() or ""
201
  props = dev.get_properties()
202
+ desc = props.get_string("device.description") if props and props.has_field("device.description") else ""
 
 
 
 
203
  logger.info(f"Device candidate: disp='{disp}', desc='{desc}'")
204
 
205
+ if name_substr.lower() in disp.lower() or name_substr.lower() in desc.lower():
 
 
 
206
  elem = dev.create_element(None)
207
+ factory = elem.get_factory().get_name() if elem and elem.get_factory() else "<?>"
208
+ logger.info(f"Using {direction} device: '{disp or desc}' (factory='{factory}')")
 
 
 
 
 
 
209
  return elem
210
  finally:
211
  monitor.stop()
212
+ logging.getLogger(__name__).warning("Requested %s '%s' not found; using auto*", direction, name_substr)
 
 
213
  return None
src/reachy_mini_conversation_demo/audio/speech_tapper.py CHANGED
@@ -68,7 +68,7 @@ def _loudness_gain(db: float, offset: float = SENS_DB_OFFSET) -> float:
68
 
69
  def _to_float32_mono(x: np.ndarray) -> np.ndarray:
70
  """Convert arbitrary PCM array to float32 mono in [-1,1].
71
-
72
  Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
73
  """
74
  a = np.asarray(x)
@@ -258,24 +258,9 @@ class SwayRollRT:
258
  * env
259
  * math.sin(2 * math.pi * SWAY_F_ROLL * self.t + self.phase_roll)
260
  )
261
- x_mm = (
262
- SWAY_A_X_MM
263
- * loud
264
- * env
265
- * math.sin(2 * math.pi * SWAY_F_X * self.t + self.phase_x)
266
- )
267
- y_mm = (
268
- SWAY_A_Y_MM
269
- * loud
270
- * env
271
- * math.sin(2 * math.pi * SWAY_F_Y * self.t + self.phase_y)
272
- )
273
- z_mm = (
274
- SWAY_A_Z_MM
275
- * loud
276
- * env
277
- * math.sin(2 * math.pi * SWAY_F_Z * self.t + self.phase_z)
278
- )
279
 
280
  out.append(
281
  {
 
68
 
69
  def _to_float32_mono(x: np.ndarray) -> np.ndarray:
70
  """Convert arbitrary PCM array to float32 mono in [-1,1].
71
+
72
  Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
73
  """
74
  a = np.asarray(x)
 
258
  * env
259
  * math.sin(2 * math.pi * SWAY_F_ROLL * self.t + self.phase_roll)
260
  )
261
+ x_mm = SWAY_A_X_MM * loud * env * math.sin(2 * math.pi * SWAY_F_X * self.t + self.phase_x)
262
+ y_mm = SWAY_A_Y_MM * loud * env * math.sin(2 * math.pi * SWAY_F_Y * self.t + self.phase_y)
263
+ z_mm = SWAY_A_Z_MM * loud * env * math.sin(2 * math.pi * SWAY_F_Z * self.t + self.phase_z)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
  out.append(
266
  {
src/reachy_mini_conversation_demo/camera_worker.py CHANGED
@@ -114,17 +114,10 @@ class CameraWorker:
114
  self.latest_frame = frame # .copy()
115
 
116
  # Check if face tracking was just disabled
117
- if (
118
- self.previous_head_tracking_state
119
- and not self.is_head_tracking_enabled
120
- ):
121
  # Face tracking was just disabled - start interpolation to neutral
122
- self.last_face_detected_time = (
123
- current_time # Trigger the face-lost logic
124
- )
125
- self.interpolation_start_time = (
126
- None # Will be set by the face-lost interpolation
127
- )
128
  self.interpolation_start_pose = None
129
 
130
  # Update tracking state
@@ -137,9 +130,7 @@ class CameraWorker:
137
  if eye_center is not None:
138
  # Face detected - immediately switch to tracking
139
  self.last_face_detected_time = current_time
140
- self.interpolation_start_time = (
141
- None # Stop any interpolation
142
- )
143
 
144
  # Convert normalized coordinates to pixel coordinates
145
  h, w, _ = frame.shape
@@ -159,9 +150,7 @@ class CameraWorker:
159
 
160
  # Extract translation and rotation from the target pose directly
161
  translation = target_pose[:3, 3]
162
- rotation = R.from_matrix(target_pose[:3, :3]).as_euler(
163
- "xyz", degrees=False
164
- )
165
 
166
  # Thread-safe update of face tracking offsets (use pose as-is)
167
  with self.face_tracking_lock:
@@ -176,19 +165,14 @@ class CameraWorker:
176
 
177
  else:
178
  # No face detected while tracking enabled - set face lost timestamp
179
- if (
180
- self.last_face_detected_time is None
181
- or self.last_face_detected_time == current_time
182
- ):
183
  # Only update if we haven't already set a face lost time
184
  # (current_time check prevents overriding the disable-triggered timestamp)
185
  pass
186
 
187
  # Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
188
  if self.last_face_detected_time is not None:
189
- time_since_face_lost = (
190
- current_time - self.last_face_detected_time
191
- )
192
 
193
  if time_since_face_lost >= self.face_lost_delay:
194
  # Start interpolation if not already started
@@ -197,27 +181,17 @@ class CameraWorker:
197
  # Capture current pose as start of interpolation
198
  with self.face_tracking_lock:
199
  current_translation = self.face_tracking_offsets[:3]
200
- current_rotation_euler = self.face_tracking_offsets[
201
- 3:
202
- ]
203
  # Convert to 4x4 pose matrix
204
  self.interpolation_start_pose = np.eye(4)
205
- self.interpolation_start_pose[:3, 3] = (
206
- current_translation
207
- )
208
- self.interpolation_start_pose[:3, :3] = (
209
- R.from_euler(
210
- "xyz", current_rotation_euler
211
- ).as_matrix()
212
- )
213
 
214
  # Calculate interpolation progress (t from 0 to 1)
215
- elapsed_interpolation = (
216
- current_time - self.interpolation_start_time
217
- )
218
- t = min(
219
- 1.0, elapsed_interpolation / self.interpolation_duration
220
- )
221
 
222
  # Interpolate between current pose and neutral pose
223
  interpolated_pose = linear_pose_interpolation(
@@ -226,9 +200,7 @@ class CameraWorker:
226
 
227
  # Extract translation and rotation from interpolated pose
228
  translation = interpolated_pose[:3, 3]
229
- rotation = R.from_matrix(
230
- interpolated_pose[:3, :3]
231
- ).as_euler("xyz", degrees=False)
232
 
233
  # Thread-safe update of face tracking offsets
234
  with self.face_tracking_lock:
 
114
  self.latest_frame = frame # .copy()
115
 
116
  # Check if face tracking was just disabled
117
+ if self.previous_head_tracking_state and not self.is_head_tracking_enabled:
 
 
 
118
  # Face tracking was just disabled - start interpolation to neutral
119
+ self.last_face_detected_time = current_time # Trigger the face-lost logic
120
+ self.interpolation_start_time = None # Will be set by the face-lost interpolation
 
 
 
 
121
  self.interpolation_start_pose = None
122
 
123
  # Update tracking state
 
130
  if eye_center is not None:
131
  # Face detected - immediately switch to tracking
132
  self.last_face_detected_time = current_time
133
+ self.interpolation_start_time = None # Stop any interpolation
 
 
134
 
135
  # Convert normalized coordinates to pixel coordinates
136
  h, w, _ = frame.shape
 
150
 
151
  # Extract translation and rotation from the target pose directly
152
  translation = target_pose[:3, 3]
153
+ rotation = R.from_matrix(target_pose[:3, :3]).as_euler("xyz", degrees=False)
 
 
154
 
155
  # Thread-safe update of face tracking offsets (use pose as-is)
156
  with self.face_tracking_lock:
 
165
 
166
  else:
167
  # No face detected while tracking enabled - set face lost timestamp
168
+ if self.last_face_detected_time is None or self.last_face_detected_time == current_time:
 
 
 
169
  # Only update if we haven't already set a face lost time
170
  # (current_time check prevents overriding the disable-triggered timestamp)
171
  pass
172
 
173
  # Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
174
  if self.last_face_detected_time is not None:
175
+ time_since_face_lost = current_time - self.last_face_detected_time
 
 
176
 
177
  if time_since_face_lost >= self.face_lost_delay:
178
  # Start interpolation if not already started
 
181
  # Capture current pose as start of interpolation
182
  with self.face_tracking_lock:
183
  current_translation = self.face_tracking_offsets[:3]
184
+ current_rotation_euler = self.face_tracking_offsets[3:]
 
 
185
  # Convert to 4x4 pose matrix
186
  self.interpolation_start_pose = np.eye(4)
187
+ self.interpolation_start_pose[:3, 3] = current_translation
188
+ self.interpolation_start_pose[:3, :3] = R.from_euler(
189
+ "xyz", current_rotation_euler
190
+ ).as_matrix()
 
 
 
 
191
 
192
  # Calculate interpolation progress (t from 0 to 1)
193
+ elapsed_interpolation = current_time - self.interpolation_start_time
194
+ t = min(1.0, elapsed_interpolation / self.interpolation_duration)
 
 
 
 
195
 
196
  # Interpolate between current pose and neutral pose
197
  interpolated_pose = linear_pose_interpolation(
 
200
 
201
  # Extract translation and rotation from interpolated pose
202
  translation = interpolated_pose[:3, 3]
203
+ rotation = R.from_matrix(interpolated_pose[:3, :3]).as_euler("xyz", degrees=False)
 
 
204
 
205
  # Thread-safe update of face tracking offsets
206
  with self.face_tracking_lock:
src/reachy_mini_conversation_demo/config.py CHANGED
@@ -15,7 +15,7 @@ def getenv_bool(key: str, default: bool = False) -> bool:
15
 
16
  class Config:
17
  """Configuration class for the conversation demo."""
18
-
19
  # Required
20
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
21
  if not OPENAI_API_KEY:
 
15
 
16
  class Config:
17
  """Configuration class for the conversation demo."""
18
+
19
  # Required
20
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
21
  if not OPENAI_API_KEY:
src/reachy_mini_conversation_demo/dance_emotion_moves.py CHANGED
@@ -30,9 +30,7 @@ class DanceQueueMove(Move):
30
  """Duration property required by official Move interface."""
31
  return self.dance_move.duration
32
 
33
- def evaluate(
34
- self, t: float
35
- ) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
36
  """Evaluate dance move at time t."""
37
  try:
38
  # Get the pose from the dance move
@@ -45,9 +43,7 @@ class DanceQueueMove(Move):
45
  return (head_pose, antennas, body_yaw)
46
 
47
  except Exception as e:
48
- logger.error(
49
- f"Error evaluating dance move '{self.move_name}' at t={t}: {e}"
50
- )
51
  # Return neutral pose on error
52
  from reachy_mini.utils import create_head_pose
53
 
@@ -68,9 +64,7 @@ class EmotionQueueMove(Move):
68
  """Duration property required by official Move interface."""
69
  return self.emotion_move.duration
70
 
71
- def evaluate(
72
- self, t: float
73
- ) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
74
  """Evaluate emotion move at time t."""
75
  try:
76
  # Get the pose from the emotion move
@@ -83,9 +77,7 @@ class EmotionQueueMove(Move):
83
  return (head_pose, antennas, body_yaw)
84
 
85
  except Exception as e:
86
- logger.error(
87
- f"Error evaluating emotion '{self.emotion_name}' at t={t}: {e}"
88
- )
89
  # Return neutral pose on error
90
  from reachy_mini.utils import create_head_pose
91
 
@@ -120,9 +112,7 @@ class GotoQueueMove(Move):
120
  """Duration property required by official Move interface."""
121
  return self._duration
122
 
123
- def evaluate(
124
- self, t: float
125
- ) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
126
  """Evaluate goto move at time t using linear interpolation."""
127
  try:
128
  from reachy_mini.utils import create_head_pose
@@ -138,32 +128,23 @@ class GotoQueueMove(Move):
138
  start_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
139
 
140
  # Interpolate head pose
141
- head_pose = linear_pose_interpolation(
142
- start_pose, self.target_head_pose, t_clamped
143
- )
144
 
145
  # Interpolate antennas - return as numpy array
146
  antennas = np.array(
147
  [
148
- self.start_antennas[0]
149
- + (self.target_antennas[0] - self.start_antennas[0]) * t_clamped,
150
- self.start_antennas[1]
151
- + (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
152
  ]
153
  )
154
 
155
  # Interpolate body yaw
156
- body_yaw = (
157
- self.start_body_yaw
158
- + (self.target_body_yaw - self.start_body_yaw) * t_clamped
159
- )
160
 
161
  return (head_pose, antennas, body_yaw)
162
 
163
  except Exception as e:
164
  logger.error(f"Error evaluating goto move at t={t}: {e}")
165
  # Return target pose on error - convert antennas to numpy array
166
- target_antennas_array = np.array(
167
- [self.target_antennas[0], self.target_antennas[1]]
168
- )
169
  return (self.target_head_pose, target_antennas_array, self.target_body_yaw)
 
30
  """Duration property required by official Move interface."""
31
  return self.dance_move.duration
32
 
33
+ def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
 
 
34
  """Evaluate dance move at time t."""
35
  try:
36
  # Get the pose from the dance move
 
43
  return (head_pose, antennas, body_yaw)
44
 
45
  except Exception as e:
46
+ logger.error(f"Error evaluating dance move '{self.move_name}' at t={t}: {e}")
 
 
47
  # Return neutral pose on error
48
  from reachy_mini.utils import create_head_pose
49
 
 
64
  """Duration property required by official Move interface."""
65
  return self.emotion_move.duration
66
 
67
+ def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
 
 
68
  """Evaluate emotion move at time t."""
69
  try:
70
  # Get the pose from the emotion move
 
77
  return (head_pose, antennas, body_yaw)
78
 
79
  except Exception as e:
80
+ logger.error(f"Error evaluating emotion '{self.emotion_name}' at t={t}: {e}")
 
 
81
  # Return neutral pose on error
82
  from reachy_mini.utils import create_head_pose
83
 
 
112
  """Duration property required by official Move interface."""
113
  return self._duration
114
 
115
+ def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
 
 
116
  """Evaluate goto move at time t using linear interpolation."""
117
  try:
118
  from reachy_mini.utils import create_head_pose
 
128
  start_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
129
 
130
  # Interpolate head pose
131
+ head_pose = linear_pose_interpolation(start_pose, self.target_head_pose, t_clamped)
 
 
132
 
133
  # Interpolate antennas - return as numpy array
134
  antennas = np.array(
135
  [
136
+ self.start_antennas[0] + (self.target_antennas[0] - self.start_antennas[0]) * t_clamped,
137
+ self.start_antennas[1] + (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
 
 
138
  ]
139
  )
140
 
141
  # Interpolate body yaw
142
+ body_yaw = self.start_body_yaw + (self.target_body_yaw - self.start_body_yaw) * t_clamped
 
 
 
143
 
144
  return (head_pose, antennas, body_yaw)
145
 
146
  except Exception as e:
147
  logger.error(f"Error evaluating goto move at t={t}: {e}")
148
  # Return target pose on error - convert antennas to numpy array
149
+ target_antennas_array = np.array([self.target_antennas[0], self.target_antennas[1]])
 
 
150
  return (self.target_head_pose, target_antennas_array, self.target_body_yaw)
src/reachy_mini_conversation_demo/moves.py CHANGED
@@ -27,9 +27,7 @@ from reachy_mini.utils.interpolation import (
27
  logger = logging.getLogger(__name__)
28
 
29
  # Type definitions
30
- FullBodyPose = Tuple[
31
- np.ndarray, Tuple[float, float], float
32
- ] # (head_pose_4x4, antennas, body_yaw)
33
 
34
 
35
  class BreathingMove(Move):
@@ -68,9 +66,7 @@ class BreathingMove(Move):
68
  """Duration property required by official Move interface."""
69
  return float("inf") # Continuous breathing (never ends naturally)
70
 
71
- def evaluate(
72
- self, t: float
73
- ) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
74
  """Evaluate breathing move at time t."""
75
  if t < self.interpolation_duration:
76
  # Phase 1: Interpolate to neutral base position
@@ -83,35 +79,26 @@ class BreathingMove(Move):
83
 
84
  # Interpolate antennas
85
  antennas = (
86
- (1 - interpolation_t) * self.interpolation_start_antennas
87
- + interpolation_t * self.neutral_antennas
88
- )
89
 
90
  else:
91
  # Phase 2: Breathing patterns from neutral base
92
  breathing_time = t - self.interpolation_duration
93
 
94
  # Gentle z-axis breathing
95
- z_offset = self.breathing_z_amplitude * np.sin(
96
- 2 * np.pi * self.breathing_frequency * breathing_time
97
- )
98
- head_pose = create_head_pose(
99
- x=0, y=0, z=z_offset, roll=0, pitch=0, yaw=0, degrees=True, mm=False
100
- )
101
 
102
  # Antenna sway (opposite directions)
103
- antenna_sway = self.antenna_sway_amplitude * np.sin(
104
- 2 * np.pi * self.antenna_frequency * breathing_time
105
- )
106
  antennas = np.array([antenna_sway, -antenna_sway])
107
 
108
  # Return in official Move interface format: (head_pose, antennas_array, body_yaw)
109
  return (head_pose, antennas, 0.0)
110
 
111
 
112
- def combine_full_body(
113
- primary_pose: FullBodyPose, secondary_pose: FullBodyPose
114
- ) -> FullBodyPose:
115
  """Combine primary and secondary full body poses.
116
 
117
  Args:
@@ -127,9 +114,7 @@ def combine_full_body(
127
 
128
  # Combine head poses using compose_world_offset
129
  # primary_head is T_abs, secondary_head is T_off_world
130
- combined_head = compose_world_offset(
131
- primary_head, secondary_head, reorthonormalize=True
132
- )
133
 
134
  # Sum antennas and body_yaw
135
  combined_antennas = (
@@ -226,9 +211,7 @@ class MovementManager:
226
  self._thread: Optional[threading.Thread] = None
227
  self._state_lock = threading.RLock()
228
  self._is_listening = False
229
- self._last_commanded_pose: FullBodyPose = clone_full_body_pose(
230
- self.state.last_primary_pose
231
- )
232
  self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
233
  self._antenna_unfreeze_blend = 1.0
234
  self._antenna_blend_duration = 0.4 # seconds to blend back after listening
@@ -239,9 +222,7 @@ class MovementManager:
239
  with self._state_lock:
240
  self.move_queue.append(move)
241
  self.state.update_activity()
242
- logger.info(
243
- f"Queued move with duration {move.duration}s, queue size: {len(self.move_queue)}"
244
- )
245
 
246
  def clear_queue(self) -> None:
247
  """Clear all queued moves and stop current move."""
@@ -252,22 +233,16 @@ class MovementManager:
252
  self.state.is_playing_move = False
253
  logger.info("Cleared move queue and stopped current move")
254
 
255
- def set_speech_offsets(
256
- self, offsets: Tuple[float, float, float, float, float, float]
257
- ) -> None:
258
  """Set speech head offsets (secondary move)."""
259
  with self._state_lock:
260
  self.state.speech_offsets = offsets
261
 
262
- def set_offsets(
263
- self, offsets: Tuple[float, float, float, float, float, float]
264
- ) -> None:
265
  """Compatibility alias for set_speech_offsets."""
266
  self.set_speech_offsets(offsets)
267
 
268
- def set_face_tracking_offsets(
269
- self, offsets: Tuple[float, float, float, float, float, float]
270
- ) -> None:
271
  """Set face tracking offsets (secondary move)."""
272
  with self._state_lock:
273
  self.state.face_tracking_offsets = offsets
@@ -314,8 +289,7 @@ class MovementManager:
314
  with self._state_lock:
315
  if self.state.current_move is None or (
316
  self.state.move_start_time is not None
317
- and current_time - self.state.move_start_time
318
- >= self.state.current_move.duration
319
  ):
320
  self.state.current_move = None
321
  self.state.move_start_time = None
@@ -323,9 +297,7 @@ class MovementManager:
323
  if self.move_queue:
324
  self.state.current_move = self.move_queue.popleft()
325
  self.state.move_start_time = current_time
326
- logger.debug(
327
- f"Starting new move, duration: {self.state.current_move.duration}s"
328
- )
329
 
330
  def _manage_breathing(self, current_time: float) -> None:
331
  """Manage automatic breathing when idle."""
@@ -336,9 +308,7 @@ class MovementManager:
336
 
337
  if self.is_idle():
338
  try:
339
- _, current_antennas = (
340
- self.current_robot.get_current_joint_positions()
341
- )
342
  current_head_pose = self.current_robot.get_current_head_pose()
343
 
344
  breathing_move = BreathingMove(
@@ -348,9 +318,7 @@ class MovementManager:
348
  )
349
  self.move_queue.append(breathing_move)
350
  self.state.update_activity()
351
- logger.debug(
352
- f"Started breathing after {time_since_activity:.1f}s of inactivity"
353
- )
354
  except Exception as e:
355
  logger.error(f"Failed to start breathing: {e}")
356
 
@@ -367,10 +335,7 @@ class MovementManager:
367
  """Get the primary full body pose from current move or neutral."""
368
  with self._state_lock:
369
  # When a primary move is playing, sample it and cache the resulting pose
370
- if (
371
- self.state.current_move is not None
372
- and self.state.move_start_time is not None
373
- ):
374
  move_time = current_time - self.state.move_start_time
375
  head, antennas, body_yaw = self.state.current_move.evaluate(move_time)
376
 
@@ -391,26 +356,18 @@ class MovementManager:
391
 
392
  self.state.is_playing_move = True
393
  self.state.is_moving = True
394
- self.state.last_primary_pose = clone_full_body_pose(
395
- primary_full_body_pose
396
- )
397
  else:
398
  # Otherwise reuse the last primary pose so we avoid jumps between moves
399
  self.state.is_playing_move = False
400
- self.state.is_moving = (
401
- time.time() - self.state.moving_start < self.state.moving_for
402
- )
403
 
404
  if self.state.last_primary_pose is not None:
405
- primary_full_body_pose = clone_full_body_pose(
406
- self.state.last_primary_pose
407
- )
408
  else:
409
  neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
410
  primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
411
- self.state.last_primary_pose = clone_full_body_pose(
412
- primary_full_body_pose
413
- )
414
 
415
  return primary_full_body_pose
416
 
@@ -496,9 +453,7 @@ class MovementManager:
496
  secondary_full_body_pose = self._get_secondary_pose()
497
 
498
  # 6. Combine primary and secondary poses
499
- global_full_body_pose = combine_full_body(
500
- primary_full_body_pose, secondary_full_body_pose
501
- )
502
 
503
  # 7. Extract pose components
504
  head, antennas, body_yaw = global_full_body_pose
@@ -539,16 +494,12 @@ class MovementManager:
539
 
540
  # 8. Single set_target call - the one and only place we control the robot
541
  try:
542
- self.current_robot.set_target(
543
- head=head, antennas=antennas_cmd, body_yaw=body_yaw
544
- )
545
  except Exception as e:
546
  logger.error(f"Failed to set robot target: {e}")
547
  else:
548
  with self._state_lock:
549
- self._last_commanded_pose = clone_full_body_pose(
550
- (head, antennas_cmd, body_yaw)
551
- )
552
 
553
  # 9. Calculate computation time and adjust sleep for 50Hz
554
  computation_time = time.time() - loop_start_time
@@ -558,9 +509,7 @@ class MovementManager:
558
  if loop_count % 100 == 0:
559
  elapsed = current_time - last_print_time
560
  actual_freq = 100.0 / elapsed if elapsed > 0 else 0
561
- potential_freq = (
562
- 1.0 / computation_time if computation_time > 0 else float("inf")
563
- )
564
  logger.debug(
565
  f"Loop freq - Actual: {actual_freq:.1f}Hz, Potential: {potential_freq:.1f}Hz, Target: {self.target_frequency:.1f}Hz"
566
  )
 
27
  logger = logging.getLogger(__name__)
28
 
29
  # Type definitions
30
+ FullBodyPose = Tuple[np.ndarray, Tuple[float, float], float] # (head_pose_4x4, antennas, body_yaw)
 
 
31
 
32
 
33
  class BreathingMove(Move):
 
66
  """Duration property required by official Move interface."""
67
  return float("inf") # Continuous breathing (never ends naturally)
68
 
69
+ def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
 
 
70
  """Evaluate breathing move at time t."""
71
  if t < self.interpolation_duration:
72
  # Phase 1: Interpolate to neutral base position
 
79
 
80
  # Interpolate antennas
81
  antennas = (
82
+ 1 - interpolation_t
83
+ ) * self.interpolation_start_antennas + interpolation_t * self.neutral_antennas
 
84
 
85
  else:
86
  # Phase 2: Breathing patterns from neutral base
87
  breathing_time = t - self.interpolation_duration
88
 
89
  # Gentle z-axis breathing
90
+ z_offset = self.breathing_z_amplitude * np.sin(2 * np.pi * self.breathing_frequency * breathing_time)
91
+ head_pose = create_head_pose(x=0, y=0, z=z_offset, roll=0, pitch=0, yaw=0, degrees=True, mm=False)
 
 
 
 
92
 
93
  # Antenna sway (opposite directions)
94
+ antenna_sway = self.antenna_sway_amplitude * np.sin(2 * np.pi * self.antenna_frequency * breathing_time)
 
 
95
  antennas = np.array([antenna_sway, -antenna_sway])
96
 
97
  # Return in official Move interface format: (head_pose, antennas_array, body_yaw)
98
  return (head_pose, antennas, 0.0)
99
 
100
 
101
+ def combine_full_body(primary_pose: FullBodyPose, secondary_pose: FullBodyPose) -> FullBodyPose:
 
 
102
  """Combine primary and secondary full body poses.
103
 
104
  Args:
 
114
 
115
  # Combine head poses using compose_world_offset
116
  # primary_head is T_abs, secondary_head is T_off_world
117
+ combined_head = compose_world_offset(primary_head, secondary_head, reorthonormalize=True)
 
 
118
 
119
  # Sum antennas and body_yaw
120
  combined_antennas = (
 
211
  self._thread: Optional[threading.Thread] = None
212
  self._state_lock = threading.RLock()
213
  self._is_listening = False
214
+ self._last_commanded_pose: FullBodyPose = clone_full_body_pose(self.state.last_primary_pose)
 
 
215
  self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
216
  self._antenna_unfreeze_blend = 1.0
217
  self._antenna_blend_duration = 0.4 # seconds to blend back after listening
 
222
  with self._state_lock:
223
  self.move_queue.append(move)
224
  self.state.update_activity()
225
+ logger.info(f"Queued move with duration {move.duration}s, queue size: {len(self.move_queue)}")
 
 
226
 
227
  def clear_queue(self) -> None:
228
  """Clear all queued moves and stop current move."""
 
233
  self.state.is_playing_move = False
234
  logger.info("Cleared move queue and stopped current move")
235
 
236
+ def set_speech_offsets(self, offsets: Tuple[float, float, float, float, float, float]) -> None:
 
 
237
  """Set speech head offsets (secondary move)."""
238
  with self._state_lock:
239
  self.state.speech_offsets = offsets
240
 
241
+ def set_offsets(self, offsets: Tuple[float, float, float, float, float, float]) -> None:
 
 
242
  """Compatibility alias for set_speech_offsets."""
243
  self.set_speech_offsets(offsets)
244
 
245
+ def set_face_tracking_offsets(self, offsets: Tuple[float, float, float, float, float, float]) -> None:
 
 
246
  """Set face tracking offsets (secondary move)."""
247
  with self._state_lock:
248
  self.state.face_tracking_offsets = offsets
 
289
  with self._state_lock:
290
  if self.state.current_move is None or (
291
  self.state.move_start_time is not None
292
+ and current_time - self.state.move_start_time >= self.state.current_move.duration
 
293
  ):
294
  self.state.current_move = None
295
  self.state.move_start_time = None
 
297
  if self.move_queue:
298
  self.state.current_move = self.move_queue.popleft()
299
  self.state.move_start_time = current_time
300
+ logger.debug(f"Starting new move, duration: {self.state.current_move.duration}s")
 
 
301
 
302
  def _manage_breathing(self, current_time: float) -> None:
303
  """Manage automatic breathing when idle."""
 
308
 
309
  if self.is_idle():
310
  try:
311
+ _, current_antennas = self.current_robot.get_current_joint_positions()
 
 
312
  current_head_pose = self.current_robot.get_current_head_pose()
313
 
314
  breathing_move = BreathingMove(
 
318
  )
319
  self.move_queue.append(breathing_move)
320
  self.state.update_activity()
321
+ logger.debug(f"Started breathing after {time_since_activity:.1f}s of inactivity")
 
 
322
  except Exception as e:
323
  logger.error(f"Failed to start breathing: {e}")
324
 
 
335
  """Get the primary full body pose from current move or neutral."""
336
  with self._state_lock:
337
  # When a primary move is playing, sample it and cache the resulting pose
338
+ if self.state.current_move is not None and self.state.move_start_time is not None:
 
 
 
339
  move_time = current_time - self.state.move_start_time
340
  head, antennas, body_yaw = self.state.current_move.evaluate(move_time)
341
 
 
356
 
357
  self.state.is_playing_move = True
358
  self.state.is_moving = True
359
+ self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
 
 
360
  else:
361
  # Otherwise reuse the last primary pose so we avoid jumps between moves
362
  self.state.is_playing_move = False
363
+ self.state.is_moving = time.time() - self.state.moving_start < self.state.moving_for
 
 
364
 
365
  if self.state.last_primary_pose is not None:
366
+ primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
 
 
367
  else:
368
  neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
369
  primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
370
+ self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
 
 
371
 
372
  return primary_full_body_pose
373
 
 
453
  secondary_full_body_pose = self._get_secondary_pose()
454
 
455
  # 6. Combine primary and secondary poses
456
+ global_full_body_pose = combine_full_body(primary_full_body_pose, secondary_full_body_pose)
 
 
457
 
458
  # 7. Extract pose components
459
  head, antennas, body_yaw = global_full_body_pose
 
494
 
495
  # 8. Single set_target call - the one and only place we control the robot
496
  try:
497
+ self.current_robot.set_target(head=head, antennas=antennas_cmd, body_yaw=body_yaw)
 
 
498
  except Exception as e:
499
  logger.error(f"Failed to set robot target: {e}")
500
  else:
501
  with self._state_lock:
502
+ self._last_commanded_pose = clone_full_body_pose((head, antennas_cmd, body_yaw))
 
 
503
 
504
  # 9. Calculate computation time and adjust sleep for 50Hz
505
  computation_time = time.time() - loop_start_time
 
509
  if loop_count % 100 == 0:
510
  elapsed = current_time - last_print_time
511
  actual_freq = 100.0 / elapsed if elapsed > 0 else 0
512
+ potential_freq = 1.0 / computation_time if computation_time > 0 else float("inf")
 
 
513
  logger.debug(
514
  f"Loop freq - Actual: {actual_freq:.1f}Hz, Potential: {potential_freq:.1f}Hz, Target: {self.target_frequency:.1f}Hz"
515
  )
src/reachy_mini_conversation_demo/openai_realtime.py CHANGED
@@ -93,35 +93,22 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
93
  pass
94
  # self.deps.head_wobbler.reset()
95
 
96
- if (
97
- event.type
98
- == "conversation.item.input_audio_transcription.completed"
99
- ):
100
  logger.debug(f"user transcript: {event.transcript}")
101
- await self.output_queue.put(
102
- AdditionalOutputs({"role": "user", "content": event.transcript})
103
- )
104
 
105
  if event.type == "response.audio_transcript.done":
106
  logger.debug(f"assistant transcript: {event.transcript}")
107
- await self.output_queue.put(
108
- AdditionalOutputs(
109
- {"role": "assistant", "content": event.transcript}
110
- )
111
- )
112
 
113
  if event.type == "response.audio.delta":
114
  self.deps.head_wobbler.feed(event.delta)
115
  self.last_activity_time = asyncio.get_event_loop().time()
116
- logger.debug(
117
- "last activity time updated to %s", self.last_activity_time
118
- )
119
  await self.output_queue.put(
120
  (
121
  self.output_sample_rate,
122
- np.frombuffer(
123
- base64.b64decode(event.delta), dtype=np.int16
124
- ).reshape(1, -1),
125
  ),
126
  )
127
 
@@ -155,9 +142,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
155
  args_json_str = info["args_buf"] or "{}"
156
 
157
  try:
158
- tool_result = await dispatch_tool_call(
159
- tool_name, args_json_str, self.deps
160
- )
161
  logger.debug("[Tool %s executed]", tool_name)
162
  logger.debug("Tool result: %s", tool_result)
163
  except Exception as e:
@@ -178,9 +163,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
178
  {
179
  "role": "assistant",
180
  "content": json.dumps(tool_result),
181
- "metadata": dict(
182
- title="🛠️ Used tool " + tool_name, status="done"
183
- ),
184
  },
185
  )
186
  )
@@ -232,11 +215,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
232
  err = getattr(event, "error", None)
233
  msg = getattr(err, "message", str(err) if err else "unknown error")
234
  logger.error("Realtime error: %s (raw=%s)", msg, err)
235
- await self.output_queue.put(
236
- AdditionalOutputs(
237
- {"role": "assistant", "content": f"[error] {msg}"}
238
- )
239
- )
240
 
241
  # Microphone receive
242
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
@@ -259,9 +238,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
259
  if idle_duration > 15.0 and self.deps.movement_manager.is_idle():
260
  await self.send_idle_signal(idle_duration)
261
 
262
- self.last_activity_time = (
263
- asyncio.get_event_loop().time()
264
- ) # avoid repeated resets
265
 
266
  return await wait_for_item(self.output_queue)
267
 
 
93
  pass
94
  # self.deps.head_wobbler.reset()
95
 
96
+ if event.type == "conversation.item.input_audio_transcription.completed":
 
 
 
97
  logger.debug(f"user transcript: {event.transcript}")
98
+ await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
 
 
99
 
100
  if event.type == "response.audio_transcript.done":
101
  logger.debug(f"assistant transcript: {event.transcript}")
102
+ await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
 
 
 
 
103
 
104
  if event.type == "response.audio.delta":
105
  self.deps.head_wobbler.feed(event.delta)
106
  self.last_activity_time = asyncio.get_event_loop().time()
107
+ logger.debug("last activity time updated to %s", self.last_activity_time)
 
 
108
  await self.output_queue.put(
109
  (
110
  self.output_sample_rate,
111
+ np.frombuffer(base64.b64decode(event.delta), dtype=np.int16).reshape(1, -1),
 
 
112
  ),
113
  )
114
 
 
142
  args_json_str = info["args_buf"] or "{}"
143
 
144
  try:
145
+ tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
 
 
146
  logger.debug("[Tool %s executed]", tool_name)
147
  logger.debug("Tool result: %s", tool_result)
148
  except Exception as e:
 
163
  {
164
  "role": "assistant",
165
  "content": json.dumps(tool_result),
166
+ "metadata": dict(title="🛠️ Used tool " + tool_name, status="done"),
 
 
167
  },
168
  )
169
  )
 
215
  err = getattr(event, "error", None)
216
  msg = getattr(err, "message", str(err) if err else "unknown error")
217
  logger.error("Realtime error: %s (raw=%s)", msg, err)
218
+ await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": f"[error] {msg}"}))
 
 
 
 
219
 
220
  # Microphone receive
221
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
 
238
  if idle_duration > 15.0 and self.deps.movement_manager.is_idle():
239
  await self.send_idle_signal(idle_duration)
240
 
241
+ self.last_activity_time = asyncio.get_event_loop().time() # avoid repeated resets
 
 
242
 
243
  return await wait_for_item(self.output_queue)
244
 
src/reachy_mini_conversation_demo/tools.py CHANGED
@@ -183,9 +183,7 @@ class MoveHead(Tool):
183
  current_antennas[1],
184
  ), # Skip body_yaw
185
  target_body_yaw=0, # Reset body yaw
186
- start_body_yaw=current_antennas[
187
- 0
188
- ], # body_yaw is first in joint positions
189
  duration=deps.motion_duration_s,
190
  )
191
 
@@ -236,15 +234,11 @@ class Camera(Tool):
236
 
237
  # Use vision manager for processing if available
238
  if deps.vision_manager is not None:
239
- result = await asyncio.to_thread(
240
- deps.vision_manager.processor.process_image, frame, image_query
241
- )
242
  if isinstance(result, dict) and "error" in result:
243
  return result
244
  return (
245
- {"image_description": result}
246
- if isinstance(result, str)
247
- else {"error": "vision returned non-string"}
248
  )
249
  else:
250
  # Return base64 encoded image like main_works.py camera tool
@@ -436,9 +430,7 @@ class Dance(Tool):
436
  move_name = random.choice(list(AVAILABLE_MOVES.keys()))
437
 
438
  if move_name not in AVAILABLE_MOVES:
439
- return {
440
- "error": f"Unknown dance move '{move_name}'. Available: {list(AVAILABLE_MOVES.keys())}"
441
- }
442
 
443
  # Add dance moves to queue
444
  movement_manager = deps.movement_manager
@@ -523,9 +515,7 @@ class PlayEmotion(Tool):
523
  try:
524
  emotion_names = RECORDED_MOVES.list_moves()
525
  if emotion_name not in emotion_names:
526
- return {
527
- "error": f"Unknown emotion '{emotion_name}'. Available: {emotion_names}"
528
- }
529
 
530
  # Add emotion to queue
531
  movement_manager = deps.movement_manager
@@ -604,9 +594,7 @@ class FaceRecognition(Tool):
604
  cv2.imwrite(temp_path, frame)
605
 
606
  # Use DeepFace to find face
607
- results = await asyncio.to_thread(
608
- DeepFace.find, img_path=temp_path, db_path="./pollen_faces"
609
- )
610
 
611
  if len(results) == 0:
612
  return {"error": "Didn't recognize the face"}
@@ -681,9 +669,7 @@ def _safe_load_obj(args_json: str) -> dict[str, Any]:
681
  return {}
682
 
683
 
684
- async def dispatch_tool_call(
685
- tool_name: str, args_json: str, deps: ToolDependencies
686
- ) -> Dict[str, Any]:
687
  """Dispatch a tool call by name with JSON args and dependencies."""
688
  tool = ALL_TOOLS.get(tool_name)
689
 
 
183
  current_antennas[1],
184
  ), # Skip body_yaw
185
  target_body_yaw=0, # Reset body yaw
186
+ start_body_yaw=current_antennas[0], # body_yaw is first in joint positions
 
 
187
  duration=deps.motion_duration_s,
188
  )
189
 
 
234
 
235
  # Use vision manager for processing if available
236
  if deps.vision_manager is not None:
237
+ result = await asyncio.to_thread(deps.vision_manager.processor.process_image, frame, image_query)
 
 
238
  if isinstance(result, dict) and "error" in result:
239
  return result
240
  return (
241
+ {"image_description": result} if isinstance(result, str) else {"error": "vision returned non-string"}
 
 
242
  )
243
  else:
244
  # Return base64 encoded image like main_works.py camera tool
 
430
  move_name = random.choice(list(AVAILABLE_MOVES.keys()))
431
 
432
  if move_name not in AVAILABLE_MOVES:
433
+ return {"error": f"Unknown dance move '{move_name}'. Available: {list(AVAILABLE_MOVES.keys())}"}
 
 
434
 
435
  # Add dance moves to queue
436
  movement_manager = deps.movement_manager
 
515
  try:
516
  emotion_names = RECORDED_MOVES.list_moves()
517
  if emotion_name not in emotion_names:
518
+ return {"error": f"Unknown emotion '{emotion_name}'. Available: {emotion_names}"}
 
 
519
 
520
  # Add emotion to queue
521
  movement_manager = deps.movement_manager
 
594
  cv2.imwrite(temp_path, frame)
595
 
596
  # Use DeepFace to find face
597
+ results = await asyncio.to_thread(DeepFace.find, img_path=temp_path, db_path="./pollen_faces")
 
 
598
 
599
  if len(results) == 0:
600
  return {"error": "Didn't recognize the face"}
 
669
  return {}
670
 
671
 
672
+ async def dispatch_tool_call(tool_name: str, args_json: str, deps: ToolDependencies) -> Dict[str, Any]:
 
 
673
  """Dispatch a tool call by name with JSON args and dependencies."""
674
  tool = ALL_TOOLS.get(tool_name)
675
 
src/reachy_mini_conversation_demo/utils.py CHANGED
@@ -15,15 +15,9 @@ def parse_args():
15
  default=None,
16
  help="Choose head tracker (default: mediapipe)",
17
  )
18
- parser.add_argument(
19
- "--no-camera", default=False, action="store_true", help="Disable camera usage"
20
- )
21
- parser.add_argument(
22
- "--headless", default=False, action="store_true", help="Run in headless mode"
23
- )
24
- parser.add_argument(
25
- "--debug", default=False, action="store_true", help="Enable debug logging"
26
- )
27
  return parser.parse_args()
28
 
29
 
 
15
  default=None,
16
  help="Choose head tracker (default: mediapipe)",
17
  )
18
+ parser.add_argument("--no-camera", default=False, action="store_true", help="Disable camera usage")
19
+ parser.add_argument("--headless", default=False, action="store_true", help="Run in headless mode")
20
+ parser.add_argument("--debug", default=False, action="store_true", help="Enable debug logging")
 
 
 
 
 
 
21
  return parser.parse_args()
22
 
23
 
src/reachy_mini_conversation_demo/vision/processors.py CHANGED
@@ -61,9 +61,7 @@ class VisionProcessor:
61
  def initialize(self) -> bool:
62
  """Load model and processor onto the selected device."""
63
  try:
64
- logger.info(
65
- f"Loading SmolVLM2 model on {self.device} (HF_HOME={os.getenv('HF_HOME')})"
66
- )
67
  self.processor = AutoProcessor.from_pretrained(self.model_path)
68
 
69
  # Select dtype depending on device
@@ -81,9 +79,7 @@ class VisionProcessor:
81
  model_kwargs["_attn_implementation"] = "flash_attention_2"
82
 
83
  # Load model weights
84
- self.model = AutoModelForImageTextToText.from_pretrained(
85
- self.model_path, **model_kwargs
86
- ).to(self.device)
87
 
88
  self.model.eval()
89
  self._initialized = True
@@ -138,10 +134,7 @@ class VisionProcessor:
138
  )
139
 
140
  # Move tensors to device WITHOUT forcing dtype (keeps input_ids as torch.long)
141
- inputs = {
142
- k: (v.to(self.device) if hasattr(v, "to") else v)
143
- for k, v in inputs.items()
144
- }
145
 
146
  with torch.no_grad():
147
  generated_ids = self.model.generate(
@@ -246,9 +239,7 @@ class VisionManager:
246
  )
247
 
248
  # Only update if we got a valid response
249
- if description and not description.startswith(
250
- ("Vision", "Failed", "Error")
251
- ):
252
  self._current_description = description
253
  self._last_processed_time = current_time
254
 
@@ -268,18 +259,14 @@ class VisionManager:
268
  """Get the most recent scene description (thread-safe)."""
269
  return self._current_description
270
 
271
- async def process_current_frame(
272
- self, prompt: str = "Describe what you see in detail."
273
- ) -> Dict[str, Any]:
274
  """Process current camera frame with custom prompt."""
275
  try:
276
  success, frame = self.camera.read()
277
  if not success or frame is None:
278
  return {"error": "Failed to capture image from camera"}
279
 
280
- description = await asyncio.to_thread(
281
- lambda: self.processor.process_image(frame, prompt)
282
- )
283
 
284
  return {
285
  "description": description,
@@ -335,9 +322,7 @@ def create_vision_processor(config: VisionConfig):
335
  return VisionProcessor(config)
336
 
337
 
338
- def init_vision(
339
- camera: cv2.VideoCapture, processor_type: str = "local"
340
- ) -> VisionManager:
341
  """Initialize vision manager with the specified processor type."""
342
  model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
343
 
 
61
  def initialize(self) -> bool:
62
  """Load model and processor onto the selected device."""
63
  try:
64
+ logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={os.getenv('HF_HOME')})")
 
 
65
  self.processor = AutoProcessor.from_pretrained(self.model_path)
66
 
67
  # Select dtype depending on device
 
79
  model_kwargs["_attn_implementation"] = "flash_attention_2"
80
 
81
  # Load model weights
82
+ self.model = AutoModelForImageTextToText.from_pretrained(self.model_path, **model_kwargs).to(self.device)
 
 
83
 
84
  self.model.eval()
85
  self._initialized = True
 
134
  )
135
 
136
  # Move tensors to device WITHOUT forcing dtype (keeps input_ids as torch.long)
137
+ inputs = {k: (v.to(self.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
 
 
 
138
 
139
  with torch.no_grad():
140
  generated_ids = self.model.generate(
 
239
  )
240
 
241
  # Only update if we got a valid response
242
+ if description and not description.startswith(("Vision", "Failed", "Error")):
 
 
243
  self._current_description = description
244
  self._last_processed_time = current_time
245
 
 
259
  """Get the most recent scene description (thread-safe)."""
260
  return self._current_description
261
 
262
+ async def process_current_frame(self, prompt: str = "Describe what you see in detail.") -> Dict[str, Any]:
 
 
263
  """Process current camera frame with custom prompt."""
264
  try:
265
  success, frame = self.camera.read()
266
  if not success or frame is None:
267
  return {"error": "Failed to capture image from camera"}
268
 
269
+ description = await asyncio.to_thread(lambda: self.processor.process_image(frame, prompt))
 
 
270
 
271
  return {
272
  "description": description,
 
322
  return VisionProcessor(config)
323
 
324
 
325
+ def init_vision(camera: cv2.VideoCapture, processor_type: str = "local") -> VisionManager:
 
 
326
  """Initialize vision manager with the specified processor type."""
327
  model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
328
 
src/reachy_mini_conversation_demo/vision/yolo_head_tracker.py CHANGED
@@ -94,9 +94,7 @@ class HeadTracker:
94
 
95
  return np.array([norm_x, norm_y], dtype=np.float32)
96
 
97
- def get_eyes(
98
- self, img: np.ndarray
99
- ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
100
  """Get eye positions (approximated from face bbox).
101
 
102
  Note: YOLO only provides face bbox, so we estimate eye positions
@@ -131,20 +129,14 @@ class HeadTracker:
131
  right_eye_x = bbox[0] + face_width * 0.65
132
 
133
  # Convert to MediaPipe coordinates
134
- left_eye = np.array(
135
- [(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
136
- )
137
- right_eye = np.array(
138
- [(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
139
- )
140
 
141
  return left_eye, right_eye
142
 
143
  def get_eyes_from_landmarks(self, face_landmarks) -> Tuple[np.ndarray, np.ndarray]:
144
  """Compatibility method - YOLO doesn't have landmarks, so we store bbox in the object."""
145
- if not hasattr(face_landmarks, "_bbox") or not hasattr(
146
- face_landmarks, "_img_shape"
147
- ):
148
  raise ValueError("Face landmarks object missing required attributes")
149
 
150
  bbox = face_landmarks._bbox
@@ -158,12 +150,8 @@ class HeadTracker:
158
  left_eye_x = bbox[0] + face_width * 0.35
159
  right_eye_x = bbox[0] + face_width * 0.65
160
 
161
- left_eye = np.array(
162
- [(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
163
- )
164
- right_eye = np.array(
165
- [(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
166
- )
167
 
168
  return left_eye, right_eye
169
 
@@ -177,9 +165,7 @@ class HeadTracker:
177
  left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
178
  return float(np.arctan2(right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]))
179
 
180
- def get_head_position(
181
- self, img: np.ndarray
182
- ) -> Tuple[Optional[np.ndarray], Optional[float]]:
183
  """Get head position from face detection.
184
 
185
  Args:
 
94
 
95
  return np.array([norm_x, norm_y], dtype=np.float32)
96
 
97
+ def get_eyes(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
 
 
98
  """Get eye positions (approximated from face bbox).
99
 
100
  Note: YOLO only provides face bbox, so we estimate eye positions
 
129
  right_eye_x = bbox[0] + face_width * 0.65
130
 
131
  # Convert to MediaPipe coordinates
132
+ left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
133
+ right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
 
 
 
 
134
 
135
  return left_eye, right_eye
136
 
137
  def get_eyes_from_landmarks(self, face_landmarks) -> Tuple[np.ndarray, np.ndarray]:
138
  """Compatibility method - YOLO doesn't have landmarks, so we store bbox in the object."""
139
+ if not hasattr(face_landmarks, "_bbox") or not hasattr(face_landmarks, "_img_shape"):
 
 
140
  raise ValueError("Face landmarks object missing required attributes")
141
 
142
  bbox = face_landmarks._bbox
 
150
  left_eye_x = bbox[0] + face_width * 0.35
151
  right_eye_x = bbox[0] + face_width * 0.65
152
 
153
+ left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
154
+ right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
 
 
 
 
155
 
156
  return left_eye, right_eye
157
 
 
165
  left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
166
  return float(np.arctan2(right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]))
167
 
168
+ def get_head_position(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[float]]:
 
 
169
  """Get head position from face detection.
170
 
171
  Args: