Alina Lozovskaya
commited on
Commit
·
426e6c4
1
Parent(s):
8383ea5
Apply Ruff
Browse files- src/reachy_mini_conversation_demo/__init__.py +1 -1
- src/reachy_mini_conversation_demo/audio/__init__.py +1 -1
- src/reachy_mini_conversation_demo/audio/gstreamer.py +14 -35
- src/reachy_mini_conversation_demo/audio/speech_tapper.py +4 -19
- src/reachy_mini_conversation_demo/camera_worker.py +15 -43
- src/reachy_mini_conversation_demo/config.py +1 -1
- src/reachy_mini_conversation_demo/dance_emotion_moves.py +10 -29
- src/reachy_mini_conversation_demo/moves.py +27 -78
- src/reachy_mini_conversation_demo/openai_realtime.py +9 -32
- src/reachy_mini_conversation_demo/tools.py +7 -21
- src/reachy_mini_conversation_demo/utils.py +3 -9
- src/reachy_mini_conversation_demo/vision/processors.py +7 -22
- src/reachy_mini_conversation_demo/vision/yolo_head_tracker.py +7 -21
src/reachy_mini_conversation_demo/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
"""Nothing (for ruff)."""
|
|
|
|
| 1 |
+
"""Nothing (for ruff)."""
|
src/reachy_mini_conversation_demo/audio/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
"""Nothing (for ruff)."""
|
|
|
|
| 1 |
+
"""Nothing (for ruff)."""
|
src/reachy_mini_conversation_demo/audio/gstreamer.py
CHANGED
|
@@ -25,18 +25,16 @@ class GstPlayer:
|
|
| 25 |
self.appsrc = Gst.ElementFactory.make("appsrc", None)
|
| 26 |
self.appsrc.set_property("format", Gst.Format.TIME)
|
| 27 |
self.appsrc.set_property("is-live", True)
|
| 28 |
-
caps = Gst.Caps.from_string(
|
| 29 |
-
f"audio/x-raw,format=S16LE,channels=1,rate={sample_rate},layout=interleaved"
|
| 30 |
-
)
|
| 31 |
self.appsrc.set_property("caps", caps)
|
| 32 |
queue = Gst.ElementFactory.make("queue")
|
| 33 |
audioconvert = Gst.ElementFactory.make("audioconvert")
|
| 34 |
audioresample = Gst.ElementFactory.make("audioresample")
|
| 35 |
|
| 36 |
# Try to pin specific output device; fallback to autoaudiosink
|
| 37 |
-
audiosink = _create_device_element(
|
| 38 |
-
|
| 39 |
-
)
|
| 40 |
|
| 41 |
self.pipeline.add(self.appsrc)
|
| 42 |
self.pipeline.add(queue)
|
|
@@ -104,9 +102,9 @@ class GstRecorder:
|
|
| 104 |
self.pipeline = Gst.Pipeline.new("audio_recorder")
|
| 105 |
|
| 106 |
# Create elements: try specific mic; fallback to default
|
| 107 |
-
autoaudiosrc = _create_device_element(
|
| 108 |
-
|
| 109 |
-
)
|
| 110 |
|
| 111 |
queue = Gst.ElementFactory.make("queue", None)
|
| 112 |
audioconvert = Gst.ElementFactory.make("audioconvert", None)
|
|
@@ -117,9 +115,7 @@ class GstRecorder:
|
|
| 117 |
raise RuntimeError("Failed to create GStreamer elements")
|
| 118 |
|
| 119 |
# Force mono/S16LE at 24000; resample handles device SR (e.g., 16000 → 24000)
|
| 120 |
-
caps = Gst.Caps.from_string(
|
| 121 |
-
f"audio/x-raw,channels=1,rate={sample_rate},format=S16LE"
|
| 122 |
-
)
|
| 123 |
self.appsink.set_property("caps", caps)
|
| 124 |
|
| 125 |
# Build pipeline
|
|
@@ -183,9 +179,7 @@ class GstRecorder:
|
|
| 183 |
logger.info("Stopped Recorder")
|
| 184 |
|
| 185 |
|
| 186 |
-
def _create_device_element(
|
| 187 |
-
direction: str, name_substr: Optional[str]
|
| 188 |
-
) -> Optional[Gst.Element]:
|
| 189 |
"""direction: 'source' or 'sink'.
|
| 190 |
|
| 191 |
name_substr: case-insensitive substring matching device display name/description.
|
|
@@ -205,30 +199,15 @@ def _create_device_element(
|
|
| 205 |
for dev in monitor.get_devices() or []:
|
| 206 |
disp = dev.get_display_name() or ""
|
| 207 |
props = dev.get_properties()
|
| 208 |
-
desc = (
|
| 209 |
-
props.get_string("device.description")
|
| 210 |
-
if props and props.has_field("device.description")
|
| 211 |
-
else ""
|
| 212 |
-
)
|
| 213 |
logger.info(f"Device candidate: disp='{disp}', desc='{desc}'")
|
| 214 |
|
| 215 |
-
if (
|
| 216 |
-
name_substr.lower() in disp.lower()
|
| 217 |
-
or name_substr.lower() in desc.lower()
|
| 218 |
-
):
|
| 219 |
elem = dev.create_element(None)
|
| 220 |
-
factory = (
|
| 221 |
-
|
| 222 |
-
if elem and elem.get_factory()
|
| 223 |
-
else "<?>"
|
| 224 |
-
)
|
| 225 |
-
logger.info(
|
| 226 |
-
f"Using {direction} device: '{disp or desc}' (factory='{factory}')"
|
| 227 |
-
)
|
| 228 |
return elem
|
| 229 |
finally:
|
| 230 |
monitor.stop()
|
| 231 |
-
logging.getLogger(__name__).warning(
|
| 232 |
-
"Requested %s '%s' not found; using auto*", direction, name_substr
|
| 233 |
-
)
|
| 234 |
return None
|
|
|
|
| 25 |
self.appsrc = Gst.ElementFactory.make("appsrc", None)
|
| 26 |
self.appsrc.set_property("format", Gst.Format.TIME)
|
| 27 |
self.appsrc.set_property("is-live", True)
|
| 28 |
+
caps = Gst.Caps.from_string(f"audio/x-raw,format=S16LE,channels=1,rate={sample_rate},layout=interleaved")
|
|
|
|
|
|
|
| 29 |
self.appsrc.set_property("caps", caps)
|
| 30 |
queue = Gst.ElementFactory.make("queue")
|
| 31 |
audioconvert = Gst.ElementFactory.make("audioconvert")
|
| 32 |
audioresample = Gst.ElementFactory.make("audioresample")
|
| 33 |
|
| 34 |
# Try to pin specific output device; fallback to autoaudiosink
|
| 35 |
+
audiosink = _create_device_element(direction="sink", name_substr=device_name) or Gst.ElementFactory.make(
|
| 36 |
+
"autoaudiosink"
|
| 37 |
+
)
|
| 38 |
|
| 39 |
self.pipeline.add(self.appsrc)
|
| 40 |
self.pipeline.add(queue)
|
|
|
|
| 102 |
self.pipeline = Gst.Pipeline.new("audio_recorder")
|
| 103 |
|
| 104 |
# Create elements: try specific mic; fallback to default
|
| 105 |
+
autoaudiosrc = _create_device_element(direction="source", name_substr=device_name) or Gst.ElementFactory.make(
|
| 106 |
+
"autoaudiosrc", None
|
| 107 |
+
)
|
| 108 |
|
| 109 |
queue = Gst.ElementFactory.make("queue", None)
|
| 110 |
audioconvert = Gst.ElementFactory.make("audioconvert", None)
|
|
|
|
| 115 |
raise RuntimeError("Failed to create GStreamer elements")
|
| 116 |
|
| 117 |
# Force mono/S16LE at 24000; resample handles device SR (e.g., 16000 → 24000)
|
| 118 |
+
caps = Gst.Caps.from_string(f"audio/x-raw,channels=1,rate={sample_rate},format=S16LE")
|
|
|
|
|
|
|
| 119 |
self.appsink.set_property("caps", caps)
|
| 120 |
|
| 121 |
# Build pipeline
|
|
|
|
| 179 |
logger.info("Stopped Recorder")
|
| 180 |
|
| 181 |
|
| 182 |
+
def _create_device_element(direction: str, name_substr: Optional[str]) -> Optional[Gst.Element]:
|
|
|
|
|
|
|
| 183 |
"""direction: 'source' or 'sink'.
|
| 184 |
|
| 185 |
name_substr: case-insensitive substring matching device display name/description.
|
|
|
|
| 199 |
for dev in monitor.get_devices() or []:
|
| 200 |
disp = dev.get_display_name() or ""
|
| 201 |
props = dev.get_properties()
|
| 202 |
+
desc = props.get_string("device.description") if props and props.has_field("device.description") else ""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
logger.info(f"Device candidate: disp='{disp}', desc='{desc}'")
|
| 204 |
|
| 205 |
+
if name_substr.lower() in disp.lower() or name_substr.lower() in desc.lower():
|
|
|
|
|
|
|
|
|
|
| 206 |
elem = dev.create_element(None)
|
| 207 |
+
factory = elem.get_factory().get_name() if elem and elem.get_factory() else "<?>"
|
| 208 |
+
logger.info(f"Using {direction} device: '{disp or desc}' (factory='{factory}')")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
return elem
|
| 210 |
finally:
|
| 211 |
monitor.stop()
|
| 212 |
+
logging.getLogger(__name__).warning("Requested %s '%s' not found; using auto*", direction, name_substr)
|
|
|
|
|
|
|
| 213 |
return None
|
src/reachy_mini_conversation_demo/audio/speech_tapper.py
CHANGED
|
@@ -68,7 +68,7 @@ def _loudness_gain(db: float, offset: float = SENS_DB_OFFSET) -> float:
|
|
| 68 |
|
| 69 |
def _to_float32_mono(x: np.ndarray) -> np.ndarray:
|
| 70 |
"""Convert arbitrary PCM array to float32 mono in [-1,1].
|
| 71 |
-
|
| 72 |
Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
|
| 73 |
"""
|
| 74 |
a = np.asarray(x)
|
|
@@ -258,24 +258,9 @@ class SwayRollRT:
|
|
| 258 |
* env
|
| 259 |
* math.sin(2 * math.pi * SWAY_F_ROLL * self.t + self.phase_roll)
|
| 260 |
)
|
| 261 |
-
x_mm = (
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
* env
|
| 265 |
-
* math.sin(2 * math.pi * SWAY_F_X * self.t + self.phase_x)
|
| 266 |
-
)
|
| 267 |
-
y_mm = (
|
| 268 |
-
SWAY_A_Y_MM
|
| 269 |
-
* loud
|
| 270 |
-
* env
|
| 271 |
-
* math.sin(2 * math.pi * SWAY_F_Y * self.t + self.phase_y)
|
| 272 |
-
)
|
| 273 |
-
z_mm = (
|
| 274 |
-
SWAY_A_Z_MM
|
| 275 |
-
* loud
|
| 276 |
-
* env
|
| 277 |
-
* math.sin(2 * math.pi * SWAY_F_Z * self.t + self.phase_z)
|
| 278 |
-
)
|
| 279 |
|
| 280 |
out.append(
|
| 281 |
{
|
|
|
|
| 68 |
|
| 69 |
def _to_float32_mono(x: np.ndarray) -> np.ndarray:
|
| 70 |
"""Convert arbitrary PCM array to float32 mono in [-1,1].
|
| 71 |
+
|
| 72 |
Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
|
| 73 |
"""
|
| 74 |
a = np.asarray(x)
|
|
|
|
| 258 |
* env
|
| 259 |
* math.sin(2 * math.pi * SWAY_F_ROLL * self.t + self.phase_roll)
|
| 260 |
)
|
| 261 |
+
x_mm = SWAY_A_X_MM * loud * env * math.sin(2 * math.pi * SWAY_F_X * self.t + self.phase_x)
|
| 262 |
+
y_mm = SWAY_A_Y_MM * loud * env * math.sin(2 * math.pi * SWAY_F_Y * self.t + self.phase_y)
|
| 263 |
+
z_mm = SWAY_A_Z_MM * loud * env * math.sin(2 * math.pi * SWAY_F_Z * self.t + self.phase_z)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
out.append(
|
| 266 |
{
|
src/reachy_mini_conversation_demo/camera_worker.py
CHANGED
|
@@ -114,17 +114,10 @@ class CameraWorker:
|
|
| 114 |
self.latest_frame = frame # .copy()
|
| 115 |
|
| 116 |
# Check if face tracking was just disabled
|
| 117 |
-
if
|
| 118 |
-
self.previous_head_tracking_state
|
| 119 |
-
and not self.is_head_tracking_enabled
|
| 120 |
-
):
|
| 121 |
# Face tracking was just disabled - start interpolation to neutral
|
| 122 |
-
self.last_face_detected_time =
|
| 123 |
-
|
| 124 |
-
)
|
| 125 |
-
self.interpolation_start_time = (
|
| 126 |
-
None # Will be set by the face-lost interpolation
|
| 127 |
-
)
|
| 128 |
self.interpolation_start_pose = None
|
| 129 |
|
| 130 |
# Update tracking state
|
|
@@ -137,9 +130,7 @@ class CameraWorker:
|
|
| 137 |
if eye_center is not None:
|
| 138 |
# Face detected - immediately switch to tracking
|
| 139 |
self.last_face_detected_time = current_time
|
| 140 |
-
self.interpolation_start_time =
|
| 141 |
-
None # Stop any interpolation
|
| 142 |
-
)
|
| 143 |
|
| 144 |
# Convert normalized coordinates to pixel coordinates
|
| 145 |
h, w, _ = frame.shape
|
|
@@ -159,9 +150,7 @@ class CameraWorker:
|
|
| 159 |
|
| 160 |
# Extract translation and rotation from the target pose directly
|
| 161 |
translation = target_pose[:3, 3]
|
| 162 |
-
rotation = R.from_matrix(target_pose[:3, :3]).as_euler(
|
| 163 |
-
"xyz", degrees=False
|
| 164 |
-
)
|
| 165 |
|
| 166 |
# Thread-safe update of face tracking offsets (use pose as-is)
|
| 167 |
with self.face_tracking_lock:
|
|
@@ -176,19 +165,14 @@ class CameraWorker:
|
|
| 176 |
|
| 177 |
else:
|
| 178 |
# No face detected while tracking enabled - set face lost timestamp
|
| 179 |
-
if
|
| 180 |
-
self.last_face_detected_time is None
|
| 181 |
-
or self.last_face_detected_time == current_time
|
| 182 |
-
):
|
| 183 |
# Only update if we haven't already set a face lost time
|
| 184 |
# (current_time check prevents overriding the disable-triggered timestamp)
|
| 185 |
pass
|
| 186 |
|
| 187 |
# Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
|
| 188 |
if self.last_face_detected_time is not None:
|
| 189 |
-
time_since_face_lost =
|
| 190 |
-
current_time - self.last_face_detected_time
|
| 191 |
-
)
|
| 192 |
|
| 193 |
if time_since_face_lost >= self.face_lost_delay:
|
| 194 |
# Start interpolation if not already started
|
|
@@ -197,27 +181,17 @@ class CameraWorker:
|
|
| 197 |
# Capture current pose as start of interpolation
|
| 198 |
with self.face_tracking_lock:
|
| 199 |
current_translation = self.face_tracking_offsets[:3]
|
| 200 |
-
current_rotation_euler = self.face_tracking_offsets[
|
| 201 |
-
3:
|
| 202 |
-
]
|
| 203 |
# Convert to 4x4 pose matrix
|
| 204 |
self.interpolation_start_pose = np.eye(4)
|
| 205 |
-
self.interpolation_start_pose[:3, 3] =
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
R.from_euler(
|
| 210 |
-
"xyz", current_rotation_euler
|
| 211 |
-
).as_matrix()
|
| 212 |
-
)
|
| 213 |
|
| 214 |
# Calculate interpolation progress (t from 0 to 1)
|
| 215 |
-
elapsed_interpolation =
|
| 216 |
-
|
| 217 |
-
)
|
| 218 |
-
t = min(
|
| 219 |
-
1.0, elapsed_interpolation / self.interpolation_duration
|
| 220 |
-
)
|
| 221 |
|
| 222 |
# Interpolate between current pose and neutral pose
|
| 223 |
interpolated_pose = linear_pose_interpolation(
|
|
@@ -226,9 +200,7 @@ class CameraWorker:
|
|
| 226 |
|
| 227 |
# Extract translation and rotation from interpolated pose
|
| 228 |
translation = interpolated_pose[:3, 3]
|
| 229 |
-
rotation = R.from_matrix(
|
| 230 |
-
interpolated_pose[:3, :3]
|
| 231 |
-
).as_euler("xyz", degrees=False)
|
| 232 |
|
| 233 |
# Thread-safe update of face tracking offsets
|
| 234 |
with self.face_tracking_lock:
|
|
|
|
| 114 |
self.latest_frame = frame # .copy()
|
| 115 |
|
| 116 |
# Check if face tracking was just disabled
|
| 117 |
+
if self.previous_head_tracking_state and not self.is_head_tracking_enabled:
|
|
|
|
|
|
|
|
|
|
| 118 |
# Face tracking was just disabled - start interpolation to neutral
|
| 119 |
+
self.last_face_detected_time = current_time # Trigger the face-lost logic
|
| 120 |
+
self.interpolation_start_time = None # Will be set by the face-lost interpolation
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
self.interpolation_start_pose = None
|
| 122 |
|
| 123 |
# Update tracking state
|
|
|
|
| 130 |
if eye_center is not None:
|
| 131 |
# Face detected - immediately switch to tracking
|
| 132 |
self.last_face_detected_time = current_time
|
| 133 |
+
self.interpolation_start_time = None # Stop any interpolation
|
|
|
|
|
|
|
| 134 |
|
| 135 |
# Convert normalized coordinates to pixel coordinates
|
| 136 |
h, w, _ = frame.shape
|
|
|
|
| 150 |
|
| 151 |
# Extract translation and rotation from the target pose directly
|
| 152 |
translation = target_pose[:3, 3]
|
| 153 |
+
rotation = R.from_matrix(target_pose[:3, :3]).as_euler("xyz", degrees=False)
|
|
|
|
|
|
|
| 154 |
|
| 155 |
# Thread-safe update of face tracking offsets (use pose as-is)
|
| 156 |
with self.face_tracking_lock:
|
|
|
|
| 165 |
|
| 166 |
else:
|
| 167 |
# No face detected while tracking enabled - set face lost timestamp
|
| 168 |
+
if self.last_face_detected_time is None or self.last_face_detected_time == current_time:
|
|
|
|
|
|
|
|
|
|
| 169 |
# Only update if we haven't already set a face lost time
|
| 170 |
# (current_time check prevents overriding the disable-triggered timestamp)
|
| 171 |
pass
|
| 172 |
|
| 173 |
# Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
|
| 174 |
if self.last_face_detected_time is not None:
|
| 175 |
+
time_since_face_lost = current_time - self.last_face_detected_time
|
|
|
|
|
|
|
| 176 |
|
| 177 |
if time_since_face_lost >= self.face_lost_delay:
|
| 178 |
# Start interpolation if not already started
|
|
|
|
| 181 |
# Capture current pose as start of interpolation
|
| 182 |
with self.face_tracking_lock:
|
| 183 |
current_translation = self.face_tracking_offsets[:3]
|
| 184 |
+
current_rotation_euler = self.face_tracking_offsets[3:]
|
|
|
|
|
|
|
| 185 |
# Convert to 4x4 pose matrix
|
| 186 |
self.interpolation_start_pose = np.eye(4)
|
| 187 |
+
self.interpolation_start_pose[:3, 3] = current_translation
|
| 188 |
+
self.interpolation_start_pose[:3, :3] = R.from_euler(
|
| 189 |
+
"xyz", current_rotation_euler
|
| 190 |
+
).as_matrix()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
# Calculate interpolation progress (t from 0 to 1)
|
| 193 |
+
elapsed_interpolation = current_time - self.interpolation_start_time
|
| 194 |
+
t = min(1.0, elapsed_interpolation / self.interpolation_duration)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
# Interpolate between current pose and neutral pose
|
| 197 |
interpolated_pose = linear_pose_interpolation(
|
|
|
|
| 200 |
|
| 201 |
# Extract translation and rotation from interpolated pose
|
| 202 |
translation = interpolated_pose[:3, 3]
|
| 203 |
+
rotation = R.from_matrix(interpolated_pose[:3, :3]).as_euler("xyz", degrees=False)
|
|
|
|
|
|
|
| 204 |
|
| 205 |
# Thread-safe update of face tracking offsets
|
| 206 |
with self.face_tracking_lock:
|
src/reachy_mini_conversation_demo/config.py
CHANGED
|
@@ -15,7 +15,7 @@ def getenv_bool(key: str, default: bool = False) -> bool:
|
|
| 15 |
|
| 16 |
class Config:
|
| 17 |
"""Configuration class for the conversation demo."""
|
| 18 |
-
|
| 19 |
# Required
|
| 20 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 21 |
if not OPENAI_API_KEY:
|
|
|
|
| 15 |
|
| 16 |
class Config:
|
| 17 |
"""Configuration class for the conversation demo."""
|
| 18 |
+
|
| 19 |
# Required
|
| 20 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 21 |
if not OPENAI_API_KEY:
|
src/reachy_mini_conversation_demo/dance_emotion_moves.py
CHANGED
|
@@ -30,9 +30,7 @@ class DanceQueueMove(Move):
|
|
| 30 |
"""Duration property required by official Move interface."""
|
| 31 |
return self.dance_move.duration
|
| 32 |
|
| 33 |
-
def evaluate(
|
| 34 |
-
self, t: float
|
| 35 |
-
) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
|
| 36 |
"""Evaluate dance move at time t."""
|
| 37 |
try:
|
| 38 |
# Get the pose from the dance move
|
|
@@ -45,9 +43,7 @@ class DanceQueueMove(Move):
|
|
| 45 |
return (head_pose, antennas, body_yaw)
|
| 46 |
|
| 47 |
except Exception as e:
|
| 48 |
-
logger.error(
|
| 49 |
-
f"Error evaluating dance move '{self.move_name}' at t={t}: {e}"
|
| 50 |
-
)
|
| 51 |
# Return neutral pose on error
|
| 52 |
from reachy_mini.utils import create_head_pose
|
| 53 |
|
|
@@ -68,9 +64,7 @@ class EmotionQueueMove(Move):
|
|
| 68 |
"""Duration property required by official Move interface."""
|
| 69 |
return self.emotion_move.duration
|
| 70 |
|
| 71 |
-
def evaluate(
|
| 72 |
-
self, t: float
|
| 73 |
-
) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
|
| 74 |
"""Evaluate emotion move at time t."""
|
| 75 |
try:
|
| 76 |
# Get the pose from the emotion move
|
|
@@ -83,9 +77,7 @@ class EmotionQueueMove(Move):
|
|
| 83 |
return (head_pose, antennas, body_yaw)
|
| 84 |
|
| 85 |
except Exception as e:
|
| 86 |
-
logger.error(
|
| 87 |
-
f"Error evaluating emotion '{self.emotion_name}' at t={t}: {e}"
|
| 88 |
-
)
|
| 89 |
# Return neutral pose on error
|
| 90 |
from reachy_mini.utils import create_head_pose
|
| 91 |
|
|
@@ -120,9 +112,7 @@ class GotoQueueMove(Move):
|
|
| 120 |
"""Duration property required by official Move interface."""
|
| 121 |
return self._duration
|
| 122 |
|
| 123 |
-
def evaluate(
|
| 124 |
-
self, t: float
|
| 125 |
-
) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
|
| 126 |
"""Evaluate goto move at time t using linear interpolation."""
|
| 127 |
try:
|
| 128 |
from reachy_mini.utils import create_head_pose
|
|
@@ -138,32 +128,23 @@ class GotoQueueMove(Move):
|
|
| 138 |
start_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
|
| 139 |
|
| 140 |
# Interpolate head pose
|
| 141 |
-
head_pose = linear_pose_interpolation(
|
| 142 |
-
start_pose, self.target_head_pose, t_clamped
|
| 143 |
-
)
|
| 144 |
|
| 145 |
# Interpolate antennas - return as numpy array
|
| 146 |
antennas = np.array(
|
| 147 |
[
|
| 148 |
-
self.start_antennas[0]
|
| 149 |
-
+ (self.target_antennas[
|
| 150 |
-
self.start_antennas[1]
|
| 151 |
-
+ (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
|
| 152 |
]
|
| 153 |
)
|
| 154 |
|
| 155 |
# Interpolate body yaw
|
| 156 |
-
body_yaw = (
|
| 157 |
-
self.start_body_yaw
|
| 158 |
-
+ (self.target_body_yaw - self.start_body_yaw) * t_clamped
|
| 159 |
-
)
|
| 160 |
|
| 161 |
return (head_pose, antennas, body_yaw)
|
| 162 |
|
| 163 |
except Exception as e:
|
| 164 |
logger.error(f"Error evaluating goto move at t={t}: {e}")
|
| 165 |
# Return target pose on error - convert antennas to numpy array
|
| 166 |
-
target_antennas_array = np.array(
|
| 167 |
-
[self.target_antennas[0], self.target_antennas[1]]
|
| 168 |
-
)
|
| 169 |
return (self.target_head_pose, target_antennas_array, self.target_body_yaw)
|
|
|
|
| 30 |
"""Duration property required by official Move interface."""
|
| 31 |
return self.dance_move.duration
|
| 32 |
|
| 33 |
+
def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
|
|
|
|
|
|
|
| 34 |
"""Evaluate dance move at time t."""
|
| 35 |
try:
|
| 36 |
# Get the pose from the dance move
|
|
|
|
| 43 |
return (head_pose, antennas, body_yaw)
|
| 44 |
|
| 45 |
except Exception as e:
|
| 46 |
+
logger.error(f"Error evaluating dance move '{self.move_name}' at t={t}: {e}")
|
|
|
|
|
|
|
| 47 |
# Return neutral pose on error
|
| 48 |
from reachy_mini.utils import create_head_pose
|
| 49 |
|
|
|
|
| 64 |
"""Duration property required by official Move interface."""
|
| 65 |
return self.emotion_move.duration
|
| 66 |
|
| 67 |
+
def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
|
|
|
|
|
|
|
| 68 |
"""Evaluate emotion move at time t."""
|
| 69 |
try:
|
| 70 |
# Get the pose from the emotion move
|
|
|
|
| 77 |
return (head_pose, antennas, body_yaw)
|
| 78 |
|
| 79 |
except Exception as e:
|
| 80 |
+
logger.error(f"Error evaluating emotion '{self.emotion_name}' at t={t}: {e}")
|
|
|
|
|
|
|
| 81 |
# Return neutral pose on error
|
| 82 |
from reachy_mini.utils import create_head_pose
|
| 83 |
|
|
|
|
| 112 |
"""Duration property required by official Move interface."""
|
| 113 |
return self._duration
|
| 114 |
|
| 115 |
+
def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
|
|
|
|
|
|
|
| 116 |
"""Evaluate goto move at time t using linear interpolation."""
|
| 117 |
try:
|
| 118 |
from reachy_mini.utils import create_head_pose
|
|
|
|
| 128 |
start_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
|
| 129 |
|
| 130 |
# Interpolate head pose
|
| 131 |
+
head_pose = linear_pose_interpolation(start_pose, self.target_head_pose, t_clamped)
|
|
|
|
|
|
|
| 132 |
|
| 133 |
# Interpolate antennas - return as numpy array
|
| 134 |
antennas = np.array(
|
| 135 |
[
|
| 136 |
+
self.start_antennas[0] + (self.target_antennas[0] - self.start_antennas[0]) * t_clamped,
|
| 137 |
+
self.start_antennas[1] + (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
|
|
|
|
|
|
|
| 138 |
]
|
| 139 |
)
|
| 140 |
|
| 141 |
# Interpolate body yaw
|
| 142 |
+
body_yaw = self.start_body_yaw + (self.target_body_yaw - self.start_body_yaw) * t_clamped
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
return (head_pose, antennas, body_yaw)
|
| 145 |
|
| 146 |
except Exception as e:
|
| 147 |
logger.error(f"Error evaluating goto move at t={t}: {e}")
|
| 148 |
# Return target pose on error - convert antennas to numpy array
|
| 149 |
+
target_antennas_array = np.array([self.target_antennas[0], self.target_antennas[1]])
|
|
|
|
|
|
|
| 150 |
return (self.target_head_pose, target_antennas_array, self.target_body_yaw)
|
src/reachy_mini_conversation_demo/moves.py
CHANGED
|
@@ -27,9 +27,7 @@ from reachy_mini.utils.interpolation import (
|
|
| 27 |
logger = logging.getLogger(__name__)
|
| 28 |
|
| 29 |
# Type definitions
|
| 30 |
-
FullBodyPose = Tuple[
|
| 31 |
-
np.ndarray, Tuple[float, float], float
|
| 32 |
-
] # (head_pose_4x4, antennas, body_yaw)
|
| 33 |
|
| 34 |
|
| 35 |
class BreathingMove(Move):
|
|
@@ -68,9 +66,7 @@ class BreathingMove(Move):
|
|
| 68 |
"""Duration property required by official Move interface."""
|
| 69 |
return float("inf") # Continuous breathing (never ends naturally)
|
| 70 |
|
| 71 |
-
def evaluate(
|
| 72 |
-
self, t: float
|
| 73 |
-
) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
|
| 74 |
"""Evaluate breathing move at time t."""
|
| 75 |
if t < self.interpolation_duration:
|
| 76 |
# Phase 1: Interpolate to neutral base position
|
|
@@ -83,35 +79,26 @@ class BreathingMove(Move):
|
|
| 83 |
|
| 84 |
# Interpolate antennas
|
| 85 |
antennas = (
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
)
|
| 89 |
|
| 90 |
else:
|
| 91 |
# Phase 2: Breathing patterns from neutral base
|
| 92 |
breathing_time = t - self.interpolation_duration
|
| 93 |
|
| 94 |
# Gentle z-axis breathing
|
| 95 |
-
z_offset = self.breathing_z_amplitude * np.sin(
|
| 96 |
-
|
| 97 |
-
)
|
| 98 |
-
head_pose = create_head_pose(
|
| 99 |
-
x=0, y=0, z=z_offset, roll=0, pitch=0, yaw=0, degrees=True, mm=False
|
| 100 |
-
)
|
| 101 |
|
| 102 |
# Antenna sway (opposite directions)
|
| 103 |
-
antenna_sway = self.antenna_sway_amplitude * np.sin(
|
| 104 |
-
2 * np.pi * self.antenna_frequency * breathing_time
|
| 105 |
-
)
|
| 106 |
antennas = np.array([antenna_sway, -antenna_sway])
|
| 107 |
|
| 108 |
# Return in official Move interface format: (head_pose, antennas_array, body_yaw)
|
| 109 |
return (head_pose, antennas, 0.0)
|
| 110 |
|
| 111 |
|
| 112 |
-
def combine_full_body(
|
| 113 |
-
primary_pose: FullBodyPose, secondary_pose: FullBodyPose
|
| 114 |
-
) -> FullBodyPose:
|
| 115 |
"""Combine primary and secondary full body poses.
|
| 116 |
|
| 117 |
Args:
|
|
@@ -127,9 +114,7 @@ def combine_full_body(
|
|
| 127 |
|
| 128 |
# Combine head poses using compose_world_offset
|
| 129 |
# primary_head is T_abs, secondary_head is T_off_world
|
| 130 |
-
combined_head = compose_world_offset(
|
| 131 |
-
primary_head, secondary_head, reorthonormalize=True
|
| 132 |
-
)
|
| 133 |
|
| 134 |
# Sum antennas and body_yaw
|
| 135 |
combined_antennas = (
|
|
@@ -226,9 +211,7 @@ class MovementManager:
|
|
| 226 |
self._thread: Optional[threading.Thread] = None
|
| 227 |
self._state_lock = threading.RLock()
|
| 228 |
self._is_listening = False
|
| 229 |
-
self._last_commanded_pose: FullBodyPose = clone_full_body_pose(
|
| 230 |
-
self.state.last_primary_pose
|
| 231 |
-
)
|
| 232 |
self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
|
| 233 |
self._antenna_unfreeze_blend = 1.0
|
| 234 |
self._antenna_blend_duration = 0.4 # seconds to blend back after listening
|
|
@@ -239,9 +222,7 @@ class MovementManager:
|
|
| 239 |
with self._state_lock:
|
| 240 |
self.move_queue.append(move)
|
| 241 |
self.state.update_activity()
|
| 242 |
-
logger.info(
|
| 243 |
-
f"Queued move with duration {move.duration}s, queue size: {len(self.move_queue)}"
|
| 244 |
-
)
|
| 245 |
|
| 246 |
def clear_queue(self) -> None:
|
| 247 |
"""Clear all queued moves and stop current move."""
|
|
@@ -252,22 +233,16 @@ class MovementManager:
|
|
| 252 |
self.state.is_playing_move = False
|
| 253 |
logger.info("Cleared move queue and stopped current move")
|
| 254 |
|
| 255 |
-
def set_speech_offsets(
|
| 256 |
-
self, offsets: Tuple[float, float, float, float, float, float]
|
| 257 |
-
) -> None:
|
| 258 |
"""Set speech head offsets (secondary move)."""
|
| 259 |
with self._state_lock:
|
| 260 |
self.state.speech_offsets = offsets
|
| 261 |
|
| 262 |
-
def set_offsets(
|
| 263 |
-
self, offsets: Tuple[float, float, float, float, float, float]
|
| 264 |
-
) -> None:
|
| 265 |
"""Compatibility alias for set_speech_offsets."""
|
| 266 |
self.set_speech_offsets(offsets)
|
| 267 |
|
| 268 |
-
def set_face_tracking_offsets(
|
| 269 |
-
self, offsets: Tuple[float, float, float, float, float, float]
|
| 270 |
-
) -> None:
|
| 271 |
"""Set face tracking offsets (secondary move)."""
|
| 272 |
with self._state_lock:
|
| 273 |
self.state.face_tracking_offsets = offsets
|
|
@@ -314,8 +289,7 @@ class MovementManager:
|
|
| 314 |
with self._state_lock:
|
| 315 |
if self.state.current_move is None or (
|
| 316 |
self.state.move_start_time is not None
|
| 317 |
-
and current_time - self.state.move_start_time
|
| 318 |
-
>= self.state.current_move.duration
|
| 319 |
):
|
| 320 |
self.state.current_move = None
|
| 321 |
self.state.move_start_time = None
|
|
@@ -323,9 +297,7 @@ class MovementManager:
|
|
| 323 |
if self.move_queue:
|
| 324 |
self.state.current_move = self.move_queue.popleft()
|
| 325 |
self.state.move_start_time = current_time
|
| 326 |
-
logger.debug(
|
| 327 |
-
f"Starting new move, duration: {self.state.current_move.duration}s"
|
| 328 |
-
)
|
| 329 |
|
| 330 |
def _manage_breathing(self, current_time: float) -> None:
|
| 331 |
"""Manage automatic breathing when idle."""
|
|
@@ -336,9 +308,7 @@ class MovementManager:
|
|
| 336 |
|
| 337 |
if self.is_idle():
|
| 338 |
try:
|
| 339 |
-
_, current_antennas = (
|
| 340 |
-
self.current_robot.get_current_joint_positions()
|
| 341 |
-
)
|
| 342 |
current_head_pose = self.current_robot.get_current_head_pose()
|
| 343 |
|
| 344 |
breathing_move = BreathingMove(
|
|
@@ -348,9 +318,7 @@ class MovementManager:
|
|
| 348 |
)
|
| 349 |
self.move_queue.append(breathing_move)
|
| 350 |
self.state.update_activity()
|
| 351 |
-
logger.debug(
|
| 352 |
-
f"Started breathing after {time_since_activity:.1f}s of inactivity"
|
| 353 |
-
)
|
| 354 |
except Exception as e:
|
| 355 |
logger.error(f"Failed to start breathing: {e}")
|
| 356 |
|
|
@@ -367,10 +335,7 @@ class MovementManager:
|
|
| 367 |
"""Get the primary full body pose from current move or neutral."""
|
| 368 |
with self._state_lock:
|
| 369 |
# When a primary move is playing, sample it and cache the resulting pose
|
| 370 |
-
if
|
| 371 |
-
self.state.current_move is not None
|
| 372 |
-
and self.state.move_start_time is not None
|
| 373 |
-
):
|
| 374 |
move_time = current_time - self.state.move_start_time
|
| 375 |
head, antennas, body_yaw = self.state.current_move.evaluate(move_time)
|
| 376 |
|
|
@@ -391,26 +356,18 @@ class MovementManager:
|
|
| 391 |
|
| 392 |
self.state.is_playing_move = True
|
| 393 |
self.state.is_moving = True
|
| 394 |
-
self.state.last_primary_pose = clone_full_body_pose(
|
| 395 |
-
primary_full_body_pose
|
| 396 |
-
)
|
| 397 |
else:
|
| 398 |
# Otherwise reuse the last primary pose so we avoid jumps between moves
|
| 399 |
self.state.is_playing_move = False
|
| 400 |
-
self.state.is_moving = (
|
| 401 |
-
time.time() - self.state.moving_start < self.state.moving_for
|
| 402 |
-
)
|
| 403 |
|
| 404 |
if self.state.last_primary_pose is not None:
|
| 405 |
-
primary_full_body_pose = clone_full_body_pose(
|
| 406 |
-
self.state.last_primary_pose
|
| 407 |
-
)
|
| 408 |
else:
|
| 409 |
neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
|
| 410 |
primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
|
| 411 |
-
self.state.last_primary_pose = clone_full_body_pose(
|
| 412 |
-
primary_full_body_pose
|
| 413 |
-
)
|
| 414 |
|
| 415 |
return primary_full_body_pose
|
| 416 |
|
|
@@ -496,9 +453,7 @@ class MovementManager:
|
|
| 496 |
secondary_full_body_pose = self._get_secondary_pose()
|
| 497 |
|
| 498 |
# 6. Combine primary and secondary poses
|
| 499 |
-
global_full_body_pose = combine_full_body(
|
| 500 |
-
primary_full_body_pose, secondary_full_body_pose
|
| 501 |
-
)
|
| 502 |
|
| 503 |
# 7. Extract pose components
|
| 504 |
head, antennas, body_yaw = global_full_body_pose
|
|
@@ -539,16 +494,12 @@ class MovementManager:
|
|
| 539 |
|
| 540 |
# 8. Single set_target call - the one and only place we control the robot
|
| 541 |
try:
|
| 542 |
-
self.current_robot.set_target(
|
| 543 |
-
head=head, antennas=antennas_cmd, body_yaw=body_yaw
|
| 544 |
-
)
|
| 545 |
except Exception as e:
|
| 546 |
logger.error(f"Failed to set robot target: {e}")
|
| 547 |
else:
|
| 548 |
with self._state_lock:
|
| 549 |
-
self._last_commanded_pose = clone_full_body_pose(
|
| 550 |
-
(head, antennas_cmd, body_yaw)
|
| 551 |
-
)
|
| 552 |
|
| 553 |
# 9. Calculate computation time and adjust sleep for 50Hz
|
| 554 |
computation_time = time.time() - loop_start_time
|
|
@@ -558,9 +509,7 @@ class MovementManager:
|
|
| 558 |
if loop_count % 100 == 0:
|
| 559 |
elapsed = current_time - last_print_time
|
| 560 |
actual_freq = 100.0 / elapsed if elapsed > 0 else 0
|
| 561 |
-
potential_freq = (
|
| 562 |
-
1.0 / computation_time if computation_time > 0 else float("inf")
|
| 563 |
-
)
|
| 564 |
logger.debug(
|
| 565 |
f"Loop freq - Actual: {actual_freq:.1f}Hz, Potential: {potential_freq:.1f}Hz, Target: {self.target_frequency:.1f}Hz"
|
| 566 |
)
|
|
|
|
| 27 |
logger = logging.getLogger(__name__)
|
| 28 |
|
| 29 |
# Type definitions
|
| 30 |
+
FullBodyPose = Tuple[np.ndarray, Tuple[float, float], float] # (head_pose_4x4, antennas, body_yaw)
|
|
|
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
class BreathingMove(Move):
|
|
|
|
| 66 |
"""Duration property required by official Move interface."""
|
| 67 |
return float("inf") # Continuous breathing (never ends naturally)
|
| 68 |
|
| 69 |
+
def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
|
|
|
|
|
|
|
| 70 |
"""Evaluate breathing move at time t."""
|
| 71 |
if t < self.interpolation_duration:
|
| 72 |
# Phase 1: Interpolate to neutral base position
|
|
|
|
| 79 |
|
| 80 |
# Interpolate antennas
|
| 81 |
antennas = (
|
| 82 |
+
1 - interpolation_t
|
| 83 |
+
) * self.interpolation_start_antennas + interpolation_t * self.neutral_antennas
|
|
|
|
| 84 |
|
| 85 |
else:
|
| 86 |
# Phase 2: Breathing patterns from neutral base
|
| 87 |
breathing_time = t - self.interpolation_duration
|
| 88 |
|
| 89 |
# Gentle z-axis breathing
|
| 90 |
+
z_offset = self.breathing_z_amplitude * np.sin(2 * np.pi * self.breathing_frequency * breathing_time)
|
| 91 |
+
head_pose = create_head_pose(x=0, y=0, z=z_offset, roll=0, pitch=0, yaw=0, degrees=True, mm=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
# Antenna sway (opposite directions)
|
| 94 |
+
antenna_sway = self.antenna_sway_amplitude * np.sin(2 * np.pi * self.antenna_frequency * breathing_time)
|
|
|
|
|
|
|
| 95 |
antennas = np.array([antenna_sway, -antenna_sway])
|
| 96 |
|
| 97 |
# Return in official Move interface format: (head_pose, antennas_array, body_yaw)
|
| 98 |
return (head_pose, antennas, 0.0)
|
| 99 |
|
| 100 |
|
| 101 |
+
def combine_full_body(primary_pose: FullBodyPose, secondary_pose: FullBodyPose) -> FullBodyPose:
|
|
|
|
|
|
|
| 102 |
"""Combine primary and secondary full body poses.
|
| 103 |
|
| 104 |
Args:
|
|
|
|
| 114 |
|
| 115 |
# Combine head poses using compose_world_offset
|
| 116 |
# primary_head is T_abs, secondary_head is T_off_world
|
| 117 |
+
combined_head = compose_world_offset(primary_head, secondary_head, reorthonormalize=True)
|
|
|
|
|
|
|
| 118 |
|
| 119 |
# Sum antennas and body_yaw
|
| 120 |
combined_antennas = (
|
|
|
|
| 211 |
self._thread: Optional[threading.Thread] = None
|
| 212 |
self._state_lock = threading.RLock()
|
| 213 |
self._is_listening = False
|
| 214 |
+
self._last_commanded_pose: FullBodyPose = clone_full_body_pose(self.state.last_primary_pose)
|
|
|
|
|
|
|
| 215 |
self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
|
| 216 |
self._antenna_unfreeze_blend = 1.0
|
| 217 |
self._antenna_blend_duration = 0.4 # seconds to blend back after listening
|
|
|
|
| 222 |
with self._state_lock:
|
| 223 |
self.move_queue.append(move)
|
| 224 |
self.state.update_activity()
|
| 225 |
+
logger.info(f"Queued move with duration {move.duration}s, queue size: {len(self.move_queue)}")
|
|
|
|
|
|
|
| 226 |
|
| 227 |
def clear_queue(self) -> None:
|
| 228 |
"""Clear all queued moves and stop current move."""
|
|
|
|
| 233 |
self.state.is_playing_move = False
|
| 234 |
logger.info("Cleared move queue and stopped current move")
|
| 235 |
|
| 236 |
+
def set_speech_offsets(self, offsets: Tuple[float, float, float, float, float, float]) -> None:
|
|
|
|
|
|
|
| 237 |
"""Set speech head offsets (secondary move)."""
|
| 238 |
with self._state_lock:
|
| 239 |
self.state.speech_offsets = offsets
|
| 240 |
|
| 241 |
+
def set_offsets(self, offsets: Tuple[float, float, float, float, float, float]) -> None:
|
|
|
|
|
|
|
| 242 |
"""Compatibility alias for set_speech_offsets."""
|
| 243 |
self.set_speech_offsets(offsets)
|
| 244 |
|
| 245 |
+
def set_face_tracking_offsets(self, offsets: Tuple[float, float, float, float, float, float]) -> None:
|
|
|
|
|
|
|
| 246 |
"""Set face tracking offsets (secondary move)."""
|
| 247 |
with self._state_lock:
|
| 248 |
self.state.face_tracking_offsets = offsets
|
|
|
|
| 289 |
with self._state_lock:
|
| 290 |
if self.state.current_move is None or (
|
| 291 |
self.state.move_start_time is not None
|
| 292 |
+
and current_time - self.state.move_start_time >= self.state.current_move.duration
|
|
|
|
| 293 |
):
|
| 294 |
self.state.current_move = None
|
| 295 |
self.state.move_start_time = None
|
|
|
|
| 297 |
if self.move_queue:
|
| 298 |
self.state.current_move = self.move_queue.popleft()
|
| 299 |
self.state.move_start_time = current_time
|
| 300 |
+
logger.debug(f"Starting new move, duration: {self.state.current_move.duration}s")
|
|
|
|
|
|
|
| 301 |
|
| 302 |
def _manage_breathing(self, current_time: float) -> None:
|
| 303 |
"""Manage automatic breathing when idle."""
|
|
|
|
| 308 |
|
| 309 |
if self.is_idle():
|
| 310 |
try:
|
| 311 |
+
_, current_antennas = self.current_robot.get_current_joint_positions()
|
|
|
|
|
|
|
| 312 |
current_head_pose = self.current_robot.get_current_head_pose()
|
| 313 |
|
| 314 |
breathing_move = BreathingMove(
|
|
|
|
| 318 |
)
|
| 319 |
self.move_queue.append(breathing_move)
|
| 320 |
self.state.update_activity()
|
| 321 |
+
logger.debug(f"Started breathing after {time_since_activity:.1f}s of inactivity")
|
|
|
|
|
|
|
| 322 |
except Exception as e:
|
| 323 |
logger.error(f"Failed to start breathing: {e}")
|
| 324 |
|
|
|
|
| 335 |
"""Get the primary full body pose from current move or neutral."""
|
| 336 |
with self._state_lock:
|
| 337 |
# When a primary move is playing, sample it and cache the resulting pose
|
| 338 |
+
if self.state.current_move is not None and self.state.move_start_time is not None:
|
|
|
|
|
|
|
|
|
|
| 339 |
move_time = current_time - self.state.move_start_time
|
| 340 |
head, antennas, body_yaw = self.state.current_move.evaluate(move_time)
|
| 341 |
|
|
|
|
| 356 |
|
| 357 |
self.state.is_playing_move = True
|
| 358 |
self.state.is_moving = True
|
| 359 |
+
self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
|
|
|
|
|
|
|
| 360 |
else:
|
| 361 |
# Otherwise reuse the last primary pose so we avoid jumps between moves
|
| 362 |
self.state.is_playing_move = False
|
| 363 |
+
self.state.is_moving = time.time() - self.state.moving_start < self.state.moving_for
|
|
|
|
|
|
|
| 364 |
|
| 365 |
if self.state.last_primary_pose is not None:
|
| 366 |
+
primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
|
|
|
|
|
|
|
| 367 |
else:
|
| 368 |
neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
|
| 369 |
primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
|
| 370 |
+
self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
|
|
|
|
|
|
|
| 371 |
|
| 372 |
return primary_full_body_pose
|
| 373 |
|
|
|
|
| 453 |
secondary_full_body_pose = self._get_secondary_pose()
|
| 454 |
|
| 455 |
# 6. Combine primary and secondary poses
|
| 456 |
+
global_full_body_pose = combine_full_body(primary_full_body_pose, secondary_full_body_pose)
|
|
|
|
|
|
|
| 457 |
|
| 458 |
# 7. Extract pose components
|
| 459 |
head, antennas, body_yaw = global_full_body_pose
|
|
|
|
| 494 |
|
| 495 |
# 8. Single set_target call - the one and only place we control the robot
|
| 496 |
try:
|
| 497 |
+
self.current_robot.set_target(head=head, antennas=antennas_cmd, body_yaw=body_yaw)
|
|
|
|
|
|
|
| 498 |
except Exception as e:
|
| 499 |
logger.error(f"Failed to set robot target: {e}")
|
| 500 |
else:
|
| 501 |
with self._state_lock:
|
| 502 |
+
self._last_commanded_pose = clone_full_body_pose((head, antennas_cmd, body_yaw))
|
|
|
|
|
|
|
| 503 |
|
| 504 |
# 9. Calculate computation time and adjust sleep for 50Hz
|
| 505 |
computation_time = time.time() - loop_start_time
|
|
|
|
| 509 |
if loop_count % 100 == 0:
|
| 510 |
elapsed = current_time - last_print_time
|
| 511 |
actual_freq = 100.0 / elapsed if elapsed > 0 else 0
|
| 512 |
+
potential_freq = 1.0 / computation_time if computation_time > 0 else float("inf")
|
|
|
|
|
|
|
| 513 |
logger.debug(
|
| 514 |
f"Loop freq - Actual: {actual_freq:.1f}Hz, Potential: {potential_freq:.1f}Hz, Target: {self.target_frequency:.1f}Hz"
|
| 515 |
)
|
src/reachy_mini_conversation_demo/openai_realtime.py
CHANGED
|
@@ -93,35 +93,22 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 93 |
pass
|
| 94 |
# self.deps.head_wobbler.reset()
|
| 95 |
|
| 96 |
-
if
|
| 97 |
-
event.type
|
| 98 |
-
== "conversation.item.input_audio_transcription.completed"
|
| 99 |
-
):
|
| 100 |
logger.debug(f"user transcript: {event.transcript}")
|
| 101 |
-
await self.output_queue.put(
|
| 102 |
-
AdditionalOutputs({"role": "user", "content": event.transcript})
|
| 103 |
-
)
|
| 104 |
|
| 105 |
if event.type == "response.audio_transcript.done":
|
| 106 |
logger.debug(f"assistant transcript: {event.transcript}")
|
| 107 |
-
await self.output_queue.put(
|
| 108 |
-
AdditionalOutputs(
|
| 109 |
-
{"role": "assistant", "content": event.transcript}
|
| 110 |
-
)
|
| 111 |
-
)
|
| 112 |
|
| 113 |
if event.type == "response.audio.delta":
|
| 114 |
self.deps.head_wobbler.feed(event.delta)
|
| 115 |
self.last_activity_time = asyncio.get_event_loop().time()
|
| 116 |
-
logger.debug(
|
| 117 |
-
"last activity time updated to %s", self.last_activity_time
|
| 118 |
-
)
|
| 119 |
await self.output_queue.put(
|
| 120 |
(
|
| 121 |
self.output_sample_rate,
|
| 122 |
-
np.frombuffer(
|
| 123 |
-
base64.b64decode(event.delta), dtype=np.int16
|
| 124 |
-
).reshape(1, -1),
|
| 125 |
),
|
| 126 |
)
|
| 127 |
|
|
@@ -155,9 +142,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 155 |
args_json_str = info["args_buf"] or "{}"
|
| 156 |
|
| 157 |
try:
|
| 158 |
-
tool_result = await dispatch_tool_call(
|
| 159 |
-
tool_name, args_json_str, self.deps
|
| 160 |
-
)
|
| 161 |
logger.debug("[Tool %s executed]", tool_name)
|
| 162 |
logger.debug("Tool result: %s", tool_result)
|
| 163 |
except Exception as e:
|
|
@@ -178,9 +163,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 178 |
{
|
| 179 |
"role": "assistant",
|
| 180 |
"content": json.dumps(tool_result),
|
| 181 |
-
"metadata": dict(
|
| 182 |
-
title="🛠️ Used tool " + tool_name, status="done"
|
| 183 |
-
),
|
| 184 |
},
|
| 185 |
)
|
| 186 |
)
|
|
@@ -232,11 +215,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 232 |
err = getattr(event, "error", None)
|
| 233 |
msg = getattr(err, "message", str(err) if err else "unknown error")
|
| 234 |
logger.error("Realtime error: %s (raw=%s)", msg, err)
|
| 235 |
-
await self.output_queue.put(
|
| 236 |
-
AdditionalOutputs(
|
| 237 |
-
{"role": "assistant", "content": f"[error] {msg}"}
|
| 238 |
-
)
|
| 239 |
-
)
|
| 240 |
|
| 241 |
# Microphone receive
|
| 242 |
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
|
@@ -259,9 +238,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 259 |
if idle_duration > 15.0 and self.deps.movement_manager.is_idle():
|
| 260 |
await self.send_idle_signal(idle_duration)
|
| 261 |
|
| 262 |
-
self.last_activity_time = (
|
| 263 |
-
asyncio.get_event_loop().time()
|
| 264 |
-
) # avoid repeated resets
|
| 265 |
|
| 266 |
return await wait_for_item(self.output_queue)
|
| 267 |
|
|
|
|
| 93 |
pass
|
| 94 |
# self.deps.head_wobbler.reset()
|
| 95 |
|
| 96 |
+
if event.type == "conversation.item.input_audio_transcription.completed":
|
|
|
|
|
|
|
|
|
|
| 97 |
logger.debug(f"user transcript: {event.transcript}")
|
| 98 |
+
await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
|
|
|
|
|
|
|
| 99 |
|
| 100 |
if event.type == "response.audio_transcript.done":
|
| 101 |
logger.debug(f"assistant transcript: {event.transcript}")
|
| 102 |
+
await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
if event.type == "response.audio.delta":
|
| 105 |
self.deps.head_wobbler.feed(event.delta)
|
| 106 |
self.last_activity_time = asyncio.get_event_loop().time()
|
| 107 |
+
logger.debug("last activity time updated to %s", self.last_activity_time)
|
|
|
|
|
|
|
| 108 |
await self.output_queue.put(
|
| 109 |
(
|
| 110 |
self.output_sample_rate,
|
| 111 |
+
np.frombuffer(base64.b64decode(event.delta), dtype=np.int16).reshape(1, -1),
|
|
|
|
|
|
|
| 112 |
),
|
| 113 |
)
|
| 114 |
|
|
|
|
| 142 |
args_json_str = info["args_buf"] or "{}"
|
| 143 |
|
| 144 |
try:
|
| 145 |
+
tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
|
|
|
|
|
|
|
| 146 |
logger.debug("[Tool %s executed]", tool_name)
|
| 147 |
logger.debug("Tool result: %s", tool_result)
|
| 148 |
except Exception as e:
|
|
|
|
| 163 |
{
|
| 164 |
"role": "assistant",
|
| 165 |
"content": json.dumps(tool_result),
|
| 166 |
+
"metadata": dict(title="🛠️ Used tool " + tool_name, status="done"),
|
|
|
|
|
|
|
| 167 |
},
|
| 168 |
)
|
| 169 |
)
|
|
|
|
| 215 |
err = getattr(event, "error", None)
|
| 216 |
msg = getattr(err, "message", str(err) if err else "unknown error")
|
| 217 |
logger.error("Realtime error: %s (raw=%s)", msg, err)
|
| 218 |
+
await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": f"[error] {msg}"}))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
# Microphone receive
|
| 221 |
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
|
|
|
| 238 |
if idle_duration > 15.0 and self.deps.movement_manager.is_idle():
|
| 239 |
await self.send_idle_signal(idle_duration)
|
| 240 |
|
| 241 |
+
self.last_activity_time = asyncio.get_event_loop().time() # avoid repeated resets
|
|
|
|
|
|
|
| 242 |
|
| 243 |
return await wait_for_item(self.output_queue)
|
| 244 |
|
src/reachy_mini_conversation_demo/tools.py
CHANGED
|
@@ -183,9 +183,7 @@ class MoveHead(Tool):
|
|
| 183 |
current_antennas[1],
|
| 184 |
), # Skip body_yaw
|
| 185 |
target_body_yaw=0, # Reset body yaw
|
| 186 |
-
start_body_yaw=current_antennas[
|
| 187 |
-
0
|
| 188 |
-
], # body_yaw is first in joint positions
|
| 189 |
duration=deps.motion_duration_s,
|
| 190 |
)
|
| 191 |
|
|
@@ -236,15 +234,11 @@ class Camera(Tool):
|
|
| 236 |
|
| 237 |
# Use vision manager for processing if available
|
| 238 |
if deps.vision_manager is not None:
|
| 239 |
-
result = await asyncio.to_thread(
|
| 240 |
-
deps.vision_manager.processor.process_image, frame, image_query
|
| 241 |
-
)
|
| 242 |
if isinstance(result, dict) and "error" in result:
|
| 243 |
return result
|
| 244 |
return (
|
| 245 |
-
{"image_description": result}
|
| 246 |
-
if isinstance(result, str)
|
| 247 |
-
else {"error": "vision returned non-string"}
|
| 248 |
)
|
| 249 |
else:
|
| 250 |
# Return base64 encoded image like main_works.py camera tool
|
|
@@ -436,9 +430,7 @@ class Dance(Tool):
|
|
| 436 |
move_name = random.choice(list(AVAILABLE_MOVES.keys()))
|
| 437 |
|
| 438 |
if move_name not in AVAILABLE_MOVES:
|
| 439 |
-
return {
|
| 440 |
-
"error": f"Unknown dance move '{move_name}'. Available: {list(AVAILABLE_MOVES.keys())}"
|
| 441 |
-
}
|
| 442 |
|
| 443 |
# Add dance moves to queue
|
| 444 |
movement_manager = deps.movement_manager
|
|
@@ -523,9 +515,7 @@ class PlayEmotion(Tool):
|
|
| 523 |
try:
|
| 524 |
emotion_names = RECORDED_MOVES.list_moves()
|
| 525 |
if emotion_name not in emotion_names:
|
| 526 |
-
return {
|
| 527 |
-
"error": f"Unknown emotion '{emotion_name}'. Available: {emotion_names}"
|
| 528 |
-
}
|
| 529 |
|
| 530 |
# Add emotion to queue
|
| 531 |
movement_manager = deps.movement_manager
|
|
@@ -604,9 +594,7 @@ class FaceRecognition(Tool):
|
|
| 604 |
cv2.imwrite(temp_path, frame)
|
| 605 |
|
| 606 |
# Use DeepFace to find face
|
| 607 |
-
results = await asyncio.to_thread(
|
| 608 |
-
DeepFace.find, img_path=temp_path, db_path="./pollen_faces"
|
| 609 |
-
)
|
| 610 |
|
| 611 |
if len(results) == 0:
|
| 612 |
return {"error": "Didn't recognize the face"}
|
|
@@ -681,9 +669,7 @@ def _safe_load_obj(args_json: str) -> dict[str, Any]:
|
|
| 681 |
return {}
|
| 682 |
|
| 683 |
|
| 684 |
-
async def dispatch_tool_call(
|
| 685 |
-
tool_name: str, args_json: str, deps: ToolDependencies
|
| 686 |
-
) -> Dict[str, Any]:
|
| 687 |
"""Dispatch a tool call by name with JSON args and dependencies."""
|
| 688 |
tool = ALL_TOOLS.get(tool_name)
|
| 689 |
|
|
|
|
| 183 |
current_antennas[1],
|
| 184 |
), # Skip body_yaw
|
| 185 |
target_body_yaw=0, # Reset body yaw
|
| 186 |
+
start_body_yaw=current_antennas[0], # body_yaw is first in joint positions
|
|
|
|
|
|
|
| 187 |
duration=deps.motion_duration_s,
|
| 188 |
)
|
| 189 |
|
|
|
|
| 234 |
|
| 235 |
# Use vision manager for processing if available
|
| 236 |
if deps.vision_manager is not None:
|
| 237 |
+
result = await asyncio.to_thread(deps.vision_manager.processor.process_image, frame, image_query)
|
|
|
|
|
|
|
| 238 |
if isinstance(result, dict) and "error" in result:
|
| 239 |
return result
|
| 240 |
return (
|
| 241 |
+
{"image_description": result} if isinstance(result, str) else {"error": "vision returned non-string"}
|
|
|
|
|
|
|
| 242 |
)
|
| 243 |
else:
|
| 244 |
# Return base64 encoded image like main_works.py camera tool
|
|
|
|
| 430 |
move_name = random.choice(list(AVAILABLE_MOVES.keys()))
|
| 431 |
|
| 432 |
if move_name not in AVAILABLE_MOVES:
|
| 433 |
+
return {"error": f"Unknown dance move '{move_name}'. Available: {list(AVAILABLE_MOVES.keys())}"}
|
|
|
|
|
|
|
| 434 |
|
| 435 |
# Add dance moves to queue
|
| 436 |
movement_manager = deps.movement_manager
|
|
|
|
| 515 |
try:
|
| 516 |
emotion_names = RECORDED_MOVES.list_moves()
|
| 517 |
if emotion_name not in emotion_names:
|
| 518 |
+
return {"error": f"Unknown emotion '{emotion_name}'. Available: {emotion_names}"}
|
|
|
|
|
|
|
| 519 |
|
| 520 |
# Add emotion to queue
|
| 521 |
movement_manager = deps.movement_manager
|
|
|
|
| 594 |
cv2.imwrite(temp_path, frame)
|
| 595 |
|
| 596 |
# Use DeepFace to find face
|
| 597 |
+
results = await asyncio.to_thread(DeepFace.find, img_path=temp_path, db_path="./pollen_faces")
|
|
|
|
|
|
|
| 598 |
|
| 599 |
if len(results) == 0:
|
| 600 |
return {"error": "Didn't recognize the face"}
|
|
|
|
| 669 |
return {}
|
| 670 |
|
| 671 |
|
| 672 |
+
async def dispatch_tool_call(tool_name: str, args_json: str, deps: ToolDependencies) -> Dict[str, Any]:
|
|
|
|
|
|
|
| 673 |
"""Dispatch a tool call by name with JSON args and dependencies."""
|
| 674 |
tool = ALL_TOOLS.get(tool_name)
|
| 675 |
|
src/reachy_mini_conversation_demo/utils.py
CHANGED
|
@@ -15,15 +15,9 @@ def parse_args():
|
|
| 15 |
default=None,
|
| 16 |
help="Choose head tracker (default: mediapipe)",
|
| 17 |
)
|
| 18 |
-
parser.add_argument(
|
| 19 |
-
|
| 20 |
-
)
|
| 21 |
-
parser.add_argument(
|
| 22 |
-
"--headless", default=False, action="store_true", help="Run in headless mode"
|
| 23 |
-
)
|
| 24 |
-
parser.add_argument(
|
| 25 |
-
"--debug", default=False, action="store_true", help="Enable debug logging"
|
| 26 |
-
)
|
| 27 |
return parser.parse_args()
|
| 28 |
|
| 29 |
|
|
|
|
| 15 |
default=None,
|
| 16 |
help="Choose head tracker (default: mediapipe)",
|
| 17 |
)
|
| 18 |
+
parser.add_argument("--no-camera", default=False, action="store_true", help="Disable camera usage")
|
| 19 |
+
parser.add_argument("--headless", default=False, action="store_true", help="Run in headless mode")
|
| 20 |
+
parser.add_argument("--debug", default=False, action="store_true", help="Enable debug logging")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
return parser.parse_args()
|
| 22 |
|
| 23 |
|
src/reachy_mini_conversation_demo/vision/processors.py
CHANGED
|
@@ -61,9 +61,7 @@ class VisionProcessor:
|
|
| 61 |
def initialize(self) -> bool:
|
| 62 |
"""Load model and processor onto the selected device."""
|
| 63 |
try:
|
| 64 |
-
logger.info(
|
| 65 |
-
f"Loading SmolVLM2 model on {self.device} (HF_HOME={os.getenv('HF_HOME')})"
|
| 66 |
-
)
|
| 67 |
self.processor = AutoProcessor.from_pretrained(self.model_path)
|
| 68 |
|
| 69 |
# Select dtype depending on device
|
|
@@ -81,9 +79,7 @@ class VisionProcessor:
|
|
| 81 |
model_kwargs["_attn_implementation"] = "flash_attention_2"
|
| 82 |
|
| 83 |
# Load model weights
|
| 84 |
-
self.model = AutoModelForImageTextToText.from_pretrained(
|
| 85 |
-
self.model_path, **model_kwargs
|
| 86 |
-
).to(self.device)
|
| 87 |
|
| 88 |
self.model.eval()
|
| 89 |
self._initialized = True
|
|
@@ -138,10 +134,7 @@ class VisionProcessor:
|
|
| 138 |
)
|
| 139 |
|
| 140 |
# Move tensors to device WITHOUT forcing dtype (keeps input_ids as torch.long)
|
| 141 |
-
inputs = {
|
| 142 |
-
k: (v.to(self.device) if hasattr(v, "to") else v)
|
| 143 |
-
for k, v in inputs.items()
|
| 144 |
-
}
|
| 145 |
|
| 146 |
with torch.no_grad():
|
| 147 |
generated_ids = self.model.generate(
|
|
@@ -246,9 +239,7 @@ class VisionManager:
|
|
| 246 |
)
|
| 247 |
|
| 248 |
# Only update if we got a valid response
|
| 249 |
-
if description and not description.startswith(
|
| 250 |
-
("Vision", "Failed", "Error")
|
| 251 |
-
):
|
| 252 |
self._current_description = description
|
| 253 |
self._last_processed_time = current_time
|
| 254 |
|
|
@@ -268,18 +259,14 @@ class VisionManager:
|
|
| 268 |
"""Get the most recent scene description (thread-safe)."""
|
| 269 |
return self._current_description
|
| 270 |
|
| 271 |
-
async def process_current_frame(
|
| 272 |
-
self, prompt: str = "Describe what you see in detail."
|
| 273 |
-
) -> Dict[str, Any]:
|
| 274 |
"""Process current camera frame with custom prompt."""
|
| 275 |
try:
|
| 276 |
success, frame = self.camera.read()
|
| 277 |
if not success or frame is None:
|
| 278 |
return {"error": "Failed to capture image from camera"}
|
| 279 |
|
| 280 |
-
description = await asyncio.to_thread(
|
| 281 |
-
lambda: self.processor.process_image(frame, prompt)
|
| 282 |
-
)
|
| 283 |
|
| 284 |
return {
|
| 285 |
"description": description,
|
|
@@ -335,9 +322,7 @@ def create_vision_processor(config: VisionConfig):
|
|
| 335 |
return VisionProcessor(config)
|
| 336 |
|
| 337 |
|
| 338 |
-
def init_vision(
|
| 339 |
-
camera: cv2.VideoCapture, processor_type: str = "local"
|
| 340 |
-
) -> VisionManager:
|
| 341 |
"""Initialize vision manager with the specified processor type."""
|
| 342 |
model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
| 343 |
|
|
|
|
| 61 |
def initialize(self) -> bool:
|
| 62 |
"""Load model and processor onto the selected device."""
|
| 63 |
try:
|
| 64 |
+
logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={os.getenv('HF_HOME')})")
|
|
|
|
|
|
|
| 65 |
self.processor = AutoProcessor.from_pretrained(self.model_path)
|
| 66 |
|
| 67 |
# Select dtype depending on device
|
|
|
|
| 79 |
model_kwargs["_attn_implementation"] = "flash_attention_2"
|
| 80 |
|
| 81 |
# Load model weights
|
| 82 |
+
self.model = AutoModelForImageTextToText.from_pretrained(self.model_path, **model_kwargs).to(self.device)
|
|
|
|
|
|
|
| 83 |
|
| 84 |
self.model.eval()
|
| 85 |
self._initialized = True
|
|
|
|
| 134 |
)
|
| 135 |
|
| 136 |
# Move tensors to device WITHOUT forcing dtype (keeps input_ids as torch.long)
|
| 137 |
+
inputs = {k: (v.to(self.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
with torch.no_grad():
|
| 140 |
generated_ids = self.model.generate(
|
|
|
|
| 239 |
)
|
| 240 |
|
| 241 |
# Only update if we got a valid response
|
| 242 |
+
if description and not description.startswith(("Vision", "Failed", "Error")):
|
|
|
|
|
|
|
| 243 |
self._current_description = description
|
| 244 |
self._last_processed_time = current_time
|
| 245 |
|
|
|
|
| 259 |
"""Get the most recent scene description (thread-safe)."""
|
| 260 |
return self._current_description
|
| 261 |
|
| 262 |
+
async def process_current_frame(self, prompt: str = "Describe what you see in detail.") -> Dict[str, Any]:
|
|
|
|
|
|
|
| 263 |
"""Process current camera frame with custom prompt."""
|
| 264 |
try:
|
| 265 |
success, frame = self.camera.read()
|
| 266 |
if not success or frame is None:
|
| 267 |
return {"error": "Failed to capture image from camera"}
|
| 268 |
|
| 269 |
+
description = await asyncio.to_thread(lambda: self.processor.process_image(frame, prompt))
|
|
|
|
|
|
|
| 270 |
|
| 271 |
return {
|
| 272 |
"description": description,
|
|
|
|
| 322 |
return VisionProcessor(config)
|
| 323 |
|
| 324 |
|
| 325 |
+
def init_vision(camera: cv2.VideoCapture, processor_type: str = "local") -> VisionManager:
|
|
|
|
|
|
|
| 326 |
"""Initialize vision manager with the specified processor type."""
|
| 327 |
model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
| 328 |
|
src/reachy_mini_conversation_demo/vision/yolo_head_tracker.py
CHANGED
|
@@ -94,9 +94,7 @@ class HeadTracker:
|
|
| 94 |
|
| 95 |
return np.array([norm_x, norm_y], dtype=np.float32)
|
| 96 |
|
| 97 |
-
def get_eyes(
|
| 98 |
-
self, img: np.ndarray
|
| 99 |
-
) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
|
| 100 |
"""Get eye positions (approximated from face bbox).
|
| 101 |
|
| 102 |
Note: YOLO only provides face bbox, so we estimate eye positions
|
|
@@ -131,20 +129,14 @@ class HeadTracker:
|
|
| 131 |
right_eye_x = bbox[0] + face_width * 0.65
|
| 132 |
|
| 133 |
# Convert to MediaPipe coordinates
|
| 134 |
-
left_eye = np.array(
|
| 135 |
-
|
| 136 |
-
)
|
| 137 |
-
right_eye = np.array(
|
| 138 |
-
[(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
|
| 139 |
-
)
|
| 140 |
|
| 141 |
return left_eye, right_eye
|
| 142 |
|
| 143 |
def get_eyes_from_landmarks(self, face_landmarks) -> Tuple[np.ndarray, np.ndarray]:
|
| 144 |
"""Compatibility method - YOLO doesn't have landmarks, so we store bbox in the object."""
|
| 145 |
-
if not hasattr(face_landmarks, "_bbox") or not hasattr(
|
| 146 |
-
face_landmarks, "_img_shape"
|
| 147 |
-
):
|
| 148 |
raise ValueError("Face landmarks object missing required attributes")
|
| 149 |
|
| 150 |
bbox = face_landmarks._bbox
|
|
@@ -158,12 +150,8 @@ class HeadTracker:
|
|
| 158 |
left_eye_x = bbox[0] + face_width * 0.35
|
| 159 |
right_eye_x = bbox[0] + face_width * 0.65
|
| 160 |
|
| 161 |
-
left_eye = np.array(
|
| 162 |
-
|
| 163 |
-
)
|
| 164 |
-
right_eye = np.array(
|
| 165 |
-
[(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
|
| 166 |
-
)
|
| 167 |
|
| 168 |
return left_eye, right_eye
|
| 169 |
|
|
@@ -177,9 +165,7 @@ class HeadTracker:
|
|
| 177 |
left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
|
| 178 |
return float(np.arctan2(right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]))
|
| 179 |
|
| 180 |
-
def get_head_position(
|
| 181 |
-
self, img: np.ndarray
|
| 182 |
-
) -> Tuple[Optional[np.ndarray], Optional[float]]:
|
| 183 |
"""Get head position from face detection.
|
| 184 |
|
| 185 |
Args:
|
|
|
|
| 94 |
|
| 95 |
return np.array([norm_x, norm_y], dtype=np.float32)
|
| 96 |
|
| 97 |
+
def get_eyes(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
|
|
|
|
|
|
|
| 98 |
"""Get eye positions (approximated from face bbox).
|
| 99 |
|
| 100 |
Note: YOLO only provides face bbox, so we estimate eye positions
|
|
|
|
| 129 |
right_eye_x = bbox[0] + face_width * 0.65
|
| 130 |
|
| 131 |
# Convert to MediaPipe coordinates
|
| 132 |
+
left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
|
| 133 |
+
right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
return left_eye, right_eye
|
| 136 |
|
| 137 |
def get_eyes_from_landmarks(self, face_landmarks) -> Tuple[np.ndarray, np.ndarray]:
|
| 138 |
"""Compatibility method - YOLO doesn't have landmarks, so we store bbox in the object."""
|
| 139 |
+
if not hasattr(face_landmarks, "_bbox") or not hasattr(face_landmarks, "_img_shape"):
|
|
|
|
|
|
|
| 140 |
raise ValueError("Face landmarks object missing required attributes")
|
| 141 |
|
| 142 |
bbox = face_landmarks._bbox
|
|
|
|
| 150 |
left_eye_x = bbox[0] + face_width * 0.35
|
| 151 |
right_eye_x = bbox[0] + face_width * 0.65
|
| 152 |
|
| 153 |
+
left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
|
| 154 |
+
right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
return left_eye, right_eye
|
| 157 |
|
|
|
|
| 165 |
left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
|
| 166 |
return float(np.arctan2(right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]))
|
| 167 |
|
| 168 |
+
def get_head_position(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[float]]:
|
|
|
|
|
|
|
| 169 |
"""Get head position from face detection.
|
| 170 |
|
| 171 |
Args:
|