Alina Lozovskaya
commited on
Commit
·
1d985fe
1
Parent(s):
69601f9
Apply Ruff for better formatting
Browse files
src/reachy_mini_conversation_demo/audio_sway.py
CHANGED
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
| 2 |
|
| 3 |
import asyncio
|
| 4 |
import base64
|
| 5 |
-
from dataclasses import dataclass
|
| 6 |
from typing import Callable, Optional, Tuple
|
| 7 |
|
| 8 |
import numpy as np
|
|
|
|
| 2 |
|
| 3 |
import asyncio
|
| 4 |
import base64
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
from typing import Callable, Optional, Tuple
|
| 7 |
|
| 8 |
import numpy as np
|
src/reachy_mini_conversation_demo/main.py
CHANGED
|
@@ -100,7 +100,9 @@ elif HEAD_TRACKING and SIM:
|
|
| 100 |
else:
|
| 101 |
logger.warning("Head tracking disabled")
|
| 102 |
|
| 103 |
-
movement_manager = MovementManager(
|
|
|
|
|
|
|
| 104 |
robot_is_speaking = asyncio.Event()
|
| 105 |
speaking_queue = asyncio.Queue()
|
| 106 |
|
|
@@ -410,7 +412,7 @@ async def control_mic_loop():
|
|
| 410 |
audio_sync.on_response_completed()
|
| 411 |
await asyncio.sleep(0)
|
| 412 |
continue
|
| 413 |
-
|
| 414 |
await asyncio.sleep(block_time)
|
| 415 |
|
| 416 |
|
|
@@ -423,7 +425,7 @@ async def main():
|
|
| 423 |
recorder.record()
|
| 424 |
player = GstPlayer()
|
| 425 |
player.play()
|
| 426 |
-
|
| 427 |
movement_manager.set_neutral()
|
| 428 |
logger.info("Starting main audio loop. You can start to speak")
|
| 429 |
|
|
@@ -432,12 +434,16 @@ async def main():
|
|
| 432 |
asyncio.create_task(emit_loop(player, openai), name="emit"),
|
| 433 |
asyncio.create_task(receive_loop(recorder, openai), name="recv"),
|
| 434 |
asyncio.create_task(control_mic_loop(), name="mic-mute"),
|
| 435 |
-
asyncio.create_task(
|
|
|
|
|
|
|
| 436 |
]
|
| 437 |
|
| 438 |
if vision_manager:
|
| 439 |
tasks.append(
|
| 440 |
-
asyncio.create_task(
|
|
|
|
|
|
|
| 441 |
)
|
| 442 |
|
| 443 |
try:
|
|
@@ -445,7 +451,7 @@ async def main():
|
|
| 445 |
except asyncio.CancelledError:
|
| 446 |
logger.info("Shutting down")
|
| 447 |
stop_event.set()
|
| 448 |
-
|
| 449 |
if camera:
|
| 450 |
camera.release()
|
| 451 |
|
|
@@ -453,9 +459,10 @@ async def main():
|
|
| 453 |
movement_manager.set_neutral()
|
| 454 |
recorder.stop()
|
| 455 |
player.stop()
|
| 456 |
-
|
| 457 |
current_robot.client.disconnect()
|
| 458 |
logger.info("Stopped, robot disconected")
|
| 459 |
|
|
|
|
| 460 |
if __name__ == "__main__":
|
| 461 |
asyncio.run(main())
|
|
|
|
| 100 |
else:
|
| 101 |
logger.warning("Head tracking disabled")
|
| 102 |
|
| 103 |
+
movement_manager = MovementManager(
|
| 104 |
+
current_robot=current_robot, head_tracker=head_tracker, camera=camera
|
| 105 |
+
)
|
| 106 |
robot_is_speaking = asyncio.Event()
|
| 107 |
speaking_queue = asyncio.Queue()
|
| 108 |
|
|
|
|
| 412 |
audio_sync.on_response_completed()
|
| 413 |
await asyncio.sleep(0)
|
| 414 |
continue
|
| 415 |
+
|
| 416 |
await asyncio.sleep(block_time)
|
| 417 |
|
| 418 |
|
|
|
|
| 425 |
recorder.record()
|
| 426 |
player = GstPlayer()
|
| 427 |
player.play()
|
| 428 |
+
|
| 429 |
movement_manager.set_neutral()
|
| 430 |
logger.info("Starting main audio loop. You can start to speak")
|
| 431 |
|
|
|
|
| 434 |
asyncio.create_task(emit_loop(player, openai), name="emit"),
|
| 435 |
asyncio.create_task(receive_loop(recorder, openai), name="recv"),
|
| 436 |
asyncio.create_task(control_mic_loop(), name="mic-mute"),
|
| 437 |
+
asyncio.create_task(
|
| 438 |
+
movement_manager.enable(stop_event=stop_event), name="move"
|
| 439 |
+
),
|
| 440 |
]
|
| 441 |
|
| 442 |
if vision_manager:
|
| 443 |
tasks.append(
|
| 444 |
+
asyncio.create_task(
|
| 445 |
+
vision_manager.enable(stop_event=stop_event), name="vision"
|
| 446 |
+
),
|
| 447 |
)
|
| 448 |
|
| 449 |
try:
|
|
|
|
| 451 |
except asyncio.CancelledError:
|
| 452 |
logger.info("Shutting down")
|
| 453 |
stop_event.set()
|
| 454 |
+
|
| 455 |
if camera:
|
| 456 |
camera.release()
|
| 457 |
|
|
|
|
| 459 |
movement_manager.set_neutral()
|
| 460 |
recorder.stop()
|
| 461 |
player.stop()
|
| 462 |
+
|
| 463 |
current_robot.client.disconnect()
|
| 464 |
logger.info("Stopped, robot disconected")
|
| 465 |
|
| 466 |
+
|
| 467 |
if __name__ == "__main__":
|
| 468 |
asyncio.run(main())
|
src/reachy_mini_conversation_demo/movement.py
CHANGED
|
@@ -15,7 +15,12 @@ logger = logging.getLogger(__name__)
|
|
| 15 |
|
| 16 |
|
| 17 |
class MovementManager:
|
| 18 |
-
def __init__(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
self.current_robot = current_robot
|
| 20 |
self.head_tracker = head_tracker
|
| 21 |
self.camera = camera
|
|
@@ -25,21 +30,20 @@ class MovementManager:
|
|
| 25 |
self.moving_start = time.monotonic()
|
| 26 |
self.moving_for = 0.0
|
| 27 |
self.speech_head_offsets = [0.0] * 6
|
| 28 |
-
self.movement_loop_sleep = 0.05
|
| 29 |
|
| 30 |
def set_offsets(self, offsets: list[float]) -> None:
|
| 31 |
"""Used by AudioSync callback to update speech offsets"""
|
| 32 |
self.speech_head_offsets = list(offsets)
|
| 33 |
|
| 34 |
def set_neutral(self) -> None:
|
| 35 |
-
"""Set neutral robot position
|
| 36 |
self.speech_head_offsets = [0.0] * 6
|
| 37 |
self.current_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
|
| 38 |
self.current_robot.set_target(head=self.current_head_pose, antennas=(0.0, 0.0))
|
| 39 |
|
| 40 |
def reset_head_pose(self) -> None:
|
| 41 |
self.current_head_pose = np.eye(4)
|
| 42 |
-
|
| 43 |
|
| 44 |
async def enable(self, stop_event: threading.Event) -> None:
|
| 45 |
logger.info("Starting head movement loop")
|
|
@@ -56,7 +60,9 @@ class MovementManager:
|
|
| 56 |
logger.warning("Camera read failed")
|
| 57 |
last_log_ts = current_time
|
| 58 |
else:
|
| 59 |
-
eye_center, _ = self.head_tracker.get_head_position(
|
|
|
|
|
|
|
| 60 |
|
| 61 |
if eye_center is not None:
|
| 62 |
# Rescale target position into IMAGE_SIZE coordinates
|
|
@@ -68,22 +74,20 @@ class MovementManager:
|
|
| 68 |
# Bounds checking
|
| 69 |
eye_center = np.clip(eye_center, [0, 0], [w - 1, h - 1])
|
| 70 |
|
| 71 |
-
current_head_pose = (
|
| 72 |
-
|
| 73 |
-
*eye_center, duration=0.0, apply=False
|
| 74 |
-
)
|
| 75 |
)
|
| 76 |
|
| 77 |
self.current_head_pose = current_head_pose
|
| 78 |
# Pose calculation
|
| 79 |
try:
|
| 80 |
-
current_x, current_y, current_z = self.current_head_pose[
|
| 81 |
-
:3, 3
|
| 82 |
-
]
|
| 83 |
|
| 84 |
-
current_roll, current_pitch, current_yaw =
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
| 87 |
|
| 88 |
if debug_frame_count % 50 == 0:
|
| 89 |
logger.debug(
|
|
@@ -102,15 +106,11 @@ class MovementManager:
|
|
| 102 |
except Exception as e:
|
| 103 |
logger.exception("Invalid pose; resetting")
|
| 104 |
self.reset_head_pose()
|
| 105 |
-
current_x, current_y, current_z = self.current_head_pose[
|
| 106 |
-
:3, 3
|
| 107 |
-
]
|
| 108 |
current_roll = current_pitch = current_yaw = 0.0
|
| 109 |
|
| 110 |
# Movement check
|
| 111 |
-
is_moving = (
|
| 112 |
-
time.monotonic() - self.moving_start < self.moving_for
|
| 113 |
-
)
|
| 114 |
|
| 115 |
if debug_frame_count % 50 == 0:
|
| 116 |
logger.debug(f"Robot moving: {is_moving}")
|
|
@@ -133,9 +133,7 @@ class MovementManager:
|
|
| 133 |
logger.debug(
|
| 134 |
"Final head pose with offsets: %s", head_pose[:3, 3]
|
| 135 |
)
|
| 136 |
-
logger.debug(
|
| 137 |
-
"Speech offsets: %s", self.speech_head_offsets
|
| 138 |
-
)
|
| 139 |
|
| 140 |
self.current_robot.set_target(head=head_pose, antennas=(0.0, 0.0))
|
| 141 |
|
|
@@ -146,5 +144,5 @@ class MovementManager:
|
|
| 146 |
logger.debug("Failed to set robot target: %s", e)
|
| 147 |
|
| 148 |
await asyncio.sleep(self.movement_loop_sleep)
|
| 149 |
-
|
| 150 |
logger.info("Exited head movement loop")
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
class MovementManager:
|
| 18 |
+
def __init__(
|
| 19 |
+
self,
|
| 20 |
+
current_robot: ReachyMini,
|
| 21 |
+
head_tracker: HeadTracker | None,
|
| 22 |
+
camera: cv2.VideoCapture | None,
|
| 23 |
+
):
|
| 24 |
self.current_robot = current_robot
|
| 25 |
self.head_tracker = head_tracker
|
| 26 |
self.camera = camera
|
|
|
|
| 30 |
self.moving_start = time.monotonic()
|
| 31 |
self.moving_for = 0.0
|
| 32 |
self.speech_head_offsets = [0.0] * 6
|
| 33 |
+
self.movement_loop_sleep = 0.05 # seconds
|
| 34 |
|
| 35 |
def set_offsets(self, offsets: list[float]) -> None:
|
| 36 |
"""Used by AudioSync callback to update speech offsets"""
|
| 37 |
self.speech_head_offsets = list(offsets)
|
| 38 |
|
| 39 |
def set_neutral(self) -> None:
|
| 40 |
+
"""Set neutral robot position"""
|
| 41 |
self.speech_head_offsets = [0.0] * 6
|
| 42 |
self.current_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
|
| 43 |
self.current_robot.set_target(head=self.current_head_pose, antennas=(0.0, 0.0))
|
| 44 |
|
| 45 |
def reset_head_pose(self) -> None:
|
| 46 |
self.current_head_pose = np.eye(4)
|
|
|
|
| 47 |
|
| 48 |
async def enable(self, stop_event: threading.Event) -> None:
|
| 49 |
logger.info("Starting head movement loop")
|
|
|
|
| 60 |
logger.warning("Camera read failed")
|
| 61 |
last_log_ts = current_time
|
| 62 |
else:
|
| 63 |
+
eye_center, _ = self.head_tracker.get_head_position(
|
| 64 |
+
im
|
| 65 |
+
) # as [-1, 1]
|
| 66 |
|
| 67 |
if eye_center is not None:
|
| 68 |
# Rescale target position into IMAGE_SIZE coordinates
|
|
|
|
| 74 |
# Bounds checking
|
| 75 |
eye_center = np.clip(eye_center, [0, 0], [w - 1, h - 1])
|
| 76 |
|
| 77 |
+
current_head_pose = self.current_robot.look_at_image(
|
| 78 |
+
*eye_center, duration=0.0, apply=False
|
|
|
|
|
|
|
| 79 |
)
|
| 80 |
|
| 81 |
self.current_head_pose = current_head_pose
|
| 82 |
# Pose calculation
|
| 83 |
try:
|
| 84 |
+
current_x, current_y, current_z = self.current_head_pose[:3, 3]
|
|
|
|
|
|
|
| 85 |
|
| 86 |
+
current_roll, current_pitch, current_yaw = (
|
| 87 |
+
scipy.spatial.transform.Rotation.from_matrix(
|
| 88 |
+
self.current_head_pose[:3, :3]
|
| 89 |
+
).as_euler("xyz", degrees=False)
|
| 90 |
+
)
|
| 91 |
|
| 92 |
if debug_frame_count % 50 == 0:
|
| 93 |
logger.debug(
|
|
|
|
| 106 |
except Exception as e:
|
| 107 |
logger.exception("Invalid pose; resetting")
|
| 108 |
self.reset_head_pose()
|
| 109 |
+
current_x, current_y, current_z = self.current_head_pose[:3, 3]
|
|
|
|
|
|
|
| 110 |
current_roll = current_pitch = current_yaw = 0.0
|
| 111 |
|
| 112 |
# Movement check
|
| 113 |
+
is_moving = time.monotonic() - self.moving_start < self.moving_for
|
|
|
|
|
|
|
| 114 |
|
| 115 |
if debug_frame_count % 50 == 0:
|
| 116 |
logger.debug(f"Robot moving: {is_moving}")
|
|
|
|
| 133 |
logger.debug(
|
| 134 |
"Final head pose with offsets: %s", head_pose[:3, 3]
|
| 135 |
)
|
| 136 |
+
logger.debug("Speech offsets: %s", self.speech_head_offsets)
|
|
|
|
|
|
|
| 137 |
|
| 138 |
self.current_robot.set_target(head=head_pose, antennas=(0.0, 0.0))
|
| 139 |
|
|
|
|
| 144 |
logger.debug("Failed to set robot target: %s", e)
|
| 145 |
|
| 146 |
await asyncio.sleep(self.movement_loop_sleep)
|
| 147 |
+
|
| 148 |
logger.info("Exited head movement loop")
|
src/reachy_mini_conversation_demo/vision.py
CHANGED
|
@@ -239,9 +239,10 @@ class VisionManager:
|
|
| 239 |
if current_time - self._last_processed_time >= self.vision_interval:
|
| 240 |
success, frame = await asyncio.to_thread(self.camera.read)
|
| 241 |
if success and frame is not None:
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
|
|
|
| 245 |
)
|
| 246 |
|
| 247 |
# Only update if we got a valid response
|
|
@@ -259,7 +260,7 @@ class VisionManager:
|
|
| 259 |
|
| 260 |
except Exception as e:
|
| 261 |
logger.exception("Vision processing loop error")
|
| 262 |
-
await asyncio.sleep(5.0)
|
| 263 |
|
| 264 |
logger.info(f"Vision loop finished")
|
| 265 |
|
|
@@ -275,8 +276,10 @@ class VisionManager:
|
|
| 275 |
success, frame = self.camera.read()
|
| 276 |
if not success or frame is None:
|
| 277 |
return {"error": "Failed to capture image from camera"}
|
| 278 |
-
|
| 279 |
-
description =
|
|
|
|
|
|
|
| 280 |
|
| 281 |
return {
|
| 282 |
"description": description,
|
|
@@ -287,8 +290,7 @@ class VisionManager:
|
|
| 287 |
except Exception as e:
|
| 288 |
logger.exception("Failed to process current frame")
|
| 289 |
return {"error": f"Frame processing failed: {str(e)}"}
|
| 290 |
-
|
| 291 |
-
|
| 292 |
async def get_status(self) -> Dict[str, Any]:
|
| 293 |
"""Get comprehensive status information"""
|
| 294 |
return {
|
|
@@ -303,18 +305,14 @@ class VisionManager:
|
|
| 303 |
}
|
| 304 |
|
| 305 |
|
| 306 |
-
|
| 307 |
-
def init_camera(camera_index = 0, simulation=True):
|
| 308 |
-
|
| 309 |
api_preference = cv2.CAP_AVFOUNDATION if sys.platform == "darwin" else 0
|
| 310 |
|
| 311 |
if simulation:
|
| 312 |
# Default build-in camera in SIM
|
| 313 |
# TODO: please, test on Linux and Windows
|
| 314 |
# TODO simulation in find_camera
|
| 315 |
-
camera = cv2.VideoCapture(
|
| 316 |
-
0, api_preference
|
| 317 |
-
)
|
| 318 |
else:
|
| 319 |
# TODO handle macos in find_camera
|
| 320 |
if sys.platform == "darwin":
|
|
@@ -328,7 +326,6 @@ def init_camera(camera_index = 0, simulation=True):
|
|
| 328 |
def init_vision(camera: cv2.VideoCapture) -> VisionManager:
|
| 329 |
model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
| 330 |
|
| 331 |
-
|
| 332 |
cache_dir = os.path.expandvars(os.getenv("HF_HOME", "$HOME/.cache/huggingface"))
|
| 333 |
|
| 334 |
try:
|
|
@@ -362,7 +359,7 @@ def init_vision(camera: cv2.VideoCapture) -> VisionManager:
|
|
| 362 |
|
| 363 |
device_info = vision_manager.processor.get_model_info()
|
| 364 |
logger.info(
|
| 365 |
-
f"Vision processing enabled: {device_info[
|
| 366 |
)
|
| 367 |
|
| 368 |
return vision_manager
|
|
|
|
| 239 |
if current_time - self._last_processed_time >= self.vision_interval:
|
| 240 |
success, frame = await asyncio.to_thread(self.camera.read)
|
| 241 |
if success and frame is not None:
|
| 242 |
+
description = await asyncio.to_thread(
|
| 243 |
+
lambda: self.processor.process_image(
|
| 244 |
+
frame, "Briefly describe what you see in one sentence."
|
| 245 |
+
)
|
| 246 |
)
|
| 247 |
|
| 248 |
# Only update if we got a valid response
|
|
|
|
| 260 |
|
| 261 |
except Exception as e:
|
| 262 |
logger.exception("Vision processing loop error")
|
| 263 |
+
await asyncio.sleep(5.0) # Longer sleep on error
|
| 264 |
|
| 265 |
logger.info(f"Vision loop finished")
|
| 266 |
|
|
|
|
| 276 |
success, frame = self.camera.read()
|
| 277 |
if not success or frame is None:
|
| 278 |
return {"error": "Failed to capture image from camera"}
|
| 279 |
+
|
| 280 |
+
description = await asyncio.to_thread(
|
| 281 |
+
lambda: self.processor.process_image(frame, prompt)
|
| 282 |
+
)
|
| 283 |
|
| 284 |
return {
|
| 285 |
"description": description,
|
|
|
|
| 290 |
except Exception as e:
|
| 291 |
logger.exception("Failed to process current frame")
|
| 292 |
return {"error": f"Frame processing failed: {str(e)}"}
|
| 293 |
+
|
|
|
|
| 294 |
async def get_status(self) -> Dict[str, Any]:
|
| 295 |
"""Get comprehensive status information"""
|
| 296 |
return {
|
|
|
|
| 305 |
}
|
| 306 |
|
| 307 |
|
| 308 |
+
def init_camera(camera_index=0, simulation=True):
|
|
|
|
|
|
|
| 309 |
api_preference = cv2.CAP_AVFOUNDATION if sys.platform == "darwin" else 0
|
| 310 |
|
| 311 |
if simulation:
|
| 312 |
# Default build-in camera in SIM
|
| 313 |
# TODO: please, test on Linux and Windows
|
| 314 |
# TODO simulation in find_camera
|
| 315 |
+
camera = cv2.VideoCapture(0, api_preference)
|
|
|
|
|
|
|
| 316 |
else:
|
| 317 |
# TODO handle macos in find_camera
|
| 318 |
if sys.platform == "darwin":
|
|
|
|
| 326 |
def init_vision(camera: cv2.VideoCapture) -> VisionManager:
|
| 327 |
model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
| 328 |
|
|
|
|
| 329 |
cache_dir = os.path.expandvars(os.getenv("HF_HOME", "$HOME/.cache/huggingface"))
|
| 330 |
|
| 331 |
try:
|
|
|
|
| 359 |
|
| 360 |
device_info = vision_manager.processor.get_model_info()
|
| 361 |
logger.info(
|
| 362 |
+
f"Vision processing enabled: {device_info['model_path']} on {device_info['device']} memory: {device_info.get('gpu_memory', 'N/A')}",
|
| 363 |
)
|
| 364 |
|
| 365 |
return vision_manager
|