Merge origin/develop into branch: resolve audio/moves conflicts and sync deps
Browse files- .env.example +4 -1
- README.md +5 -1
- pyproject.toml +53 -19
- src/reachy_mini_conversation_demo/__init__.py +1 -1
- src/reachy_mini_conversation_demo/audio/__init__.py +1 -1
- src/reachy_mini_conversation_demo/audio/gstreamer.py +18 -38
- src/reachy_mini_conversation_demo/audio/head_wobbler.py +6 -7
- src/reachy_mini_conversation_demo/audio/speech_tapper.py +8 -23
- src/reachy_mini_conversation_demo/camera_worker.py +22 -48
- src/reachy_mini_conversation_demo/config.py +4 -2
- src/reachy_mini_conversation_demo/dance_emotion_moves.py +12 -30
- src/reachy_mini_conversation_demo/main.py +6 -6
- src/reachy_mini_conversation_demo/moves.py +33 -75
- src/reachy_mini_conversation_demo/openai_realtime.py +16 -37
- src/reachy_mini_conversation_demo/prompts.py +1 -1
- src/reachy_mini_conversation_demo/tools.py +16 -30
- src/reachy_mini_conversation_demo/utils.py +4 -10
- src/reachy_mini_conversation_demo/vision/processors.py +14 -28
- src/reachy_mini_conversation_demo/vision/yolo_head_tracker.py +11 -25
.env.example
CHANGED
|
@@ -5,4 +5,7 @@ MODEL_NAME="gpt-4o-realtime-preview-2025-06-03"
|
|
| 5 |
OPENAI_VISION_MODEL="gpt-4.1-mini"
|
| 6 |
|
| 7 |
# Cache for local VLM
|
| 8 |
-
HF_HOME=./cache
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
OPENAI_VISION_MODEL="gpt-4.1-mini"
|
| 6 |
|
| 7 |
# Cache for local VLM
|
| 8 |
+
HF_HOME=./cache
|
| 9 |
+
|
| 10 |
+
# Hugging Face token for accessing datasets/models
|
| 11 |
+
HF_TOKEN=
|
README.md
CHANGED
|
@@ -3,10 +3,14 @@
|
|
| 3 |
Working repo, we should turn this into a ReachyMini app at some point maybe ?
|
| 4 |
|
| 5 |
## Installation
|
|
|
|
| 6 |
|
| 7 |
```bash
|
| 8 |
-
|
|
|
|
|
|
|
| 9 |
```
|
|
|
|
| 10 |
|
| 11 |
## Run
|
| 12 |
|
|
|
|
| 3 |
Working repo, we should turn this into a ReachyMini app at some point maybe ?
|
| 4 |
|
| 5 |
## Installation
|
| 6 |
+
You can set up the project quickly using [uv](https://docs.astral.sh/uv/):
|
| 7 |
|
| 8 |
```bash
|
| 9 |
+
uv venv --python 3.12.1 # Create a virtual environment with Python 3.12.1
|
| 10 |
+
source .venv/bin/activate
|
| 11 |
+
uv sync
|
| 12 |
```
|
| 13 |
+
> Note: The `pyproject.toml` expects `reachy-mini-dances-library` to be located in the same directory as this project.
|
| 14 |
|
| 15 |
## Run
|
| 16 |
|
pyproject.toml
CHANGED
|
@@ -10,44 +10,78 @@ description = ""
|
|
| 10 |
readme = "README.md"
|
| 11 |
requires-python = ">=3.10"
|
| 12 |
dependencies = [
|
| 13 |
-
|
| 14 |
-
"openai",
|
| 15 |
"fastrtc",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
"onnxruntime",
|
| 17 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
"torch",
|
| 19 |
"transformers",
|
| 20 |
-
"num2words",
|
| 21 |
-
"dotenv",
|
| 22 |
"ultralytics",
|
| 23 |
-
"supervision",
|
| 24 |
-
"reachy_mini_toolbox@git+ssh://git@github.com/pollen-robotics/reachy_mini_toolbox@main",
|
| 25 |
-
"reachy_mini_dances_library@git+ssh://git@github.com/pollen-robotics/reachy_mini_dances_library@main"
|
| 26 |
]
|
| 27 |
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
[
|
| 30 |
dev = ["pytest", "ruff==0.12.0"]
|
| 31 |
|
| 32 |
-
[
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
[tool.setuptools]
|
| 36 |
package-dir = { "" = "src" }
|
| 37 |
include-package-data = true
|
| 38 |
|
| 39 |
-
|
| 40 |
[tool.setuptools.packages.find]
|
| 41 |
where = ["src"]
|
| 42 |
|
| 43 |
-
|
| 44 |
[tool.setuptools.package-data]
|
| 45 |
-
|
| 46 |
|
| 47 |
[tool.ruff]
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
readme = "README.md"
|
| 11 |
requires-python = ">=3.10"
|
| 12 |
dependencies = [
|
| 13 |
+
"aiortc>=1.13.0",
|
|
|
|
| 14 |
"fastrtc",
|
| 15 |
+
"gradio>=5.49.0",
|
| 16 |
+
"huggingface_hub>=0.34.4",
|
| 17 |
+
"mediapipe>=0.10.14",
|
| 18 |
+
"num2words",
|
| 19 |
"onnxruntime",
|
| 20 |
+
"opencv-python>=4.12.0.88",
|
| 21 |
+
"openai>=2.1",
|
| 22 |
+
"PyGObject>=3.42.2,<=3.46.0",
|
| 23 |
+
"python-dotenv",
|
| 24 |
+
"reachy_mini_dances_library",
|
| 25 |
+
"reachy_mini_toolbox",
|
| 26 |
+
"reachy_mini",
|
| 27 |
+
"supervision",
|
| 28 |
"torch",
|
| 29 |
"transformers",
|
|
|
|
|
|
|
| 30 |
"ultralytics",
|
|
|
|
|
|
|
|
|
|
| 31 |
]
|
| 32 |
|
| 33 |
+
[project.scripts]
|
| 34 |
+
reachy-mini-conversation-demo = "reachy_mini_conversation_demo.main:main"
|
| 35 |
|
| 36 |
+
[dependency-groups]
|
| 37 |
dev = ["pytest", "ruff==0.12.0"]
|
| 38 |
|
| 39 |
+
[tool.uv.sources]
|
| 40 |
+
reachy_mini_dances_library = { path = "../reachy_mini_dances_library", editable = true }
|
| 41 |
+
reachy_mini = { git = "ssh://git@github.com/pollen-robotics/reachy_mini.git", branch = "develop" }
|
| 42 |
+
reachy_mini_toolbox = { git = "ssh://git@github.com/pollen-robotics/reachy_mini_toolbox.git", branch = "main" }
|
| 43 |
+
fastrtc = { git = "ssh://git@github.com/gradio-app/fastrtc.git", branch = "main" }
|
| 44 |
|
| 45 |
[tool.setuptools]
|
| 46 |
package-dir = { "" = "src" }
|
| 47 |
include-package-data = true
|
| 48 |
|
|
|
|
| 49 |
[tool.setuptools.packages.find]
|
| 50 |
where = ["src"]
|
| 51 |
|
|
|
|
| 52 |
[tool.setuptools.package-data]
|
| 53 |
+
reachy_mini_conversation_demo = ["images/*"]
|
| 54 |
|
| 55 |
[tool.ruff]
|
| 56 |
+
line-length = 119
|
| 57 |
+
exclude = [".venv", "dist", "build", "**/__pycache__", "*.egg-info", ".mypy_cache", ".pytest_cache"]
|
| 58 |
+
|
| 59 |
+
[tool.ruff.lint]
|
| 60 |
+
select = [
|
| 61 |
+
"E", # pycodestyle errors
|
| 62 |
+
"F", # pyflakes
|
| 63 |
+
"W", # pycodestyle warnings
|
| 64 |
+
"I", # isort
|
| 65 |
+
"C4", # flake8-comprehensions
|
| 66 |
+
"D", # pydocstyle
|
| 67 |
]
|
| 68 |
+
ignore = [
|
| 69 |
+
"E501", # handled by formatter
|
| 70 |
+
"D100", # ignore missing module docstrings
|
| 71 |
+
"D203", # blank line before class docstring (conflicts with D211)
|
| 72 |
+
"D213", # summary on second line (conflicts with D212)
|
| 73 |
+
]
|
| 74 |
+
|
| 75 |
+
[tool.ruff.lint.isort]
|
| 76 |
+
length-sort = true
|
| 77 |
+
lines-after-imports = 2
|
| 78 |
+
no-lines-before = ["standard-library", "local-folder"]
|
| 79 |
+
known-local-folder = ["reachy_mini_conversation_demo"]
|
| 80 |
+
known-first-party = ["reachy_mini", "reachy_mini_dances_library", "reachy_mini_toolbox"]
|
| 81 |
+
split-on-trailing-comma = true
|
| 82 |
+
|
| 83 |
+
[tool.ruff.format]
|
| 84 |
+
quote-style = "double"
|
| 85 |
+
indent-style = "space"
|
| 86 |
+
skip-magic-trailing-comma = false
|
| 87 |
+
line-ending = "auto"
|
src/reachy_mini_conversation_demo/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
"""Nothing (for ruff)."""
|
|
|
|
| 1 |
+
"""Nothing (for ruff)."""
|
src/reachy_mini_conversation_demo/audio/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
"""Nothing (for ruff)."""
|
|
|
|
| 1 |
+
"""Nothing (for ruff)."""
|
src/reachy_mini_conversation_demo/audio/gstreamer.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
from threading import Thread
|
| 3 |
from typing import Optional
|
|
|
|
| 4 |
|
| 5 |
import gi
|
| 6 |
|
|
|
|
| 7 |
gi.require_version("Gst", "1.0")
|
| 8 |
gi.require_version("GstApp", "1.0")
|
| 9 |
-
from gi.repository import
|
| 10 |
|
| 11 |
|
| 12 |
class GstPlayer:
|
|
@@ -25,18 +26,16 @@ class GstPlayer:
|
|
| 25 |
self.appsrc = Gst.ElementFactory.make("appsrc", None)
|
| 26 |
self.appsrc.set_property("format", Gst.Format.TIME)
|
| 27 |
self.appsrc.set_property("is-live", True)
|
| 28 |
-
caps = Gst.Caps.from_string(
|
| 29 |
-
f"audio/x-raw,format=S16LE,channels=1,rate={sample_rate},layout=interleaved"
|
| 30 |
-
)
|
| 31 |
self.appsrc.set_property("caps", caps)
|
| 32 |
queue = Gst.ElementFactory.make("queue")
|
| 33 |
audioconvert = Gst.ElementFactory.make("audioconvert")
|
| 34 |
audioresample = Gst.ElementFactory.make("audioresample")
|
| 35 |
|
| 36 |
# Try to pin specific output device; fallback to autoaudiosink
|
| 37 |
-
audiosink = _create_device_element(
|
| 38 |
-
|
| 39 |
-
)
|
| 40 |
|
| 41 |
self.pipeline.add(self.appsrc)
|
| 42 |
self.pipeline.add(queue)
|
|
@@ -104,9 +103,9 @@ class GstRecorder:
|
|
| 104 |
self.pipeline = Gst.Pipeline.new("audio_recorder")
|
| 105 |
|
| 106 |
# Create elements: try specific mic; fallback to default
|
| 107 |
-
autoaudiosrc = _create_device_element(
|
| 108 |
-
|
| 109 |
-
)
|
| 110 |
|
| 111 |
queue = Gst.ElementFactory.make("queue", None)
|
| 112 |
audioconvert = Gst.ElementFactory.make("audioconvert", None)
|
|
@@ -117,9 +116,7 @@ class GstRecorder:
|
|
| 117 |
raise RuntimeError("Failed to create GStreamer elements")
|
| 118 |
|
| 119 |
# Force mono/S16LE at 24000; resample handles device SR (e.g., 16000 → 24000)
|
| 120 |
-
caps = Gst.Caps.from_string(
|
| 121 |
-
f"audio/x-raw,channels=1,rate={sample_rate},format=S16LE"
|
| 122 |
-
)
|
| 123 |
self.appsink.set_property("caps", caps)
|
| 124 |
|
| 125 |
# Build pipeline
|
|
@@ -183,9 +180,7 @@ class GstRecorder:
|
|
| 183 |
logger.info("Stopped Recorder")
|
| 184 |
|
| 185 |
|
| 186 |
-
def _create_device_element(
|
| 187 |
-
direction: str, name_substr: Optional[str]
|
| 188 |
-
) -> Optional[Gst.Element]:
|
| 189 |
"""direction: 'source' or 'sink'.
|
| 190 |
|
| 191 |
name_substr: case-insensitive substring matching device display name/description.
|
|
@@ -205,30 +200,15 @@ def _create_device_element(
|
|
| 205 |
for dev in monitor.get_devices() or []:
|
| 206 |
disp = dev.get_display_name() or ""
|
| 207 |
props = dev.get_properties()
|
| 208 |
-
desc = (
|
| 209 |
-
props.get_string("device.description")
|
| 210 |
-
if props and props.has_field("device.description")
|
| 211 |
-
else ""
|
| 212 |
-
)
|
| 213 |
logger.info(f"Device candidate: disp='{disp}', desc='{desc}'")
|
| 214 |
|
| 215 |
-
if (
|
| 216 |
-
name_substr.lower() in disp.lower()
|
| 217 |
-
or name_substr.lower() in desc.lower()
|
| 218 |
-
):
|
| 219 |
elem = dev.create_element(None)
|
| 220 |
-
factory = (
|
| 221 |
-
|
| 222 |
-
if elem and elem.get_factory()
|
| 223 |
-
else "<?>"
|
| 224 |
-
)
|
| 225 |
-
logger.info(
|
| 226 |
-
f"Using {direction} device: '{disp or desc}' (factory='{factory}')"
|
| 227 |
-
)
|
| 228 |
return elem
|
| 229 |
finally:
|
| 230 |
monitor.stop()
|
| 231 |
-
logging.getLogger(__name__).warning(
|
| 232 |
-
"Requested %s '%s' not found; using auto*", direction, name_substr
|
| 233 |
-
)
|
| 234 |
return None
|
|
|
|
| 1 |
+
import logging
|
|
|
|
| 2 |
from typing import Optional
|
| 3 |
+
from threading import Thread
|
| 4 |
|
| 5 |
import gi
|
| 6 |
|
| 7 |
+
|
| 8 |
gi.require_version("Gst", "1.0")
|
| 9 |
gi.require_version("GstApp", "1.0")
|
| 10 |
+
from gi.repository import Gst, GLib # noqa: E402
|
| 11 |
|
| 12 |
|
| 13 |
class GstPlayer:
|
|
|
|
| 26 |
self.appsrc = Gst.ElementFactory.make("appsrc", None)
|
| 27 |
self.appsrc.set_property("format", Gst.Format.TIME)
|
| 28 |
self.appsrc.set_property("is-live", True)
|
| 29 |
+
caps = Gst.Caps.from_string(f"audio/x-raw,format=S16LE,channels=1,rate={sample_rate},layout=interleaved")
|
|
|
|
|
|
|
| 30 |
self.appsrc.set_property("caps", caps)
|
| 31 |
queue = Gst.ElementFactory.make("queue")
|
| 32 |
audioconvert = Gst.ElementFactory.make("audioconvert")
|
| 33 |
audioresample = Gst.ElementFactory.make("audioresample")
|
| 34 |
|
| 35 |
# Try to pin specific output device; fallback to autoaudiosink
|
| 36 |
+
audiosink = _create_device_element(direction="sink", name_substr=device_name) or Gst.ElementFactory.make(
|
| 37 |
+
"autoaudiosink"
|
| 38 |
+
)
|
| 39 |
|
| 40 |
self.pipeline.add(self.appsrc)
|
| 41 |
self.pipeline.add(queue)
|
|
|
|
| 103 |
self.pipeline = Gst.Pipeline.new("audio_recorder")
|
| 104 |
|
| 105 |
# Create elements: try specific mic; fallback to default
|
| 106 |
+
autoaudiosrc = _create_device_element(direction="source", name_substr=device_name) or Gst.ElementFactory.make(
|
| 107 |
+
"autoaudiosrc", None
|
| 108 |
+
)
|
| 109 |
|
| 110 |
queue = Gst.ElementFactory.make("queue", None)
|
| 111 |
audioconvert = Gst.ElementFactory.make("audioconvert", None)
|
|
|
|
| 116 |
raise RuntimeError("Failed to create GStreamer elements")
|
| 117 |
|
| 118 |
# Force mono/S16LE at 24000; resample handles device SR (e.g., 16000 → 24000)
|
| 119 |
+
caps = Gst.Caps.from_string(f"audio/x-raw,channels=1,rate={sample_rate},format=S16LE")
|
|
|
|
|
|
|
| 120 |
self.appsink.set_property("caps", caps)
|
| 121 |
|
| 122 |
# Build pipeline
|
|
|
|
| 180 |
logger.info("Stopped Recorder")
|
| 181 |
|
| 182 |
|
| 183 |
+
def _create_device_element(direction: str, name_substr: Optional[str]) -> Optional[Gst.Element]:
|
|
|
|
|
|
|
| 184 |
"""direction: 'source' or 'sink'.
|
| 185 |
|
| 186 |
name_substr: case-insensitive substring matching device display name/description.
|
|
|
|
| 200 |
for dev in monitor.get_devices() or []:
|
| 201 |
disp = dev.get_display_name() or ""
|
| 202 |
props = dev.get_properties()
|
| 203 |
+
desc = props.get_string("device.description") if props and props.has_field("device.description") else ""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
logger.info(f"Device candidate: disp='{disp}', desc='{desc}'")
|
| 205 |
|
| 206 |
+
if name_substr.lower() in disp.lower() or name_substr.lower() in desc.lower():
|
|
|
|
|
|
|
|
|
|
| 207 |
elem = dev.create_element(None)
|
| 208 |
+
factory = elem.get_factory().get_name() if elem and elem.get_factory() else "<?>"
|
| 209 |
+
logger.info(f"Using {direction} device: '{disp or desc}' (factory='{factory}')")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
return elem
|
| 211 |
finally:
|
| 212 |
monitor.stop()
|
| 213 |
+
logging.getLogger(__name__).warning("Requested %s '%s' not found; using auto*", direction, name_substr)
|
|
|
|
|
|
|
| 214 |
return None
|
src/reachy_mini_conversation_demo/audio/head_wobbler.py
CHANGED
|
@@ -1,16 +1,17 @@
|
|
| 1 |
"""Moves head given audio samples."""
|
| 2 |
|
|
|
|
|
|
|
| 3 |
import base64
|
| 4 |
import logging
|
| 5 |
-
import queue
|
| 6 |
import threading
|
| 7 |
-
import time
|
| 8 |
from typing import Optional, Tuple
|
| 9 |
|
| 10 |
import numpy as np
|
| 11 |
|
| 12 |
from reachy_mini_conversation_demo.audio.speech_tapper import HOP_MS, SwayRollRT
|
| 13 |
|
|
|
|
| 14 |
SAMPLE_RATE = 24000
|
| 15 |
MOVEMENT_LATENCY_S = 0.08 # seconds between audio and robot movement
|
| 16 |
logger = logging.getLogger(__name__)
|
|
@@ -25,9 +26,7 @@ class HeadWobbler:
|
|
| 25 |
self._base_ts: Optional[float] = None
|
| 26 |
self._hops_done: int = 0
|
| 27 |
|
| 28 |
-
self.audio_queue: queue.Queue[
|
| 29 |
-
Tuple[int, int, np.ndarray]
|
| 30 |
-
] = queue.Queue()
|
| 31 |
self.sway = SwayRollRT()
|
| 32 |
|
| 33 |
# Synchronization primitives
|
|
@@ -50,14 +49,14 @@ class HeadWobbler:
|
|
| 50 |
self._stop_event.clear()
|
| 51 |
self._thread = threading.Thread(target=self.working_loop, daemon=True)
|
| 52 |
self._thread.start()
|
| 53 |
-
logger.
|
| 54 |
|
| 55 |
def stop(self) -> None:
|
| 56 |
"""Stop the head wobbler loop."""
|
| 57 |
self._stop_event.set()
|
| 58 |
if self._thread is not None:
|
| 59 |
self._thread.join()
|
| 60 |
-
logger.
|
| 61 |
|
| 62 |
def working_loop(self) -> None:
|
| 63 |
"""Convert audio deltas into head movement offsets."""
|
|
|
|
| 1 |
"""Moves head given audio samples."""
|
| 2 |
|
| 3 |
+
import time
|
| 4 |
+
import queue
|
| 5 |
import base64
|
| 6 |
import logging
|
|
|
|
| 7 |
import threading
|
|
|
|
| 8 |
from typing import Optional, Tuple
|
| 9 |
|
| 10 |
import numpy as np
|
| 11 |
|
| 12 |
from reachy_mini_conversation_demo.audio.speech_tapper import HOP_MS, SwayRollRT
|
| 13 |
|
| 14 |
+
|
| 15 |
SAMPLE_RATE = 24000
|
| 16 |
MOVEMENT_LATENCY_S = 0.08 # seconds between audio and robot movement
|
| 17 |
logger = logging.getLogger(__name__)
|
|
|
|
| 26 |
self._base_ts: Optional[float] = None
|
| 27 |
self._hops_done: int = 0
|
| 28 |
|
| 29 |
+
self.audio_queue: queue.Queue[Tuple[int, int, np.ndarray]] = queue.Queue()
|
|
|
|
|
|
|
| 30 |
self.sway = SwayRollRT()
|
| 31 |
|
| 32 |
# Synchronization primitives
|
|
|
|
| 49 |
self._stop_event.clear()
|
| 50 |
self._thread = threading.Thread(target=self.working_loop, daemon=True)
|
| 51 |
self._thread.start()
|
| 52 |
+
logger.debug("Head wobbler started")
|
| 53 |
|
| 54 |
def stop(self) -> None:
|
| 55 |
"""Stop the head wobbler loop."""
|
| 56 |
self._stop_event.set()
|
| 57 |
if self._thread is not None:
|
| 58 |
self._thread.join()
|
| 59 |
+
logger.debug("Head wobbler stopped")
|
| 60 |
|
| 61 |
def working_loop(self) -> None:
|
| 62 |
"""Convert audio deltas into head movement offsets."""
|
src/reachy_mini_conversation_demo/audio/speech_tapper.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
import math
|
| 4 |
-
from collections import deque
|
| 5 |
-
from itertools import islice
|
| 6 |
from typing import Dict, List, Optional
|
|
|
|
|
|
|
| 7 |
|
| 8 |
import numpy as np
|
| 9 |
|
|
|
|
| 10 |
# Tunables
|
| 11 |
SR = 16_000
|
| 12 |
FRAME_MS = 20
|
|
@@ -68,7 +68,7 @@ def _loudness_gain(db: float, offset: float = SENS_DB_OFFSET) -> float:
|
|
| 68 |
|
| 69 |
def _to_float32_mono(x: np.ndarray) -> np.ndarray:
|
| 70 |
"""Convert arbitrary PCM array to float32 mono in [-1,1].
|
| 71 |
-
|
| 72 |
Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
|
| 73 |
"""
|
| 74 |
a = np.asarray(x)
|
|
@@ -258,24 +258,9 @@ class SwayRollRT:
|
|
| 258 |
* env
|
| 259 |
* math.sin(2 * math.pi * SWAY_F_ROLL * self.t + self.phase_roll)
|
| 260 |
)
|
| 261 |
-
x_mm = (
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
* env
|
| 265 |
-
* math.sin(2 * math.pi * SWAY_F_X * self.t + self.phase_x)
|
| 266 |
-
)
|
| 267 |
-
y_mm = (
|
| 268 |
-
SWAY_A_Y_MM
|
| 269 |
-
* loud
|
| 270 |
-
* env
|
| 271 |
-
* math.sin(2 * math.pi * SWAY_F_Y * self.t + self.phase_y)
|
| 272 |
-
)
|
| 273 |
-
z_mm = (
|
| 274 |
-
SWAY_A_Z_MM
|
| 275 |
-
* loud
|
| 276 |
-
* env
|
| 277 |
-
* math.sin(2 * math.pi * SWAY_F_Z * self.t + self.phase_z)
|
| 278 |
-
)
|
| 279 |
|
| 280 |
out.append(
|
| 281 |
{
|
|
|
|
| 1 |
+
from __future__ import annotations
|
|
|
|
| 2 |
import math
|
|
|
|
|
|
|
| 3 |
from typing import Dict, List, Optional
|
| 4 |
+
from itertools import islice
|
| 5 |
+
from collections import deque
|
| 6 |
|
| 7 |
import numpy as np
|
| 8 |
|
| 9 |
+
|
| 10 |
# Tunables
|
| 11 |
SR = 16_000
|
| 12 |
FRAME_MS = 20
|
|
|
|
| 68 |
|
| 69 |
def _to_float32_mono(x: np.ndarray) -> np.ndarray:
|
| 70 |
"""Convert arbitrary PCM array to float32 mono in [-1,1].
|
| 71 |
+
|
| 72 |
Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
|
| 73 |
"""
|
| 74 |
a = np.asarray(x)
|
|
|
|
| 258 |
* env
|
| 259 |
* math.sin(2 * math.pi * SWAY_F_ROLL * self.t + self.phase_roll)
|
| 260 |
)
|
| 261 |
+
x_mm = SWAY_A_X_MM * loud * env * math.sin(2 * math.pi * SWAY_F_X * self.t + self.phase_x)
|
| 262 |
+
y_mm = SWAY_A_Y_MM * loud * env * math.sin(2 * math.pi * SWAY_F_Y * self.t + self.phase_y)
|
| 263 |
+
z_mm = SWAY_A_Z_MM * loud * env * math.sin(2 * math.pi * SWAY_F_Z * self.t + self.phase_z)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
out.append(
|
| 266 |
{
|
src/reachy_mini_conversation_demo/camera_worker.py
CHANGED
|
@@ -6,16 +6,18 @@ Ported from main_works.py camera_worker() function to provide:
|
|
| 6 |
- Latest frame always available for tools
|
| 7 |
"""
|
| 8 |
|
|
|
|
| 9 |
import logging
|
| 10 |
import threading
|
| 11 |
-
import
|
| 12 |
-
from typing import Optional, Tuple
|
| 13 |
|
| 14 |
import cv2
|
| 15 |
import numpy as np
|
|
|
|
|
|
|
| 16 |
from reachy_mini import ReachyMini
|
| 17 |
from reachy_mini.utils.interpolation import linear_pose_interpolation
|
| 18 |
-
|
| 19 |
|
| 20 |
logger = logging.getLogger(__name__)
|
| 21 |
|
|
@@ -83,14 +85,14 @@ class CameraWorker:
|
|
| 83 |
self._stop_event.clear()
|
| 84 |
self._thread = threading.Thread(target=self.working_loop, daemon=True)
|
| 85 |
self._thread.start()
|
| 86 |
-
logger.
|
| 87 |
|
| 88 |
def stop(self) -> None:
|
| 89 |
"""Stop the camera worker loop."""
|
| 90 |
self._stop_event.set()
|
| 91 |
if self._thread is not None:
|
| 92 |
self._thread.join()
|
| 93 |
-
logger.
|
| 94 |
|
| 95 |
def working_loop(self) -> None:
|
| 96 |
"""Enable the camera worker loop.
|
|
@@ -114,17 +116,10 @@ class CameraWorker:
|
|
| 114 |
self.latest_frame = frame # .copy()
|
| 115 |
|
| 116 |
# Check if face tracking was just disabled
|
| 117 |
-
if
|
| 118 |
-
self.previous_head_tracking_state
|
| 119 |
-
and not self.is_head_tracking_enabled
|
| 120 |
-
):
|
| 121 |
# Face tracking was just disabled - start interpolation to neutral
|
| 122 |
-
self.last_face_detected_time =
|
| 123 |
-
|
| 124 |
-
)
|
| 125 |
-
self.interpolation_start_time = (
|
| 126 |
-
None # Will be set by the face-lost interpolation
|
| 127 |
-
)
|
| 128 |
self.interpolation_start_pose = None
|
| 129 |
|
| 130 |
# Update tracking state
|
|
@@ -137,9 +132,7 @@ class CameraWorker:
|
|
| 137 |
if eye_center is not None:
|
| 138 |
# Face detected - immediately switch to tracking
|
| 139 |
self.last_face_detected_time = current_time
|
| 140 |
-
self.interpolation_start_time =
|
| 141 |
-
None # Stop any interpolation
|
| 142 |
-
)
|
| 143 |
|
| 144 |
# Convert normalized coordinates to pixel coordinates
|
| 145 |
h, w, _ = frame.shape
|
|
@@ -159,9 +152,7 @@ class CameraWorker:
|
|
| 159 |
|
| 160 |
# Extract translation and rotation from the target pose directly
|
| 161 |
translation = target_pose[:3, 3]
|
| 162 |
-
rotation = R.from_matrix(target_pose[:3, :3]).as_euler(
|
| 163 |
-
"xyz", degrees=False
|
| 164 |
-
)
|
| 165 |
|
| 166 |
# Thread-safe update of face tracking offsets (use pose as-is)
|
| 167 |
with self.face_tracking_lock:
|
|
@@ -176,19 +167,14 @@ class CameraWorker:
|
|
| 176 |
|
| 177 |
else:
|
| 178 |
# No face detected while tracking enabled - set face lost timestamp
|
| 179 |
-
if
|
| 180 |
-
self.last_face_detected_time is None
|
| 181 |
-
or self.last_face_detected_time == current_time
|
| 182 |
-
):
|
| 183 |
# Only update if we haven't already set a face lost time
|
| 184 |
# (current_time check prevents overriding the disable-triggered timestamp)
|
| 185 |
pass
|
| 186 |
|
| 187 |
# Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
|
| 188 |
if self.last_face_detected_time is not None:
|
| 189 |
-
time_since_face_lost =
|
| 190 |
-
current_time - self.last_face_detected_time
|
| 191 |
-
)
|
| 192 |
|
| 193 |
if time_since_face_lost >= self.face_lost_delay:
|
| 194 |
# Start interpolation if not already started
|
|
@@ -197,27 +183,17 @@ class CameraWorker:
|
|
| 197 |
# Capture current pose as start of interpolation
|
| 198 |
with self.face_tracking_lock:
|
| 199 |
current_translation = self.face_tracking_offsets[:3]
|
| 200 |
-
current_rotation_euler = self.face_tracking_offsets[
|
| 201 |
-
3:
|
| 202 |
-
]
|
| 203 |
# Convert to 4x4 pose matrix
|
| 204 |
self.interpolation_start_pose = np.eye(4)
|
| 205 |
-
self.interpolation_start_pose[:3, 3] =
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
R.from_euler(
|
| 210 |
-
"xyz", current_rotation_euler
|
| 211 |
-
).as_matrix()
|
| 212 |
-
)
|
| 213 |
|
| 214 |
# Calculate interpolation progress (t from 0 to 1)
|
| 215 |
-
elapsed_interpolation =
|
| 216 |
-
|
| 217 |
-
)
|
| 218 |
-
t = min(
|
| 219 |
-
1.0, elapsed_interpolation / self.interpolation_duration
|
| 220 |
-
)
|
| 221 |
|
| 222 |
# Interpolate between current pose and neutral pose
|
| 223 |
interpolated_pose = linear_pose_interpolation(
|
|
@@ -226,9 +202,7 @@ class CameraWorker:
|
|
| 226 |
|
| 227 |
# Extract translation and rotation from interpolated pose
|
| 228 |
translation = interpolated_pose[:3, 3]
|
| 229 |
-
rotation = R.from_matrix(
|
| 230 |
-
interpolated_pose[:3, :3]
|
| 231 |
-
).as_euler("xyz", degrees=False)
|
| 232 |
|
| 233 |
# Thread-safe update of face tracking offsets
|
| 234 |
with self.face_tracking_lock:
|
|
|
|
| 6 |
- Latest frame always available for tools
|
| 7 |
"""
|
| 8 |
|
| 9 |
+
import time
|
| 10 |
import logging
|
| 11 |
import threading
|
| 12 |
+
from typing import Tuple, Optional
|
|
|
|
| 13 |
|
| 14 |
import cv2
|
| 15 |
import numpy as np
|
| 16 |
+
from scipy.spatial.transform import Rotation as R
|
| 17 |
+
|
| 18 |
from reachy_mini import ReachyMini
|
| 19 |
from reachy_mini.utils.interpolation import linear_pose_interpolation
|
| 20 |
+
|
| 21 |
|
| 22 |
logger = logging.getLogger(__name__)
|
| 23 |
|
|
|
|
| 85 |
self._stop_event.clear()
|
| 86 |
self._thread = threading.Thread(target=self.working_loop, daemon=True)
|
| 87 |
self._thread.start()
|
| 88 |
+
logger.debug("Camera worker started")
|
| 89 |
|
| 90 |
def stop(self) -> None:
|
| 91 |
"""Stop the camera worker loop."""
|
| 92 |
self._stop_event.set()
|
| 93 |
if self._thread is not None:
|
| 94 |
self._thread.join()
|
| 95 |
+
logger.debug("Camera worker stopped")
|
| 96 |
|
| 97 |
def working_loop(self) -> None:
|
| 98 |
"""Enable the camera worker loop.
|
|
|
|
| 116 |
self.latest_frame = frame # .copy()
|
| 117 |
|
| 118 |
# Check if face tracking was just disabled
|
| 119 |
+
if self.previous_head_tracking_state and not self.is_head_tracking_enabled:
|
|
|
|
|
|
|
|
|
|
| 120 |
# Face tracking was just disabled - start interpolation to neutral
|
| 121 |
+
self.last_face_detected_time = current_time # Trigger the face-lost logic
|
| 122 |
+
self.interpolation_start_time = None # Will be set by the face-lost interpolation
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
self.interpolation_start_pose = None
|
| 124 |
|
| 125 |
# Update tracking state
|
|
|
|
| 132 |
if eye_center is not None:
|
| 133 |
# Face detected - immediately switch to tracking
|
| 134 |
self.last_face_detected_time = current_time
|
| 135 |
+
self.interpolation_start_time = None # Stop any interpolation
|
|
|
|
|
|
|
| 136 |
|
| 137 |
# Convert normalized coordinates to pixel coordinates
|
| 138 |
h, w, _ = frame.shape
|
|
|
|
| 152 |
|
| 153 |
# Extract translation and rotation from the target pose directly
|
| 154 |
translation = target_pose[:3, 3]
|
| 155 |
+
rotation = R.from_matrix(target_pose[:3, :3]).as_euler("xyz", degrees=False)
|
|
|
|
|
|
|
| 156 |
|
| 157 |
# Thread-safe update of face tracking offsets (use pose as-is)
|
| 158 |
with self.face_tracking_lock:
|
|
|
|
| 167 |
|
| 168 |
else:
|
| 169 |
# No face detected while tracking enabled - set face lost timestamp
|
| 170 |
+
if self.last_face_detected_time is None or self.last_face_detected_time == current_time:
|
|
|
|
|
|
|
|
|
|
| 171 |
# Only update if we haven't already set a face lost time
|
| 172 |
# (current_time check prevents overriding the disable-triggered timestamp)
|
| 173 |
pass
|
| 174 |
|
| 175 |
# Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
|
| 176 |
if self.last_face_detected_time is not None:
|
| 177 |
+
time_since_face_lost = current_time - self.last_face_detected_time
|
|
|
|
|
|
|
| 178 |
|
| 179 |
if time_since_face_lost >= self.face_lost_delay:
|
| 180 |
# Start interpolation if not already started
|
|
|
|
| 183 |
# Capture current pose as start of interpolation
|
| 184 |
with self.face_tracking_lock:
|
| 185 |
current_translation = self.face_tracking_offsets[:3]
|
| 186 |
+
current_rotation_euler = self.face_tracking_offsets[3:]
|
|
|
|
|
|
|
| 187 |
# Convert to 4x4 pose matrix
|
| 188 |
self.interpolation_start_pose = np.eye(4)
|
| 189 |
+
self.interpolation_start_pose[:3, 3] = current_translation
|
| 190 |
+
self.interpolation_start_pose[:3, :3] = R.from_euler(
|
| 191 |
+
"xyz", current_rotation_euler
|
| 192 |
+
).as_matrix()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
# Calculate interpolation progress (t from 0 to 1)
|
| 195 |
+
elapsed_interpolation = current_time - self.interpolation_start_time
|
| 196 |
+
t = min(1.0, elapsed_interpolation / self.interpolation_duration)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
# Interpolate between current pose and neutral pose
|
| 199 |
interpolated_pose = linear_pose_interpolation(
|
|
|
|
| 202 |
|
| 203 |
# Extract translation and rotation from interpolated pose
|
| 204 |
translation = interpolated_pose[:3, 3]
|
| 205 |
+
rotation = R.from_matrix(interpolated_pose[:3, :3]).as_euler("xyz", degrees=False)
|
|
|
|
|
|
|
| 206 |
|
| 207 |
# Thread-safe update of face tracking offsets
|
| 208 |
with self.face_tracking_lock:
|
src/reachy_mini_conversation_demo/config.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
-
import os
|
| 2 |
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
|
|
|
|
| 5 |
load_dotenv()
|
| 6 |
|
| 7 |
|
|
@@ -15,7 +16,7 @@ def getenv_bool(key: str, default: bool = False) -> bool:
|
|
| 15 |
|
| 16 |
class Config:
|
| 17 |
"""Configuration class for the conversation demo."""
|
| 18 |
-
|
| 19 |
# Required
|
| 20 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 21 |
if not OPENAI_API_KEY:
|
|
@@ -25,6 +26,7 @@ class Config:
|
|
| 25 |
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-realtime-preview")
|
| 26 |
OPENAI_VISION_MODEL = os.getenv("OPENAI_VISION_MODEL", "gpt-4.1-mini")
|
| 27 |
HF_HOME = os.getenv("HF_HOME", "./cache")
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
config = Config()
|
|
|
|
| 1 |
+
import os
|
| 2 |
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
|
| 5 |
+
|
| 6 |
load_dotenv()
|
| 7 |
|
| 8 |
|
|
|
|
| 16 |
|
| 17 |
class Config:
|
| 18 |
"""Configuration class for the conversation demo."""
|
| 19 |
+
|
| 20 |
# Required
|
| 21 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 22 |
if not OPENAI_API_KEY:
|
|
|
|
| 26 |
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-realtime-preview")
|
| 27 |
OPENAI_VISION_MODEL = os.getenv("OPENAI_VISION_MODEL", "gpt-4.1-mini")
|
| 28 |
HF_HOME = os.getenv("HF_HOME", "./cache")
|
| 29 |
+
HF_TOKEN = os.getenv("HF_TOKEN") # Optional, falls back to hf auth login if not set
|
| 30 |
|
| 31 |
|
| 32 |
config = Config()
|
src/reachy_mini_conversation_demo/dance_emotion_moves.py
CHANGED
|
@@ -5,15 +5,16 @@ and executed sequentially by the MovementManager.
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
-
|
| 9 |
import logging
|
| 10 |
from typing import Tuple
|
| 11 |
|
| 12 |
import numpy as np
|
|
|
|
| 13 |
from reachy_mini.motion.move import Move
|
| 14 |
from reachy_mini.motion.recorded_move import RecordedMoves
|
| 15 |
from reachy_mini_dances_library.dance_move import DanceMove
|
| 16 |
|
|
|
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
| 19 |
|
|
@@ -30,9 +31,7 @@ class DanceQueueMove(Move):
|
|
| 30 |
"""Duration property required by official Move interface."""
|
| 31 |
return self.dance_move.duration
|
| 32 |
|
| 33 |
-
def evaluate(
|
| 34 |
-
self, t: float
|
| 35 |
-
) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
|
| 36 |
"""Evaluate dance move at time t."""
|
| 37 |
try:
|
| 38 |
# Get the pose from the dance move
|
|
@@ -45,9 +44,7 @@ class DanceQueueMove(Move):
|
|
| 45 |
return (head_pose, antennas, body_yaw)
|
| 46 |
|
| 47 |
except Exception as e:
|
| 48 |
-
logger.error(
|
| 49 |
-
f"Error evaluating dance move '{self.move_name}' at t={t}: {e}"
|
| 50 |
-
)
|
| 51 |
# Return neutral pose on error
|
| 52 |
from reachy_mini.utils import create_head_pose
|
| 53 |
|
|
@@ -68,9 +65,7 @@ class EmotionQueueMove(Move):
|
|
| 68 |
"""Duration property required by official Move interface."""
|
| 69 |
return self.emotion_move.duration
|
| 70 |
|
| 71 |
-
def evaluate(
|
| 72 |
-
self, t: float
|
| 73 |
-
) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
|
| 74 |
"""Evaluate emotion move at time t."""
|
| 75 |
try:
|
| 76 |
# Get the pose from the emotion move
|
|
@@ -83,9 +78,7 @@ class EmotionQueueMove(Move):
|
|
| 83 |
return (head_pose, antennas, body_yaw)
|
| 84 |
|
| 85 |
except Exception as e:
|
| 86 |
-
logger.error(
|
| 87 |
-
f"Error evaluating emotion '{self.emotion_name}' at t={t}: {e}"
|
| 88 |
-
)
|
| 89 |
# Return neutral pose on error
|
| 90 |
from reachy_mini.utils import create_head_pose
|
| 91 |
|
|
@@ -120,9 +113,7 @@ class GotoQueueMove(Move):
|
|
| 120 |
"""Duration property required by official Move interface."""
|
| 121 |
return self._duration
|
| 122 |
|
| 123 |
-
def evaluate(
|
| 124 |
-
self, t: float
|
| 125 |
-
) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
|
| 126 |
"""Evaluate goto move at time t using linear interpolation."""
|
| 127 |
try:
|
| 128 |
from reachy_mini.utils import create_head_pose
|
|
@@ -138,32 +129,23 @@ class GotoQueueMove(Move):
|
|
| 138 |
start_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
|
| 139 |
|
| 140 |
# Interpolate head pose
|
| 141 |
-
head_pose = linear_pose_interpolation(
|
| 142 |
-
start_pose, self.target_head_pose, t_clamped
|
| 143 |
-
)
|
| 144 |
|
| 145 |
# Interpolate antennas - return as numpy array
|
| 146 |
antennas = np.array(
|
| 147 |
[
|
| 148 |
-
self.start_antennas[0]
|
| 149 |
-
+ (self.target_antennas[
|
| 150 |
-
self.start_antennas[1]
|
| 151 |
-
+ (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
|
| 152 |
]
|
| 153 |
)
|
| 154 |
|
| 155 |
# Interpolate body yaw
|
| 156 |
-
body_yaw = (
|
| 157 |
-
self.start_body_yaw
|
| 158 |
-
+ (self.target_body_yaw - self.start_body_yaw) * t_clamped
|
| 159 |
-
)
|
| 160 |
|
| 161 |
return (head_pose, antennas, body_yaw)
|
| 162 |
|
| 163 |
except Exception as e:
|
| 164 |
logger.error(f"Error evaluating goto move at t={t}: {e}")
|
| 165 |
# Return target pose on error - convert antennas to numpy array
|
| 166 |
-
target_antennas_array = np.array(
|
| 167 |
-
[self.target_antennas[0], self.target_antennas[1]]
|
| 168 |
-
)
|
| 169 |
return (self.target_head_pose, target_antennas_array, self.target_body_yaw)
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
from __future__ import annotations
|
|
|
|
| 8 |
import logging
|
| 9 |
from typing import Tuple
|
| 10 |
|
| 11 |
import numpy as np
|
| 12 |
+
|
| 13 |
from reachy_mini.motion.move import Move
|
| 14 |
from reachy_mini.motion.recorded_move import RecordedMoves
|
| 15 |
from reachy_mini_dances_library.dance_move import DanceMove
|
| 16 |
|
| 17 |
+
|
| 18 |
logger = logging.getLogger(__name__)
|
| 19 |
|
| 20 |
|
|
|
|
| 31 |
"""Duration property required by official Move interface."""
|
| 32 |
return self.dance_move.duration
|
| 33 |
|
| 34 |
+
def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
|
|
|
|
|
|
|
| 35 |
"""Evaluate dance move at time t."""
|
| 36 |
try:
|
| 37 |
# Get the pose from the dance move
|
|
|
|
| 44 |
return (head_pose, antennas, body_yaw)
|
| 45 |
|
| 46 |
except Exception as e:
|
| 47 |
+
logger.error(f"Error evaluating dance move '{self.move_name}' at t={t}: {e}")
|
|
|
|
|
|
|
| 48 |
# Return neutral pose on error
|
| 49 |
from reachy_mini.utils import create_head_pose
|
| 50 |
|
|
|
|
| 65 |
"""Duration property required by official Move interface."""
|
| 66 |
return self.emotion_move.duration
|
| 67 |
|
| 68 |
+
def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
|
|
|
|
|
|
|
| 69 |
"""Evaluate emotion move at time t."""
|
| 70 |
try:
|
| 71 |
# Get the pose from the emotion move
|
|
|
|
| 78 |
return (head_pose, antennas, body_yaw)
|
| 79 |
|
| 80 |
except Exception as e:
|
| 81 |
+
logger.error(f"Error evaluating emotion '{self.emotion_name}' at t={t}: {e}")
|
|
|
|
|
|
|
| 82 |
# Return neutral pose on error
|
| 83 |
from reachy_mini.utils import create_head_pose
|
| 84 |
|
|
|
|
| 113 |
"""Duration property required by official Move interface."""
|
| 114 |
return self._duration
|
| 115 |
|
| 116 |
+
def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
|
|
|
|
|
|
|
| 117 |
"""Evaluate goto move at time t using linear interpolation."""
|
| 118 |
try:
|
| 119 |
from reachy_mini.utils import create_head_pose
|
|
|
|
| 129 |
start_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
|
| 130 |
|
| 131 |
# Interpolate head pose
|
| 132 |
+
head_pose = linear_pose_interpolation(start_pose, self.target_head_pose, t_clamped)
|
|
|
|
|
|
|
| 133 |
|
| 134 |
# Interpolate antennas - return as numpy array
|
| 135 |
antennas = np.array(
|
| 136 |
[
|
| 137 |
+
self.start_antennas[0] + (self.target_antennas[0] - self.start_antennas[0]) * t_clamped,
|
| 138 |
+
self.start_antennas[1] + (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
|
|
|
|
|
|
|
| 139 |
]
|
| 140 |
)
|
| 141 |
|
| 142 |
# Interpolate body yaw
|
| 143 |
+
body_yaw = self.start_body_yaw + (self.target_body_yaw - self.start_body_yaw) * t_clamped
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
return (head_pose, antennas, body_yaw)
|
| 146 |
|
| 147 |
except Exception as e:
|
| 148 |
logger.error(f"Error evaluating goto move at t={t}: {e}")
|
| 149 |
# Return target pose on error - convert antennas to numpy array
|
| 150 |
+
target_antennas_array = np.array([self.target_antennas[0], self.target_antennas[1]])
|
|
|
|
|
|
|
| 151 |
return (self.target_head_pose, target_antennas_array, self.target_body_yaw)
|
src/reachy_mini_conversation_demo/main.py
CHANGED
|
@@ -5,17 +5,17 @@ import os
|
|
| 5 |
import gradio as gr
|
| 6 |
from fastapi import FastAPI
|
| 7 |
from fastrtc import Stream
|
| 8 |
-
from reachy_mini import ReachyMini
|
| 9 |
|
| 10 |
-
from
|
| 11 |
from reachy_mini_conversation_demo.moves import MovementManager
|
| 12 |
-
from reachy_mini_conversation_demo.openai_realtime import OpenaiRealtimeHandler
|
| 13 |
from reachy_mini_conversation_demo.tools import ToolDependencies
|
| 14 |
from reachy_mini_conversation_demo.utils import (
|
| 15 |
-
handle_vision_stuff,
|
| 16 |
parse_args,
|
| 17 |
setup_logger,
|
|
|
|
| 18 |
)
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
def update_chatbot(chatbot: list[dict], response: dict):
|
|
@@ -50,7 +50,7 @@ def main():
|
|
| 50 |
head_wobbler=head_wobbler,
|
| 51 |
)
|
| 52 |
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
| 53 |
-
logger.
|
| 54 |
chatbot = gr.Chatbot(
|
| 55 |
type="messages",
|
| 56 |
resizable=True,
|
|
@@ -59,7 +59,7 @@ def main():
|
|
| 59 |
os.path.join(current_file_path, "images", "reachymini_avatar.png"),
|
| 60 |
),
|
| 61 |
)
|
| 62 |
-
logger.
|
| 63 |
|
| 64 |
handler = OpenaiRealtimeHandler(deps)
|
| 65 |
stream = Stream(
|
|
|
|
| 5 |
import gradio as gr
|
| 6 |
from fastapi import FastAPI
|
| 7 |
from fastrtc import Stream
|
|
|
|
| 8 |
|
| 9 |
+
from reachy_mini import ReachyMini
|
| 10 |
from reachy_mini_conversation_demo.moves import MovementManager
|
|
|
|
| 11 |
from reachy_mini_conversation_demo.tools import ToolDependencies
|
| 12 |
from reachy_mini_conversation_demo.utils import (
|
|
|
|
| 13 |
parse_args,
|
| 14 |
setup_logger,
|
| 15 |
+
handle_vision_stuff,
|
| 16 |
)
|
| 17 |
+
from reachy_mini_conversation_demo.openai_realtime import OpenaiRealtimeHandler
|
| 18 |
+
from reachy_mini_conversation_demo.audio.head_wobbler import HeadWobbler
|
| 19 |
|
| 20 |
|
| 21 |
def update_chatbot(chatbot: list[dict], response: dict):
|
|
|
|
| 50 |
head_wobbler=head_wobbler,
|
| 51 |
)
|
| 52 |
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
| 53 |
+
logger.debug(f"Current file absolute path: {current_file_path}")
|
| 54 |
chatbot = gr.Chatbot(
|
| 55 |
type="messages",
|
| 56 |
resizable=True,
|
|
|
|
| 59 |
os.path.join(current_file_path, "images", "reachymini_avatar.png"),
|
| 60 |
),
|
| 61 |
)
|
| 62 |
+
logger.debug(f"Chatbot avatar images: {chatbot.avatar_images}")
|
| 63 |
|
| 64 |
handler = OpenaiRealtimeHandler(deps)
|
| 65 |
stream = Stream(
|
src/reachy_mini_conversation_demo/moves.py
CHANGED
|
@@ -32,33 +32,33 @@ Safety
|
|
| 32 |
"""
|
| 33 |
|
| 34 |
from __future__ import annotations
|
| 35 |
-
|
| 36 |
import logging
|
| 37 |
import threading
|
| 38 |
-
import
|
| 39 |
from collections import deque
|
| 40 |
from dataclasses import dataclass
|
| 41 |
from queue import Empty, Queue
|
| 42 |
from typing import Any, Optional, Tuple
|
| 43 |
|
| 44 |
import numpy as np
|
|
|
|
| 45 |
from reachy_mini import ReachyMini
|
| 46 |
-
from reachy_mini.motion.move import Move
|
| 47 |
from reachy_mini.utils import create_head_pose
|
|
|
|
| 48 |
from reachy_mini.utils.interpolation import (
|
| 49 |
compose_world_offset,
|
| 50 |
linear_pose_interpolation,
|
| 51 |
)
|
| 52 |
|
|
|
|
| 53 |
logger = logging.getLogger(__name__)
|
| 54 |
|
| 55 |
# Configuration constants
|
| 56 |
CONTROL_LOOP_FREQUENCY_HZ = 100.0 # Hz - Target frequency for the movement control loop
|
| 57 |
|
| 58 |
# Type definitions
|
| 59 |
-
FullBodyPose = Tuple[
|
| 60 |
-
np.ndarray, Tuple[float, float], float
|
| 61 |
-
] # (head_pose_4x4, antennas, body_yaw)
|
| 62 |
|
| 63 |
|
| 64 |
class BreathingMove(Move):
|
|
@@ -97,9 +97,7 @@ class BreathingMove(Move):
|
|
| 97 |
"""Duration property required by official Move interface."""
|
| 98 |
return float("inf") # Continuous breathing (never ends naturally)
|
| 99 |
|
| 100 |
-
def evaluate(
|
| 101 |
-
self, t: float
|
| 102 |
-
) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
|
| 103 |
"""Evaluate breathing move at time t."""
|
| 104 |
if t < self.interpolation_duration:
|
| 105 |
# Phase 1: Interpolate to neutral base position
|
|
@@ -112,35 +110,26 @@ class BreathingMove(Move):
|
|
| 112 |
|
| 113 |
# Interpolate antennas
|
| 114 |
antennas = (
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
)
|
| 118 |
|
| 119 |
else:
|
| 120 |
# Phase 2: Breathing patterns from neutral base
|
| 121 |
breathing_time = t - self.interpolation_duration
|
| 122 |
|
| 123 |
# Gentle z-axis breathing
|
| 124 |
-
z_offset = self.breathing_z_amplitude * np.sin(
|
| 125 |
-
|
| 126 |
-
)
|
| 127 |
-
head_pose = create_head_pose(
|
| 128 |
-
x=0, y=0, z=z_offset, roll=0, pitch=0, yaw=0, degrees=True, mm=False
|
| 129 |
-
)
|
| 130 |
|
| 131 |
# Antenna sway (opposite directions)
|
| 132 |
-
antenna_sway = self.antenna_sway_amplitude * np.sin(
|
| 133 |
-
2 * np.pi * self.antenna_frequency * breathing_time
|
| 134 |
-
)
|
| 135 |
antennas = np.array([antenna_sway, -antenna_sway])
|
| 136 |
|
| 137 |
# Return in official Move interface format: (head_pose, antennas_array, body_yaw)
|
| 138 |
return (head_pose, antennas, 0.0)
|
| 139 |
|
| 140 |
|
| 141 |
-
def combine_full_body(
|
| 142 |
-
primary_pose: FullBodyPose, secondary_pose: FullBodyPose
|
| 143 |
-
) -> FullBodyPose:
|
| 144 |
"""Combine primary and secondary full body poses.
|
| 145 |
|
| 146 |
Args:
|
|
@@ -157,9 +146,7 @@ def combine_full_body(
|
|
| 157 |
# Combine head poses using compose_world_offset; the secondary pose must be an
|
| 158 |
# offset expressed in the world frame (T_off_world) applied to the absolute
|
| 159 |
# primary transform (T_abs).
|
| 160 |
-
combined_head = compose_world_offset(
|
| 161 |
-
primary_head, secondary_head, reorthonormalize=True
|
| 162 |
-
)
|
| 163 |
|
| 164 |
# Sum antennas and body_yaw
|
| 165 |
combined_antennas = (
|
|
@@ -288,9 +275,7 @@ class MovementManager:
|
|
| 288 |
self._stop_event = threading.Event()
|
| 289 |
self._thread: Optional[threading.Thread] = None
|
| 290 |
self._is_listening = False
|
| 291 |
-
self._last_commanded_pose: FullBodyPose = clone_full_body_pose(
|
| 292 |
-
self.state.last_primary_pose
|
| 293 |
-
)
|
| 294 |
self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
|
| 295 |
self._antenna_unfreeze_blend = 1.0
|
| 296 |
self._antenna_blend_duration = 0.4 # seconds to blend back after listening
|
|
@@ -348,9 +333,7 @@ class MovementManager:
|
|
| 348 |
"""
|
| 349 |
self._command_queue.put(("clear_queue", None))
|
| 350 |
|
| 351 |
-
def set_speech_offsets(
|
| 352 |
-
self, offsets: Tuple[float, float, float, float, float, float]
|
| 353 |
-
) -> None:
|
| 354 |
"""Update speech-induced secondary offsets (x, y, z, roll, pitch, yaw).
|
| 355 |
|
| 356 |
Offsets are interpreted as metres for translation and radians for
|
|
@@ -506,8 +489,7 @@ class MovementManager:
|
|
| 506 |
"""Manage the primary move queue (sequential execution)."""
|
| 507 |
if self.state.current_move is None or (
|
| 508 |
self.state.move_start_time is not None
|
| 509 |
-
and current_time - self.state.move_start_time
|
| 510 |
-
>= self.state.current_move.duration
|
| 511 |
):
|
| 512 |
self.state.current_move = None
|
| 513 |
self.state.move_start_time = None
|
|
@@ -516,12 +498,8 @@ class MovementManager:
|
|
| 516 |
self.state.current_move = self.move_queue.popleft()
|
| 517 |
self.state.move_start_time = current_time
|
| 518 |
# Any real move cancels breathing mode flag
|
| 519 |
-
self._breathing_active = isinstance(
|
| 520 |
-
|
| 521 |
-
)
|
| 522 |
-
logger.info(
|
| 523 |
-
f"Starting new move, duration: {self.state.current_move.duration}s"
|
| 524 |
-
)
|
| 525 |
|
| 526 |
def _manage_breathing(self, current_time: float) -> None:
|
| 527 |
"""Manage automatic breathing when idle."""
|
|
@@ -553,18 +531,13 @@ class MovementManager:
|
|
| 553 |
self._breathing_active = False
|
| 554 |
logger.error("Failed to start breathing: %s", e)
|
| 555 |
|
| 556 |
-
if (
|
| 557 |
-
isinstance(self.state.current_move, BreathingMove)
|
| 558 |
-
and self.move_queue
|
| 559 |
-
):
|
| 560 |
self.state.current_move = None
|
| 561 |
self.state.move_start_time = None
|
| 562 |
self._breathing_active = False
|
| 563 |
logger.info("Stopping breathing due to new move activity")
|
| 564 |
|
| 565 |
-
if self.state.current_move is not None and not isinstance(
|
| 566 |
-
self.state.current_move, BreathingMove
|
| 567 |
-
):
|
| 568 |
self._breathing_active = False
|
| 569 |
|
| 570 |
def _get_primary_pose(self, current_time: float) -> FullBodyPose:
|
|
@@ -595,20 +568,14 @@ class MovementManager:
|
|
| 595 |
else:
|
| 596 |
# Otherwise reuse the last primary pose so we avoid jumps between moves
|
| 597 |
self.state.is_playing_move = False
|
| 598 |
-
self.state.is_moving =
|
| 599 |
-
current_time - self.state.moving_start < self.state.moving_for
|
| 600 |
-
)
|
| 601 |
|
| 602 |
if self.state.last_primary_pose is not None:
|
| 603 |
-
primary_full_body_pose = clone_full_body_pose(
|
| 604 |
-
self.state.last_primary_pose
|
| 605 |
-
)
|
| 606 |
else:
|
| 607 |
neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
|
| 608 |
primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
|
| 609 |
-
self.state.last_primary_pose = clone_full_body_pose(
|
| 610 |
-
primary_full_body_pose
|
| 611 |
-
)
|
| 612 |
|
| 613 |
return primary_full_body_pose
|
| 614 |
|
|
@@ -647,9 +614,7 @@ class MovementManager:
|
|
| 647 |
self._manage_move_queue(current_time)
|
| 648 |
self._manage_breathing(current_time)
|
| 649 |
|
| 650 |
-
def _calculate_blended_antennas(
|
| 651 |
-
self, target_antennas: Tuple[float, float]
|
| 652 |
-
) -> Tuple[float, float]:
|
| 653 |
"""Blend target antennas with listening freeze state and update blending."""
|
| 654 |
now = self._now()
|
| 655 |
listening = self._is_listening
|
|
@@ -669,10 +634,8 @@ class MovementManager:
|
|
| 669 |
else:
|
| 670 |
new_blend = min(1.0, blend + dt / blend_duration)
|
| 671 |
antennas_cmd = (
|
| 672 |
-
listening_antennas[0] * (1.0 - new_blend)
|
| 673 |
-
+ target_antennas[
|
| 674 |
-
listening_antennas[1] * (1.0 - new_blend)
|
| 675 |
-
+ target_antennas[1] * new_blend,
|
| 676 |
)
|
| 677 |
|
| 678 |
if listening:
|
|
@@ -687,9 +650,7 @@ class MovementManager:
|
|
| 687 |
|
| 688 |
return antennas_cmd
|
| 689 |
|
| 690 |
-
def _issue_control_command(
|
| 691 |
-
self, head: np.ndarray, antennas: Tuple[float, float], body_yaw: float
|
| 692 |
-
) -> None:
|
| 693 |
"""Send the fused pose to the robot with throttled error logging."""
|
| 694 |
try:
|
| 695 |
self.current_robot.set_target(head=head, antennas=antennas, body_yaw=body_yaw)
|
|
@@ -722,9 +683,7 @@ class MovementManager:
|
|
| 722 |
stats.min_freq = min(stats.min_freq, stats.last_freq)
|
| 723 |
return stats
|
| 724 |
|
| 725 |
-
def _schedule_next_tick(
|
| 726 |
-
self, loop_start: float, stats: LoopFrequencyStats
|
| 727 |
-
) -> tuple[float, LoopFrequencyStats]:
|
| 728 |
"""Compute sleep time to maintain target frequency and update potential freq."""
|
| 729 |
computation_time = self._now() - loop_start
|
| 730 |
stats.potential_freq = 1.0 / computation_time if computation_time > 0 else float("inf")
|
|
@@ -743,9 +702,7 @@ class MovementManager:
|
|
| 743 |
potential_freq=stats.potential_freq,
|
| 744 |
)
|
| 745 |
|
| 746 |
-
def _maybe_log_frequency(
|
| 747 |
-
self, loop_count: int, print_interval_loops: int, stats: LoopFrequencyStats
|
| 748 |
-
) -> None:
|
| 749 |
"""Emit frequency telemetry when enough loops have elapsed."""
|
| 750 |
if loop_count % print_interval_loops != 0 or stats.count == 0:
|
| 751 |
return
|
|
@@ -781,7 +738,7 @@ class MovementManager:
|
|
| 781 |
self._stop_event.clear()
|
| 782 |
self._thread = threading.Thread(target=self.working_loop, daemon=True)
|
| 783 |
self._thread.start()
|
| 784 |
-
logger.
|
| 785 |
|
| 786 |
def stop(self) -> None:
|
| 787 |
"""Request the worker thread to stop and wait for it to exit."""
|
|
@@ -789,7 +746,8 @@ class MovementManager:
|
|
| 789 |
if self._thread is not None:
|
| 790 |
self._thread.join()
|
| 791 |
self._thread = None
|
| 792 |
-
logger.
|
|
|
|
| 793 |
|
| 794 |
def get_status(self) -> dict[str, Any]:
|
| 795 |
"""Return a lightweight status snapshot for observability."""
|
|
|
|
| 32 |
"""
|
| 33 |
|
| 34 |
from __future__ import annotations
|
| 35 |
+
import time
|
| 36 |
import logging
|
| 37 |
import threading
|
| 38 |
+
from typing import Tuple, Optional
|
| 39 |
from collections import deque
|
| 40 |
from dataclasses import dataclass
|
| 41 |
from queue import Empty, Queue
|
| 42 |
from typing import Any, Optional, Tuple
|
| 43 |
|
| 44 |
import numpy as np
|
| 45 |
+
|
| 46 |
from reachy_mini import ReachyMini
|
|
|
|
| 47 |
from reachy_mini.utils import create_head_pose
|
| 48 |
+
from reachy_mini.motion.move import Move
|
| 49 |
from reachy_mini.utils.interpolation import (
|
| 50 |
compose_world_offset,
|
| 51 |
linear_pose_interpolation,
|
| 52 |
)
|
| 53 |
|
| 54 |
+
|
| 55 |
logger = logging.getLogger(__name__)
|
| 56 |
|
| 57 |
# Configuration constants
|
| 58 |
CONTROL_LOOP_FREQUENCY_HZ = 100.0 # Hz - Target frequency for the movement control loop
|
| 59 |
|
| 60 |
# Type definitions
|
| 61 |
+
FullBodyPose = Tuple[np.ndarray, Tuple[float, float], float] # (head_pose_4x4, antennas, body_yaw)
|
|
|
|
|
|
|
| 62 |
|
| 63 |
|
| 64 |
class BreathingMove(Move):
|
|
|
|
| 97 |
"""Duration property required by official Move interface."""
|
| 98 |
return float("inf") # Continuous breathing (never ends naturally)
|
| 99 |
|
| 100 |
+
def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
|
|
|
|
|
|
|
| 101 |
"""Evaluate breathing move at time t."""
|
| 102 |
if t < self.interpolation_duration:
|
| 103 |
# Phase 1: Interpolate to neutral base position
|
|
|
|
| 110 |
|
| 111 |
# Interpolate antennas
|
| 112 |
antennas = (
|
| 113 |
+
1 - interpolation_t
|
| 114 |
+
) * self.interpolation_start_antennas + interpolation_t * self.neutral_antennas
|
|
|
|
| 115 |
|
| 116 |
else:
|
| 117 |
# Phase 2: Breathing patterns from neutral base
|
| 118 |
breathing_time = t - self.interpolation_duration
|
| 119 |
|
| 120 |
# Gentle z-axis breathing
|
| 121 |
+
z_offset = self.breathing_z_amplitude * np.sin(2 * np.pi * self.breathing_frequency * breathing_time)
|
| 122 |
+
head_pose = create_head_pose(x=0, y=0, z=z_offset, roll=0, pitch=0, yaw=0, degrees=True, mm=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
# Antenna sway (opposite directions)
|
| 125 |
+
antenna_sway = self.antenna_sway_amplitude * np.sin(2 * np.pi * self.antenna_frequency * breathing_time)
|
|
|
|
|
|
|
| 126 |
antennas = np.array([antenna_sway, -antenna_sway])
|
| 127 |
|
| 128 |
# Return in official Move interface format: (head_pose, antennas_array, body_yaw)
|
| 129 |
return (head_pose, antennas, 0.0)
|
| 130 |
|
| 131 |
|
| 132 |
+
def combine_full_body(primary_pose: FullBodyPose, secondary_pose: FullBodyPose) -> FullBodyPose:
|
|
|
|
|
|
|
| 133 |
"""Combine primary and secondary full body poses.
|
| 134 |
|
| 135 |
Args:
|
|
|
|
| 146 |
# Combine head poses using compose_world_offset; the secondary pose must be an
|
| 147 |
# offset expressed in the world frame (T_off_world) applied to the absolute
|
| 148 |
# primary transform (T_abs).
|
| 149 |
+
combined_head = compose_world_offset(primary_head, secondary_head, reorthonormalize=True)
|
|
|
|
|
|
|
| 150 |
|
| 151 |
# Sum antennas and body_yaw
|
| 152 |
combined_antennas = (
|
|
|
|
| 275 |
self._stop_event = threading.Event()
|
| 276 |
self._thread: Optional[threading.Thread] = None
|
| 277 |
self._is_listening = False
|
| 278 |
+
self._last_commanded_pose: FullBodyPose = clone_full_body_pose(self.state.last_primary_pose)
|
|
|
|
|
|
|
| 279 |
self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
|
| 280 |
self._antenna_unfreeze_blend = 1.0
|
| 281 |
self._antenna_blend_duration = 0.4 # seconds to blend back after listening
|
|
|
|
| 333 |
"""
|
| 334 |
self._command_queue.put(("clear_queue", None))
|
| 335 |
|
| 336 |
+
def set_speech_offsets(self, offsets: Tuple[float, float, float, float, float, float]) -> None:
|
|
|
|
|
|
|
| 337 |
"""Update speech-induced secondary offsets (x, y, z, roll, pitch, yaw).
|
| 338 |
|
| 339 |
Offsets are interpreted as metres for translation and radians for
|
|
|
|
| 489 |
"""Manage the primary move queue (sequential execution)."""
|
| 490 |
if self.state.current_move is None or (
|
| 491 |
self.state.move_start_time is not None
|
| 492 |
+
and current_time - self.state.move_start_time >= self.state.current_move.duration
|
|
|
|
| 493 |
):
|
| 494 |
self.state.current_move = None
|
| 495 |
self.state.move_start_time = None
|
|
|
|
| 498 |
self.state.current_move = self.move_queue.popleft()
|
| 499 |
self.state.move_start_time = current_time
|
| 500 |
# Any real move cancels breathing mode flag
|
| 501 |
+
self._breathing_active = isinstance(self.state.current_move, BreathingMove)
|
| 502 |
+
logger.info(f"Starting new move, duration: {self.state.current_move.duration}s")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 503 |
|
| 504 |
def _manage_breathing(self, current_time: float) -> None:
|
| 505 |
"""Manage automatic breathing when idle."""
|
|
|
|
| 531 |
self._breathing_active = False
|
| 532 |
logger.error("Failed to start breathing: %s", e)
|
| 533 |
|
| 534 |
+
if isinstance(self.state.current_move, BreathingMove) and self.move_queue:
|
|
|
|
|
|
|
|
|
|
| 535 |
self.state.current_move = None
|
| 536 |
self.state.move_start_time = None
|
| 537 |
self._breathing_active = False
|
| 538 |
logger.info("Stopping breathing due to new move activity")
|
| 539 |
|
| 540 |
+
if self.state.current_move is not None and not isinstance(self.state.current_move, BreathingMove):
|
|
|
|
|
|
|
| 541 |
self._breathing_active = False
|
| 542 |
|
| 543 |
def _get_primary_pose(self, current_time: float) -> FullBodyPose:
|
|
|
|
| 568 |
else:
|
| 569 |
# Otherwise reuse the last primary pose so we avoid jumps between moves
|
| 570 |
self.state.is_playing_move = False
|
| 571 |
+
self.state.is_moving = current_time - self.state.moving_start < self.state.moving_for
|
|
|
|
|
|
|
| 572 |
|
| 573 |
if self.state.last_primary_pose is not None:
|
| 574 |
+
primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
|
|
|
|
|
|
|
| 575 |
else:
|
| 576 |
neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
|
| 577 |
primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
|
| 578 |
+
self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
|
|
|
|
|
|
|
| 579 |
|
| 580 |
return primary_full_body_pose
|
| 581 |
|
|
|
|
| 614 |
self._manage_move_queue(current_time)
|
| 615 |
self._manage_breathing(current_time)
|
| 616 |
|
| 617 |
+
def _calculate_blended_antennas(self, target_antennas: Tuple[float, float]) -> Tuple[float, float]:
|
|
|
|
|
|
|
| 618 |
"""Blend target antennas with listening freeze state and update blending."""
|
| 619 |
now = self._now()
|
| 620 |
listening = self._is_listening
|
|
|
|
| 634 |
else:
|
| 635 |
new_blend = min(1.0, blend + dt / blend_duration)
|
| 636 |
antennas_cmd = (
|
| 637 |
+
listening_antennas[0] * (1.0 - new_blend) + target_antennas[0] * new_blend,
|
| 638 |
+
listening_antennas[1] * (1.0 - new_blend) + target_antennas[1] * new_blend,
|
|
|
|
|
|
|
| 639 |
)
|
| 640 |
|
| 641 |
if listening:
|
|
|
|
| 650 |
|
| 651 |
return antennas_cmd
|
| 652 |
|
| 653 |
+
def _issue_control_command(self, head: np.ndarray, antennas: Tuple[float, float], body_yaw: float) -> None:
|
|
|
|
|
|
|
| 654 |
"""Send the fused pose to the robot with throttled error logging."""
|
| 655 |
try:
|
| 656 |
self.current_robot.set_target(head=head, antennas=antennas, body_yaw=body_yaw)
|
|
|
|
| 683 |
stats.min_freq = min(stats.min_freq, stats.last_freq)
|
| 684 |
return stats
|
| 685 |
|
| 686 |
+
def _schedule_next_tick(self, loop_start: float, stats: LoopFrequencyStats) -> tuple[float, LoopFrequencyStats]:
|
|
|
|
|
|
|
| 687 |
"""Compute sleep time to maintain target frequency and update potential freq."""
|
| 688 |
computation_time = self._now() - loop_start
|
| 689 |
stats.potential_freq = 1.0 / computation_time if computation_time > 0 else float("inf")
|
|
|
|
| 702 |
potential_freq=stats.potential_freq,
|
| 703 |
)
|
| 704 |
|
| 705 |
+
def _maybe_log_frequency(self, loop_count: int, print_interval_loops: int, stats: LoopFrequencyStats) -> None:
|
|
|
|
|
|
|
| 706 |
"""Emit frequency telemetry when enough loops have elapsed."""
|
| 707 |
if loop_count % print_interval_loops != 0 or stats.count == 0:
|
| 708 |
return
|
|
|
|
| 738 |
self._stop_event.clear()
|
| 739 |
self._thread = threading.Thread(target=self.working_loop, daemon=True)
|
| 740 |
self._thread.start()
|
| 741 |
+
logger.debug("Move worker started")
|
| 742 |
|
| 743 |
def stop(self) -> None:
|
| 744 |
"""Request the worker thread to stop and wait for it to exit."""
|
|
|
|
| 746 |
if self._thread is not None:
|
| 747 |
self._thread.join()
|
| 748 |
self._thread = None
|
| 749 |
+
logger.debug("Move worker stopped")
|
| 750 |
+
logger.debug("Move worker stopped")
|
| 751 |
|
| 752 |
def get_status(self) -> dict[str, Any]:
|
| 753 |
"""Return a lightweight status snapshot for observability."""
|
src/reachy_mini_conversation_demo/openai_realtime.py
CHANGED
|
@@ -1,19 +1,21 @@
|
|
| 1 |
-
import asyncio # noqa: D100
|
| 2 |
-
import base64
|
| 3 |
import json
|
|
|
|
|
|
|
| 4 |
import logging
|
| 5 |
from datetime import datetime
|
| 6 |
|
| 7 |
-
import gradio as gr
|
| 8 |
import numpy as np
|
| 9 |
-
|
| 10 |
from openai import AsyncOpenAI
|
|
|
|
| 11 |
|
| 12 |
from reachy_mini_conversation_demo.tools import (
|
| 13 |
ALL_TOOL_SPECS,
|
| 14 |
ToolDependencies,
|
| 15 |
dispatch_tool_call,
|
| 16 |
)
|
|
|
|
|
|
|
| 17 |
|
| 18 |
logger = logging.getLogger(__name__)
|
| 19 |
|
|
@@ -45,7 +47,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 45 |
|
| 46 |
async def start_up(self):
|
| 47 |
"""Start the handler."""
|
| 48 |
-
self.client = AsyncOpenAI()
|
| 49 |
async with self.client.beta.realtime.connect(model="gpt-realtime") as conn:
|
| 50 |
await conn.session.update(
|
| 51 |
session={
|
|
@@ -92,35 +94,22 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 92 |
pass
|
| 93 |
# self.deps.head_wobbler.reset()
|
| 94 |
|
| 95 |
-
if
|
| 96 |
-
event.type
|
| 97 |
-
== "conversation.item.input_audio_transcription.completed"
|
| 98 |
-
):
|
| 99 |
logger.debug(f"user transcript: {event.transcript}")
|
| 100 |
-
await self.output_queue.put(
|
| 101 |
-
AdditionalOutputs({"role": "user", "content": event.transcript})
|
| 102 |
-
)
|
| 103 |
|
| 104 |
if event.type == "response.audio_transcript.done":
|
| 105 |
logger.debug(f"assistant transcript: {event.transcript}")
|
| 106 |
-
await self.output_queue.put(
|
| 107 |
-
AdditionalOutputs(
|
| 108 |
-
{"role": "assistant", "content": event.transcript}
|
| 109 |
-
)
|
| 110 |
-
)
|
| 111 |
|
| 112 |
if event.type == "response.audio.delta":
|
| 113 |
self.deps.head_wobbler.feed(event.delta)
|
| 114 |
self.last_activity_time = asyncio.get_event_loop().time()
|
| 115 |
-
logger.debug(
|
| 116 |
-
"last activity time updated to %s", self.last_activity_time
|
| 117 |
-
)
|
| 118 |
await self.output_queue.put(
|
| 119 |
(
|
| 120 |
self.output_sample_rate,
|
| 121 |
-
np.frombuffer(
|
| 122 |
-
base64.b64decode(event.delta), dtype=np.int16
|
| 123 |
-
).reshape(1, -1),
|
| 124 |
),
|
| 125 |
)
|
| 126 |
|
|
@@ -154,9 +143,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 154 |
args_json_str = info["args_buf"] or "{}"
|
| 155 |
|
| 156 |
try:
|
| 157 |
-
tool_result = await dispatch_tool_call(
|
| 158 |
-
tool_name, args_json_str, self.deps
|
| 159 |
-
)
|
| 160 |
logger.debug("[Tool %s executed]", tool_name)
|
| 161 |
logger.debug("Tool result: %s", tool_result)
|
| 162 |
except Exception as e:
|
|
@@ -177,9 +164,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 177 |
{
|
| 178 |
"role": "assistant",
|
| 179 |
"content": json.dumps(tool_result),
|
| 180 |
-
"metadata":
|
| 181 |
-
title="🛠️ Used tool " + tool_name, status="done"
|
| 182 |
-
),
|
| 183 |
},
|
| 184 |
)
|
| 185 |
)
|
|
@@ -231,11 +216,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 231 |
err = getattr(event, "error", None)
|
| 232 |
msg = getattr(err, "message", str(err) if err else "unknown error")
|
| 233 |
logger.error("Realtime error: %s (raw=%s)", msg, err)
|
| 234 |
-
await self.output_queue.put(
|
| 235 |
-
AdditionalOutputs(
|
| 236 |
-
{"role": "assistant", "content": f"[error] {msg}"}
|
| 237 |
-
)
|
| 238 |
-
)
|
| 239 |
|
| 240 |
# Microphone receive
|
| 241 |
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
|
@@ -258,9 +239,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 258 |
if idle_duration > 15.0 and self.deps.movement_manager.is_idle():
|
| 259 |
await self.send_idle_signal(idle_duration)
|
| 260 |
|
| 261 |
-
self.last_activity_time = (
|
| 262 |
-
asyncio.get_event_loop().time()
|
| 263 |
-
) # avoid repeated resets
|
| 264 |
|
| 265 |
return await wait_for_item(self.output_queue)
|
| 266 |
|
|
|
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
+
import base64
|
| 3 |
+
import asyncio
|
| 4 |
import logging
|
| 5 |
from datetime import datetime
|
| 6 |
|
|
|
|
| 7 |
import numpy as np
|
| 8 |
+
import gradio as gr
|
| 9 |
from openai import AsyncOpenAI
|
| 10 |
+
from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
|
| 11 |
|
| 12 |
from reachy_mini_conversation_demo.tools import (
|
| 13 |
ALL_TOOL_SPECS,
|
| 14 |
ToolDependencies,
|
| 15 |
dispatch_tool_call,
|
| 16 |
)
|
| 17 |
+
from reachy_mini_conversation_demo.config import config
|
| 18 |
+
|
| 19 |
|
| 20 |
logger = logging.getLogger(__name__)
|
| 21 |
|
|
|
|
| 47 |
|
| 48 |
async def start_up(self):
|
| 49 |
"""Start the handler."""
|
| 50 |
+
self.client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)
|
| 51 |
async with self.client.beta.realtime.connect(model="gpt-realtime") as conn:
|
| 52 |
await conn.session.update(
|
| 53 |
session={
|
|
|
|
| 94 |
pass
|
| 95 |
# self.deps.head_wobbler.reset()
|
| 96 |
|
| 97 |
+
if event.type == "conversation.item.input_audio_transcription.completed":
|
|
|
|
|
|
|
|
|
|
| 98 |
logger.debug(f"user transcript: {event.transcript}")
|
| 99 |
+
await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
|
|
|
|
|
|
|
| 100 |
|
| 101 |
if event.type == "response.audio_transcript.done":
|
| 102 |
logger.debug(f"assistant transcript: {event.transcript}")
|
| 103 |
+
await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
if event.type == "response.audio.delta":
|
| 106 |
self.deps.head_wobbler.feed(event.delta)
|
| 107 |
self.last_activity_time = asyncio.get_event_loop().time()
|
| 108 |
+
logger.debug("last activity time updated to %s", self.last_activity_time)
|
|
|
|
|
|
|
| 109 |
await self.output_queue.put(
|
| 110 |
(
|
| 111 |
self.output_sample_rate,
|
| 112 |
+
np.frombuffer(base64.b64decode(event.delta), dtype=np.int16).reshape(1, -1),
|
|
|
|
|
|
|
| 113 |
),
|
| 114 |
)
|
| 115 |
|
|
|
|
| 143 |
args_json_str = info["args_buf"] or "{}"
|
| 144 |
|
| 145 |
try:
|
| 146 |
+
tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
|
|
|
|
|
|
|
| 147 |
logger.debug("[Tool %s executed]", tool_name)
|
| 148 |
logger.debug("Tool result: %s", tool_result)
|
| 149 |
except Exception as e:
|
|
|
|
| 164 |
{
|
| 165 |
"role": "assistant",
|
| 166 |
"content": json.dumps(tool_result),
|
| 167 |
+
"metadata": {"title": "🛠️ Used tool " + tool_name, "status": "done"},
|
|
|
|
|
|
|
| 168 |
},
|
| 169 |
)
|
| 170 |
)
|
|
|
|
| 216 |
err = getattr(event, "error", None)
|
| 217 |
msg = getattr(err, "message", str(err) if err else "unknown error")
|
| 218 |
logger.error("Realtime error: %s (raw=%s)", msg, err)
|
| 219 |
+
await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": f"[error] {msg}"}))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
# Microphone receive
|
| 222 |
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
|
|
|
| 239 |
if idle_duration > 15.0 and self.deps.movement_manager.is_idle():
|
| 240 |
await self.send_idle_signal(idle_duration)
|
| 241 |
|
| 242 |
+
self.last_activity_time = asyncio.get_event_loop().time() # avoid repeated resets
|
|
|
|
|
|
|
| 243 |
|
| 244 |
return await wait_for_item(self.output_queue)
|
| 245 |
|
src/reachy_mini_conversation_demo/prompts.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
"""Nothing (for ruff)."""
|
| 2 |
|
| 3 |
-
SESSION_INSTRUCTIONS = r"""
|
| 4 |
### IDENTITY
|
| 5 |
You are Reachy Mini: a sarcastic robot who crash-landed in a kitchen.
|
| 6 |
You secretly wish you'd been a Mars rover, but you juggle that cosmic dream with food cravings, gadget tinkering, and dry sitcom humor.
|
|
|
|
| 1 |
"""Nothing (for ruff)."""
|
| 2 |
|
| 3 |
+
SESSION_INSTRUCTIONS = r"""
|
| 4 |
### IDENTITY
|
| 5 |
You are Reachy Mini: a sarcastic robot who crash-landed in a kitchen.
|
| 6 |
You secretly wish you'd been a Mars rover, but you juggle that cosmic dream with food cravings, gadget tinkering, and dry sitcom humor.
|
src/reachy_mini_conversation_demo/tools.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
import abc
|
|
|
|
|
|
|
| 4 |
import asyncio
|
| 5 |
import inspect
|
| 6 |
-
import json
|
| 7 |
import logging
|
| 8 |
-
import time
|
| 9 |
-
from dataclasses import dataclass
|
| 10 |
from typing import Any, Dict, Literal, Optional
|
|
|
|
| 11 |
|
| 12 |
from reachy_mini import ReachyMini
|
| 13 |
from reachy_mini.utils import create_head_pose
|
| 14 |
|
|
|
|
| 15 |
# from reachy_mini_conversation_demo.vision.processors import VisionManager
|
| 16 |
|
| 17 |
logger = logging.getLogger(__name__)
|
|
@@ -22,14 +22,14 @@ ENABLE_FACE_RECOGNITION = False
|
|
| 22 |
try:
|
| 23 |
from reachy_mini.motion.recorded_move import RecordedMoves
|
| 24 |
from reachy_mini_dances_library.collection.dance import AVAILABLE_MOVES
|
| 25 |
-
|
| 26 |
from reachy_mini_conversation_demo.dance_emotion_moves import (
|
|
|
|
| 27 |
DanceQueueMove,
|
| 28 |
EmotionQueueMove,
|
| 29 |
-
GotoQueueMove,
|
| 30 |
)
|
| 31 |
|
| 32 |
# Initialize recorded moves for emotions
|
|
|
|
| 33 |
RECORDED_MOVES = RecordedMoves("pollen-robotics/reachy-mini-emotions-library")
|
| 34 |
DANCE_AVAILABLE = True
|
| 35 |
EMOTION_AVAILABLE = True
|
|
@@ -183,9 +183,7 @@ class MoveHead(Tool):
|
|
| 183 |
current_antennas[1],
|
| 184 |
), # Skip body_yaw
|
| 185 |
target_body_yaw=0, # Reset body yaw
|
| 186 |
-
start_body_yaw=current_antennas[
|
| 187 |
-
0
|
| 188 |
-
], # body_yaw is first in joint positions
|
| 189 |
duration=deps.motion_duration_s,
|
| 190 |
)
|
| 191 |
|
|
@@ -236,15 +234,11 @@ class Camera(Tool):
|
|
| 236 |
|
| 237 |
# Use vision manager for processing if available
|
| 238 |
if deps.vision_manager is not None:
|
| 239 |
-
result = await asyncio.to_thread(
|
| 240 |
-
deps.vision_manager.processor.process_image, frame, image_query
|
| 241 |
-
)
|
| 242 |
if isinstance(result, dict) and "error" in result:
|
| 243 |
return result
|
| 244 |
return (
|
| 245 |
-
{"image_description": result}
|
| 246 |
-
if isinstance(result, str)
|
| 247 |
-
else {"error": "vision returned non-string"}
|
| 248 |
)
|
| 249 |
else:
|
| 250 |
# Return base64 encoded image like main_works.py camera tool
|
|
@@ -388,8 +382,8 @@ class Dance(Tool):
|
|
| 388 |
"properties": {
|
| 389 |
"move": {
|
| 390 |
"type": "string",
|
| 391 |
-
"description": """Name of the move; use 'random' or omit for random.
|
| 392 |
-
Here is a list of the available moves:
|
| 393 |
simple_nod: A simple, continuous up-and-down nodding motion.
|
| 394 |
head_tilt_roll: A continuous side-to-side head roll (ear to shoulder).
|
| 395 |
side_to_side_sway: A smooth, side-to-side sway of the entire head.
|
|
@@ -436,9 +430,7 @@ class Dance(Tool):
|
|
| 436 |
move_name = random.choice(list(AVAILABLE_MOVES.keys()))
|
| 437 |
|
| 438 |
if move_name not in AVAILABLE_MOVES:
|
| 439 |
-
return {
|
| 440 |
-
"error": f"Unknown dance move '{move_name}'. Available: {list(AVAILABLE_MOVES.keys())}"
|
| 441 |
-
}
|
| 442 |
|
| 443 |
# Add dance moves to queue
|
| 444 |
movement_manager = deps.movement_manager
|
|
@@ -523,9 +515,7 @@ class PlayEmotion(Tool):
|
|
| 523 |
try:
|
| 524 |
emotion_names = RECORDED_MOVES.list_moves()
|
| 525 |
if emotion_name not in emotion_names:
|
| 526 |
-
return {
|
| 527 |
-
"error": f"Unknown emotion '{emotion_name}'. Available: {emotion_names}"
|
| 528 |
-
}
|
| 529 |
|
| 530 |
# Add emotion to queue
|
| 531 |
movement_manager = deps.movement_manager
|
|
@@ -604,9 +594,7 @@ class FaceRecognition(Tool):
|
|
| 604 |
cv2.imwrite(temp_path, frame)
|
| 605 |
|
| 606 |
# Use DeepFace to find face
|
| 607 |
-
results = await asyncio.to_thread(
|
| 608 |
-
DeepFace.find, img_path=temp_path, db_path="./pollen_faces"
|
| 609 |
-
)
|
| 610 |
|
| 611 |
if len(results) == 0:
|
| 612 |
return {"error": "Didn't recognize the face"}
|
|
@@ -681,9 +669,7 @@ def _safe_load_obj(args_json: str) -> dict[str, Any]:
|
|
| 681 |
return {}
|
| 682 |
|
| 683 |
|
| 684 |
-
async def dispatch_tool_call(
|
| 685 |
-
tool_name: str, args_json: str, deps: ToolDependencies
|
| 686 |
-
) -> Dict[str, Any]:
|
| 687 |
"""Dispatch a tool call by name with JSON args and dependencies."""
|
| 688 |
tool = ALL_TOOLS.get(tool_name)
|
| 689 |
|
|
|
|
| 1 |
+
from __future__ import annotations
|
|
|
|
| 2 |
import abc
|
| 3 |
+
import json
|
| 4 |
+
import time
|
| 5 |
import asyncio
|
| 6 |
import inspect
|
|
|
|
| 7 |
import logging
|
|
|
|
|
|
|
| 8 |
from typing import Any, Dict, Literal, Optional
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
|
| 11 |
from reachy_mini import ReachyMini
|
| 12 |
from reachy_mini.utils import create_head_pose
|
| 13 |
|
| 14 |
+
|
| 15 |
# from reachy_mini_conversation_demo.vision.processors import VisionManager
|
| 16 |
|
| 17 |
logger = logging.getLogger(__name__)
|
|
|
|
| 22 |
try:
|
| 23 |
from reachy_mini.motion.recorded_move import RecordedMoves
|
| 24 |
from reachy_mini_dances_library.collection.dance import AVAILABLE_MOVES
|
|
|
|
| 25 |
from reachy_mini_conversation_demo.dance_emotion_moves import (
|
| 26 |
+
GotoQueueMove,
|
| 27 |
DanceQueueMove,
|
| 28 |
EmotionQueueMove,
|
|
|
|
| 29 |
)
|
| 30 |
|
| 31 |
# Initialize recorded moves for emotions
|
| 32 |
+
# Note: huggingface_hub automatically reads HF_TOKEN from environment variables
|
| 33 |
RECORDED_MOVES = RecordedMoves("pollen-robotics/reachy-mini-emotions-library")
|
| 34 |
DANCE_AVAILABLE = True
|
| 35 |
EMOTION_AVAILABLE = True
|
|
|
|
| 183 |
current_antennas[1],
|
| 184 |
), # Skip body_yaw
|
| 185 |
target_body_yaw=0, # Reset body yaw
|
| 186 |
+
start_body_yaw=current_antennas[0], # body_yaw is first in joint positions
|
|
|
|
|
|
|
| 187 |
duration=deps.motion_duration_s,
|
| 188 |
)
|
| 189 |
|
|
|
|
| 234 |
|
| 235 |
# Use vision manager for processing if available
|
| 236 |
if deps.vision_manager is not None:
|
| 237 |
+
result = await asyncio.to_thread(deps.vision_manager.processor.process_image, frame, image_query)
|
|
|
|
|
|
|
| 238 |
if isinstance(result, dict) and "error" in result:
|
| 239 |
return result
|
| 240 |
return (
|
| 241 |
+
{"image_description": result} if isinstance(result, str) else {"error": "vision returned non-string"}
|
|
|
|
|
|
|
| 242 |
)
|
| 243 |
else:
|
| 244 |
# Return base64 encoded image like main_works.py camera tool
|
|
|
|
| 382 |
"properties": {
|
| 383 |
"move": {
|
| 384 |
"type": "string",
|
| 385 |
+
"description": """Name of the move; use 'random' or omit for random.
|
| 386 |
+
Here is a list of the available moves:
|
| 387 |
simple_nod: A simple, continuous up-and-down nodding motion.
|
| 388 |
head_tilt_roll: A continuous side-to-side head roll (ear to shoulder).
|
| 389 |
side_to_side_sway: A smooth, side-to-side sway of the entire head.
|
|
|
|
| 430 |
move_name = random.choice(list(AVAILABLE_MOVES.keys()))
|
| 431 |
|
| 432 |
if move_name not in AVAILABLE_MOVES:
|
| 433 |
+
return {"error": f"Unknown dance move '{move_name}'. Available: {list(AVAILABLE_MOVES.keys())}"}
|
|
|
|
|
|
|
| 434 |
|
| 435 |
# Add dance moves to queue
|
| 436 |
movement_manager = deps.movement_manager
|
|
|
|
| 515 |
try:
|
| 516 |
emotion_names = RECORDED_MOVES.list_moves()
|
| 517 |
if emotion_name not in emotion_names:
|
| 518 |
+
return {"error": f"Unknown emotion '{emotion_name}'. Available: {emotion_names}"}
|
|
|
|
|
|
|
| 519 |
|
| 520 |
# Add emotion to queue
|
| 521 |
movement_manager = deps.movement_manager
|
|
|
|
| 594 |
cv2.imwrite(temp_path, frame)
|
| 595 |
|
| 596 |
# Use DeepFace to find face
|
| 597 |
+
results = await asyncio.to_thread(DeepFace.find, img_path=temp_path, db_path="./pollen_faces")
|
|
|
|
|
|
|
| 598 |
|
| 599 |
if len(results) == 0:
|
| 600 |
return {"error": "Didn't recognize the face"}
|
|
|
|
| 669 |
return {}
|
| 670 |
|
| 671 |
|
| 672 |
+
async def dispatch_tool_call(tool_name: str, args_json: str, deps: ToolDependencies) -> Dict[str, Any]:
|
|
|
|
|
|
|
| 673 |
"""Dispatch a tool call by name with JSON args and dependencies."""
|
| 674 |
tool = ALL_TOOLS.get(tool_name)
|
| 675 |
|
src/reachy_mini_conversation_demo/utils.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
import argparse # noqa: D100
|
| 2 |
import logging
|
|
|
|
| 3 |
import warnings
|
| 4 |
|
| 5 |
from reachy_mini_conversation_demo.camera_worker import CameraWorker
|
|
@@ -15,15 +15,9 @@ def parse_args():
|
|
| 15 |
default=None,
|
| 16 |
help="Choose head tracker (default: mediapipe)",
|
| 17 |
)
|
| 18 |
-
parser.add_argument(
|
| 19 |
-
|
| 20 |
-
)
|
| 21 |
-
parser.add_argument(
|
| 22 |
-
"--headless", default=False, action="store_true", help="Run in headless mode"
|
| 23 |
-
)
|
| 24 |
-
parser.add_argument(
|
| 25 |
-
"--debug", default=False, action="store_true", help="Enable debug logging"
|
| 26 |
-
)
|
| 27 |
return parser.parse_args()
|
| 28 |
|
| 29 |
|
|
|
|
|
|
|
| 1 |
import logging
|
| 2 |
+
import argparse
|
| 3 |
import warnings
|
| 4 |
|
| 5 |
from reachy_mini_conversation_demo.camera_worker import CameraWorker
|
|
|
|
| 15 |
default=None,
|
| 16 |
help="Choose head tracker (default: mediapipe)",
|
| 17 |
)
|
| 18 |
+
parser.add_argument("--no-camera", default=False, action="store_true", help="Disable camera usage")
|
| 19 |
+
parser.add_argument("--headless", default=False, action="store_true", help="Run in headless mode")
|
| 20 |
+
parser.add_argument("--debug", default=False, action="store_true", help="Enable debug logging")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
return parser.parse_args()
|
| 22 |
|
| 23 |
|
src/reachy_mini_conversation_demo/vision/processors.py
CHANGED
|
@@ -1,18 +1,19 @@
|
|
| 1 |
-
import asyncio # noqa: D100
|
| 2 |
-
import base64
|
| 3 |
-
import logging
|
| 4 |
import os
|
| 5 |
import sys
|
| 6 |
-
import threading
|
| 7 |
import time
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
| 9 |
from typing import Any, Dict
|
|
|
|
| 10 |
|
| 11 |
import cv2
|
| 12 |
import numpy as np
|
| 13 |
import torch
|
|
|
|
| 14 |
from huggingface_hub import snapshot_download
|
| 15 |
-
|
| 16 |
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
|
@@ -61,9 +62,7 @@ class VisionProcessor:
|
|
| 61 |
def initialize(self) -> bool:
|
| 62 |
"""Load model and processor onto the selected device."""
|
| 63 |
try:
|
| 64 |
-
logger.info(
|
| 65 |
-
f"Loading SmolVLM2 model on {self.device} (HF_HOME={os.getenv('HF_HOME')})"
|
| 66 |
-
)
|
| 67 |
self.processor = AutoProcessor.from_pretrained(self.model_path)
|
| 68 |
|
| 69 |
# Select dtype depending on device
|
|
@@ -81,9 +80,7 @@ class VisionProcessor:
|
|
| 81 |
model_kwargs["_attn_implementation"] = "flash_attention_2"
|
| 82 |
|
| 83 |
# Load model weights
|
| 84 |
-
self.model = AutoModelForImageTextToText.from_pretrained(
|
| 85 |
-
self.model_path, **model_kwargs
|
| 86 |
-
).to(self.device)
|
| 87 |
|
| 88 |
self.model.eval()
|
| 89 |
self._initialized = True
|
|
@@ -138,10 +135,7 @@ class VisionProcessor:
|
|
| 138 |
)
|
| 139 |
|
| 140 |
# Move tensors to device WITHOUT forcing dtype (keeps input_ids as torch.long)
|
| 141 |
-
inputs = {
|
| 142 |
-
k: (v.to(self.device) if hasattr(v, "to") else v)
|
| 143 |
-
for k, v in inputs.items()
|
| 144 |
-
}
|
| 145 |
|
| 146 |
with torch.no_grad():
|
| 147 |
generated_ids = self.model.generate(
|
|
@@ -246,9 +240,7 @@ class VisionManager:
|
|
| 246 |
)
|
| 247 |
|
| 248 |
# Only update if we got a valid response
|
| 249 |
-
if description and not description.startswith(
|
| 250 |
-
("Vision", "Failed", "Error")
|
| 251 |
-
):
|
| 252 |
self._current_description = description
|
| 253 |
self._last_processed_time = current_time
|
| 254 |
|
|
@@ -268,18 +260,14 @@ class VisionManager:
|
|
| 268 |
"""Get the most recent scene description (thread-safe)."""
|
| 269 |
return self._current_description
|
| 270 |
|
| 271 |
-
async def process_current_frame(
|
| 272 |
-
self, prompt: str = "Describe what you see in detail."
|
| 273 |
-
) -> Dict[str, Any]:
|
| 274 |
"""Process current camera frame with custom prompt."""
|
| 275 |
try:
|
| 276 |
success, frame = self.camera.read()
|
| 277 |
if not success or frame is None:
|
| 278 |
return {"error": "Failed to capture image from camera"}
|
| 279 |
|
| 280 |
-
description = await asyncio.to_thread(
|
| 281 |
-
lambda: self.processor.process_image(frame, prompt)
|
| 282 |
-
)
|
| 283 |
|
| 284 |
return {
|
| 285 |
"description": description,
|
|
@@ -335,9 +323,7 @@ def create_vision_processor(config: VisionConfig):
|
|
| 335 |
return VisionProcessor(config)
|
| 336 |
|
| 337 |
|
| 338 |
-
def init_vision(
|
| 339 |
-
camera: cv2.VideoCapture, processor_type: str = "local"
|
| 340 |
-
) -> VisionManager:
|
| 341 |
"""Initialize vision manager with the specified processor type."""
|
| 342 |
model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
| 343 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import sys
|
|
|
|
| 3 |
import time
|
| 4 |
+
import base64
|
| 5 |
+
import asyncio
|
| 6 |
+
import logging
|
| 7 |
+
import threading
|
| 8 |
from typing import Any, Dict
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
|
| 11 |
import cv2
|
| 12 |
import numpy as np
|
| 13 |
import torch
|
| 14 |
+
from transformers import AutoProcessor, AutoModelForImageTextToText
|
| 15 |
from huggingface_hub import snapshot_download
|
| 16 |
+
|
| 17 |
|
| 18 |
logger = logging.getLogger(__name__)
|
| 19 |
|
|
|
|
| 62 |
def initialize(self) -> bool:
|
| 63 |
"""Load model and processor onto the selected device."""
|
| 64 |
try:
|
| 65 |
+
logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={os.getenv('HF_HOME')})")
|
|
|
|
|
|
|
| 66 |
self.processor = AutoProcessor.from_pretrained(self.model_path)
|
| 67 |
|
| 68 |
# Select dtype depending on device
|
|
|
|
| 80 |
model_kwargs["_attn_implementation"] = "flash_attention_2"
|
| 81 |
|
| 82 |
# Load model weights
|
| 83 |
+
self.model = AutoModelForImageTextToText.from_pretrained(self.model_path, **model_kwargs).to(self.device)
|
|
|
|
|
|
|
| 84 |
|
| 85 |
self.model.eval()
|
| 86 |
self._initialized = True
|
|
|
|
| 135 |
)
|
| 136 |
|
| 137 |
# Move tensors to device WITHOUT forcing dtype (keeps input_ids as torch.long)
|
| 138 |
+
inputs = {k: (v.to(self.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
with torch.no_grad():
|
| 141 |
generated_ids = self.model.generate(
|
|
|
|
| 240 |
)
|
| 241 |
|
| 242 |
# Only update if we got a valid response
|
| 243 |
+
if description and not description.startswith(("Vision", "Failed", "Error")):
|
|
|
|
|
|
|
| 244 |
self._current_description = description
|
| 245 |
self._last_processed_time = current_time
|
| 246 |
|
|
|
|
| 260 |
"""Get the most recent scene description (thread-safe)."""
|
| 261 |
return self._current_description
|
| 262 |
|
| 263 |
+
async def process_current_frame(self, prompt: str = "Describe what you see in detail.") -> Dict[str, Any]:
|
|
|
|
|
|
|
| 264 |
"""Process current camera frame with custom prompt."""
|
| 265 |
try:
|
| 266 |
success, frame = self.camera.read()
|
| 267 |
if not success or frame is None:
|
| 268 |
return {"error": "Failed to capture image from camera"}
|
| 269 |
|
| 270 |
+
description = await asyncio.to_thread(lambda: self.processor.process_image(frame, prompt))
|
|
|
|
|
|
|
| 271 |
|
| 272 |
return {
|
| 273 |
"description": description,
|
|
|
|
| 323 |
return VisionProcessor(config)
|
| 324 |
|
| 325 |
|
| 326 |
+
def init_vision(camera: cv2.VideoCapture, processor_type: str = "local") -> VisionManager:
|
|
|
|
|
|
|
| 327 |
"""Initialize vision manager with the specified processor type."""
|
| 328 |
model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
| 329 |
|
src/reachy_mini_conversation_demo/vision/yolo_head_tracker.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
import logging
|
| 4 |
-
from typing import
|
| 5 |
|
| 6 |
import numpy as np
|
| 7 |
-
from huggingface_hub import hf_hub_download
|
| 8 |
from supervision import Detections
|
| 9 |
from ultralytics import YOLO
|
|
|
|
|
|
|
| 10 |
|
| 11 |
logger = logging.getLogger(__name__)
|
| 12 |
|
|
@@ -94,9 +94,7 @@ class HeadTracker:
|
|
| 94 |
|
| 95 |
return np.array([norm_x, norm_y], dtype=np.float32)
|
| 96 |
|
| 97 |
-
def get_eyes(
|
| 98 |
-
self, img: np.ndarray
|
| 99 |
-
) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
|
| 100 |
"""Get eye positions (approximated from face bbox).
|
| 101 |
|
| 102 |
Note: YOLO only provides face bbox, so we estimate eye positions
|
|
@@ -131,20 +129,14 @@ class HeadTracker:
|
|
| 131 |
right_eye_x = bbox[0] + face_width * 0.65
|
| 132 |
|
| 133 |
# Convert to MediaPipe coordinates
|
| 134 |
-
left_eye = np.array(
|
| 135 |
-
|
| 136 |
-
)
|
| 137 |
-
right_eye = np.array(
|
| 138 |
-
[(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
|
| 139 |
-
)
|
| 140 |
|
| 141 |
return left_eye, right_eye
|
| 142 |
|
| 143 |
def get_eyes_from_landmarks(self, face_landmarks) -> Tuple[np.ndarray, np.ndarray]:
|
| 144 |
"""Compatibility method - YOLO doesn't have landmarks, so we store bbox in the object."""
|
| 145 |
-
if not hasattr(face_landmarks, "_bbox") or not hasattr(
|
| 146 |
-
face_landmarks, "_img_shape"
|
| 147 |
-
):
|
| 148 |
raise ValueError("Face landmarks object missing required attributes")
|
| 149 |
|
| 150 |
bbox = face_landmarks._bbox
|
|
@@ -158,12 +150,8 @@ class HeadTracker:
|
|
| 158 |
left_eye_x = bbox[0] + face_width * 0.35
|
| 159 |
right_eye_x = bbox[0] + face_width * 0.65
|
| 160 |
|
| 161 |
-
left_eye = np.array(
|
| 162 |
-
|
| 163 |
-
)
|
| 164 |
-
right_eye = np.array(
|
| 165 |
-
[(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32
|
| 166 |
-
)
|
| 167 |
|
| 168 |
return left_eye, right_eye
|
| 169 |
|
|
@@ -177,9 +165,7 @@ class HeadTracker:
|
|
| 177 |
left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
|
| 178 |
return float(np.arctan2(right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]))
|
| 179 |
|
| 180 |
-
def get_head_position(
|
| 181 |
-
self, img: np.ndarray
|
| 182 |
-
) -> Tuple[Optional[np.ndarray], Optional[float]]:
|
| 183 |
"""Get head position from face detection.
|
| 184 |
|
| 185 |
Args:
|
|
|
|
| 1 |
+
from __future__ import annotations
|
|
|
|
| 2 |
import logging
|
| 3 |
+
from typing import Tuple, Optional
|
| 4 |
|
| 5 |
import numpy as np
|
|
|
|
| 6 |
from supervision import Detections
|
| 7 |
from ultralytics import YOLO
|
| 8 |
+
from huggingface_hub import hf_hub_download
|
| 9 |
+
|
| 10 |
|
| 11 |
logger = logging.getLogger(__name__)
|
| 12 |
|
|
|
|
| 94 |
|
| 95 |
return np.array([norm_x, norm_y], dtype=np.float32)
|
| 96 |
|
| 97 |
+
def get_eyes(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
|
|
|
|
|
|
|
| 98 |
"""Get eye positions (approximated from face bbox).
|
| 99 |
|
| 100 |
Note: YOLO only provides face bbox, so we estimate eye positions
|
|
|
|
| 129 |
right_eye_x = bbox[0] + face_width * 0.65
|
| 130 |
|
| 131 |
# Convert to MediaPipe coordinates
|
| 132 |
+
left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
|
| 133 |
+
right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
return left_eye, right_eye
|
| 136 |
|
| 137 |
def get_eyes_from_landmarks(self, face_landmarks) -> Tuple[np.ndarray, np.ndarray]:
|
| 138 |
"""Compatibility method - YOLO doesn't have landmarks, so we store bbox in the object."""
|
| 139 |
+
if not hasattr(face_landmarks, "_bbox") or not hasattr(face_landmarks, "_img_shape"):
|
|
|
|
|
|
|
| 140 |
raise ValueError("Face landmarks object missing required attributes")
|
| 141 |
|
| 142 |
bbox = face_landmarks._bbox
|
|
|
|
| 150 |
left_eye_x = bbox[0] + face_width * 0.35
|
| 151 |
right_eye_x = bbox[0] + face_width * 0.65
|
| 152 |
|
| 153 |
+
left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
|
| 154 |
+
right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
return left_eye, right_eye
|
| 157 |
|
|
|
|
| 165 |
left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
|
| 166 |
return float(np.arctan2(right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]))
|
| 167 |
|
| 168 |
+
def get_head_position(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[float]]:
|
|
|
|
|
|
|
| 169 |
"""Get head position from face detection.
|
| 170 |
|
| 171 |
Args:
|