Alina Lozovskaya
commited on
Commit
·
69601f9
1
Parent(s):
24c038d
Clean out main.py [wip]
Browse files
src/reachy_mini_conversation_demo/main.py
CHANGED
|
@@ -9,10 +9,7 @@ import sys
|
|
| 9 |
import time
|
| 10 |
import warnings
|
| 11 |
import threading
|
| 12 |
-
from threading import Thread
|
| 13 |
|
| 14 |
-
import cv2
|
| 15 |
-
import gradio as gr
|
| 16 |
import numpy as np
|
| 17 |
from dotenv import load_dotenv
|
| 18 |
from openai import AsyncOpenAI
|
|
@@ -20,11 +17,8 @@ from openai import AsyncOpenAI
|
|
| 20 |
from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
|
| 21 |
from websockets import ConnectionClosedError, ConnectionClosedOK
|
| 22 |
|
| 23 |
-
from reachy_mini.reachy_mini import IMAGE_SIZE
|
| 24 |
from reachy_mini import ReachyMini
|
| 25 |
from reachy_mini.utils import create_head_pose
|
| 26 |
-
from reachy_mini.utils.camera import find_camera
|
| 27 |
-
from scipy.spatial.transform import Rotation
|
| 28 |
|
| 29 |
from reachy_mini_conversation_demo.head_tracker import HeadTracker
|
| 30 |
from reachy_mini_conversation_demo.prompts import SESSION_INSTRUCTIONS
|
|
@@ -33,10 +27,10 @@ from reachy_mini_conversation_demo.tools import (
|
|
| 33 |
TOOL_SPECS,
|
| 34 |
dispatch_tool_call,
|
| 35 |
)
|
| 36 |
-
from reachy_mini_conversation_demo.
|
| 37 |
from reachy_mini_conversation_demo.movement import MovementManager
|
| 38 |
from reachy_mini_conversation_demo.gstreamer import GstPlayer, GstRecorder
|
| 39 |
-
from reachy_mini_conversation_demo.vision import VisionManager,
|
| 40 |
|
| 41 |
# env + logging
|
| 42 |
load_dotenv()
|
|
@@ -83,118 +77,19 @@ if not API_KEY:
|
|
| 83 |
masked = (API_KEY[:6] + "..." + API_KEY[-4:]) if len(API_KEY) >= 12 else "<short>"
|
| 84 |
logger.info("OPENAI_API_KEY loaded (prefix): %s", masked)
|
| 85 |
|
| 86 |
-
# HF cache setup (persist between restarts)
|
| 87 |
-
HF_CACHE_DIR = os.path.expandvars(os.getenv("HF_HOME", "$HOME/.cache/huggingface"))
|
| 88 |
-
try:
|
| 89 |
-
os.makedirs(HF_CACHE_DIR, exist_ok=True)
|
| 90 |
-
os.environ["HF_HOME"] = HF_CACHE_DIR
|
| 91 |
-
logger.info("HF_HOME set to %s", HF_CACHE_DIR)
|
| 92 |
-
except Exception as e:
|
| 93 |
-
logger.warning("Failed to prepare HF cache dir %s: %s", HF_CACHE_DIR, e)
|
| 94 |
-
|
| 95 |
# init camera
|
| 96 |
CAMERA_INDEX = int(os.getenv("CAMERA_INDEX", "0"))
|
| 97 |
-
|
| 98 |
-
if SIM:
|
| 99 |
-
# Default build-in camera in SIM
|
| 100 |
-
# TODO: please, test on Linux and Windows
|
| 101 |
-
camera = cv2.VideoCapture(
|
| 102 |
-
0, cv2.CAP_AVFOUNDATION if sys.platform == "darwin" else 0
|
| 103 |
-
)
|
| 104 |
-
else:
|
| 105 |
-
if sys.platform == "darwin":
|
| 106 |
-
camera = cv2.VideoCapture(CAMERA_INDEX, cv2.CAP_AVFOUNDATION)
|
| 107 |
-
if not camera or not camera.isOpened():
|
| 108 |
-
logger.warning(
|
| 109 |
-
"Camera %d failed with AVFoundation; trying default backend",
|
| 110 |
-
CAMERA_INDEX,
|
| 111 |
-
)
|
| 112 |
-
camera = cv2.VideoCapture(CAMERA_INDEX)
|
| 113 |
-
else:
|
| 114 |
-
camera = find_camera()
|
| 115 |
|
| 116 |
# Vision manager initialization with proper error handling
|
| 117 |
vision_manager: VisionManager | None = None
|
| 118 |
|
| 119 |
-
if
|
| 120 |
-
|
| 121 |
-
VISION_ENABLED = False # Disable vision if no camera
|
| 122 |
-
else:
|
| 123 |
-
logger.info(
|
| 124 |
-
"Camera ready (index=%s)%s", 0 if SIM else CAMERA_INDEX, " [SIM]" if SIM else ""
|
| 125 |
-
)
|
| 126 |
-
|
| 127 |
-
# Prefetch SmolVLM2 repo into HF cache (idempotent, fast if already cached)
|
| 128 |
-
try:
|
| 129 |
-
from huggingface_hub import snapshot_download
|
| 130 |
-
model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
| 131 |
-
snapshot_download(
|
| 132 |
-
repo_id=model_id,
|
| 133 |
-
repo_type="model",
|
| 134 |
-
cache_dir=os.path.expandvars(os.getenv("HF_HOME", "$HOME/.cache/huggingface")),
|
| 135 |
-
)
|
| 136 |
-
logger.info("Prefetched %s into HF cache (%s)", model_id, os.getenv("HF_HOME"))
|
| 137 |
-
except Exception as e:
|
| 138 |
-
logger.warning("Model prefetch skipped/failed (will load normally): %s", e)
|
| 139 |
-
|
| 140 |
-
# Initialize vision manager if enabled
|
| 141 |
-
if VISION_ENABLED:
|
| 142 |
-
try:
|
| 143 |
-
# Prefetch SmolVLM2 repo into HF cache (idempotent, fast if cached)
|
| 144 |
-
try:
|
| 145 |
-
from huggingface_hub import snapshot_download
|
| 146 |
-
model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
| 147 |
-
snapshot_download(
|
| 148 |
-
repo_id=model_id,
|
| 149 |
-
repo_type="model",
|
| 150 |
-
cache_dir=os.path.expandvars(os.getenv("HF_HOME", "$HOME/.cache/huggingface")),
|
| 151 |
-
)
|
| 152 |
-
logger.info("Prefetched %s into HF cache (%s)", model_id, os.getenv("HF_HUB_CACHE"))
|
| 153 |
-
except Exception as e:
|
| 154 |
-
logger.warning("Model prefetch skipped/failed (will load normally): %s", e)
|
| 155 |
-
|
| 156 |
-
# Configure LLM processing
|
| 157 |
-
vision_config = VisionConfig(
|
| 158 |
-
model_path="HuggingFaceTB/SmolVLM2-2.2B-Instruct",
|
| 159 |
-
vision_interval=5.0,
|
| 160 |
-
max_new_tokens=64,
|
| 161 |
-
temperature=0.7,
|
| 162 |
-
jpeg_quality=85,
|
| 163 |
-
max_retries=3,
|
| 164 |
-
retry_delay=1.0,
|
| 165 |
-
device_preference="auto",
|
| 166 |
-
)
|
| 167 |
-
|
| 168 |
-
logger.info("Initializing SmolVLM2 vision processor (HF_HOME=%s)", os.getenv("HF_HOME"))
|
| 169 |
-
vision_manager = VisionManager(camera, vision_config)
|
| 170 |
-
|
| 171 |
-
device_info = vision_manager.processor.get_model_info()
|
| 172 |
-
logger.info(
|
| 173 |
-
"Vision processing enabled: %s on %s (GPU: %s)",
|
| 174 |
-
device_info["model_path"], device_info["device"], device_info.get("gpu_memory", "N/A"),
|
| 175 |
-
)
|
| 176 |
-
|
| 177 |
-
except Exception as e:
|
| 178 |
-
logger.error("Failed to initialize vision manager: %s", e)
|
| 179 |
-
logger.error("Vision processing will be disabled")
|
| 180 |
-
vision_manager = None
|
| 181 |
-
VISION_ENABLED = False
|
| 182 |
-
|
| 183 |
-
# Log final vision status
|
| 184 |
-
if VISION_ENABLED and vision_manager:
|
| 185 |
-
logger.info("Vision system ready - local SmolVLM2 processing enabled")
|
| 186 |
-
else:
|
| 187 |
-
logger.warning(
|
| 188 |
-
"Vision system disabled - robot will operate without visual understanding"
|
| 189 |
-
)
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
# Constants
|
| 193 |
-
BACKOFF_START_S = 1.0
|
| 194 |
-
BACKOFF_MAX_S = 30.0
|
| 195 |
|
| 196 |
# hardware / IO
|
| 197 |
current_robot = ReachyMini()
|
|
|
|
| 198 |
head_tracker: HeadTracker = None
|
| 199 |
|
| 200 |
if HEAD_TRACKING and not SIM:
|
|
@@ -239,6 +134,10 @@ class OpenAIRealtimeHandler(AsyncStreamHandler):
|
|
| 239 |
self._started_audio = False
|
| 240 |
self._connection_ready = False
|
| 241 |
self._speech_start_time = 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
def copy(self):
|
| 244 |
return OpenAIRealtimeHandler()
|
|
@@ -252,7 +151,7 @@ class OpenAIRealtimeHandler(AsyncStreamHandler):
|
|
| 252 |
logger.info("Realtime start_up: creating AsyncOpenAI client...")
|
| 253 |
self.client = AsyncOpenAI(api_key=API_KEY)
|
| 254 |
|
| 255 |
-
|
| 256 |
while not self._stop:
|
| 257 |
try:
|
| 258 |
async with self.client.beta.realtime.connect(
|
|
@@ -308,7 +207,7 @@ class OpenAIRealtimeHandler(AsyncStreamHandler):
|
|
| 308 |
)
|
| 309 |
|
| 310 |
logger.info("Realtime event loop started with improved VAD")
|
| 311 |
-
|
| 312 |
|
| 313 |
async for event in rt_connection:
|
| 314 |
event_type = getattr(event, "type", None)
|
|
@@ -426,10 +325,10 @@ class OpenAIRealtimeHandler(AsyncStreamHandler):
|
|
| 426 |
self._connection_ready = False
|
| 427 |
|
| 428 |
# Exponential backoff
|
| 429 |
-
delay = min(
|
| 430 |
logger.info("Reconnect in %.1fs…", delay)
|
| 431 |
await asyncio.sleep(delay)
|
| 432 |
-
|
| 433 |
|
| 434 |
async def receive(self, frame: bytes) -> None:
|
| 435 |
"""Mic frames from fastrtc."""
|
|
|
|
| 9 |
import time
|
| 10 |
import warnings
|
| 11 |
import threading
|
|
|
|
| 12 |
|
|
|
|
|
|
|
| 13 |
import numpy as np
|
| 14 |
from dotenv import load_dotenv
|
| 15 |
from openai import AsyncOpenAI
|
|
|
|
| 17 |
from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
|
| 18 |
from websockets import ConnectionClosedError, ConnectionClosedOK
|
| 19 |
|
|
|
|
| 20 |
from reachy_mini import ReachyMini
|
| 21 |
from reachy_mini.utils import create_head_pose
|
|
|
|
|
|
|
| 22 |
|
| 23 |
from reachy_mini_conversation_demo.head_tracker import HeadTracker
|
| 24 |
from reachy_mini_conversation_demo.prompts import SESSION_INSTRUCTIONS
|
|
|
|
| 27 |
TOOL_SPECS,
|
| 28 |
dispatch_tool_call,
|
| 29 |
)
|
| 30 |
+
from reachy_mini_conversation_demo.audio_sway import AudioSync, AudioConfig, pcm_to_b64
|
| 31 |
from reachy_mini_conversation_demo.movement import MovementManager
|
| 32 |
from reachy_mini_conversation_demo.gstreamer import GstPlayer, GstRecorder
|
| 33 |
+
from reachy_mini_conversation_demo.vision import VisionManager, init_vision, init_camera
|
| 34 |
|
| 35 |
# env + logging
|
| 36 |
load_dotenv()
|
|
|
|
| 77 |
masked = (API_KEY[:6] + "..." + API_KEY[-4:]) if len(API_KEY) >= 12 else "<short>"
|
| 78 |
logger.info("OPENAI_API_KEY loaded (prefix): %s", masked)
|
| 79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
# init camera
|
| 81 |
CAMERA_INDEX = int(os.getenv("CAMERA_INDEX", "0"))
|
| 82 |
+
camera = init_camera(camera_index=CAMERA_INDEX, simulation=SIM)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
# Vision manager initialization with proper error handling
|
| 85 |
vision_manager: VisionManager | None = None
|
| 86 |
|
| 87 |
+
if camera and camera.isOpened() and VISION_ENABLED:
|
| 88 |
+
vision_manager = init_vision(camera=camera)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
# hardware / IO
|
| 91 |
current_robot = ReachyMini()
|
| 92 |
+
|
| 93 |
head_tracker: HeadTracker = None
|
| 94 |
|
| 95 |
if HEAD_TRACKING and not SIM:
|
|
|
|
| 134 |
self._started_audio = False
|
| 135 |
self._connection_ready = False
|
| 136 |
self._speech_start_time = 0.0
|
| 137 |
+
# backoff managment for retry
|
| 138 |
+
self._backoff_start = 1.0
|
| 139 |
+
self._backoff_max = 16.0
|
| 140 |
+
self._backoff = self._backoff_start
|
| 141 |
|
| 142 |
def copy(self):
|
| 143 |
return OpenAIRealtimeHandler()
|
|
|
|
| 151 |
logger.info("Realtime start_up: creating AsyncOpenAI client...")
|
| 152 |
self.client = AsyncOpenAI(api_key=API_KEY)
|
| 153 |
|
| 154 |
+
self._backoff = self._backoff_start
|
| 155 |
while not self._stop:
|
| 156 |
try:
|
| 157 |
async with self.client.beta.realtime.connect(
|
|
|
|
| 207 |
)
|
| 208 |
|
| 209 |
logger.info("Realtime event loop started with improved VAD")
|
| 210 |
+
self._backoff = self._backoff_start
|
| 211 |
|
| 212 |
async for event in rt_connection:
|
| 213 |
event_type = getattr(event, "type", None)
|
|
|
|
| 325 |
self._connection_ready = False
|
| 326 |
|
| 327 |
# Exponential backoff
|
| 328 |
+
delay = min(self._backoff, self._backoff_max) + random.uniform(0, 0.5)
|
| 329 |
logger.info("Reconnect in %.1fs…", delay)
|
| 330 |
await asyncio.sleep(delay)
|
| 331 |
+
self._backoff = min(self._backoff * 2.0, self._backoff_max)
|
| 332 |
|
| 333 |
async def receive(self, frame: bytes) -> None:
|
| 334 |
"""Mic frames from fastrtc."""
|