Alina Lozovskaya commited on
Commit
69601f9
·
1 Parent(s): 24c038d

Clean out main.py [wip]

Browse files
src/reachy_mini_conversation_demo/main.py CHANGED
@@ -9,10 +9,7 @@ import sys
9
  import time
10
  import warnings
11
  import threading
12
- from threading import Thread
13
 
14
- import cv2
15
- import gradio as gr
16
  import numpy as np
17
  from dotenv import load_dotenv
18
  from openai import AsyncOpenAI
@@ -20,11 +17,8 @@ from openai import AsyncOpenAI
20
  from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
21
  from websockets import ConnectionClosedError, ConnectionClosedOK
22
 
23
- from reachy_mini.reachy_mini import IMAGE_SIZE
24
  from reachy_mini import ReachyMini
25
  from reachy_mini.utils import create_head_pose
26
- from reachy_mini.utils.camera import find_camera
27
- from scipy.spatial.transform import Rotation
28
 
29
  from reachy_mini_conversation_demo.head_tracker import HeadTracker
30
  from reachy_mini_conversation_demo.prompts import SESSION_INSTRUCTIONS
@@ -33,10 +27,10 @@ from reachy_mini_conversation_demo.tools import (
33
  TOOL_SPECS,
34
  dispatch_tool_call,
35
  )
36
- from reachy_mini_conversation_demo.audio import AudioSync, AudioConfig, pcm_to_b64
37
  from reachy_mini_conversation_demo.movement import MovementManager
38
  from reachy_mini_conversation_demo.gstreamer import GstPlayer, GstRecorder
39
- from reachy_mini_conversation_demo.vision import VisionManager, VisionConfig
40
 
41
  # env + logging
42
  load_dotenv()
@@ -83,118 +77,19 @@ if not API_KEY:
83
  masked = (API_KEY[:6] + "..." + API_KEY[-4:]) if len(API_KEY) >= 12 else "<short>"
84
  logger.info("OPENAI_API_KEY loaded (prefix): %s", masked)
85
 
86
- # HF cache setup (persist between restarts)
87
- HF_CACHE_DIR = os.path.expandvars(os.getenv("HF_HOME", "$HOME/.cache/huggingface"))
88
- try:
89
- os.makedirs(HF_CACHE_DIR, exist_ok=True)
90
- os.environ["HF_HOME"] = HF_CACHE_DIR
91
- logger.info("HF_HOME set to %s", HF_CACHE_DIR)
92
- except Exception as e:
93
- logger.warning("Failed to prepare HF cache dir %s: %s", HF_CACHE_DIR, e)
94
-
95
  # init camera
96
  CAMERA_INDEX = int(os.getenv("CAMERA_INDEX", "0"))
97
-
98
- if SIM:
99
- # Default build-in camera in SIM
100
- # TODO: please, test on Linux and Windows
101
- camera = cv2.VideoCapture(
102
- 0, cv2.CAP_AVFOUNDATION if sys.platform == "darwin" else 0
103
- )
104
- else:
105
- if sys.platform == "darwin":
106
- camera = cv2.VideoCapture(CAMERA_INDEX, cv2.CAP_AVFOUNDATION)
107
- if not camera or not camera.isOpened():
108
- logger.warning(
109
- "Camera %d failed with AVFoundation; trying default backend",
110
- CAMERA_INDEX,
111
- )
112
- camera = cv2.VideoCapture(CAMERA_INDEX)
113
- else:
114
- camera = find_camera()
115
 
116
  # Vision manager initialization with proper error handling
117
  vision_manager: VisionManager | None = None
118
 
119
- if not camera or not camera.isOpened():
120
- logger.error("Camera failed to open (index=%s)", 0 if SIM else CAMERA_INDEX)
121
- VISION_ENABLED = False # Disable vision if no camera
122
- else:
123
- logger.info(
124
- "Camera ready (index=%s)%s", 0 if SIM else CAMERA_INDEX, " [SIM]" if SIM else ""
125
- )
126
-
127
- # Prefetch SmolVLM2 repo into HF cache (idempotent, fast if already cached)
128
- try:
129
- from huggingface_hub import snapshot_download
130
- model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
131
- snapshot_download(
132
- repo_id=model_id,
133
- repo_type="model",
134
- cache_dir=os.path.expandvars(os.getenv("HF_HOME", "$HOME/.cache/huggingface")),
135
- )
136
- logger.info("Prefetched %s into HF cache (%s)", model_id, os.getenv("HF_HOME"))
137
- except Exception as e:
138
- logger.warning("Model prefetch skipped/failed (will load normally): %s", e)
139
-
140
- # Initialize vision manager if enabled
141
- if VISION_ENABLED:
142
- try:
143
- # Prefetch SmolVLM2 repo into HF cache (idempotent, fast if cached)
144
- try:
145
- from huggingface_hub import snapshot_download
146
- model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
147
- snapshot_download(
148
- repo_id=model_id,
149
- repo_type="model",
150
- cache_dir=os.path.expandvars(os.getenv("HF_HOME", "$HOME/.cache/huggingface")),
151
- )
152
- logger.info("Prefetched %s into HF cache (%s)", model_id, os.getenv("HF_HUB_CACHE"))
153
- except Exception as e:
154
- logger.warning("Model prefetch skipped/failed (will load normally): %s", e)
155
-
156
- # Configure LLM processing
157
- vision_config = VisionConfig(
158
- model_path="HuggingFaceTB/SmolVLM2-2.2B-Instruct",
159
- vision_interval=5.0,
160
- max_new_tokens=64,
161
- temperature=0.7,
162
- jpeg_quality=85,
163
- max_retries=3,
164
- retry_delay=1.0,
165
- device_preference="auto",
166
- )
167
-
168
- logger.info("Initializing SmolVLM2 vision processor (HF_HOME=%s)", os.getenv("HF_HOME"))
169
- vision_manager = VisionManager(camera, vision_config)
170
-
171
- device_info = vision_manager.processor.get_model_info()
172
- logger.info(
173
- "Vision processing enabled: %s on %s (GPU: %s)",
174
- device_info["model_path"], device_info["device"], device_info.get("gpu_memory", "N/A"),
175
- )
176
-
177
- except Exception as e:
178
- logger.error("Failed to initialize vision manager: %s", e)
179
- logger.error("Vision processing will be disabled")
180
- vision_manager = None
181
- VISION_ENABLED = False
182
-
183
- # Log final vision status
184
- if VISION_ENABLED and vision_manager:
185
- logger.info("Vision system ready - local SmolVLM2 processing enabled")
186
- else:
187
- logger.warning(
188
- "Vision system disabled - robot will operate without visual understanding"
189
- )
190
-
191
-
192
- # Constants
193
- BACKOFF_START_S = 1.0
194
- BACKOFF_MAX_S = 30.0
195
 
196
  # hardware / IO
197
  current_robot = ReachyMini()
 
198
  head_tracker: HeadTracker = None
199
 
200
  if HEAD_TRACKING and not SIM:
@@ -239,6 +134,10 @@ class OpenAIRealtimeHandler(AsyncStreamHandler):
239
  self._started_audio = False
240
  self._connection_ready = False
241
  self._speech_start_time = 0.0
 
 
 
 
242
 
243
  def copy(self):
244
  return OpenAIRealtimeHandler()
@@ -252,7 +151,7 @@ class OpenAIRealtimeHandler(AsyncStreamHandler):
252
  logger.info("Realtime start_up: creating AsyncOpenAI client...")
253
  self.client = AsyncOpenAI(api_key=API_KEY)
254
 
255
- backoff = BACKOFF_START_S
256
  while not self._stop:
257
  try:
258
  async with self.client.beta.realtime.connect(
@@ -308,7 +207,7 @@ class OpenAIRealtimeHandler(AsyncStreamHandler):
308
  )
309
 
310
  logger.info("Realtime event loop started with improved VAD")
311
- backoff = BACKOFF_START_S
312
 
313
  async for event in rt_connection:
314
  event_type = getattr(event, "type", None)
@@ -426,10 +325,10 @@ class OpenAIRealtimeHandler(AsyncStreamHandler):
426
  self._connection_ready = False
427
 
428
  # Exponential backoff
429
- delay = min(backoff, BACKOFF_MAX_S) + random.uniform(0, 0.5)
430
  logger.info("Reconnect in %.1fs…", delay)
431
  await asyncio.sleep(delay)
432
- backoff = min(backoff * 2.0, BACKOFF_MAX_S)
433
 
434
  async def receive(self, frame: bytes) -> None:
435
  """Mic frames from fastrtc."""
 
9
  import time
10
  import warnings
11
  import threading
 
12
 
 
 
13
  import numpy as np
14
  from dotenv import load_dotenv
15
  from openai import AsyncOpenAI
 
17
  from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
18
  from websockets import ConnectionClosedError, ConnectionClosedOK
19
 
 
20
  from reachy_mini import ReachyMini
21
  from reachy_mini.utils import create_head_pose
 
 
22
 
23
  from reachy_mini_conversation_demo.head_tracker import HeadTracker
24
  from reachy_mini_conversation_demo.prompts import SESSION_INSTRUCTIONS
 
27
  TOOL_SPECS,
28
  dispatch_tool_call,
29
  )
30
+ from reachy_mini_conversation_demo.audio_sway import AudioSync, AudioConfig, pcm_to_b64
31
  from reachy_mini_conversation_demo.movement import MovementManager
32
  from reachy_mini_conversation_demo.gstreamer import GstPlayer, GstRecorder
33
+ from reachy_mini_conversation_demo.vision import VisionManager, init_vision, init_camera
34
 
35
  # env + logging
36
  load_dotenv()
 
77
  masked = (API_KEY[:6] + "..." + API_KEY[-4:]) if len(API_KEY) >= 12 else "<short>"
78
  logger.info("OPENAI_API_KEY loaded (prefix): %s", masked)
79
 
 
 
 
 
 
 
 
 
 
80
  # init camera
81
  CAMERA_INDEX = int(os.getenv("CAMERA_INDEX", "0"))
82
+ camera = init_camera(camera_index=CAMERA_INDEX, simulation=SIM)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  # Vision manager initialization with proper error handling
85
  vision_manager: VisionManager | None = None
86
 
87
+ if camera and camera.isOpened() and VISION_ENABLED:
88
+ vision_manager = init_vision(camera=camera)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  # hardware / IO
91
  current_robot = ReachyMini()
92
+
93
  head_tracker: HeadTracker = None
94
 
95
  if HEAD_TRACKING and not SIM:
 
134
  self._started_audio = False
135
  self._connection_ready = False
136
  self._speech_start_time = 0.0
137
+ # backoff managment for retry
138
+ self._backoff_start = 1.0
139
+ self._backoff_max = 16.0
140
+ self._backoff = self._backoff_start
141
 
142
  def copy(self):
143
  return OpenAIRealtimeHandler()
 
151
  logger.info("Realtime start_up: creating AsyncOpenAI client...")
152
  self.client = AsyncOpenAI(api_key=API_KEY)
153
 
154
+ self._backoff = self._backoff_start
155
  while not self._stop:
156
  try:
157
  async with self.client.beta.realtime.connect(
 
207
  )
208
 
209
  logger.info("Realtime event loop started with improved VAD")
210
+ self._backoff = self._backoff_start
211
 
212
  async for event in rt_connection:
213
  event_type = getattr(event, "type", None)
 
325
  self._connection_ready = False
326
 
327
  # Exponential backoff
328
+ delay = min(self._backoff, self._backoff_max) + random.uniform(0, 0.5)
329
  logger.info("Reconnect in %.1fs…", delay)
330
  await asyncio.sleep(delay)
331
+ self._backoff = min(self._backoff * 2.0, self._backoff_max)
332
 
333
  async def receive(self, frame: bytes) -> None:
334
  """Mic frames from fastrtc."""