Alina Lozovskaya commited on
Commit
adbcb04
·
1 Parent(s): 722c064

Fix local vision

Browse files
.env.example CHANGED
@@ -1,6 +1,9 @@
1
  OPENAI_API_KEY=
2
  MODEL_NAME="gpt-realtime"
3
 
 
 
 
4
  # Cache for local VLM
5
  HF_HOME=./cache
6
 
 
1
  OPENAI_API_KEY=
2
  MODEL_NAME="gpt-realtime"
3
 
4
+ # Local vision model
5
+ LOCAL_VISION_MODEL=HuggingFaceTB/SmolVLM2-2.2B-Instruct
6
+
7
  # Cache for local VLM
8
  HF_HOME=./cache
9
 
src/reachy_mini_conversation_demo/vision/processors.py CHANGED
@@ -1,11 +1,10 @@
1
  import os
2
- import sys
3
  import time
4
  import base64
5
  import asyncio
6
  import logging
7
  import threading
8
- from typing import Any, Dict
9
  from dataclasses import dataclass
10
 
11
  import cv2
@@ -14,6 +13,8 @@ import torch
14
  from transformers import AutoProcessor, AutoModelForImageTextToText
15
  from huggingface_hub import snapshot_download
16
 
 
 
17
 
18
  logger = logging.getLogger(__name__)
19
 
@@ -22,11 +23,9 @@ logger = logging.getLogger(__name__)
22
  class VisionConfig:
23
  """Configuration for vision processing."""
24
 
25
- processor_type: str = "local"
26
- model_path: str = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
27
  vision_interval: float = 5.0
28
  max_new_tokens: int = 64
29
- temperature: float = 0.7
30
  jpeg_quality: int = 85
31
  max_retries: int = 3
32
  retry_delay: float = 1.0
@@ -36,17 +35,17 @@ class VisionConfig:
36
  class VisionProcessor:
37
  """Handles SmolVLM2 model loading and inference."""
38
 
39
- def __init__(self, config: VisionConfig = None):
40
  """Initialize the vision processor."""
41
- self.config = config or VisionConfig()
42
- self.model_path = self.config.model_path
43
  self.device = self._determine_device()
44
  self.processor = None
45
  self.model = None
46
  self._initialized = False
47
 
48
  def _determine_device(self) -> str:
49
- pref = self.config.device_preference
50
  if pref == "cpu":
51
  return "cpu"
52
  if pref == "cuda":
@@ -61,7 +60,7 @@ class VisionProcessor:
61
  def initialize(self) -> bool:
62
  """Load model and processor onto the selected device."""
63
  try:
64
- logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={os.getenv('HF_HOME')})")
65
  self.processor = AutoProcessor.from_pretrained(self.model_path)
66
 
67
  # Select dtype depending on device
@@ -98,13 +97,13 @@ class VisionProcessor:
98
  if not self._initialized:
99
  return "Vision model not initialized"
100
 
101
- for attempt in range(self.config.max_retries):
102
  try:
103
  # Convert to JPEG bytes
104
  success, jpeg_buffer = cv2.imencode(
105
  ".jpg",
106
  cv2_image,
107
- [cv2.IMWRITE_JPEG_QUALITY, self.config.jpeg_quality],
108
  )
109
  if not success:
110
  return "Failed to encode image"
@@ -140,7 +139,7 @@ class VisionProcessor:
140
  generated_ids = self.model.generate(
141
  **inputs,
142
  do_sample=False,
143
- max_new_tokens=self.config.max_new_tokens,
144
  pad_token_id=self.processor.tokenizer.eos_token_id,
145
  )
146
 
@@ -165,17 +164,17 @@ class VisionProcessor:
165
  logger.error(f"CUDA OOM on attempt {attempt + 1}: {e}")
166
  if self.device == "cuda":
167
  torch.cuda.empty_cache()
168
- if attempt < self.config.max_retries - 1:
169
- time.sleep(self.config.retry_delay * (attempt + 1))
170
  else:
171
  return "GPU out of memory - vision processing failed"
172
 
173
  except Exception as e:
174
  logger.error(f"Vision processing failed (attempt {attempt + 1}): {e}")
175
- if attempt < self.config.max_retries - 1:
176
- time.sleep(self.config.retry_delay)
177
  else:
178
- return f"Vision processing error after {self.config.max_retries} attempts"
179
 
180
  def _extract_response(self, full_text: str) -> str:
181
  """Extract the assistant's response from the full generated text."""
@@ -194,7 +193,6 @@ class VisionProcessor:
194
  def get_model_info(self) -> Dict[str, Any]:
195
  """Get information about the loaded model."""
196
  return {
197
- "processor_type": "local",
198
  "initialized": self._initialized,
199
  "device": self.device,
200
  "model_path": self.model_path,
@@ -208,14 +206,13 @@ class VisionProcessor:
208
  class VisionManager:
209
  """Manages periodic vision processing and scene understanding."""
210
 
211
- def __init__(self, camera, config: VisionConfig = None):
212
  """Initialize vision manager with camera and configuration."""
213
  self.camera = camera
214
- self.config = config or VisionConfig()
215
- self.vision_interval = self.config.vision_interval
216
- self.processor = create_vision_processor(self.config) # Use factory function
217
 
218
- self._current_description = ""
219
  self._last_processed_time = 0
220
 
221
  # Initialize processor
@@ -230,8 +227,8 @@ class VisionManager:
230
  current_time = time.time()
231
 
232
  if current_time - self._last_processed_time >= self.vision_interval:
233
- success, frame = await asyncio.to_thread(self.camera.read)
234
- if success and frame is not None:
235
  description = await asyncio.to_thread(
236
  lambda: self.processor.process_image(
237
  frame, "Briefly describe what you see in one sentence."
@@ -240,7 +237,6 @@ class VisionManager:
240
 
241
  # Only update if we got a valid response
242
  if description and not description.startswith(("Vision", "Failed", "Error")):
243
- self._current_description = description
244
  self._last_processed_time = current_time
245
 
246
  logger.info(f"Vision update: {description}")
@@ -255,29 +251,6 @@ class VisionManager:
255
 
256
  logger.info("Vision loop finished")
257
 
258
- async def get_current_description(self) -> str:
259
- """Get the most recent scene description (thread-safe)."""
260
- return self._current_description
261
-
262
- async def process_current_frame(self, prompt: str = "Describe what you see in detail.") -> Dict[str, Any]:
263
- """Process current camera frame with custom prompt."""
264
- try:
265
- success, frame = self.camera.read()
266
- if not success or frame is None:
267
- return {"error": "Failed to capture image from camera"}
268
-
269
- description = await asyncio.to_thread(lambda: self.processor.process_image(frame, prompt))
270
-
271
- return {
272
- "description": description,
273
- "timestamp": time.time(),
274
- "prompt": prompt,
275
- }
276
-
277
- except Exception as e:
278
- logger.exception("Failed to process current frame")
279
- return {"error": f"Frame processing failed: {str(e)}"}
280
-
281
  async def get_status(self) -> Dict[str, Any]:
282
  """Get comprehensive status information."""
283
  return {
@@ -285,84 +258,59 @@ class VisionManager:
285
  "processor_info": self.processor.get_model_info(),
286
  "config": {
287
  "interval": self.vision_interval,
288
- "processor_type": self.config.processor_type,
289
  },
290
  }
291
 
292
 
293
- def init_camera(camera_index=0, simulation=True):
294
- """Initialize camera (real or simulated)."""
295
- api_preference = cv2.CAP_AVFOUNDATION if sys.platform == "darwin" else 0
296
-
297
- if simulation:
298
- # Default build-in camera in SIM
299
- # TODO: please, test on Linux and Windows
300
- camera = cv2.VideoCapture(0, api_preference)
301
- else:
302
- # TODO handle macos properly
303
- if sys.platform == "darwin":
304
- camera = cv2.VideoCapture(camera_index, cv2.CAP_AVFOUNDATION)
305
- else:
306
- camera = cv2.VideoCapture(camera_index)
307
 
308
- return camera
309
-
310
-
311
- def create_vision_processor(config: VisionConfig):
312
- """Create the appropriate vision processor (factory)."""
313
- if config.processor_type == "openai":
314
- try:
315
- from .openai_vision import OpenAIVisionProcessor
316
 
317
- return OpenAIVisionProcessor(config)
318
- except ImportError:
319
- logger.error("OpenAI vision processor not available, falling back to local")
320
- return VisionProcessor(config)
321
- else:
322
- return VisionProcessor(config)
323
 
 
 
 
 
324
 
325
- def init_vision(camera: cv2.VideoCapture, processor_type: str = "local") -> VisionManager:
326
- """Initialize vision manager with the specified processor type."""
327
- model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
328
-
329
- cache_dir = os.path.expandvars(os.getenv("HF_HOME", "$HOME/.cache/huggingface"))
330
-
331
- # Only download model if using local processor
332
- if processor_type == "local":
333
- try:
334
- os.makedirs(cache_dir, exist_ok=True)
335
- os.environ["HF_HOME"] = cache_dir
336
- logger.info("HF_HOME set to %s", cache_dir)
337
- except Exception as e:
338
- logger.warning("Failed to prepare HF cache dir %s: %s", cache_dir, e)
339
- return None
340
-
341
  snapshot_download(
342
  repo_id=model_id,
343
  repo_type="model",
344
  cache_dir=cache_dir,
345
  )
346
- logger.info(f"Prefetched model_id={model_id} into cache_dir={cache_dir}")
347
-
348
- # Configure vision processing
349
- vision_config = VisionConfig(
350
- processor_type=processor_type,
351
- model_path=model_id,
352
- vision_interval=5.0,
353
- max_new_tokens=64,
354
- temperature=0.7,
355
- jpeg_quality=85,
356
- max_retries=3,
357
- retry_delay=1.0,
358
- device_preference="auto",
359
- )
360
-
361
- vision_manager = VisionManager(camera, vision_config)
362
-
363
- device_info = vision_manager.processor.get_model_info()
364
- logger.info(
365
- f"Vision processing enabled: {device_info.get('model_path', device_info.get('processor_type'))} on {device_info.get('device', 'API')}",
366
- )
367
-
368
- return vision_manager
 
 
 
 
 
1
  import os
 
2
  import time
3
  import base64
4
  import asyncio
5
  import logging
6
  import threading
7
+ from typing import Any, Dict, Optional
8
  from dataclasses import dataclass
9
 
10
  import cv2
 
13
  from transformers import AutoProcessor, AutoModelForImageTextToText
14
  from huggingface_hub import snapshot_download
15
 
16
+ from reachy_mini_conversation_demo.config import config
17
+
18
 
19
  logger = logging.getLogger(__name__)
20
 
 
23
  class VisionConfig:
24
  """Configuration for vision processing."""
25
 
26
+ model_path: str = config.LOCAL_VISION_MODEL
 
27
  vision_interval: float = 5.0
28
  max_new_tokens: int = 64
 
29
  jpeg_quality: int = 85
30
  max_retries: int = 3
31
  retry_delay: float = 1.0
 
35
  class VisionProcessor:
36
  """Handles SmolVLM2 model loading and inference."""
37
 
38
+ def __init__(self, vision_config: VisionConfig = None):
39
  """Initialize the vision processor."""
40
+ self.vision_config = vision_config or VisionConfig()
41
+ self.model_path = self.vision_config.model_path
42
  self.device = self._determine_device()
43
  self.processor = None
44
  self.model = None
45
  self._initialized = False
46
 
47
  def _determine_device(self) -> str:
48
+ pref = self.vision_config.device_preference
49
  if pref == "cpu":
50
  return "cpu"
51
  if pref == "cuda":
 
60
  def initialize(self) -> bool:
61
  """Load model and processor onto the selected device."""
62
  try:
63
+ logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={config.HF_HOME})")
64
  self.processor = AutoProcessor.from_pretrained(self.model_path)
65
 
66
  # Select dtype depending on device
 
97
  if not self._initialized:
98
  return "Vision model not initialized"
99
 
100
+ for attempt in range(self.vision_config.max_retries):
101
  try:
102
  # Convert to JPEG bytes
103
  success, jpeg_buffer = cv2.imencode(
104
  ".jpg",
105
  cv2_image,
106
+ [cv2.IMWRITE_JPEG_QUALITY, self.vision_config.jpeg_quality],
107
  )
108
  if not success:
109
  return "Failed to encode image"
 
139
  generated_ids = self.model.generate(
140
  **inputs,
141
  do_sample=False,
142
+ max_new_tokens=self.vision_config.max_new_tokens,
143
  pad_token_id=self.processor.tokenizer.eos_token_id,
144
  )
145
 
 
164
  logger.error(f"CUDA OOM on attempt {attempt + 1}: {e}")
165
  if self.device == "cuda":
166
  torch.cuda.empty_cache()
167
+ if attempt < self.vision_config.max_retries - 1:
168
+ time.sleep(self.vision_config.retry_delay * (attempt + 1))
169
  else:
170
  return "GPU out of memory - vision processing failed"
171
 
172
  except Exception as e:
173
  logger.error(f"Vision processing failed (attempt {attempt + 1}): {e}")
174
+ if attempt < self.vision_config.max_retries - 1:
175
+ time.sleep(self.vision_config.retry_delay)
176
  else:
177
+ return f"Vision processing error after {self.vision_config.max_retries} attempts"
178
 
179
  def _extract_response(self, full_text: str) -> str:
180
  """Extract the assistant's response from the full generated text."""
 
193
  def get_model_info(self) -> Dict[str, Any]:
194
  """Get information about the loaded model."""
195
  return {
 
196
  "initialized": self._initialized,
197
  "device": self.device,
198
  "model_path": self.model_path,
 
206
  class VisionManager:
207
  """Manages periodic vision processing and scene understanding."""
208
 
209
+ def __init__(self, camera, vision_config: VisionConfig = None):
210
  """Initialize vision manager with camera and configuration."""
211
  self.camera = camera
212
+ self.vision_config = vision_config or VisionConfig()
213
+ self.vision_interval = self.vision_config.vision_interval
214
+ self.processor = VisionProcessor(self.vision_config)
215
 
 
216
  self._last_processed_time = 0
217
 
218
  # Initialize processor
 
227
  current_time = time.time()
228
 
229
  if current_time - self._last_processed_time >= self.vision_interval:
230
+ frame = self.camera.get_latest_frame()
231
+ if frame is not None:
232
  description = await asyncio.to_thread(
233
  lambda: self.processor.process_image(
234
  frame, "Briefly describe what you see in one sentence."
 
237
 
238
  # Only update if we got a valid response
239
  if description and not description.startswith(("Vision", "Failed", "Error")):
 
240
  self._last_processed_time = current_time
241
 
242
  logger.info(f"Vision update: {description}")
 
251
 
252
  logger.info("Vision loop finished")
253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  async def get_status(self) -> Dict[str, Any]:
255
  """Get comprehensive status information."""
256
  return {
 
258
  "processor_info": self.processor.get_model_info(),
259
  "config": {
260
  "interval": self.vision_interval,
 
261
  },
262
  }
263
 
264
 
265
+ def initialize_vision_manager(camera_worker) -> Optional[VisionManager]:
266
+ """Initialize vision manager with model download and configuration.
 
 
 
 
 
 
 
 
 
 
 
 
267
 
268
+ Args:
269
+ camera_worker: CameraWorker instance for frame capture
270
+ Returns:
271
+ VisionManager instance or None if initialization fails
 
 
 
 
272
 
273
+ """
274
+ try:
275
+ model_id = config.LOCAL_VISION_MODEL
276
+ cache_dir = os.path.expanduser(config.HF_HOME)
 
 
277
 
278
+ # Prepare cache directory
279
+ os.makedirs(cache_dir, exist_ok=True)
280
+ os.environ["HF_HOME"] = cache_dir
281
+ logger.info("HF_HOME set to %s", cache_dir)
282
 
283
+ # Download model to cache
284
+ logger.info(f"Downloading vision model {model_id} to cache...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  snapshot_download(
286
  repo_id=model_id,
287
  repo_type="model",
288
  cache_dir=cache_dir,
289
  )
290
+ logger.info(f"Model {model_id} downloaded to {cache_dir}")
291
+
292
+ # Configure vision processing
293
+ vision_config = VisionConfig(
294
+ model_path=model_id,
295
+ vision_interval=5.0,
296
+ max_new_tokens=64,
297
+ jpeg_quality=85,
298
+ max_retries=3,
299
+ retry_delay=1.0,
300
+ device_preference="auto",
301
+ )
302
+
303
+ # Initialize vision manager
304
+ vision_manager = VisionManager(camera_worker, vision_config)
305
+
306
+ # Log device info
307
+ device_info = vision_manager.processor.get_model_info()
308
+ logger.info(
309
+ f"Vision processing enabled: {device_info.get('model_path')} on {device_info.get('device')}"
310
+ )
311
+
312
+ return vision_manager
313
+
314
+ except Exception as e:
315
+ logger.error(f"Failed to initialize vision manager: {e}")
316
+ return None