Alina Lozovskaya commited on
Commit
436b1d6
·
1 Parent(s): a150e03

Add vision folder with local or openai vision options

Browse files
src/reachy_mini_conversation_demo/main.py CHANGED
@@ -11,22 +11,32 @@ from reachy_mini import ReachyMini
11
  from reachy_mini.utils import create_head_pose
12
 
13
  from reachy_mini_conversation_demo.config import config
14
- from reachy_mini_conversation_demo.head_tracker import HeadTracker
15
  from reachy_mini_conversation_demo.openai_realtime import OpenAIRealtimeHandler
16
  from reachy_mini_conversation_demo.prompts import SESSION_INSTRUCTIONS
17
  from reachy_mini_conversation_demo.tools import (
18
  ToolDependencies,
19
  )
20
- from reachy_mini_conversation_demo.audio_sway import AudioSync, AudioConfig
21
  from reachy_mini_conversation_demo.movement import MovementManager
22
- from reachy_mini_conversation_demo.gstreamer import GstPlayer, GstRecorder
23
- from reachy_mini_conversation_demo.vision import VisionManager, init_vision, init_camera
 
 
 
 
24
 
25
  # Command-line arguments
26
  parser = argparse.ArgumentParser(description="Reachy Mini Conversation Demo")
27
  parser.add_argument("--sim", action="store_true", help="Run in simulation mode")
28
  parser.add_argument("--vision", action="store_true", help="Enable vision")
29
  parser.add_argument("--head-tracking", action="store_true", help="Enable head tracking")
 
 
 
 
 
 
30
  parser.add_argument("--debug", action="store_true", help="Enable debug logging")
31
  args = parser.parse_args()
32
 
@@ -135,7 +145,9 @@ async def loop():
135
 
136
  vision_manager: VisionManager | None = None
137
  if camera and camera.isOpened() and VISION_ENABLED:
138
- vision_manager = init_vision(camera=camera)
 
 
139
 
140
  current_robot = ReachyMini()
141
 
 
11
  from reachy_mini.utils import create_head_pose
12
 
13
  from reachy_mini_conversation_demo.config import config
14
+ from reachy_mini_conversation_demo.vision.head_tracker import HeadTracker
15
  from reachy_mini_conversation_demo.openai_realtime import OpenAIRealtimeHandler
16
  from reachy_mini_conversation_demo.prompts import SESSION_INSTRUCTIONS
17
  from reachy_mini_conversation_demo.tools import (
18
  ToolDependencies,
19
  )
20
+ from reachy_mini_conversation_demo.audio.audio_sway import AudioSync, AudioConfig
21
  from reachy_mini_conversation_demo.movement import MovementManager
22
+ from reachy_mini_conversation_demo.audio.gstreamer import GstPlayer, GstRecorder
23
+ from reachy_mini_conversation_demo.vision.processors import (
24
+ VisionManager,
25
+ init_vision,
26
+ init_camera,
27
+ )
28
 
29
  # Command-line arguments
30
  parser = argparse.ArgumentParser(description="Reachy Mini Conversation Demo")
31
  parser.add_argument("--sim", action="store_true", help="Run in simulation mode")
32
  parser.add_argument("--vision", action="store_true", help="Enable vision")
33
  parser.add_argument("--head-tracking", action="store_true", help="Enable head tracking")
34
+ parser.add_argument(
35
+ "--vision-provider",
36
+ choices=["openai", "local"],
37
+ default="local",
38
+ help="Choose vision provider (default: local)",
39
+ )
40
  parser.add_argument("--debug", action="store_true", help="Enable debug logging")
41
  args = parser.parse_args()
42
 
 
145
 
146
  vision_manager: VisionManager | None = None
147
  if camera and camera.isOpened() and VISION_ENABLED:
148
+ processor_type = args.vision_provider
149
+ vision_manager = init_vision(camera=camera, processor_type=processor_type)
150
+ logger.info(f"Vision processor type: {processor_type}")
151
 
152
  current_robot = ReachyMini()
153
 
src/reachy_mini_conversation_demo/movement.py CHANGED
@@ -9,7 +9,7 @@ import cv2
9
  from reachy_mini import ReachyMini
10
  from reachy_mini.reachy_mini import IMAGE_SIZE
11
  from reachy_mini.utils import create_head_pose
12
- from reachy_mini_conversation_demo.head_tracker import HeadTracker
13
 
14
  logger = logging.getLogger(__name__)
15
 
 
9
  from reachy_mini import ReachyMini
10
  from reachy_mini.reachy_mini import IMAGE_SIZE
11
  from reachy_mini.utils import create_head_pose
12
+ from reachy_mini_conversation_demo.vision.head_tracker import HeadTracker
13
 
14
  logger = logging.getLogger(__name__)
15
 
src/reachy_mini_conversation_demo/openai_realtime.py CHANGED
@@ -19,7 +19,7 @@ from reachy_mini_conversation_demo.tools import (
19
  ALL_TOOL_SPECS,
20
  dispatch_tool_call,
21
  )
22
- from reachy_mini_conversation_demo.audio_sway import AudioSync, pcm_to_b64
23
  from reachy_mini_conversation_demo.config import config
24
 
25
  logger = logging.getLogger(__name__)
 
19
  ALL_TOOL_SPECS,
20
  dispatch_tool_call,
21
  )
22
+ from reachy_mini_conversation_demo.audio.audio_sway import AudioSync, pcm_to_b64
23
  from reachy_mini_conversation_demo.config import config
24
 
25
  logger = logging.getLogger(__name__)
src/reachy_mini_conversation_demo/tools.py CHANGED
@@ -15,7 +15,7 @@ from typing import Any, Dict, Literal, Optional
15
  import cv2
16
  import numpy as np
17
 
18
- from reachy_mini_conversation_demo.vision import VisionManager
19
  from reachy_mini_conversation_demo.movement import MovementManager
20
 
21
  logger = logging.getLogger(__name__)
 
15
  import cv2
16
  import numpy as np
17
 
18
+ from reachy_mini_conversation_demo.vision.processors import VisionManager
19
  from reachy_mini_conversation_demo.movement import MovementManager
20
 
21
  logger = logging.getLogger(__name__)
src/reachy_mini_conversation_demo/vision/__init__.py ADDED
File without changes
src/reachy_mini_conversation_demo/{head_tracker.py → vision/head_tracker.py} RENAMED
File without changes
src/reachy_mini_conversation_demo/vision/openai_vision.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import logging
3
+ import os
4
+ import cv2
5
+ from openai import OpenAI
6
+ from .processors import VisionConfig
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class OpenAIVisionProcessor:
12
+ def __init__(self, config: VisionConfig = None):
13
+ self.config = config or VisionConfig()
14
+ self._initialized = False
15
+ self.client = None
16
+
17
+ def initialize(self):
18
+ """Initialize OpenAI client with proper error handling"""
19
+ try:
20
+ api_key = os.getenv("OPENAI_API_KEY")
21
+ if not api_key:
22
+ logger.error("OPENAI_API_KEY not found in environment variables")
23
+ return False
24
+
25
+ self.client = OpenAI(api_key=api_key)
26
+
27
+ # Smoke test the API/key
28
+ try:
29
+ _ = self.client.models.list()
30
+ self._initialized = True
31
+ logger.info("OpenAI Vision processor initialized successfully")
32
+ return True
33
+ except Exception as e:
34
+ logger.error(f"Failed to connect to OpenAI API: {e}")
35
+ return False
36
+
37
+ except Exception as e:
38
+ logger.error(f"Failed to initialize OpenAI Vision processor: {e}")
39
+ return False
40
+
41
+ def process_image(
42
+ self, cv2_image, prompt="Briefly describe what you see in one sentence."
43
+ ):
44
+ """Process image using OpenAI (Responses API) with retry logic"""
45
+ if not self._initialized:
46
+ return "OpenAI Vision processor not initialized"
47
+
48
+ for attempt in range(self.config.max_retries):
49
+ try:
50
+ # Convert image to base64
51
+ rgb_image = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2RGB)
52
+ ok, jpeg_buffer = cv2.imencode(
53
+ ".jpg",
54
+ rgb_image,
55
+ [cv2.IMWRITE_JPEG_QUALITY, self.config.jpeg_quality],
56
+ )
57
+ if not ok:
58
+ return "Failed to encode image"
59
+ image_base64 = base64.b64encode(jpeg_buffer.tobytes()).decode("utf-8")
60
+
61
+ # Responses API with input_image
62
+ response = self.client.responses.create(
63
+ model=self.config.openai_model, # e.g., gpt-4.1 or gpt-4.1-mini
64
+ input=[
65
+ {
66
+ "role": "user",
67
+ "content": [
68
+ {"type": "input_text", "text": prompt},
69
+ {
70
+ "type": "input_image",
71
+ "image_url": f"data:image/jpeg;base64,{image_base64}",
72
+ },
73
+ ],
74
+ }
75
+ ],
76
+ max_output_tokens=300,
77
+ )
78
+
79
+ # Unified text accessor
80
+ text = (response.output_text or "").strip()
81
+ return text if text else "No response"
82
+
83
+ except Exception as e:
84
+ logger.error(f"OpenAI Vision API error (attempt {attempt + 1}): {e}")
85
+ if attempt < self.config.max_retries - 1:
86
+ import time
87
+
88
+ time.sleep(self.config.retry_delay)
89
+ else:
90
+ return f"OpenAI Vision processing failed after {self.config.max_retries} attempts"
91
+
92
+ def get_model_info(self):
93
+ return {
94
+ "processor_type": "openai",
95
+ "initialized": self._initialized,
96
+ "model": self.config.openai_model,
97
+ }
src/reachy_mini_conversation_demo/{vision.py → vision/processors.py} RENAMED
@@ -21,6 +21,8 @@ logger = logging.getLogger(__name__)
21
  class VisionConfig:
22
  """Configuration for vision processing"""
23
 
 
 
24
  model_path: str = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
25
  vision_interval: float = 5.0
26
  max_new_tokens: int = 64
@@ -66,7 +68,7 @@ class VisionProcessor:
66
  if self.device == "cuda":
67
  dtype = torch.bfloat16
68
  elif self.device == "mps":
69
- dtype = torch.float16 # best for MPS
70
  else:
71
  dtype = torch.float32
72
 
@@ -100,13 +102,10 @@ class VisionProcessor:
100
 
101
  for attempt in range(self.config.max_retries):
102
  try:
103
- # Convert CV2 BGR to RGB
104
- rgb_image = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2RGB)
105
-
106
  # Convert to JPEG bytes
107
  success, jpeg_buffer = cv2.imencode(
108
  ".jpg",
109
- rgb_image,
110
  [cv2.IMWRITE_JPEG_QUALITY, self.config.jpeg_quality],
111
  )
112
  if not success:
@@ -136,20 +135,17 @@ class VisionProcessor:
136
  return_tensors="pt",
137
  )
138
 
139
- # move to device with proper dtype
140
- if self.device == "cuda":
141
- inputs = inputs.to(self.device, dtype=torch.bfloat16)
142
- elif self.device == "mps":
143
- inputs = inputs.to(self.device, dtype=torch.float16)
144
- else:
145
- inputs = inputs.to(self.device, dtype=torch.float32)
146
 
147
  with torch.no_grad():
148
  generated_ids = self.model.generate(
149
  **inputs,
150
- do_sample=True if self.config.temperature > 0 else False,
151
  max_new_tokens=self.config.max_new_tokens,
152
- temperature=self.config.temperature,
153
  pad_token_id=self.processor.tokenizer.eos_token_id,
154
  )
155
 
@@ -203,6 +199,7 @@ class VisionProcessor:
203
  def get_model_info(self) -> Dict[str, Any]:
204
  """Get information about the loaded model"""
205
  return {
 
206
  "initialized": self._initialized,
207
  "device": self.device,
208
  "model_path": self.model_path,
@@ -220,7 +217,7 @@ class VisionManager:
220
  self.camera = camera
221
  self.config = config or VisionConfig()
222
  self.vision_interval = self.config.vision_interval
223
- self.processor = VisionProcessor(self.config)
224
 
225
  self._current_description = ""
226
  self._last_processed_time = 0
@@ -294,13 +291,11 @@ class VisionManager:
294
  async def get_status(self) -> Dict[str, Any]:
295
  """Get comprehensive status information"""
296
  return {
297
- "running": self._running,
298
  "last_processed": self._last_processed_time,
299
  "processor_info": self.processor.get_model_info(),
300
  "config": {
301
  "interval": self.vision_interval,
302
- "model_path": self.config.model_path,
303
- "device": self.processor.device,
304
  },
305
  }
306
 
@@ -311,40 +306,58 @@ def init_camera(camera_index=0, simulation=True):
311
  if simulation:
312
  # Default build-in camera in SIM
313
  # TODO: please, test on Linux and Windows
314
- # TODO simulation in find_camera
315
  camera = cv2.VideoCapture(0, api_preference)
316
  else:
317
- # TODO handle macos in find_camera
318
  if sys.platform == "darwin":
319
  camera = cv2.VideoCapture(camera_index, cv2.CAP_AVFOUNDATION)
320
  else:
321
- camera = find_camera()
322
 
323
  return camera
324
 
325
 
326
- def init_vision(camera: cv2.VideoCapture) -> VisionManager:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
  model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
328
 
329
  cache_dir = os.path.expandvars(os.getenv("HF_HOME", "$HOME/.cache/huggingface"))
330
 
331
- try:
332
- os.makedirs(cache_dir, exist_ok=True)
333
- os.environ["HF_HOME"] = cache_dir
334
- logger.info("HF_HOME set to %s", cache_dir)
335
- except Exception as e:
336
- logger.warning("Failed to prepare HF cache dir %s: %s", cache_dir, e)
337
- return
338
-
339
- snapshot_download(
340
- repo_id=model_id,
341
- repo_type="model",
342
- cache_dir=cache_dir,
343
- )
344
- logger.info(f"Prefetched model_id={model_id} into cache_dir={cache_dir}")
 
 
345
 
346
- # Configure VLLM processing
347
  vision_config = VisionConfig(
 
348
  model_path=model_id,
349
  vision_interval=5.0,
350
  max_new_tokens=64,
@@ -359,7 +372,7 @@ def init_vision(camera: cv2.VideoCapture) -> VisionManager:
359
 
360
  device_info = vision_manager.processor.get_model_info()
361
  logger.info(
362
- f"Vision processing enabled: {device_info['model_path']} on {device_info['device']}",
363
  )
364
 
365
  return vision_manager
 
21
  class VisionConfig:
22
  """Configuration for vision processing"""
23
 
24
+ processor_type: str = "local"
25
+ openai_model: str = os.getenv("OPENAI_VISION_MODEL", "gpt-4.1-mini")
26
  model_path: str = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
27
  vision_interval: float = 5.0
28
  max_new_tokens: int = 64
 
68
  if self.device == "cuda":
69
  dtype = torch.bfloat16
70
  elif self.device == "mps":
71
+ dtype = torch.float32 # best for MPS
72
  else:
73
  dtype = torch.float32
74
 
 
102
 
103
  for attempt in range(self.config.max_retries):
104
  try:
 
 
 
105
  # Convert to JPEG bytes
106
  success, jpeg_buffer = cv2.imencode(
107
  ".jpg",
108
+ cv2_image,
109
  [cv2.IMWRITE_JPEG_QUALITY, self.config.jpeg_quality],
110
  )
111
  if not success:
 
135
  return_tensors="pt",
136
  )
137
 
138
+ # Move tensors to device WITHOUT forcing dtype (keeps input_ids as torch.long)
139
+ inputs = {
140
+ k: (v.to(self.device) if hasattr(v, "to") else v)
141
+ for k, v in inputs.items()
142
+ }
 
 
143
 
144
  with torch.no_grad():
145
  generated_ids = self.model.generate(
146
  **inputs,
147
+ do_sample=False,
148
  max_new_tokens=self.config.max_new_tokens,
 
149
  pad_token_id=self.processor.tokenizer.eos_token_id,
150
  )
151
 
 
199
  def get_model_info(self) -> Dict[str, Any]:
200
  """Get information about the loaded model"""
201
  return {
202
+ "processor_type": "local",
203
  "initialized": self._initialized,
204
  "device": self.device,
205
  "model_path": self.model_path,
 
217
  self.camera = camera
218
  self.config = config or VisionConfig()
219
  self.vision_interval = self.config.vision_interval
220
+ self.processor = create_vision_processor(self.config) # Use factory function
221
 
222
  self._current_description = ""
223
  self._last_processed_time = 0
 
291
  async def get_status(self) -> Dict[str, Any]:
292
  """Get comprehensive status information"""
293
  return {
 
294
  "last_processed": self._last_processed_time,
295
  "processor_info": self.processor.get_model_info(),
296
  "config": {
297
  "interval": self.vision_interval,
298
+ "processor_type": self.config.processor_type,
 
299
  },
300
  }
301
 
 
306
  if simulation:
307
  # Default build-in camera in SIM
308
  # TODO: please, test on Linux and Windows
 
309
  camera = cv2.VideoCapture(0, api_preference)
310
  else:
311
+ # TODO handle macos properly
312
  if sys.platform == "darwin":
313
  camera = cv2.VideoCapture(camera_index, cv2.CAP_AVFOUNDATION)
314
  else:
315
+ camera = cv2.VideoCapture(camera_index)
316
 
317
  return camera
318
 
319
 
320
+ def create_vision_processor(config: VisionConfig):
321
+ """Factory function to create the appropriate vision processor"""
322
+ if config.processor_type == "openai":
323
+ try:
324
+ from .openai_vision import OpenAIVisionProcessor
325
+
326
+ return OpenAIVisionProcessor(config)
327
+ except ImportError:
328
+ logger.error("OpenAI vision processor not available, falling back to local")
329
+ return VisionProcessor(config)
330
+ else:
331
+ return VisionProcessor(config)
332
+
333
+
334
+ def init_vision(
335
+ camera: cv2.VideoCapture, processor_type: str = "local"
336
+ ) -> VisionManager:
337
  model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
338
 
339
  cache_dir = os.path.expandvars(os.getenv("HF_HOME", "$HOME/.cache/huggingface"))
340
 
341
+ # Only download model if using local processor
342
+ if processor_type == "local":
343
+ try:
344
+ os.makedirs(cache_dir, exist_ok=True)
345
+ os.environ["HF_HOME"] = cache_dir
346
+ logger.info("HF_HOME set to %s", cache_dir)
347
+ except Exception as e:
348
+ logger.warning("Failed to prepare HF cache dir %s: %s", cache_dir, e)
349
+ return None
350
+
351
+ snapshot_download(
352
+ repo_id=model_id,
353
+ repo_type="model",
354
+ cache_dir=cache_dir,
355
+ )
356
+ logger.info(f"Prefetched model_id={model_id} into cache_dir={cache_dir}")
357
 
358
+ # Configure vision processing
359
  vision_config = VisionConfig(
360
+ processor_type=processor_type,
361
  model_path=model_id,
362
  vision_interval=5.0,
363
  max_new_tokens=64,
 
372
 
373
  device_info = vision_manager.processor.get_model_info()
374
  logger.info(
375
+ f"Vision processing enabled: {device_info.get('model_path', device_info.get('processor_type'))} on {device_info.get('device', 'API')}",
376
  )
377
 
378
  return vision_manager