Spaces:

JatinAutonomousLabs
/

Research_AI_Assistant

Sleeping

JatsTheAIGen commited on Nov 7, 2025

Commit

8603d72

1 Parent(s): 7eac98c

feat: Implement FAISS-GPU and lazy-loaded local model fallback

- Update requirements.txt: Switch from faiss-cpu to faiss-gpu for GPU-accelerated vector search
- Update faiss_manager.py: Add GPU support with automatic CPU fallback
- GPU detection and resource management
- Automatic index migration between GPU/CPU
- Proper cleanup of GPU resources
- Status reporting for GPU availability

- Update src/llm_router.py: Reverse fallback order for lazy loading
- ZeroGPU API tried first (primary path)
- Local models only load if ZeroGPU fails (lazy loading)
- HF Inference API as final fallback
- Updated logging to indicate fallback path

Benefits:
- GPU-accelerated vector search (10-100x faster for large indices)
- Lazy loading: Models only load when ZeroGPU unavailable
- Lower memory usage: No models loaded if ZeroGPU works
- Automatic fallback: GPU → CPU if GPU unavailable
- Better resource utilization: GPU only used when needed

Files changed (3) hide show

faiss_manager.py +119 -13
requirements.txt +2 -1
src/llm_router.py +32 -33

faiss_manager.py CHANGED Viewed

@@ -1,39 +1,112 @@
 # faiss_manager.py
 import faiss
 import numpy as np
 class FAISSLiteManager:
-    def __init__(self, db_path: str):
         self.db_path = db_path
         self.dimension = 384  # all-MiniLM-L6-v2 dimension
-        self.index = self._initialize_index()
     def _initialize_index(self):
-        """Initialize FAISS index with SQLite backend"""
         try:
-            return faiss.read_index(f"{self.db_path}.faiss")
-        except:
             # Create new index
             index = faiss.IndexFlatIP(self.dimension)
-            faiss.write_index(index, f"{self.db_path}.faiss")
             return index
     async def store_embedding(self, session_id: str, text: str, embedding: list):
         """Store embedding with session context"""
         # Convert to numpy array
         vector = np.array([embedding], dtype=np.float32)
-        # Add to index
-        self.index.add(vector)
         # Store metadata in SQLite
-        await self._store_metadata(session_id, text, len(self.index.ntotal) - 1)
     async def search_similar(self, query_embedding: list, k: int = 5) -> list:
         """
-        Search for similar embeddings
         """
         vector = np.array([query_embedding], dtype=np.float32)
         distances, indices = self.index.search(vector, k)
         # Retrieve metadata for results
@@ -57,12 +130,45 @@ class FAISSLiteManager:
     def save_index(self):
         """
         Save the FAISS index to disk
         """
-        faiss.write_index(self.index, f"{self.db_path}.faiss")
     def get_index_size(self) -> int:
         """
         Get the number of vectors in the index
         """
         return self.index.ntotal

 # faiss_manager.py
+# FAISS Manager with GPU support and automatic CPU fallback
 import faiss
 import numpy as np
+import logging
+import os
+logger = logging.getLogger(__name__)
 class FAISSLiteManager:
+    def __init__(self, db_path: str, use_gpu: bool = True):
+        """
+        Initialize FAISS manager with GPU support
+        Args:
+            db_path: Path to database file
+            use_gpu: Whether to use GPU if available (default: True)
+        """
         self.db_path = db_path
         self.dimension = 384  # all-MiniLM-L6-v2 dimension
+        self.use_gpu = use_gpu
+        self.gpu_available = False
+        self.gpu_resource = None
+        # Detect GPU availability
+        if use_gpu:
+            try:
+                # Check if FAISS GPU is available
+                if hasattr(faiss, 'StandardGpuResources'):
+                    self.gpu_resource = faiss.StandardGpuResources()
+                    self.gpu_available = True
+                    logger.info("✓ FAISS GPU resources initialized")
+                else:
+                    logger.warning("FAISS GPU not available, using CPU")
+                    self.gpu_available = False
+            except Exception as e:
+                logger.warning(f"Could not initialize FAISS GPU: {e}. Using CPU.")
+                self.gpu_available = False
+        self.index = self._initialize_index()
     def _initialize_index(self):
+        """Initialize FAISS index with GPU support if available"""
         try:
+            # Try to load existing index
+            index = faiss.read_index(f"{self.db_path}.faiss")
+            logger.info(f"Loaded existing FAISS index with {index.ntotal} vectors")
+            # Move to GPU if available and not already on GPU
+            if self.gpu_available and not isinstance(index, faiss.GpuIndex):
+                try:
+                    logger.info("Moving index to GPU for faster search")
+                    gpu_index = faiss.index_cpu_to_gpu(self.gpu_resource, 0, index)
+                    return gpu_index
+                except Exception as e:
+                    logger.warning(f"Could not move index to GPU: {e}. Using CPU index.")
+                    return index
+            return index
+        except FileNotFoundError:
             # Create new index
+            logger.info("Creating new FAISS index")
+            if self.gpu_available:
+                try:
+                    # Create GPU index
+                    cpu_index = faiss.IndexFlatIP(self.dimension)
+                    gpu_index = faiss.index_cpu_to_gpu(self.gpu_resource, 0, cpu_index)
+                    logger.info("✓ Created GPU-accelerated FAISS index")
+                    return gpu_index
+                except Exception as e:
+                    logger.warning(f"Could not create GPU index: {e}. Creating CPU index.")
+                    self.gpu_available = False
+            # Create CPU index
             index = faiss.IndexFlatIP(self.dimension)
+            logger.info("Created CPU-based FAISS index")
             return index
     async def store_embedding(self, session_id: str, text: str, embedding: list):
         """Store embedding with session context"""
         # Convert to numpy array
         vector = np.array([embedding], dtype=np.float32)
+        # Ensure vector is on correct device
+        if self.gpu_available and isinstance(self.index, faiss.GpuIndex):
+            # GPU index handles device automatically
+            self.index.add(vector)
+        else:
+            # CPU index
+            self.index.add(vector)
         # Store metadata in SQLite
+        await self._store_metadata(session_id, text, self.index.ntotal - 1)
     async def search_similar(self, query_embedding: list, k: int = 5) -> list:
         """
+        Search for similar embeddings (GPU-accelerated if available)
+        Args:
+            query_embedding: Query embedding vector
+            k: Number of results to return
+        Returns:
+            List of similar results
         """
         vector = np.array([query_embedding], dtype=np.float32)
+        # Search (automatically uses GPU if index is on GPU)
         distances, indices = self.index.search(vector, k)
         # Retrieve metadata for results
     def save_index(self):
         """
         Save the FAISS index to disk
+        Note: GPU indices are moved to CPU before saving
         """
+        try:
+            if isinstance(self.index, faiss.GpuIndex):
+                # Move GPU index to CPU for saving
+                logger.info("Moving index from GPU to CPU for saving")
+                cpu_index = faiss.index_gpu_to_cpu(self.index)
+                faiss.write_index(cpu_index, f"{self.db_path}.faiss")
+            else:
+                # Save CPU index directly
+                faiss.write_index(self.index, f"{self.db_path}.faiss")
+            logger.info("✓ FAISS index saved successfully")
+        except Exception as e:
+            logger.error(f"Error saving FAISS index: {e}", exc_info=True)
     def get_index_size(self) -> int:
         """
         Get the number of vectors in the index
         """
         return self.index.ntotal
+    def get_gpu_status(self) -> dict:
+        """
+        Get GPU status information
+        Returns:
+            Dictionary with GPU availability and index type
+        """
+        return {
+            "gpu_available": self.gpu_available,
+            "index_type": "GPU" if isinstance(self.index, faiss.GpuIndex) else "CPU",
+            "index_size": self.index.ntotal,
+            "dimension": self.dimension
+        }
+    def __del__(self):
+        """Cleanup GPU resources"""
+        if self.gpu_resource is not None:
+            try:
+                del self.gpu_resource
+            except:
+                pass

requirements.txt CHANGED Viewed

@@ -19,7 +19,8 @@ tokenizers>=0.15.0
 sentence-transformers>=2.2.0
 # Vector Database & Search
-faiss-cpu>=1.7.4
 numpy>=1.24.0
 scipy>=1.11.0

 sentence-transformers>=2.2.0
 # Vector Database & Search
+# Use faiss-gpu for GPU-accelerated vector search (falls back to CPU if GPU unavailable)
+faiss-gpu>=1.7.4
 numpy>=1.24.0
 scipy>=1.11.0

src/llm_router.py CHANGED Viewed

@@ -84,18 +84,16 @@ class LLMRouter:
                     logger.warning(f"Could not initialize ZeroGPU client: {e}. Falling back to HF API.")
                     self.use_zero_gpu = False
-        # Initialize local model loader if enabled
         if self.use_local_models:
             try:
                 from .local_model_loader import LocalModelLoader
                 self.local_loader = LocalModelLoader()
-                logger.info("✓ Local model loader initialized (GPU-based inference)")
-                # Note: Pre-loading will happen on first request (lazy loading)
-                # Models will be loaded on-demand to avoid blocking startup
-                logger.info("Models will be loaded on-demand for faster startup")
             except Exception as e:
-                logger.warning(f"Could not initialize local model loader: {e}. Falling back to API.")
                 logger.warning("This is normal if transformers/torch not available")
                 self.use_local_models = False
                 self.local_loader = None
@@ -103,7 +101,7 @@ class LLMRouter:
     async def route_inference(self, task_type: str, prompt: str, context: Optional[List[Dict]] = None, user_id: Optional[str] = None, **kwargs):
         """
         Smart routing based on task specialization
-        Tries local models first, then ZeroGPU API, falls back to HF Inference API if needed
         Args:
             task_type: Task type (e.g., "intent_classification", "general_reasoning")
@@ -116,39 +114,40 @@ class LLMRouter:
         model_config = self._select_model(task_type)
         logger.info(f"Selected model: {model_config['model_id']}")
-        # Try local model first if available
-        if self.use_local_models and self.local_loader:
             try:
-                # Handle embedding generation separately
-                if task_type == "embedding_generation":
-                    result = await self._call_local_embedding(model_config, prompt, **kwargs)
-                else:
-                    result = await self._call_local_model(model_config, prompt, task_type, **kwargs)
                 if result is not None:
-                    logger.info(f"Inference complete for {task_type} (local model)")
                     return result
                 else:
-                    logger.warning("Local model returned None, falling back to API")
             except Exception as e:
-                logger.warning(f"Local model inference failed: {e}. Falling back to API.")
                 logger.debug("Exception details:", exc_info=True)
-        # Try ZeroGPU API if enabled
-        if self.use_zero_gpu:
             try:
-                result = await self._call_zero_gpu_endpoint(task_type, prompt, context, user_id, **kwargs)
                 if result is not None:
-                    logger.info(f"Inference complete for {task_type} (ZeroGPU API)")
                     return result
                 else:
-                    logger.warning("ZeroGPU API returned None, falling back to HF")
             except Exception as e:
-                logger.warning(f"ZeroGPU API inference failed: {e}. Falling back to HF API.")
                 logger.debug("Exception details:", exc_info=True)
-        # Fallback to HF Inference API
-        logger.info("Using HF Inference API")
         # Health check and fallback logic
         if not await self._is_model_healthy(model_config["model_id"]):
             logger.warning(f"Model unhealthy, using fallback")
@@ -160,7 +159,7 @@ class LLMRouter:
         return result
     async def _call_local_model(self, model_config: dict, prompt: str, task_type: str, **kwargs) -> Optional[str]:
-        """Call local model for inference."""
         if not self.local_loader:
             return None
@@ -169,9 +168,9 @@ class LLMRouter:
         temperature = kwargs.get('temperature', 0.7)
         try:
-            # Ensure model is loaded
             if model_id not in self.local_loader.loaded_models:
-                logger.info(f"Loading model {model_id} on demand...")
                 self.local_loader.load_chat_model(model_id, load_in_8bit=False)
             # Format as chat messages if needed
@@ -208,16 +207,16 @@ class LLMRouter:
             return None
     async def _call_local_embedding(self, model_config: dict, text: str, **kwargs) -> Optional[list]:
-        """Call local embedding model."""
         if not self.local_loader:
             return None
         model_id = model_config["model_id"]
         try:
-            # Ensure model is loaded
             if model_id not in self.local_loader.loaded_embedding_models:
-                logger.info(f"Loading embedding model {model_id} on demand...")
                 self.local_loader.load_embedding_model(model_id)
             # Generate embedding

                     logger.warning(f"Could not initialize ZeroGPU client: {e}. Falling back to HF API.")
                     self.use_zero_gpu = False
+        # Initialize local model loader if enabled (but don't load models yet - lazy loading)
         if self.use_local_models:
             try:
                 from .local_model_loader import LocalModelLoader
+                # Initialize loader but don't load models yet
                 self.local_loader = LocalModelLoader()
+                logger.info("✓ Local model loader initialized (models will load on-demand as fallback)")
+                logger.info("Models will only load if ZeroGPU API fails")
             except Exception as e:
+                logger.warning(f"Could not initialize local model loader: {e}. Local fallback unavailable.")
                 logger.warning("This is normal if transformers/torch not available")
                 self.use_local_models = False
                 self.local_loader = None
     async def route_inference(self, task_type: str, prompt: str, context: Optional[List[Dict]] = None, user_id: Optional[str] = None, **kwargs):
         """
         Smart routing based on task specialization
+        Tries ZeroGPU API first, then local models as fallback (lazy loading), then HF Inference API
         Args:
             task_type: Task type (e.g., "intent_classification", "general_reasoning")
         model_config = self._select_model(task_type)
         logger.info(f"Selected model: {model_config['model_id']}")
+        # Try ZeroGPU API first (primary path)
+        if self.use_zero_gpu:
             try:
+                result = await self._call_zero_gpu_endpoint(task_type, prompt, context, user_id, **kwargs)
                 if result is not None:
+                    logger.info(f"Inference complete for {task_type} (ZeroGPU API)")
                     return result
                 else:
+                    logger.warning("ZeroGPU API returned None, falling back to local models")
             except Exception as e:
+                logger.warning(f"ZeroGPU API inference failed: {e}. Falling back to local models.")
                 logger.debug("Exception details:", exc_info=True)
+        # Fallback to local models (lazy loading - only if ZeroGPU fails)
+        if self.use_local_models and self.local_loader:
             try:
+                logger.info("ZeroGPU API unavailable, loading local model as fallback...")
+                # Handle embedding generation separately
+                if task_type == "embedding_generation":
+                    result = await self._call_local_embedding(model_config, prompt, **kwargs)
+                else:
+                    result = await self._call_local_model(model_config, prompt, task_type, **kwargs)
                 if result is not None:
+                    logger.info(f"Inference complete for {task_type} (local model fallback)")
                     return result
                 else:
+                    logger.warning("Local model returned None, falling back to HF API")
             except Exception as e:
+                logger.warning(f"Local model inference failed: {e}. Falling back to HF API.")
                 logger.debug("Exception details:", exc_info=True)
+        # Final fallback to HF Inference API
+        logger.info("Using HF Inference API as final fallback")
         # Health check and fallback logic
         if not await self._is_model_healthy(model_config["model_id"]):
             logger.warning(f"Model unhealthy, using fallback")
         return result
     async def _call_local_model(self, model_config: dict, prompt: str, task_type: str, **kwargs) -> Optional[str]:
+        """Call local model for inference (lazy loading - only used as fallback)."""
         if not self.local_loader:
             return None
         temperature = kwargs.get('temperature', 0.7)
         try:
+            # Ensure model is loaded (lazy loading on first use)
             if model_id not in self.local_loader.loaded_models:
+                logger.info(f"Lazy loading local model {model_id} as fallback (ZeroGPU unavailable)")
                 self.local_loader.load_chat_model(model_id, load_in_8bit=False)
             # Format as chat messages if needed
             return None
     async def _call_local_embedding(self, model_config: dict, text: str, **kwargs) -> Optional[list]:
+        """Call local embedding model (lazy loading - only used as fallback)."""
         if not self.local_loader:
             return None
         model_id = model_config["model_id"]
         try:
+            # Ensure model is loaded (lazy loading on first use)
             if model_id not in self.local_loader.loaded_embedding_models:
+                logger.info(f"Lazy loading local embedding model {model_id} as fallback (ZeroGPU unavailable)")
                 self.local_loader.load_embedding_model(model_id)
             # Generate embedding