feat: Add ZeroGPU Chat API integration
Browse files- Add ZeroGPU API client (zero_gpu_client.py) with JWT authentication and auto-refresh
- Update LLM router to support ZeroGPU API as inference provider
- Add ZeroGPU configuration to config.py (enabled via USE_ZERO_GPU env var)
- Add task type mapping for ZeroGPU API (general, reasoning, classification, embedding)
- Update app.py and flask_api_standalone.py to pass ZeroGPU config to LLM router
- Implement fallback chain: Local models -> ZeroGPU API -> HF Inference API
- Add comprehensive integration review documentation
The ZeroGPU API provides:
- Built-in user management and authentication
- Comprehensive server-side logging and audit trail
- Task-based routing (general, reasoning, classification, embedding)
- Rich metadata (tokens, timing, quality metrics)
- Rate limiting and security features
Configuration:
- Set USE_ZERO_GPU=true to enable
- Set ZERO_GPU_API_URL, ZERO_GPU_EMAIL, ZERO_GPU_PASSWORD
- Falls back to HF API if ZeroGPU unavailable or disabled
- ZEROGPU_API_INTEGRATION_REVIEW.md +829 -0
- ZEROGPU_API_REVIEW_SUMMARY.md +179 -0
- app.py +16 -1
- config.py +6 -0
- flask_api_standalone.py +16 -1
- src/llm_router.py +159 -5
- src/models_config.py +7 -0
- zero_gpu_client.py +219 -0
|
@@ -0,0 +1,829 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ZeroGPU Chat API Integration Review
|
| 2 |
+
|
| 3 |
+
**Date:** 2025-01-07
|
| 4 |
+
**Reviewer:** AI Assistant
|
| 5 |
+
**Purpose:** Comprehensive review of ZeroGPU Chat API documentation for replacing HF/Novita Inference endpoints
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## Executive Summary
|
| 10 |
+
|
| 11 |
+
The ZeroGPU Chat API provides a comprehensive replacement for Hugging Face Inference API with significant advantages:
|
| 12 |
+
- ✅ **Built-in user management and authentication** (JWT-based)
|
| 13 |
+
- ✅ **Comprehensive audit logging** (all requests logged server-side)
|
| 14 |
+
- ✅ **Multi-task support** (general, reasoning, classification, embedding)
|
| 15 |
+
- ✅ **Rate limiting and security features**
|
| 16 |
+
- ✅ **Better integration patterns** for multi-agent systems
|
| 17 |
+
|
| 18 |
+
**Key Integration Points:**
|
| 19 |
+
1. Replace `llm_router.py` HF endpoint calls with ZeroGPU `/chat` endpoint
|
| 20 |
+
2. Implement JWT authentication flow (login → access token → refresh)
|
| 21 |
+
3. Map current task types to ZeroGPU task types
|
| 22 |
+
4. Leverage API's built-in logging instead of local logging
|
| 23 |
+
5. Update user management to use API's user system or maintain dual system
|
| 24 |
+
|
| 25 |
+
---
|
| 26 |
+
|
| 27 |
+
## 1. API Documentation Review
|
| 28 |
+
|
| 29 |
+
### 1.1 Endpoint Comparison
|
| 30 |
+
|
| 31 |
+
#### Current System (HF Inference API)
|
| 32 |
+
```python
|
| 33 |
+
# Current: llm_router.py
|
| 34 |
+
api_url = "https://router.huggingface.co/v1/chat/completions"
|
| 35 |
+
headers = {"Authorization": f"Bearer {self.hf_token}"}
|
| 36 |
+
payload = {
|
| 37 |
+
"model": model_id,
|
| 38 |
+
"messages": [{"role": "user", "content": prompt}],
|
| 39 |
+
"max_tokens": max_tokens,
|
| 40 |
+
"temperature": temperature
|
| 41 |
+
}
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
#### ZeroGPU Chat API
|
| 45 |
+
```python
|
| 46 |
+
# New: ZeroGPU API
|
| 47 |
+
api_url = "http://your-pod-ip:8000/chat"
|
| 48 |
+
headers = {"Authorization": f"Bearer {access_token}"}
|
| 49 |
+
payload = {
|
| 50 |
+
"message": prompt,
|
| 51 |
+
"task": "general", # or "reasoning", "classification", "embedding"
|
| 52 |
+
"context": [...], # Optional conversation history
|
| 53 |
+
"max_tokens": max_tokens,
|
| 54 |
+
"temperature": temperature,
|
| 55 |
+
"system_prompt": "..."
|
| 56 |
+
}
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
**Key Differences:**
|
| 60 |
+
- ✅ **Task-based routing** instead of model selection
|
| 61 |
+
- ✅ **Context support** built-in (conversation history)
|
| 62 |
+
- ✅ **System prompts** supported natively
|
| 63 |
+
- ✅ **Authentication** via JWT tokens (not API keys)
|
| 64 |
+
- ⚠️ **Different payload structure** (message vs messages array)
|
| 65 |
+
|
| 66 |
+
### 1.2 Task Type Mapping
|
| 67 |
+
|
| 68 |
+
**Current System Task Types:**
|
| 69 |
+
```python
|
| 70 |
+
# From models_config.py and llm_router.py
|
| 71 |
+
task_types = {
|
| 72 |
+
"intent_classification": "classification_specialist",
|
| 73 |
+
"embedding_generation": "embedding_specialist",
|
| 74 |
+
"safety_check": "safety_checker",
|
| 75 |
+
"general_reasoning": "reasoning_primary",
|
| 76 |
+
"response_synthesis": "reasoning_primary"
|
| 77 |
+
}
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
**ZeroGPU Task Types:**
|
| 81 |
+
```python
|
| 82 |
+
# From ZeroGPU API documentation
|
| 83 |
+
zero_gpu_tasks = {
|
| 84 |
+
"general": "General purpose chat and Q&A",
|
| 85 |
+
"reasoning": "Complex reasoning and problem-solving",
|
| 86 |
+
"classification": "Text classification tasks",
|
| 87 |
+
"embedding": "Text embeddings (vector representations)"
|
| 88 |
+
}
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
**Recommended Mapping:**
|
| 92 |
+
```python
|
| 93 |
+
TASK_MAPPING = {
|
| 94 |
+
"intent_classification": "classification",
|
| 95 |
+
"embedding_generation": "embedding",
|
| 96 |
+
"safety_check": "general", # Or create custom safety endpoint
|
| 97 |
+
"general_reasoning": "reasoning",
|
| 98 |
+
"response_synthesis": "general" # Or "reasoning" for complex synthesis
|
| 99 |
+
}
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
### 1.3 Authentication Flow
|
| 103 |
+
|
| 104 |
+
**Current System:**
|
| 105 |
+
- Uses HF token directly in headers
|
| 106 |
+
- No user management
|
| 107 |
+
- No token refresh needed
|
| 108 |
+
|
| 109 |
+
**ZeroGPU API:**
|
| 110 |
+
- Requires user registration/login
|
| 111 |
+
- JWT access tokens (15 min expiry)
|
| 112 |
+
- Refresh tokens (7 day expiry)
|
| 113 |
+
- User approval workflow
|
| 114 |
+
|
| 115 |
+
**Integration Strategy:**
|
| 116 |
+
1. **Option A: Service Account** (Recommended for single-tenant)
|
| 117 |
+
- Create one service account for the application
|
| 118 |
+
- Use that account for all API calls
|
| 119 |
+
- Simpler, but all usage tracked under one user
|
| 120 |
+
|
| 121 |
+
2. **Option B: Per-User Accounts** (Multi-tenant)
|
| 122 |
+
- Map each application user to ZeroGPU user
|
| 123 |
+
- Track usage per user
|
| 124 |
+
- More complex but better for multi-tenant scenarios
|
| 125 |
+
|
| 126 |
+
3. **Option C: Hybrid** (Recommended for migration)
|
| 127 |
+
- Use service account initially
|
| 128 |
+
- Migrate to per-user accounts gradually
|
| 129 |
+
- Maintain user mapping table
|
| 130 |
+
|
| 131 |
+
### 1.4 Response Structure Comparison
|
| 132 |
+
|
| 133 |
+
**Current HF API Response:**
|
| 134 |
+
```python
|
| 135 |
+
{
|
| 136 |
+
"choices": [{
|
| 137 |
+
"message": {
|
| 138 |
+
"content": "response text"
|
| 139 |
+
}
|
| 140 |
+
}]
|
| 141 |
+
}
|
| 142 |
+
```
|
| 143 |
+
|
| 144 |
+
**ZeroGPU API Response:**
|
| 145 |
+
```python
|
| 146 |
+
{
|
| 147 |
+
"response": "response text",
|
| 148 |
+
"task": "general",
|
| 149 |
+
"model_used": "mistralai/Mistral-7B-Instruct-v0.2",
|
| 150 |
+
"tokens_used": {
|
| 151 |
+
"input": 15,
|
| 152 |
+
"output": 8,
|
| 153 |
+
"total": 23
|
| 154 |
+
},
|
| 155 |
+
"inference_metrics": {
|
| 156 |
+
"inference_duration": 0.45,
|
| 157 |
+
"total_duration": 0.52,
|
| 158 |
+
"tokens_per_second": 17.78
|
| 159 |
+
},
|
| 160 |
+
"confidence_scores": {...},
|
| 161 |
+
"quality_metrics": {...},
|
| 162 |
+
"performance_metrics": {...},
|
| 163 |
+
"audit_info": {
|
| 164 |
+
"timestamp": "2024-01-01T12:00:00",
|
| 165 |
+
"user_id": 1,
|
| 166 |
+
"model_name": "...",
|
| 167 |
+
"task": "general",
|
| 168 |
+
"generation_parameters": {...},
|
| 169 |
+
"compliance": {
|
| 170 |
+
"logged": true,
|
| 171 |
+
"retention_days": 90,
|
| 172 |
+
"audit_enabled": true
|
| 173 |
+
}
|
| 174 |
+
}
|
| 175 |
+
}
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
**Advantages:**
|
| 179 |
+
- ✅ **Rich metadata** (tokens, timing, quality metrics)
|
| 180 |
+
- ✅ **Audit trail** built-in
|
| 181 |
+
- ✅ **Performance metrics** included
|
| 182 |
+
- ✅ **Compliance information** for logging
|
| 183 |
+
|
| 184 |
+
---
|
| 185 |
+
|
| 186 |
+
## 2. Data Storage Analysis
|
| 187 |
+
|
| 188 |
+
### 2.1 Current System Data Storage
|
| 189 |
+
|
| 190 |
+
**Database Schema (from database_schema.sql and context_manager.py):**
|
| 191 |
+
```sql
|
| 192 |
+
-- Sessions
|
| 193 |
+
CREATE TABLE sessions (
|
| 194 |
+
session_id TEXT PRIMARY KEY,
|
| 195 |
+
user_id TEXT DEFAULT 'Test_Any',
|
| 196 |
+
created_at TIMESTAMP,
|
| 197 |
+
last_activity TIMESTAMP,
|
| 198 |
+
context_data TEXT,
|
| 199 |
+
user_metadata TEXT
|
| 200 |
+
);
|
| 201 |
+
|
| 202 |
+
-- Interactions
|
| 203 |
+
CREATE TABLE interactions (
|
| 204 |
+
interaction_id TEXT PRIMARY KEY,
|
| 205 |
+
session_id TEXT REFERENCES sessions(session_id),
|
| 206 |
+
user_input TEXT,
|
| 207 |
+
context_snapshot TEXT,
|
| 208 |
+
created_at TIMESTAMP
|
| 209 |
+
);
|
| 210 |
+
|
| 211 |
+
-- User Contexts
|
| 212 |
+
CREATE TABLE user_contexts (
|
| 213 |
+
user_id TEXT PRIMARY KEY,
|
| 214 |
+
persona_summary TEXT,
|
| 215 |
+
updated_at TIMESTAMP
|
| 216 |
+
);
|
| 217 |
+
```
|
| 218 |
+
|
| 219 |
+
**Current Logging:**
|
| 220 |
+
- Application-level logging to files/console
|
| 221 |
+
- Database storage for sessions/interactions
|
| 222 |
+
- No centralized audit trail
|
| 223 |
+
- No built-in compliance logging
|
| 224 |
+
|
| 225 |
+
### 2.2 ZeroGPU API Data Storage
|
| 226 |
+
|
| 227 |
+
**API-Side Storage (from documentation):**
|
| 228 |
+
The API provides comprehensive server-side logging:
|
| 229 |
+
- ✅ **All inference requests logged** with full audit trail
|
| 230 |
+
- ✅ **User activity tracking** (usage stats endpoint)
|
| 231 |
+
- ✅ **Request/response logging** with timestamps
|
| 232 |
+
- ✅ **Compliance logging** (90-day retention mentioned)
|
| 233 |
+
- ✅ **Performance metrics** stored
|
| 234 |
+
- ✅ **Token usage tracking** per user
|
| 235 |
+
|
| 236 |
+
**What the API Stores:**
|
| 237 |
+
1. **User Accounts** (email, mobile, approval status)
|
| 238 |
+
2. **Inference Logs** (all `/chat` requests)
|
| 239 |
+
- User ID
|
| 240 |
+
- Timestamp
|
| 241 |
+
- Model used
|
| 242 |
+
- Task type
|
| 243 |
+
- Generation parameters
|
| 244 |
+
- Tokens used
|
| 245 |
+
- Performance metrics
|
| 246 |
+
- Request/response content (likely)
|
| 247 |
+
3. **Usage Statistics** (aggregated per user)
|
| 248 |
+
- Total requests
|
| 249 |
+
- Total tokens
|
| 250 |
+
- Requests by task
|
| 251 |
+
- Average inference time
|
| 252 |
+
|
| 253 |
+
**What Your System Should Still Store:**
|
| 254 |
+
1. **Session Management** (conversation continuity)
|
| 255 |
+
- Session IDs
|
| 256 |
+
- Conversation history
|
| 257 |
+
- Context summaries
|
| 258 |
+
2. **User Preferences** (application-specific)
|
| 259 |
+
- UI preferences
|
| 260 |
+
- Response speed settings
|
| 261 |
+
- Context mode preferences
|
| 262 |
+
3. **Application State** (non-API data)
|
| 263 |
+
- Agent traces
|
| 264 |
+
- Reasoning chains
|
| 265 |
+
- Custom metadata
|
| 266 |
+
|
| 267 |
+
### 2.3 Data Synchronization Strategy
|
| 268 |
+
|
| 269 |
+
**Recommended Approach:**
|
| 270 |
+
|
| 271 |
+
1. **Dual Storage Pattern:**
|
| 272 |
+
```
|
| 273 |
+
Application DB (SQLite) ZeroGPU API
|
| 274 |
+
├── Sessions ├── User Accounts
|
| 275 |
+
├── Interactions ├── Inference Logs
|
| 276 |
+
├── User Contexts ├── Usage Statistics
|
| 277 |
+
└── User Preferences └── Audit Trail
|
| 278 |
+
```
|
| 279 |
+
|
| 280 |
+
2. **Data Flow:**
|
| 281 |
+
- **Read from API:** User info, usage stats
|
| 282 |
+
- **Write to API:** Inference requests (auto-logged)
|
| 283 |
+
- **Read from Local DB:** Session history, preferences
|
| 284 |
+
- **Write to Local DB:** Session management, app state
|
| 285 |
+
|
| 286 |
+
3. **Migration Considerations:**
|
| 287 |
+
- **User IDs:** API generates its own user IDs
|
| 288 |
+
- **Email as Key:** Use email for user lookups (stable identifier)
|
| 289 |
+
- **Session Mapping:** Maintain mapping: `local_session_id → api_user_id`
|
| 290 |
+
- **Historical Data:** Keep existing sessions in local DB
|
| 291 |
+
|
| 292 |
+
---
|
| 293 |
+
|
| 294 |
+
## 3. User Logging Capabilities Analysis
|
| 295 |
+
|
| 296 |
+
### 3.1 API-Provided Logging Features
|
| 297 |
+
|
| 298 |
+
#### 3.1.1 Automatic Request Logging
|
| 299 |
+
**Every `/chat` request is automatically logged with:**
|
| 300 |
+
- ✅ User ID
|
| 301 |
+
- ✅ Timestamp
|
| 302 |
+
- ✅ Model name
|
| 303 |
+
- ✅ Task type
|
| 304 |
+
- ✅ Generation parameters (max_tokens, temperature, etc.)
|
| 305 |
+
- ✅ Context information (has_context, context_messages count)
|
| 306 |
+
- ✅ Compliance flags (logged, retention_days, audit_enabled)
|
| 307 |
+
|
| 308 |
+
**From API Response:**
|
| 309 |
+
```json
|
| 310 |
+
"audit_info": {
|
| 311 |
+
"timestamp": "2024-01-01T12:00:00",
|
| 312 |
+
"user_id": 1,
|
| 313 |
+
"model_name": "mistralai/Mistral-7B-Instruct-v0.2",
|
| 314 |
+
"task": "general",
|
| 315 |
+
"generation_parameters": {
|
| 316 |
+
"max_tokens": 512,
|
| 317 |
+
"temperature": 0.7,
|
| 318 |
+
"has_context": true,
|
| 319 |
+
"context_messages": 2
|
| 320 |
+
},
|
| 321 |
+
"compliance": {
|
| 322 |
+
"logged": true,
|
| 323 |
+
"retention_days": 90,
|
| 324 |
+
"audit_enabled": true
|
| 325 |
+
}
|
| 326 |
+
}
|
| 327 |
+
```
|
| 328 |
+
|
| 329 |
+
#### 3.1.2 Usage Statistics Endpoint
|
| 330 |
+
**`GET /usage/stats` provides aggregated logging:**
|
| 331 |
+
```json
|
| 332 |
+
{
|
| 333 |
+
"user_id": 1,
|
| 334 |
+
"period_days": 30,
|
| 335 |
+
"total_requests": 150,
|
| 336 |
+
"total_tokens": 45000,
|
| 337 |
+
"total_inference_time": 125.5,
|
| 338 |
+
"requests_by_task": {
|
| 339 |
+
"general": 100,
|
| 340 |
+
"reasoning": 30,
|
| 341 |
+
"classification": 20
|
| 342 |
+
},
|
| 343 |
+
"tokens_by_task": {
|
| 344 |
+
"general": 30000,
|
| 345 |
+
"reasoning": 10000,
|
| 346 |
+
"classification": 5000
|
| 347 |
+
},
|
| 348 |
+
"average_tokens_per_request": 300,
|
| 349 |
+
"average_inference_time": 0.84
|
| 350 |
+
}
|
| 351 |
+
```
|
| 352 |
+
|
| 353 |
+
**Capabilities:**
|
| 354 |
+
- ✅ **Per-user statistics** (requires authentication)
|
| 355 |
+
- ✅ **Time-period filtering** (days parameter)
|
| 356 |
+
- ✅ **Task breakdown** (requests and tokens by task)
|
| 357 |
+
- ✅ **Performance metrics** (average inference time)
|
| 358 |
+
- ✅ **Token usage tracking** (input/output/total)
|
| 359 |
+
|
| 360 |
+
#### 3.1.3 Admin Logging Endpoints
|
| 361 |
+
**Admin endpoints provide additional logging:**
|
| 362 |
+
- `GET /admin/all-users` - All user accounts
|
| 363 |
+
- `GET /admin/pending-users` - Pending approvals
|
| 364 |
+
- User approval/deactivation actions logged
|
| 365 |
+
|
| 366 |
+
#### 3.1.4 Rate Limiting Headers
|
| 367 |
+
**Every response includes rate limit logging:**
|
| 368 |
+
```
|
| 369 |
+
X-RateLimit-Limit: 60
|
| 370 |
+
X-RateLimit-Remaining: 45
|
| 371 |
+
X-RateLimit-Reset: 1704067200
|
| 372 |
+
```
|
| 373 |
+
|
| 374 |
+
### 3.2 What's NOT Logged by API (Need Local Logging)
|
| 375 |
+
|
| 376 |
+
**The API does NOT provide:**
|
| 377 |
+
1. **Request/Response Content** (message text, response text)
|
| 378 |
+
- API logs metadata but may not store full content
|
| 379 |
+
- **Action:** Continue local logging for full conversation history
|
| 380 |
+
2. **Agent Traces** (your multi-agent system specifics)
|
| 381 |
+
- API doesn't know about your agent architecture
|
| 382 |
+
- **Action:** Keep agent trace logging in local DB
|
| 383 |
+
3. **Reasoning Chains** (chain of thought)
|
| 384 |
+
- Application-specific reasoning data
|
| 385 |
+
- **Action:** Store in local interactions table
|
| 386 |
+
4. **Context Summaries** (user persona, session context)
|
| 387 |
+
- Application-level context management
|
| 388 |
+
- **Action:** Continue using local context_manager
|
| 389 |
+
|
| 390 |
+
### 3.3 Recommended Logging Strategy
|
| 391 |
+
|
| 392 |
+
**Hybrid Logging Approach:**
|
| 393 |
+
|
| 394 |
+
```python
|
| 395 |
+
# 1. API handles inference logging (automatic)
|
| 396 |
+
response = zero_gpu_client.chat(
|
| 397 |
+
message=user_input,
|
| 398 |
+
task="general",
|
| 399 |
+
context=conversation_context
|
| 400 |
+
)
|
| 401 |
+
# API automatically logs: user_id, timestamp, model, task, params, metrics
|
| 402 |
+
|
| 403 |
+
# 2. Application handles application-specific logging
|
| 404 |
+
local_db.save_interaction(
|
| 405 |
+
session_id=session_id,
|
| 406 |
+
user_input=user_input,
|
| 407 |
+
response=response["response"],
|
| 408 |
+
agent_trace=agent_trace, # Your system's agent data
|
| 409 |
+
reasoning_data=reasoning_data, # Your system's reasoning
|
| 410 |
+
api_audit_info=response["audit_info"] # Link to API log
|
| 411 |
+
)
|
| 412 |
+
|
| 413 |
+
# 3. Periodic sync for usage stats
|
| 414 |
+
usage_stats = zero_gpu_client.get_usage_stats(days=30)
|
| 415 |
+
local_db.update_usage_cache(user_id, usage_stats)
|
| 416 |
+
```
|
| 417 |
+
|
| 418 |
+
**Benefits:**
|
| 419 |
+
- ✅ **API handles compliance** (audit trail, retention)
|
| 420 |
+
- ✅ **Application handles context** (conversation continuity)
|
| 421 |
+
- ✅ **Reduced local logging** (no need to log inference details)
|
| 422 |
+
- ✅ **Better separation** (API concerns vs application concerns)
|
| 423 |
+
|
| 424 |
+
---
|
| 425 |
+
|
| 426 |
+
## 4. Integration Requirements
|
| 427 |
+
|
| 428 |
+
### 4.1 Code Changes Required
|
| 429 |
+
|
| 430 |
+
#### 4.1.1 Create ZeroGPU API Client
|
| 431 |
+
**New File: `zero_gpu_client.py`**
|
| 432 |
+
```python
|
| 433 |
+
import requests
|
| 434 |
+
import time
|
| 435 |
+
from typing import Optional, List, Dict, Any
|
| 436 |
+
|
| 437 |
+
class ZeroGPUChatClient:
|
| 438 |
+
def __init__(self, base_url: str, email: str, password: str):
|
| 439 |
+
self.base_url = base_url.rstrip('/')
|
| 440 |
+
self.access_token = None
|
| 441 |
+
self.refresh_token = None
|
| 442 |
+
self.login(email, password)
|
| 443 |
+
|
| 444 |
+
def login(self, email: str, password: str):
|
| 445 |
+
"""Login and get tokens"""
|
| 446 |
+
response = requests.post(
|
| 447 |
+
f"{self.base_url}/login",
|
| 448 |
+
json={"email": email, "password": password}
|
| 449 |
+
)
|
| 450 |
+
response.raise_for_status()
|
| 451 |
+
data = response.json()
|
| 452 |
+
self.access_token = data["access_token"]
|
| 453 |
+
self.refresh_token = data["refresh_token"]
|
| 454 |
+
|
| 455 |
+
def refresh_access_token(self):
|
| 456 |
+
"""Refresh access token"""
|
| 457 |
+
response = requests.post(
|
| 458 |
+
f"{self.base_url}/refresh",
|
| 459 |
+
headers={"X-Refresh-Token": self.refresh_token}
|
| 460 |
+
)
|
| 461 |
+
response.raise_for_status()
|
| 462 |
+
data = response.json()
|
| 463 |
+
self.access_token = data["access_token"]
|
| 464 |
+
self.refresh_token = data["refresh_token"]
|
| 465 |
+
|
| 466 |
+
def chat(self, message: str, task: str = "general", **kwargs) -> Dict[str, Any]:
|
| 467 |
+
"""Send chat message with auto-retry on 401"""
|
| 468 |
+
url = f"{self.base_url}/chat"
|
| 469 |
+
headers = {
|
| 470 |
+
"Authorization": f"Bearer {self.access_token}",
|
| 471 |
+
"Content-Type": "application/json"
|
| 472 |
+
}
|
| 473 |
+
|
| 474 |
+
payload = {
|
| 475 |
+
"message": message,
|
| 476 |
+
"task": task,
|
| 477 |
+
**kwargs
|
| 478 |
+
}
|
| 479 |
+
|
| 480 |
+
response = requests.post(url, json=payload, headers=headers)
|
| 481 |
+
|
| 482 |
+
if response.status_code == 401:
|
| 483 |
+
# Token expired, refresh and retry
|
| 484 |
+
self.refresh_access_token()
|
| 485 |
+
headers["Authorization"] = f"Bearer {self.access_token}"
|
| 486 |
+
response = requests.post(url, json=payload, headers=headers)
|
| 487 |
+
|
| 488 |
+
response.raise_for_status()
|
| 489 |
+
return response.json()
|
| 490 |
+
```
|
| 491 |
+
|
| 492 |
+
#### 4.1.2 Update LLM Router
|
| 493 |
+
**Modify: `llm_router.py` or `src/llm_router.py`**
|
| 494 |
+
|
| 495 |
+
**Current:**
|
| 496 |
+
```python
|
| 497 |
+
async def _call_hf_endpoint(self, model_config: dict, prompt: str, task_type: str, **kwargs):
|
| 498 |
+
api_url = "https://router.huggingface.co/v1/chat/completions"
|
| 499 |
+
# ... HF API call
|
| 500 |
+
```
|
| 501 |
+
|
| 502 |
+
**New:**
|
| 503 |
+
```python
|
| 504 |
+
async def _call_zero_gpu_endpoint(self, task_type: str, prompt: str, context: List[Dict] = None, **kwargs):
|
| 505 |
+
# Map task type to ZeroGPU task
|
| 506 |
+
task_mapping = {
|
| 507 |
+
"intent_classification": "classification",
|
| 508 |
+
"embedding_generation": "embedding",
|
| 509 |
+
"general_reasoning": "reasoning",
|
| 510 |
+
"response_synthesis": "general"
|
| 511 |
+
}
|
| 512 |
+
zero_gpu_task = task_mapping.get(task_type, "general")
|
| 513 |
+
|
| 514 |
+
# Prepare context if provided
|
| 515 |
+
context_messages = None
|
| 516 |
+
if context:
|
| 517 |
+
context_messages = [
|
| 518 |
+
{
|
| 519 |
+
"role": msg.get("role", "user"),
|
| 520 |
+
"content": msg.get("content", ""),
|
| 521 |
+
"timestamp": msg.get("timestamp", datetime.utcnow().isoformat())
|
| 522 |
+
}
|
| 523 |
+
for msg in context
|
| 524 |
+
]
|
| 525 |
+
|
| 526 |
+
# Call ZeroGPU API
|
| 527 |
+
response = self.zero_gpu_client.chat(
|
| 528 |
+
message=prompt,
|
| 529 |
+
task=zero_gpu_task,
|
| 530 |
+
context=context_messages,
|
| 531 |
+
max_tokens=kwargs.get('max_tokens', 512),
|
| 532 |
+
temperature=kwargs.get('temperature', 0.7),
|
| 533 |
+
**{k: v for k, v in kwargs.items() if k not in ['max_tokens', 'temperature']}
|
| 534 |
+
)
|
| 535 |
+
|
| 536 |
+
return response["response"]
|
| 537 |
+
```
|
| 538 |
+
|
| 539 |
+
#### 4.1.3 Update Configuration
|
| 540 |
+
**Modify: `config.py` or create `zero_gpu_config.py`**
|
| 541 |
+
```python
|
| 542 |
+
ZERO_GPU_CONFIG = {
|
| 543 |
+
"base_url": os.getenv("ZERO_GPU_API_URL", "http://your-pod-ip:8000"),
|
| 544 |
+
"service_account": {
|
| 545 |
+
"email": os.getenv("ZERO_GPU_EMAIL", "service@example.com"),
|
| 546 |
+
"password": os.getenv("ZERO_GPU_PASSWORD", "")
|
| 547 |
+
},
|
| 548 |
+
"task_mapping": {
|
| 549 |
+
"intent_classification": "classification",
|
| 550 |
+
"embedding_generation": "embedding",
|
| 551 |
+
"general_reasoning": "reasoning",
|
| 552 |
+
"response_synthesis": "general",
|
| 553 |
+
"safety_check": "general"
|
| 554 |
+
},
|
| 555 |
+
"retry_config": {
|
| 556 |
+
"max_retries": 3,
|
| 557 |
+
"timeout": 30,
|
| 558 |
+
"wait_for_ready": True,
|
| 559 |
+
"ready_timeout": 300
|
| 560 |
+
}
|
| 561 |
+
}
|
| 562 |
+
```
|
| 563 |
+
|
| 564 |
+
### 4.2 Database Schema Updates
|
| 565 |
+
|
| 566 |
+
**No schema changes required**, but consider adding:
|
| 567 |
+
|
| 568 |
+
```sql
|
| 569 |
+
-- Optional: Track API user mapping
|
| 570 |
+
CREATE TABLE IF NOT EXISTS api_user_mapping (
|
| 571 |
+
local_user_id TEXT PRIMARY KEY,
|
| 572 |
+
api_user_id INTEGER,
|
| 573 |
+
api_email TEXT,
|
| 574 |
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
| 575 |
+
);
|
| 576 |
+
|
| 577 |
+
-- Optional: Cache usage stats
|
| 578 |
+
CREATE TABLE IF NOT EXISTS api_usage_cache (
|
| 579 |
+
user_id TEXT PRIMARY KEY,
|
| 580 |
+
stats_json TEXT,
|
| 581 |
+
last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
| 582 |
+
);
|
| 583 |
+
```
|
| 584 |
+
|
| 585 |
+
### 4.3 Environment Variables
|
| 586 |
+
|
| 587 |
+
**Add to `.env` or environment:**
|
| 588 |
+
```bash
|
| 589 |
+
# ZeroGPU API Configuration
|
| 590 |
+
ZERO_GPU_API_URL=http://your-pod-ip:8000
|
| 591 |
+
ZERO_GPU_EMAIL=service@example.com
|
| 592 |
+
ZERO_GPU_PASSWORD=your-secure-password
|
| 593 |
+
|
| 594 |
+
# Optional: Fallback to HF if ZeroGPU unavailable
|
| 595 |
+
USE_ZERO_GPU=true
|
| 596 |
+
HF_TOKEN=your-hf-token # Keep as fallback
|
| 597 |
+
```
|
| 598 |
+
|
| 599 |
+
---
|
| 600 |
+
|
| 601 |
+
## 5. Migration Plan
|
| 602 |
+
|
| 603 |
+
### 5.1 Phase 1: Setup and Testing (Week 1)
|
| 604 |
+
1. ✅ Set up ZeroGPU API instance
|
| 605 |
+
2. ✅ Create service account
|
| 606 |
+
3. ✅ Implement ZeroGPU client
|
| 607 |
+
4. ✅ Test authentication flow
|
| 608 |
+
5. ✅ Test basic chat endpoint
|
| 609 |
+
6. ✅ Verify logging works
|
| 610 |
+
|
| 611 |
+
### 5.2 Phase 2: Integration (Week 2)
|
| 612 |
+
1. ✅ Update LLM router to use ZeroGPU
|
| 613 |
+
2. ✅ Implement task mapping
|
| 614 |
+
3. ✅ Add context support
|
| 615 |
+
4. ✅ Update error handling
|
| 616 |
+
5. ✅ Test all task types
|
| 617 |
+
6. ✅ Verify fallback logic
|
| 618 |
+
|
| 619 |
+
### 5.3 Phase 3: User Management (Week 3)
|
| 620 |
+
1. ✅ Decide on user strategy (service account vs per-user)
|
| 621 |
+
2. ✅ Implement user mapping if needed
|
| 622 |
+
3. ✅ Update user creation flow
|
| 623 |
+
4. ✅ Test user approval workflow
|
| 624 |
+
5. ✅ Migrate existing users (if applicable)
|
| 625 |
+
|
| 626 |
+
### 5.4 Phase 4: Logging Integration (Week 4)
|
| 627 |
+
1. ✅ Reduce local inference logging (rely on API)
|
| 628 |
+
2. ✅ Keep application-specific logging
|
| 629 |
+
3. ✅ Implement usage stats sync
|
| 630 |
+
4. ✅ Test audit trail access
|
| 631 |
+
5. ✅ Verify compliance requirements
|
| 632 |
+
|
| 633 |
+
### 5.5 Phase 5: Production Deployment (Week 5)
|
| 634 |
+
1. ✅ Deploy to staging
|
| 635 |
+
2. ✅ Load testing
|
| 636 |
+
3. ✅ Monitor API usage
|
| 637 |
+
4. ✅ Verify logging completeness
|
| 638 |
+
5. ✅ Deploy to production
|
| 639 |
+
6. ✅ Monitor and optimize
|
| 640 |
+
|
| 641 |
+
---
|
| 642 |
+
|
| 643 |
+
## 6. Advantages and Considerations
|
| 644 |
+
|
| 645 |
+
### 6.1 Advantages
|
| 646 |
+
|
| 647 |
+
1. **Built-in User Management**
|
| 648 |
+
- No need to manage user accounts separately
|
| 649 |
+
- JWT authentication is industry standard
|
| 650 |
+
- User approval workflow built-in
|
| 651 |
+
|
| 652 |
+
2. **Comprehensive Logging**
|
| 653 |
+
- All requests automatically logged
|
| 654 |
+
- Audit trail for compliance
|
| 655 |
+
- Usage statistics readily available
|
| 656 |
+
- Reduced local logging overhead
|
| 657 |
+
|
| 658 |
+
3. **Better Task Routing**
|
| 659 |
+
- Task-based instead of model-based
|
| 660 |
+
- API handles model selection
|
| 661 |
+
- Simpler configuration
|
| 662 |
+
|
| 663 |
+
4. **Rich Metadata**
|
| 664 |
+
- Performance metrics included
|
| 665 |
+
- Quality scores provided
|
| 666 |
+
- Token usage tracked
|
| 667 |
+
- Inference timing available
|
| 668 |
+
|
| 669 |
+
5. **Security Features**
|
| 670 |
+
- Prompt injection detection
|
| 671 |
+
- Rate limiting built-in
|
| 672 |
+
- Input validation
|
| 673 |
+
- JWT token security
|
| 674 |
+
|
| 675 |
+
### 6.2 Considerations
|
| 676 |
+
|
| 677 |
+
1. **Authentication Complexity**
|
| 678 |
+
- Need to manage tokens (access + refresh)
|
| 679 |
+
- Token expiry handling required
|
| 680 |
+
- More complex than API key
|
| 681 |
+
|
| 682 |
+
2. **User Management Overhead**
|
| 683 |
+
- User approval workflow (unless auto-approved)
|
| 684 |
+
- Need to maintain user accounts
|
| 685 |
+
- Migration complexity if using per-user accounts
|
| 686 |
+
|
| 687 |
+
3. **API Dependency**
|
| 688 |
+
- Single point of failure
|
| 689 |
+
- Network dependency
|
| 690 |
+
- Need fallback strategy
|
| 691 |
+
|
| 692 |
+
4. **Data Location**
|
| 693 |
+
- Logs stored on API server
|
| 694 |
+
- Need to trust API provider
|
| 695 |
+
- May need data export capability
|
| 696 |
+
|
| 697 |
+
5. **Cost Considerations**
|
| 698 |
+
- May have different pricing model
|
| 699 |
+
- Usage tracking helps monitor costs
|
| 700 |
+
- Rate limits may affect throughput
|
| 701 |
+
|
| 702 |
+
---
|
| 703 |
+
|
| 704 |
+
## 7. Recommendations
|
| 705 |
+
|
| 706 |
+
### 7.1 Immediate Actions
|
| 707 |
+
|
| 708 |
+
1. **✅ Start with Service Account**
|
| 709 |
+
- Simpler initial integration
|
| 710 |
+
- Faster to implement
|
| 711 |
+
- Can migrate to per-user later
|
| 712 |
+
|
| 713 |
+
2. **✅ Keep Local Logging Initially**
|
| 714 |
+
- Don't remove local logging immediately
|
| 715 |
+
- Run dual logging during migration
|
| 716 |
+
- Verify API logging completeness
|
| 717 |
+
- Remove local logging after verification
|
| 718 |
+
|
| 719 |
+
3. **✅ Implement Fallback**
|
| 720 |
+
- Keep HF API as fallback
|
| 721 |
+
- Handle API unavailability gracefully
|
| 722 |
+
- Test fallback scenarios
|
| 723 |
+
|
| 724 |
+
4. **✅ Test Thoroughly**
|
| 725 |
+
- Test all task types
|
| 726 |
+
- Test authentication flow
|
| 727 |
+
- Test token refresh
|
| 728 |
+
- Test error scenarios
|
| 729 |
+
- Test rate limiting
|
| 730 |
+
|
| 731 |
+
### 7.2 Long-term Strategy
|
| 732 |
+
|
| 733 |
+
1. **Migrate to Per-User Accounts** (if multi-tenant)
|
| 734 |
+
- Better usage tracking
|
| 735 |
+
- Per-user rate limits
|
| 736 |
+
- Better audit trail
|
| 737 |
+
|
| 738 |
+
2. **Leverage API Logging**
|
| 739 |
+
- Reduce local logging overhead
|
| 740 |
+
- Use API for compliance reporting
|
| 741 |
+
- Sync usage stats periodically
|
| 742 |
+
|
| 743 |
+
3. **Optimize Context Management**
|
| 744 |
+
- Use API's context parameter
|
| 745 |
+
- Reduce local context storage
|
| 746 |
+
- Leverage API's context validation
|
| 747 |
+
|
| 748 |
+
4. **Monitor and Optimize**
|
| 749 |
+
- Track API usage patterns
|
| 750 |
+
- Optimize task mapping
|
| 751 |
+
- Adjust rate limits if needed
|
| 752 |
+
- Monitor costs
|
| 753 |
+
|
| 754 |
+
---
|
| 755 |
+
|
| 756 |
+
## 8. Testing Checklist
|
| 757 |
+
|
| 758 |
+
### 8.1 Authentication Testing
|
| 759 |
+
- [ ] User registration works
|
| 760 |
+
- [ ] Login returns valid tokens
|
| 761 |
+
- [ ] Token refresh works
|
| 762 |
+
- [ ] Expired token handling
|
| 763 |
+
- [ ] Invalid token rejection
|
| 764 |
+
- [ ] User approval workflow
|
| 765 |
+
|
| 766 |
+
### 8.2 API Endpoint Testing
|
| 767 |
+
- [ ] `/chat` endpoint works for all task types
|
| 768 |
+
- [ ] Context parameter works correctly
|
| 769 |
+
- [ ] System prompts work
|
| 770 |
+
- [ ] Generation parameters respected
|
| 771 |
+
- [ ] Error handling works
|
| 772 |
+
- [ ] Rate limiting works
|
| 773 |
+
|
| 774 |
+
### 8.3 Logging Verification
|
| 775 |
+
- [ ] All requests logged in API
|
| 776 |
+
- [ ] Usage stats accurate
|
| 777 |
+
- [ ] Audit info included in responses
|
| 778 |
+
- [ ] Token usage tracked correctly
|
| 779 |
+
- [ ] Performance metrics available
|
| 780 |
+
|
| 781 |
+
### 8.4 Integration Testing
|
| 782 |
+
- [ ] LLM router uses ZeroGPU
|
| 783 |
+
- [ ] Task mapping correct
|
| 784 |
+
- [ ] Context passed correctly
|
| 785 |
+
- [ ] Error handling graceful
|
| 786 |
+
- [ ] Fallback works if API unavailable
|
| 787 |
+
- [ ] Performance acceptable
|
| 788 |
+
|
| 789 |
+
### 8.5 Production Readiness
|
| 790 |
+
- [ ] Load testing completed
|
| 791 |
+
- [ ] Monitoring in place
|
| 792 |
+
- [ ] Alerting configured
|
| 793 |
+
- [ ] Documentation updated
|
| 794 |
+
- [ ] Team trained
|
| 795 |
+
- [ ] Rollback plan ready
|
| 796 |
+
|
| 797 |
+
---
|
| 798 |
+
|
| 799 |
+
## 9. Conclusion
|
| 800 |
+
|
| 801 |
+
The ZeroGPU Chat API is a **strong replacement** for Hugging Face Inference API with significant advantages:
|
| 802 |
+
|
| 803 |
+
**✅ Recommended for Integration:**
|
| 804 |
+
- Better user management
|
| 805 |
+
- Comprehensive logging
|
| 806 |
+
- Rich metadata
|
| 807 |
+
- Security features
|
| 808 |
+
- Task-based routing
|
| 809 |
+
|
| 810 |
+
**⚠️ Requires Careful Planning:**
|
| 811 |
+
- Authentication complexity
|
| 812 |
+
- User management strategy
|
| 813 |
+
- Migration planning
|
| 814 |
+
- Fallback implementation
|
| 815 |
+
|
| 816 |
+
**📋 Next Steps:**
|
| 817 |
+
1. Review this document with team
|
| 818 |
+
2. Set up ZeroGPU API instance
|
| 819 |
+
3. Create service account
|
| 820 |
+
4. Implement client library
|
| 821 |
+
5. Test integration
|
| 822 |
+
6. Plan migration timeline
|
| 823 |
+
|
| 824 |
+
---
|
| 825 |
+
|
| 826 |
+
**Document Version:** 1.0
|
| 827 |
+
**Last Updated:** 2025-01-07
|
| 828 |
+
**Status:** Ready for Review
|
| 829 |
+
|
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ZeroGPU Chat API - Quick Review Summary
|
| 2 |
+
|
| 3 |
+
## 🎯 Key Findings
|
| 4 |
+
|
| 5 |
+
### ✅ **API Documentation Quality: Excellent**
|
| 6 |
+
- Comprehensive documentation with clear examples
|
| 7 |
+
- Well-structured endpoint descriptions
|
| 8 |
+
- Good error handling documentation
|
| 9 |
+
- Multi-agent integration guide included
|
| 10 |
+
|
| 11 |
+
### ✅ **Data Storage: Server-Side Logging**
|
| 12 |
+
The API provides **comprehensive server-side logging**:
|
| 13 |
+
- ✅ All inference requests automatically logged
|
| 14 |
+
- ✅ User activity tracking via `/usage/stats` endpoint
|
| 15 |
+
- ✅ Audit trail with 90-day retention
|
| 16 |
+
- ✅ Performance metrics stored
|
| 17 |
+
- ✅ Token usage tracking per user
|
| 18 |
+
|
| 19 |
+
**What This Means:**
|
| 20 |
+
- You can **reduce local logging** for inference requests
|
| 21 |
+
- API handles **compliance logging** automatically
|
| 22 |
+
- Usage statistics available via API endpoint
|
| 23 |
+
- Still need local storage for: sessions, agent traces, reasoning chains
|
| 24 |
+
|
| 25 |
+
### ✅ **User Logging Capabilities: Comprehensive**
|
| 26 |
+
|
| 27 |
+
#### Automatic Logging (Every Request)
|
| 28 |
+
```json
|
| 29 |
+
"audit_info": {
|
| 30 |
+
"timestamp": "2024-01-01T12:00:00",
|
| 31 |
+
"user_id": 1,
|
| 32 |
+
"model_name": "...",
|
| 33 |
+
"task": "general",
|
| 34 |
+
"generation_parameters": {...},
|
| 35 |
+
"compliance": {
|
| 36 |
+
"logged": true,
|
| 37 |
+
"retention_days": 90,
|
| 38 |
+
"audit_enabled": true
|
| 39 |
+
}
|
| 40 |
+
}
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
#### Usage Statistics Endpoint
|
| 44 |
+
- Per-user statistics
|
| 45 |
+
- Time-period filtering
|
| 46 |
+
- Task breakdown (requests/tokens by task)
|
| 47 |
+
- Performance metrics
|
| 48 |
+
- Token usage tracking
|
| 49 |
+
|
| 50 |
+
#### What's NOT Logged by API
|
| 51 |
+
- Request/response content (full text) - may not be stored
|
| 52 |
+
- Agent traces (your system-specific)
|
| 53 |
+
- Reasoning chains (application-specific)
|
| 54 |
+
- Context summaries (user persona)
|
| 55 |
+
|
| 56 |
+
**Recommendation:** Use hybrid logging - API for inference logs, local DB for application-specific data.
|
| 57 |
+
|
| 58 |
+
---
|
| 59 |
+
|
| 60 |
+
## 🔄 Integration Requirements
|
| 61 |
+
|
| 62 |
+
### 1. Replace HF Endpoint Calls
|
| 63 |
+
**Current:** `https://router.huggingface.co/v1/chat/completions`
|
| 64 |
+
**New:** `http://your-pod-ip:8000/chat`
|
| 65 |
+
|
| 66 |
+
### 2. Implement Authentication
|
| 67 |
+
- JWT-based (access token + refresh token)
|
| 68 |
+
- Token expiry handling required
|
| 69 |
+
- User approval workflow
|
| 70 |
+
|
| 71 |
+
### 3. Task Type Mapping
|
| 72 |
+
```python
|
| 73 |
+
TASK_MAPPING = {
|
| 74 |
+
"intent_classification": "classification",
|
| 75 |
+
"embedding_generation": "embedding",
|
| 76 |
+
"general_reasoning": "reasoning",
|
| 77 |
+
"response_synthesis": "general"
|
| 78 |
+
}
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
### 4. Update LLM Router
|
| 82 |
+
- Replace `_call_hf_endpoint()` with `_call_zero_gpu_endpoint()`
|
| 83 |
+
- Add context parameter support
|
| 84 |
+
- Implement token refresh logic
|
| 85 |
+
|
| 86 |
+
---
|
| 87 |
+
|
| 88 |
+
## 📊 Data Storage Comparison
|
| 89 |
+
|
| 90 |
+
### Current System
|
| 91 |
+
- **Local SQLite:** Sessions, interactions, user contexts
|
| 92 |
+
- **Local Logging:** Application logs, inference logs
|
| 93 |
+
- **No centralized audit trail**
|
| 94 |
+
|
| 95 |
+
### With ZeroGPU API
|
| 96 |
+
- **API Server:** User accounts, inference logs, usage stats, audit trail
|
| 97 |
+
- **Local SQLite:** Sessions, agent traces, reasoning, preferences
|
| 98 |
+
- **Hybrid Approach:** API handles compliance, local handles context
|
| 99 |
+
|
| 100 |
+
**Storage Strategy:**
|
| 101 |
+
```
|
| 102 |
+
API Handles: Local DB Handles:
|
| 103 |
+
├── User accounts ├── Session management
|
| 104 |
+
├── Inference logs ├── Conversation history
|
| 105 |
+
├── Usage statistics ├── Agent traces
|
| 106 |
+
├── Audit trail ├── Reasoning chains
|
| 107 |
+
└── Token usage └── User preferences
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
---
|
| 111 |
+
|
| 112 |
+
## ⚠️ Key Considerations
|
| 113 |
+
|
| 114 |
+
### Advantages
|
| 115 |
+
1. ✅ Built-in user management (JWT auth)
|
| 116 |
+
2. ✅ Comprehensive audit logging
|
| 117 |
+
3. ✅ Rich metadata (tokens, timing, quality)
|
| 118 |
+
4. ✅ Task-based routing (simpler config)
|
| 119 |
+
5. ✅ Security features (rate limiting, prompt injection detection)
|
| 120 |
+
|
| 121 |
+
### Challenges
|
| 122 |
+
1. ⚠️ Authentication complexity (tokens vs API keys)
|
| 123 |
+
2. ⚠️ User management overhead (approval workflow)
|
| 124 |
+
3. ⚠️ API dependency (single point of failure)
|
| 125 |
+
4. ⚠️ Data location (logs on API server)
|
| 126 |
+
5. ⚠️ Migration complexity (user mapping)
|
| 127 |
+
|
| 128 |
+
---
|
| 129 |
+
|
| 130 |
+
## 🚀 Recommended Approach
|
| 131 |
+
|
| 132 |
+
### Phase 1: Service Account (Start Here)
|
| 133 |
+
- Create one service account for application
|
| 134 |
+
- Simpler initial integration
|
| 135 |
+
- All usage tracked under one user
|
| 136 |
+
- Can migrate to per-user later
|
| 137 |
+
|
| 138 |
+
### Phase 2: Hybrid Logging
|
| 139 |
+
- API handles inference logging (automatic)
|
| 140 |
+
- Local DB handles application-specific data
|
| 141 |
+
- Reduce local logging overhead
|
| 142 |
+
- Keep agent traces and reasoning locally
|
| 143 |
+
|
| 144 |
+
### Phase 3: Gradual Migration
|
| 145 |
+
- Start with service account
|
| 146 |
+
- Test thoroughly
|
| 147 |
+
- Monitor API logging
|
| 148 |
+
- Migrate to per-user accounts if needed
|
| 149 |
+
|
| 150 |
+
---
|
| 151 |
+
|
| 152 |
+
## 📋 Action Items
|
| 153 |
+
|
| 154 |
+
1. **Review Integration Plan** (`ZEROGPU_API_INTEGRATION_REVIEW.md`)
|
| 155 |
+
2. **Set up ZeroGPU API instance**
|
| 156 |
+
3. **Create service account**
|
| 157 |
+
4. **Implement ZeroGPU client** (`zero_gpu_client.py`)
|
| 158 |
+
5. **Update LLM router** (replace HF calls)
|
| 159 |
+
6. **Test authentication flow**
|
| 160 |
+
7. **Test all task types**
|
| 161 |
+
8. **Verify logging works**
|
| 162 |
+
9. **Implement fallback** (keep HF as backup)
|
| 163 |
+
10. **Deploy to staging**
|
| 164 |
+
|
| 165 |
+
---
|
| 166 |
+
|
| 167 |
+
## 📚 Documentation References
|
| 168 |
+
|
| 169 |
+
- **Full Review:** `ZEROGPU_API_INTEGRATION_REVIEW.md`
|
| 170 |
+
- **API Documentation:** Provided ZeroGPU API docs
|
| 171 |
+
- **Current API:** `API_QUICK_REFERENCE.md`
|
| 172 |
+
- **Current Implementation:** `llm_router.py`, `models_config.py`
|
| 173 |
+
|
| 174 |
+
---
|
| 175 |
+
|
| 176 |
+
**Status:** ✅ Ready for Integration
|
| 177 |
+
**Priority:** High (replaces HF/Novita endpoints)
|
| 178 |
+
**Complexity:** Medium (requires authentication and task mapping)
|
| 179 |
+
|
|
@@ -2024,9 +2024,24 @@ def initialize_orchestrator():
|
|
| 2024 |
if not hf_token:
|
| 2025 |
logger.warning("HF_TOKEN not found in environment")
|
| 2026 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2027 |
# Initialize LLM Router
|
| 2028 |
logger.info("Step 1/6: Initializing LLM Router...")
|
| 2029 |
-
llm_router = LLMRouter(hf_token)
|
| 2030 |
logger.info("✓ LLM Router initialized")
|
| 2031 |
|
| 2032 |
# Initialize Agents
|
|
|
|
| 2024 |
if not hf_token:
|
| 2025 |
logger.warning("HF_TOKEN not found in environment")
|
| 2026 |
|
| 2027 |
+
# Prepare ZeroGPU config if enabled
|
| 2028 |
+
zero_gpu_config = None
|
| 2029 |
+
try:
|
| 2030 |
+
from config import settings
|
| 2031 |
+
if settings.zero_gpu_enabled and settings.zero_gpu_email and settings.zero_gpu_password:
|
| 2032 |
+
zero_gpu_config = {
|
| 2033 |
+
"enabled": True,
|
| 2034 |
+
"base_url": settings.zero_gpu_base_url,
|
| 2035 |
+
"email": settings.zero_gpu_email,
|
| 2036 |
+
"password": settings.zero_gpu_password
|
| 2037 |
+
}
|
| 2038 |
+
logger.info("ZeroGPU API enabled in configuration")
|
| 2039 |
+
except Exception as e:
|
| 2040 |
+
logger.debug(f"Could not load ZeroGPU config: {e}")
|
| 2041 |
+
|
| 2042 |
# Initialize LLM Router
|
| 2043 |
logger.info("Step 1/6: Initializing LLM Router...")
|
| 2044 |
+
llm_router = LLMRouter(hf_token, use_local_models=True, zero_gpu_config=zero_gpu_config)
|
| 2045 |
logger.info("✓ LLM Router initialized")
|
| 2046 |
|
| 2047 |
# Initialize Agents
|
|
@@ -36,6 +36,12 @@ class Settings(BaseSettings):
|
|
| 36 |
log_level: str = os.getenv("LOG_LEVEL", "INFO")
|
| 37 |
log_format: str = os.getenv("LOG_FORMAT", "json")
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
class Config:
|
| 40 |
env_file = ".env"
|
| 41 |
|
|
|
|
| 36 |
log_level: str = os.getenv("LOG_LEVEL", "INFO")
|
| 37 |
log_format: str = os.getenv("LOG_FORMAT", "json")
|
| 38 |
|
| 39 |
+
# ZeroGPU API settings
|
| 40 |
+
zero_gpu_enabled: bool = os.getenv("USE_ZERO_GPU", "false").lower() == "true"
|
| 41 |
+
zero_gpu_base_url: str = os.getenv("ZERO_GPU_API_URL", "http://localhost:8000")
|
| 42 |
+
zero_gpu_email: str = os.getenv("ZERO_GPU_EMAIL", "")
|
| 43 |
+
zero_gpu_password: str = os.getenv("ZERO_GPU_PASSWORD", "")
|
| 44 |
+
|
| 45 |
class Config:
|
| 46 |
env_file = ".env"
|
| 47 |
|
|
@@ -55,9 +55,24 @@ def initialize_orchestrator():
|
|
| 55 |
if not hf_token:
|
| 56 |
logger.warning("HF_TOKEN not set - API fallback will be used if local models fail")
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
# Initialize LLM Router with local model loading enabled
|
| 59 |
logger.info("Initializing LLM Router with local GPU model loading...")
|
| 60 |
-
llm_router = LLMRouter(hf_token, use_local_models=True)
|
| 61 |
|
| 62 |
logger.info("Initializing Agents...")
|
| 63 |
agents = {
|
|
|
|
| 55 |
if not hf_token:
|
| 56 |
logger.warning("HF_TOKEN not set - API fallback will be used if local models fail")
|
| 57 |
|
| 58 |
+
# Prepare ZeroGPU config if enabled
|
| 59 |
+
zero_gpu_config = None
|
| 60 |
+
try:
|
| 61 |
+
from config import settings
|
| 62 |
+
if settings.zero_gpu_enabled and settings.zero_gpu_email and settings.zero_gpu_password:
|
| 63 |
+
zero_gpu_config = {
|
| 64 |
+
"enabled": True,
|
| 65 |
+
"base_url": settings.zero_gpu_base_url,
|
| 66 |
+
"email": settings.zero_gpu_email,
|
| 67 |
+
"password": settings.zero_gpu_password
|
| 68 |
+
}
|
| 69 |
+
logger.info("ZeroGPU API enabled in configuration")
|
| 70 |
+
except Exception as e:
|
| 71 |
+
logger.debug(f"Could not load ZeroGPU config: {e}")
|
| 72 |
+
|
| 73 |
# Initialize LLM Router with local model loading enabled
|
| 74 |
logger.info("Initializing LLM Router with local GPU model loading...")
|
| 75 |
+
llm_router = LLMRouter(hf_token, use_local_models=True, zero_gpu_config=zero_gpu_config)
|
| 76 |
|
| 77 |
logger.info("Initializing Agents...")
|
| 78 |
agents = {
|
|
@@ -1,17 +1,20 @@
|
|
| 1 |
-
# llm_router.py - UPDATED FOR LOCAL GPU MODEL LOADING
|
| 2 |
import logging
|
| 3 |
import asyncio
|
| 4 |
-
|
|
|
|
| 5 |
from .models_config import LLM_CONFIG
|
| 6 |
|
| 7 |
logger = logging.getLogger(__name__)
|
| 8 |
|
| 9 |
class LLMRouter:
|
| 10 |
-
def __init__(self, hf_token, use_local_models: bool = True):
|
| 11 |
self.hf_token = hf_token
|
| 12 |
self.health_status = {}
|
| 13 |
self.use_local_models = use_local_models
|
| 14 |
self.local_loader = None
|
|
|
|
|
|
|
| 15 |
|
| 16 |
logger.info("LLMRouter initialized")
|
| 17 |
if hf_token:
|
|
@@ -19,6 +22,35 @@ class LLMRouter:
|
|
| 19 |
else:
|
| 20 |
logger.warning("No HF token provided")
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
# Initialize local model loader if enabled
|
| 23 |
if self.use_local_models:
|
| 24 |
try:
|
|
@@ -35,10 +67,10 @@ class LLMRouter:
|
|
| 35 |
self.use_local_models = False
|
| 36 |
self.local_loader = None
|
| 37 |
|
| 38 |
-
async def route_inference(self, task_type: str, prompt: str, **kwargs):
|
| 39 |
"""
|
| 40 |
Smart routing based on task specialization
|
| 41 |
-
Tries local models first, falls back to HF Inference API if needed
|
| 42 |
"""
|
| 43 |
logger.info(f"Routing inference for task: {task_type}")
|
| 44 |
model_config = self._select_model(task_type)
|
|
@@ -62,6 +94,19 @@ class LLMRouter:
|
|
| 62 |
logger.warning(f"Local model inference failed: {e}. Falling back to API.")
|
| 63 |
logger.debug("Exception details:", exc_info=True)
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
# Fallback to HF Inference API
|
| 66 |
logger.info("Using HF Inference API")
|
| 67 |
# Health check and fallback logic
|
|
@@ -149,6 +194,115 @@ class LLMRouter:
|
|
| 149 |
logger.error(f"Error calling local embedding model: {e}", exc_info=True)
|
| 150 |
return None
|
| 151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
def _select_model(self, task_type: str) -> dict:
|
| 153 |
model_map = {
|
| 154 |
"intent_classification": LLM_CONFIG["models"]["classification_specialist"],
|
|
|
|
| 1 |
+
# llm_router.py - UPDATED FOR LOCAL GPU MODEL LOADING + ZEROGPU API
|
| 2 |
import logging
|
| 3 |
import asyncio
|
| 4 |
+
import os
|
| 5 |
+
from typing import Dict, Optional, List
|
| 6 |
from .models_config import LLM_CONFIG
|
| 7 |
|
| 8 |
logger = logging.getLogger(__name__)
|
| 9 |
|
| 10 |
class LLMRouter:
|
| 11 |
+
def __init__(self, hf_token, use_local_models: bool = True, zero_gpu_config: Optional[Dict] = None):
|
| 12 |
self.hf_token = hf_token
|
| 13 |
self.health_status = {}
|
| 14 |
self.use_local_models = use_local_models
|
| 15 |
self.local_loader = None
|
| 16 |
+
self.zero_gpu_client = None
|
| 17 |
+
self.use_zero_gpu = False
|
| 18 |
|
| 19 |
logger.info("LLMRouter initialized")
|
| 20 |
if hf_token:
|
|
|
|
| 22 |
else:
|
| 23 |
logger.warning("No HF token provided")
|
| 24 |
|
| 25 |
+
# Initialize ZeroGPU client if configured
|
| 26 |
+
if zero_gpu_config and zero_gpu_config.get("enabled", False):
|
| 27 |
+
try:
|
| 28 |
+
from zero_gpu_client import ZeroGPUChatClient
|
| 29 |
+
base_url = zero_gpu_config.get("base_url", os.getenv("ZERO_GPU_API_URL", "http://localhost:8000"))
|
| 30 |
+
email = zero_gpu_config.get("email", os.getenv("ZERO_GPU_EMAIL", ""))
|
| 31 |
+
password = zero_gpu_config.get("password", os.getenv("ZERO_GPU_PASSWORD", ""))
|
| 32 |
+
|
| 33 |
+
if email and password:
|
| 34 |
+
self.zero_gpu_client = ZeroGPUChatClient(base_url, email, password)
|
| 35 |
+
self.use_zero_gpu = True
|
| 36 |
+
logger.info("✓ ZeroGPU API client initialized")
|
| 37 |
+
|
| 38 |
+
# Wait for API to be ready (non-blocking, will fallback if not ready)
|
| 39 |
+
try:
|
| 40 |
+
if not self.zero_gpu_client.wait_for_ready(timeout=10):
|
| 41 |
+
logger.warning("ZeroGPU API not ready, will use HF fallback")
|
| 42 |
+
self.use_zero_gpu = False
|
| 43 |
+
except Exception as e:
|
| 44 |
+
logger.warning(f"Could not verify ZeroGPU API readiness: {e}. Will use HF fallback.")
|
| 45 |
+
self.use_zero_gpu = False
|
| 46 |
+
else:
|
| 47 |
+
logger.warning("ZeroGPU enabled but credentials not provided")
|
| 48 |
+
except ImportError:
|
| 49 |
+
logger.warning("zero_gpu_client not available, ZeroGPU disabled")
|
| 50 |
+
except Exception as e:
|
| 51 |
+
logger.warning(f"Could not initialize ZeroGPU client: {e}. Falling back to HF API.")
|
| 52 |
+
self.use_zero_gpu = False
|
| 53 |
+
|
| 54 |
# Initialize local model loader if enabled
|
| 55 |
if self.use_local_models:
|
| 56 |
try:
|
|
|
|
| 67 |
self.use_local_models = False
|
| 68 |
self.local_loader = None
|
| 69 |
|
| 70 |
+
async def route_inference(self, task_type: str, prompt: str, context: Optional[List[Dict]] = None, **kwargs):
|
| 71 |
"""
|
| 72 |
Smart routing based on task specialization
|
| 73 |
+
Tries local models first, then ZeroGPU API, falls back to HF Inference API if needed
|
| 74 |
"""
|
| 75 |
logger.info(f"Routing inference for task: {task_type}")
|
| 76 |
model_config = self._select_model(task_type)
|
|
|
|
| 94 |
logger.warning(f"Local model inference failed: {e}. Falling back to API.")
|
| 95 |
logger.debug("Exception details:", exc_info=True)
|
| 96 |
|
| 97 |
+
# Try ZeroGPU API if enabled
|
| 98 |
+
if self.use_zero_gpu and self.zero_gpu_client:
|
| 99 |
+
try:
|
| 100 |
+
result = await self._call_zero_gpu_endpoint(task_type, prompt, context, **kwargs)
|
| 101 |
+
if result is not None:
|
| 102 |
+
logger.info(f"Inference complete for {task_type} (ZeroGPU API)")
|
| 103 |
+
return result
|
| 104 |
+
else:
|
| 105 |
+
logger.warning("ZeroGPU API returned None, falling back to HF")
|
| 106 |
+
except Exception as e:
|
| 107 |
+
logger.warning(f"ZeroGPU API inference failed: {e}. Falling back to HF API.")
|
| 108 |
+
logger.debug("Exception details:", exc_info=True)
|
| 109 |
+
|
| 110 |
# Fallback to HF Inference API
|
| 111 |
logger.info("Using HF Inference API")
|
| 112 |
# Health check and fallback logic
|
|
|
|
| 194 |
logger.error(f"Error calling local embedding model: {e}", exc_info=True)
|
| 195 |
return None
|
| 196 |
|
| 197 |
+
async def _call_zero_gpu_endpoint(self, task_type: str, prompt: str, context: Optional[List[Dict]] = None, **kwargs) -> Optional[str]:
|
| 198 |
+
"""
|
| 199 |
+
Call ZeroGPU API endpoint
|
| 200 |
+
|
| 201 |
+
Args:
|
| 202 |
+
task_type: Task type (e.g., "intent_classification", "general_reasoning")
|
| 203 |
+
prompt: User prompt/message
|
| 204 |
+
context: Optional conversation context
|
| 205 |
+
**kwargs: Additional generation parameters
|
| 206 |
+
|
| 207 |
+
Returns:
|
| 208 |
+
Generated text response or None if failed
|
| 209 |
+
"""
|
| 210 |
+
if not self.zero_gpu_client:
|
| 211 |
+
return None
|
| 212 |
+
|
| 213 |
+
try:
|
| 214 |
+
# Map task type to ZeroGPU task
|
| 215 |
+
task_mapping = LLM_CONFIG.get("zero_gpu_task_mapping", {})
|
| 216 |
+
zero_gpu_task = task_mapping.get(task_type, "general")
|
| 217 |
+
|
| 218 |
+
logger.info(f"Calling ZeroGPU API for task: {task_type} -> {zero_gpu_task}")
|
| 219 |
+
logger.debug(f"Prompt length: {len(prompt)}")
|
| 220 |
+
logger.info("=" * 80)
|
| 221 |
+
logger.info("ZEROGPU API REQUEST:")
|
| 222 |
+
logger.info("=" * 80)
|
| 223 |
+
logger.info(f"Task Type: {task_type} -> ZeroGPU Task: {zero_gpu_task}")
|
| 224 |
+
logger.info(f"Prompt Length: {len(prompt)} characters")
|
| 225 |
+
logger.info("-" * 40)
|
| 226 |
+
logger.info("FULL PROMPT CONTENT:")
|
| 227 |
+
logger.info("-" * 40)
|
| 228 |
+
logger.info(prompt)
|
| 229 |
+
logger.info("-" * 40)
|
| 230 |
+
logger.info("END OF PROMPT")
|
| 231 |
+
logger.info("=" * 80)
|
| 232 |
+
|
| 233 |
+
# Prepare context if provided
|
| 234 |
+
context_messages = None
|
| 235 |
+
if context:
|
| 236 |
+
context_messages = []
|
| 237 |
+
for msg in context[-50:]: # Limit to 50 messages as per API
|
| 238 |
+
context_messages.append({
|
| 239 |
+
"role": msg.get("role", "user"),
|
| 240 |
+
"content": msg.get("content", ""),
|
| 241 |
+
"timestamp": msg.get("timestamp", "")
|
| 242 |
+
})
|
| 243 |
+
|
| 244 |
+
# Prepare generation parameters
|
| 245 |
+
generation_params = {
|
| 246 |
+
"max_tokens": kwargs.get('max_tokens', 512),
|
| 247 |
+
"temperature": kwargs.get('temperature', 0.7),
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
# Add optional parameters
|
| 251 |
+
if 'top_p' in kwargs:
|
| 252 |
+
generation_params["top_p"] = kwargs['top_p']
|
| 253 |
+
if 'system_prompt' in kwargs:
|
| 254 |
+
generation_params["system_prompt"] = kwargs['system_prompt']
|
| 255 |
+
|
| 256 |
+
# Call ZeroGPU API
|
| 257 |
+
response = self.zero_gpu_client.chat(
|
| 258 |
+
message=prompt,
|
| 259 |
+
task=zero_gpu_task,
|
| 260 |
+
context=context_messages,
|
| 261 |
+
**generation_params
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
# Extract response text
|
| 265 |
+
if response and "response" in response:
|
| 266 |
+
generated_text = response["response"]
|
| 267 |
+
|
| 268 |
+
if not generated_text or generated_text.strip() == "":
|
| 269 |
+
logger.warning("ZeroGPU API returned empty response")
|
| 270 |
+
return None
|
| 271 |
+
|
| 272 |
+
logger.info(f"ZeroGPU API returned response (length: {len(generated_text)})")
|
| 273 |
+
logger.info("=" * 80)
|
| 274 |
+
logger.info("COMPLETE ZEROGPU API RESPONSE:")
|
| 275 |
+
logger.info("=" * 80)
|
| 276 |
+
logger.info(f"Task Type: {task_type} -> ZeroGPU Task: {zero_gpu_task}")
|
| 277 |
+
logger.info(f"Response Length: {len(generated_text)} characters")
|
| 278 |
+
|
| 279 |
+
# Log metrics if available
|
| 280 |
+
if "tokens_used" in response:
|
| 281 |
+
tokens = response["tokens_used"]
|
| 282 |
+
logger.info(f"Tokens: input={tokens.get('input', 0)}, output={tokens.get('output', 0)}, total={tokens.get('total', 0)}")
|
| 283 |
+
|
| 284 |
+
if "inference_metrics" in response:
|
| 285 |
+
metrics = response["inference_metrics"]
|
| 286 |
+
logger.info(f"Inference Duration: {metrics.get('inference_duration', 0):.2f}s")
|
| 287 |
+
logger.info(f"Tokens/Second: {metrics.get('tokens_per_second', 0):.2f}")
|
| 288 |
+
|
| 289 |
+
logger.info("-" * 40)
|
| 290 |
+
logger.info("FULL RESPONSE CONTENT:")
|
| 291 |
+
logger.info("-" * 40)
|
| 292 |
+
logger.info(generated_text)
|
| 293 |
+
logger.info("-" * 40)
|
| 294 |
+
logger.info("END OF RESPONSE")
|
| 295 |
+
logger.info("=" * 80)
|
| 296 |
+
|
| 297 |
+
return generated_text
|
| 298 |
+
else:
|
| 299 |
+
logger.error(f"Unexpected ZeroGPU response format: {response}")
|
| 300 |
+
return None
|
| 301 |
+
|
| 302 |
+
except Exception as e:
|
| 303 |
+
logger.error(f"Error calling ZeroGPU API: {e}", exc_info=True)
|
| 304 |
+
return None
|
| 305 |
+
|
| 306 |
def _select_model(self, task_type: str) -> dict:
|
| 307 |
model_map = {
|
| 308 |
"intent_classification": LLM_CONFIG["models"]["classification_specialist"],
|
|
@@ -39,5 +39,12 @@ LLM_CONFIG = {
|
|
| 39 |
"strategy": "task_based_routing",
|
| 40 |
"fallback_chain": ["primary", "fallback", "degraded_mode"],
|
| 41 |
"load_balancing": "round_robin_with_health_check"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
}
|
| 43 |
}
|
|
|
|
| 39 |
"strategy": "task_based_routing",
|
| 40 |
"fallback_chain": ["primary", "fallback", "degraded_mode"],
|
| 41 |
"load_balancing": "round_robin_with_health_check"
|
| 42 |
+
},
|
| 43 |
+
"zero_gpu_task_mapping": {
|
| 44 |
+
"intent_classification": "classification",
|
| 45 |
+
"embedding_generation": "embedding",
|
| 46 |
+
"safety_check": "general",
|
| 47 |
+
"general_reasoning": "reasoning",
|
| 48 |
+
"response_synthesis": "general"
|
| 49 |
}
|
| 50 |
}
|
|
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# zero_gpu_client.py
|
| 2 |
+
"""
|
| 3 |
+
ZeroGPU Chat API Client
|
| 4 |
+
Provides authentication and API access to ZeroGPU Chat API
|
| 5 |
+
"""
|
| 6 |
+
import requests
|
| 7 |
+
import time
|
| 8 |
+
import logging
|
| 9 |
+
from typing import Optional, List, Dict, Any
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class ZeroGPUChatClient:
|
| 16 |
+
"""Client for ZeroGPU Chat API with automatic token refresh"""
|
| 17 |
+
|
| 18 |
+
def __init__(self, base_url: str, email: str, password: str):
|
| 19 |
+
"""
|
| 20 |
+
Initialize ZeroGPU API client
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
base_url: Base URL of ZeroGPU API (e.g., "http://your-pod-ip:8000")
|
| 24 |
+
email: User email for authentication
|
| 25 |
+
password: User password for authentication
|
| 26 |
+
"""
|
| 27 |
+
self.base_url = base_url.rstrip('/')
|
| 28 |
+
self.email = email
|
| 29 |
+
self.password = password
|
| 30 |
+
self.access_token = None
|
| 31 |
+
self.refresh_token = None
|
| 32 |
+
self._last_token_refresh = None
|
| 33 |
+
|
| 34 |
+
logger.info(f"Initializing ZeroGPU client for {self.base_url}")
|
| 35 |
+
self.login(email, password)
|
| 36 |
+
|
| 37 |
+
def login(self, email: str, password: str):
|
| 38 |
+
"""Login and get authentication tokens"""
|
| 39 |
+
try:
|
| 40 |
+
response = requests.post(
|
| 41 |
+
f"{self.base_url}/login",
|
| 42 |
+
json={"email": email, "password": password},
|
| 43 |
+
timeout=10
|
| 44 |
+
)
|
| 45 |
+
response.raise_for_status()
|
| 46 |
+
data = response.json()
|
| 47 |
+
self.access_token = data["access_token"]
|
| 48 |
+
self.refresh_token = data["refresh_token"]
|
| 49 |
+
self._last_token_refresh = time.time()
|
| 50 |
+
logger.info("✓ ZeroGPU authentication successful")
|
| 51 |
+
except requests.exceptions.RequestException as e:
|
| 52 |
+
logger.error(f"ZeroGPU login failed: {e}")
|
| 53 |
+
raise
|
| 54 |
+
|
| 55 |
+
def refresh_access_token(self):
|
| 56 |
+
"""Refresh access token using refresh token"""
|
| 57 |
+
try:
|
| 58 |
+
response = requests.post(
|
| 59 |
+
f"{self.base_url}/refresh",
|
| 60 |
+
headers={"X-Refresh-Token": self.refresh_token},
|
| 61 |
+
timeout=10
|
| 62 |
+
)
|
| 63 |
+
response.raise_for_status()
|
| 64 |
+
data = response.json()
|
| 65 |
+
self.access_token = data["access_token"]
|
| 66 |
+
self.refresh_token = data.get("refresh_token", self.refresh_token)
|
| 67 |
+
self._last_token_refresh = time.time()
|
| 68 |
+
logger.info("✓ ZeroGPU token refreshed")
|
| 69 |
+
except requests.exceptions.RequestException as e:
|
| 70 |
+
logger.warning(f"Token refresh failed, attempting re-login: {e}")
|
| 71 |
+
# Try to re-login if refresh fails
|
| 72 |
+
self.login(self.email, self.password)
|
| 73 |
+
|
| 74 |
+
def _get_headers(self) -> Dict[str, str]:
|
| 75 |
+
"""Get request headers with authentication"""
|
| 76 |
+
return {
|
| 77 |
+
"Authorization": f"Bearer {self.access_token}",
|
| 78 |
+
"Content-Type": "application/json"
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
def _ensure_valid_token(self):
|
| 82 |
+
"""Ensure access token is valid, refresh if needed"""
|
| 83 |
+
# Refresh token if it's been more than 10 minutes (tokens expire in 15 min)
|
| 84 |
+
if self._last_token_refresh and (time.time() - self._last_token_refresh) > 600:
|
| 85 |
+
self.refresh_access_token()
|
| 86 |
+
|
| 87 |
+
def _request(self, method: str, endpoint: str, **kwargs) -> Dict[str, Any]:
|
| 88 |
+
"""
|
| 89 |
+
Make authenticated request with auto-retry on 401
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
method: HTTP method (GET, POST, etc.)
|
| 93 |
+
endpoint: API endpoint (e.g., "/chat")
|
| 94 |
+
**kwargs: Additional arguments for requests.request()
|
| 95 |
+
|
| 96 |
+
Returns:
|
| 97 |
+
Response JSON as dictionary
|
| 98 |
+
"""
|
| 99 |
+
url = f"{self.base_url}{endpoint}"
|
| 100 |
+
self._ensure_valid_token()
|
| 101 |
+
kwargs.setdefault("headers", {}).update(self._get_headers())
|
| 102 |
+
|
| 103 |
+
try:
|
| 104 |
+
response = requests.request(method, url, **kwargs)
|
| 105 |
+
|
| 106 |
+
if response.status_code == 401:
|
| 107 |
+
# Token expired, refresh and retry
|
| 108 |
+
logger.info("Token expired, refreshing...")
|
| 109 |
+
self.refresh_access_token()
|
| 110 |
+
kwargs["headers"].update(self._get_headers())
|
| 111 |
+
response = requests.request(method, url, **kwargs)
|
| 112 |
+
|
| 113 |
+
response.raise_for_status()
|
| 114 |
+
return response.json()
|
| 115 |
+
except requests.exceptions.RequestException as e:
|
| 116 |
+
logger.error(f"ZeroGPU API request failed: {e}")
|
| 117 |
+
if hasattr(e, 'response') and e.response is not None:
|
| 118 |
+
logger.error(f"Response: {e.response.text}")
|
| 119 |
+
raise
|
| 120 |
+
|
| 121 |
+
def chat(
|
| 122 |
+
self,
|
| 123 |
+
message: str,
|
| 124 |
+
task: str = "general",
|
| 125 |
+
context: Optional[List[Dict[str, str]]] = None,
|
| 126 |
+
max_tokens: Optional[int] = None,
|
| 127 |
+
temperature: Optional[float] = None,
|
| 128 |
+
top_p: Optional[float] = None,
|
| 129 |
+
system_prompt: Optional[str] = None,
|
| 130 |
+
**kwargs
|
| 131 |
+
) -> Dict[str, Any]:
|
| 132 |
+
"""
|
| 133 |
+
Send chat message to ZeroGPU API
|
| 134 |
+
|
| 135 |
+
Args:
|
| 136 |
+
message: User message/prompt
|
| 137 |
+
task: Task type ("general", "reasoning", "classification", "embedding")
|
| 138 |
+
context: Optional conversation context (list of message dicts)
|
| 139 |
+
max_tokens: Maximum tokens to generate
|
| 140 |
+
temperature: Sampling temperature (0.0-2.0)
|
| 141 |
+
top_p: Nucleus sampling (0.0-1.0)
|
| 142 |
+
system_prompt: Optional system prompt
|
| 143 |
+
**kwargs: Additional generation parameters
|
| 144 |
+
|
| 145 |
+
Returns:
|
| 146 |
+
API response dictionary with response, metrics, and audit info
|
| 147 |
+
"""
|
| 148 |
+
payload = {
|
| 149 |
+
"message": message,
|
| 150 |
+
"task": task,
|
| 151 |
+
**kwargs
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
if context:
|
| 155 |
+
payload["context"] = context
|
| 156 |
+
|
| 157 |
+
if max_tokens is not None:
|
| 158 |
+
payload["max_tokens"] = max_tokens
|
| 159 |
+
|
| 160 |
+
if temperature is not None:
|
| 161 |
+
payload["temperature"] = temperature
|
| 162 |
+
|
| 163 |
+
if top_p is not None:
|
| 164 |
+
payload["top_p"] = top_p
|
| 165 |
+
|
| 166 |
+
if system_prompt:
|
| 167 |
+
payload["system_prompt"] = system_prompt
|
| 168 |
+
|
| 169 |
+
logger.debug(f"ZeroGPU chat request: task={task}, message_length={len(message)}")
|
| 170 |
+
return self._request("POST", "/chat", json=payload, timeout=60)
|
| 171 |
+
|
| 172 |
+
def get_tasks(self) -> Dict[str, Any]:
|
| 173 |
+
"""Get available tasks and their specifications"""
|
| 174 |
+
return self._request("GET", "/tasks")
|
| 175 |
+
|
| 176 |
+
def get_usage_stats(self, days: int = 30) -> Dict[str, Any]:
|
| 177 |
+
"""Get usage statistics for authenticated user"""
|
| 178 |
+
return self._request("GET", f"/usage/stats?days={days}")
|
| 179 |
+
|
| 180 |
+
def get_user_info(self) -> Dict[str, Any]:
|
| 181 |
+
"""Get current authenticated user information"""
|
| 182 |
+
return self._request("GET", "/me")
|
| 183 |
+
|
| 184 |
+
def wait_for_ready(self, timeout: int = 300) -> bool:
|
| 185 |
+
"""
|
| 186 |
+
Wait for API to be ready (models loaded)
|
| 187 |
+
|
| 188 |
+
Args:
|
| 189 |
+
timeout: Maximum time to wait in seconds
|
| 190 |
+
|
| 191 |
+
Returns:
|
| 192 |
+
True if ready, False if timeout
|
| 193 |
+
"""
|
| 194 |
+
start_time = time.time()
|
| 195 |
+
while time.time() - start_time < timeout:
|
| 196 |
+
try:
|
| 197 |
+
response = requests.get(f"{self.base_url}/ready", timeout=5)
|
| 198 |
+
if response.status_code == 200:
|
| 199 |
+
data = response.json()
|
| 200 |
+
if data.get("ready", False):
|
| 201 |
+
logger.info("✓ ZeroGPU API is ready")
|
| 202 |
+
return True
|
| 203 |
+
except requests.exceptions.RequestException:
|
| 204 |
+
pass
|
| 205 |
+
|
| 206 |
+
logger.info("Waiting for ZeroGPU API to be ready...")
|
| 207 |
+
time.sleep(5)
|
| 208 |
+
|
| 209 |
+
logger.warning(f"ZeroGPU API not ready after {timeout} seconds")
|
| 210 |
+
return False
|
| 211 |
+
|
| 212 |
+
def health_check(self) -> bool:
|
| 213 |
+
"""Check if API is healthy"""
|
| 214 |
+
try:
|
| 215 |
+
response = requests.get(f"{self.base_url}/health", timeout=5)
|
| 216 |
+
return response.status_code == 200
|
| 217 |
+
except requests.exceptions.RequestException:
|
| 218 |
+
return False
|
| 219 |
+
|