Spaces:

ianshank
/

langgraph-mcts-demo

Running

File size: 14,119 Bytes

40ee6b4

"""
Production REST API server for LangGraph Multi-Agent MCTS Framework.

Provides:
- OpenAPI/Swagger documentation
- Authentication via API keys
- Rate limiting
- Health and readiness endpoints
- Request validation with Pydantic
- Prometheus metrics exposure
"""

import asyncio
import time
from contextlib import asynccontextmanager
from datetime import datetime
from typing import Any

from fastapi import Depends, FastAPI, Header, HTTPException, Request, Response
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from pydantic import BaseModel, Field

# Import framework components
try:
    from src.adapters.llm import create_client  # noqa: F401
    from src.api.auth import (
        APIKeyAuthenticator,
        ClientInfo,
        RateLimitConfig,
        get_authenticator,
        set_authenticator,
    )
    from src.api.exceptions import (
        AuthenticationError,
        AuthorizationError,  # noqa: F401
        FrameworkError,
        RateLimitError,
        ValidationError,  # noqa: F401
    )
    from src.models.validation import MCTSConfig, QueryInput  # noqa: F401

    IMPORTS_AVAILABLE = True
except ImportError as e:
    IMPORTS_AVAILABLE = False
    import_error = str(e)

# Prometheus metrics (optional)
try:
    from prometheus_client import CONTENT_TYPE_LATEST, Counter, Gauge, Histogram, generate_latest

    PROMETHEUS_AVAILABLE = True

    # Define metrics
    REQUEST_COUNT = Counter("mcts_requests_total", "Total number of requests", ["method", "endpoint", "status"])
    REQUEST_LATENCY = Histogram("mcts_request_duration_seconds", "Request latency in seconds", ["method", "endpoint"])
    ACTIVE_REQUESTS = Gauge("mcts_active_requests", "Number of active requests")
    ERROR_COUNT = Counter("mcts_errors_total", "Total number of errors", ["error_type"])
except ImportError:
    PROMETHEUS_AVAILABLE = False


# Request/Response Models
class QueryRequest(BaseModel):
    """Request model for query processing."""

    query: str = Field(
        ...,
        min_length=1,
        max_length=10000,
        description="User query to process",
        json_schema_extra={"example": "Recommend defensive positions for night attack scenario"},
    )
    use_mcts: bool = Field(default=True, description="Enable MCTS tactical simulation")
    use_rag: bool = Field(default=True, description="Enable RAG context retrieval")
    mcts_iterations: int | None = Field(default=None, ge=1, le=10000, description="Override default MCTS iterations")
    thread_id: str | None = Field(
        default=None,
        max_length=100,
        pattern=r"^[a-zA-Z0-9_-]+$",
        description="Conversation thread ID for state persistence",
    )

    class Config:
        json_schema_extra = {
            "example": {
                "query": "Recommend defensive positions for night attack",
                "use_mcts": True,
                "use_rag": True,
                "mcts_iterations": 200,
                "thread_id": "session_123",
            }
        }


class QueryResponse(BaseModel):
    """Response model for query results."""

    response: str = Field(..., description="Final synthesized response")
    confidence: float = Field(..., ge=0.0, le=1.0, description="Overall confidence score")
    agents_used: list[str] = Field(..., description="List of agents that contributed")
    mcts_stats: dict[str, Any] | None = Field(default=None, description="MCTS simulation statistics")
    processing_time_ms: float = Field(..., description="Total processing time in milliseconds")
    metadata: dict[str, Any] = Field(default_factory=dict, description="Additional metadata")


class HealthResponse(BaseModel):
    """Health check response."""

    status: str = Field(..., description="Service status")
    timestamp: str = Field(..., description="Current timestamp")
    version: str = Field(default="1.0.0", description="API version")
    uptime_seconds: float = Field(..., description="Service uptime")


class ReadinessResponse(BaseModel):
    """Readiness check response."""

    ready: bool = Field(..., description="Whether service is ready")
    checks: dict[str, bool] = Field(..., description="Individual check results")


class ErrorResponse(BaseModel):
    """Error response model."""

    error: bool = Field(default=True)
    error_code: str = Field(..., description="Machine-readable error code")
    message: str = Field(..., description="Human-readable error message")
    timestamp: str = Field(..., description="Error timestamp")


# Application startup
start_time = time.time()
framework_instance = None


@asynccontextmanager
async def lifespan(app: FastAPI):
    """Application lifespan manager."""
    global framework_instance

    # Startup
    print("Starting MCTS Framework API server...")

    # Initialize authenticator with demo key (replace in production)
    authenticator = APIKeyAuthenticator(
        valid_keys=["demo-api-key-replace-in-production"],
        rate_limit_config=RateLimitConfig(
            requests_per_minute=60,
            requests_per_hour=1000,
            requests_per_day=10000,
        ),
    )
    set_authenticator(authenticator)

    # Initialize framework (lazy loading)
    # framework_instance = create_framework()

    print("API server started successfully")

    yield

    # Shutdown
    print("Shutting down API server...")


# Create FastAPI app
app = FastAPI(
    title="LangGraph Multi-Agent MCTS API",
    description="""
## Multi-Agent Reasoning API with MCTS Tactical Simulation

This API provides access to a sophisticated multi-agent reasoning framework that combines:
- **HRM Agent**: Hierarchical decomposition of complex queries
- **TRM Agent**: Iterative refinement for response quality
- **MCTS Engine**: Monte Carlo Tree Search for tactical simulation
- **RAG Integration**: Context retrieval from vector stores

### Features
- Secure API key authentication
- Rate limiting per client
- Real-time metrics (Prometheus)
- Distributed tracing (OpenTelemetry)
- Production-grade error handling

### Quick Start
1. Obtain an API key
2. Include `X-API-Key` header in requests
3. Send queries to `/query` endpoint
4. Monitor health via `/health` endpoint
    """,
    version="1.0.0",
    docs_url="/docs",
    redoc_url="/redoc",
    openapi_tags=[
        {"name": "query", "description": "Query processing operations"},
        {"name": "health", "description": "Health and readiness checks"},
        {"name": "metrics", "description": "Observability endpoints"},
    ],
    lifespan=lifespan,
)

# CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Configure appropriately for production
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


# Middleware for metrics
@app.middleware("http")
async def metrics_middleware(request: Request, call_next):
    """Track request metrics."""
    if PROMETHEUS_AVAILABLE:
        ACTIVE_REQUESTS.inc()

    start = time.perf_counter()

    try:
        response = await call_next(request)
        status = response.status_code
    except Exception:
        status = 500
        raise
    finally:
        if PROMETHEUS_AVAILABLE:
            ACTIVE_REQUESTS.dec()
            elapsed = time.perf_counter() - start
            REQUEST_COUNT.labels(method=request.method, endpoint=request.url.path, status=str(status)).inc()
            REQUEST_LATENCY.labels(method=request.method, endpoint=request.url.path).observe(elapsed)

    return response


# Authentication dependency
async def verify_api_key(x_api_key: str = Header(..., description="API key for authentication")):
    """Verify API key and return client info."""
    if not IMPORTS_AVAILABLE:
        raise HTTPException(status_code=500, detail="Authentication module not available")

    try:
        authenticator = get_authenticator()
        client_info = authenticator.require_auth(x_api_key)
        return client_info
    except AuthenticationError as e:
        if PROMETHEUS_AVAILABLE:
            ERROR_COUNT.labels(error_type="authentication").inc()
        raise HTTPException(status_code=401, detail=e.user_message)
    except RateLimitError as e:
        if PROMETHEUS_AVAILABLE:
            ERROR_COUNT.labels(error_type="rate_limit").inc()
        raise HTTPException(
            status_code=429, detail=e.user_message, headers={"Retry-After": str(e.retry_after_seconds or 60)}
        )


# Exception handlers
@app.exception_handler(FrameworkError)
async def framework_error_handler(request: Request, exc: FrameworkError):
    """Handle framework-specific errors."""
    if PROMETHEUS_AVAILABLE:
        ERROR_COUNT.labels(error_type=exc.error_code).inc()

    return JSONResponse(status_code=500, content=exc.to_user_response())


@app.exception_handler(ValidationError)
async def validation_error_handler(request: Request, exc: ValidationError):
    """Handle validation errors."""
    if PROMETHEUS_AVAILABLE:
        ERROR_COUNT.labels(error_type="validation").inc()

    return JSONResponse(status_code=400, content=exc.to_user_response())


# Endpoints
@app.get("/health", response_model=HealthResponse, tags=["health"])
async def health_check():
    """
    Health check endpoint.

    Returns basic service health status. Use this for load balancer health checks.
    """
    return HealthResponse(
        status="healthy",
        timestamp=datetime.utcnow().isoformat(),
        version="1.0.0",
        uptime_seconds=time.time() - start_time,
    )


@app.get("/ready", response_model=ReadinessResponse, tags=["health"])
async def readiness_check():
    """
    Readiness check endpoint.

    Verifies all dependencies are available. Use this for Kubernetes readiness probes.
    """
    checks = {
        "imports_available": IMPORTS_AVAILABLE,
        "authenticator_configured": True,
        "llm_client_available": True,  # Would check actual client
        "prometheus_available": PROMETHEUS_AVAILABLE,
    }

    # Check if all critical services are available
    all_ready = all(
        [
            checks["imports_available"],
            checks["authenticator_configured"],
        ]
    )

    if not all_ready:
        raise HTTPException(status_code=503, detail="Service not ready")

    return ReadinessResponse(ready=all_ready, checks=checks)


@app.get("/metrics", tags=["metrics"])
async def prometheus_metrics():
    """
    Prometheus metrics endpoint.

    Returns metrics in Prometheus text format for scraping.
    """
    if not PROMETHEUS_AVAILABLE:
        raise HTTPException(status_code=501, detail="Prometheus metrics not available")

    return Response(content=generate_latest(), media_type=CONTENT_TYPE_LATEST)


@app.post(
    "/query",
    response_model=QueryResponse,
    tags=["query"],
    responses={
        401: {"model": ErrorResponse, "description": "Authentication failed"},
        429: {"model": ErrorResponse, "description": "Rate limit exceeded"},
        400: {"model": ErrorResponse, "description": "Invalid input"},
        500: {"model": ErrorResponse, "description": "Internal server error"},
    },
)
async def process_query(request: QueryRequest, client_info: ClientInfo = Depends(verify_api_key)):
    """
    Process a query using the multi-agent MCTS framework.

    This endpoint:
    1. Validates the input query
    2. Optionally retrieves context via RAG
    3. Processes through HRM and TRM agents
    4. Optionally runs MCTS simulation
    5. Synthesizes a final response

    **Authentication**: Requires valid API key in X-API-Key header.

    **Rate Limiting**: Subject to rate limits per client.
    """
    start_time = time.perf_counter()

    # Validate input using validation models
    if IMPORTS_AVAILABLE:
        try:
            QueryInput(
                query=request.query,
                use_rag=request.use_rag,
                use_mcts=request.use_mcts,
                thread_id=request.thread_id,
            )
        except Exception as e:
            if PROMETHEUS_AVAILABLE:
                ERROR_COUNT.labels(error_type="validation").inc()
            raise HTTPException(status_code=400, detail=f"Validation failed: {str(e)}")

    # Process query (mock implementation for demo)
    # In production, this would call the actual framework
    await asyncio.sleep(0.1)  # Simulate processing

    processing_time = (time.perf_counter() - start_time) * 1000

    # Mock response
    return QueryResponse(
        response=f"Processed query: {request.query[:100]}...",
        confidence=0.85,
        agents_used=["hrm", "trm"] + (["mcts"] if request.use_mcts else []),
        mcts_stats=(
            {
                "iterations": request.mcts_iterations or 100,
                "best_action": "recommended_action",
                "root_visits": request.mcts_iterations or 100,
            }
            if request.use_mcts
            else None
        ),
        processing_time_ms=processing_time,
        metadata={
            "client_id": client_info.client_id,
            "thread_id": request.thread_id,
            "rag_enabled": request.use_rag,
        },
    )


@app.get("/stats", tags=["metrics"])
async def get_stats(client_info: ClientInfo = Depends(verify_api_key)):
    """
    Get usage statistics for the authenticated client.

    Returns request counts and rate limit information.
    """
    authenticator = get_authenticator()
    stats = authenticator.get_client_stats(client_info.client_id)

    return {
        "client_id": client_info.client_id,
        "roles": list(client_info.roles),
        **stats,
        "rate_limits": {
            "per_minute": authenticator.rate_limit_config.requests_per_minute,
            "per_hour": authenticator.rate_limit_config.requests_per_hour,
            "per_day": authenticator.rate_limit_config.requests_per_day,
        },
    }


# Entry point
if __name__ == "__main__":
    import uvicorn

    uvicorn.run(
        "src.api.rest_server:app",
        host="0.0.0.0",
        port=8000,
        reload=False,
        workers=4,
        log_level="info",
        access_log=True,
    )