Spaces:

ThatsGroes
/

LLM-memory-calculator

Running

App Files Files Community

ThatsGroes commited on Mar 30

Commit

76ba794

1 Parent(s): d16240b

authentication

Browse files

Files changed (1) hide show

app.py +39 -113

app.py CHANGED Viewed

@@ -1,16 +1,8 @@
-from typing import Dict, Union
-from huggingface_hub import get_safetensors_metadata, hf_hub_download
-import argparse
-import sys
-import json
 import gradio as gr
 from typing import Dict, Union
-from huggingface_hub import get_safetensors_metadata, hf_hub_download
 import json
-# Example:
-# python get_gpu_memory.py Qwen/Qwen2.5-7B-Instruct
 # Dictionary mapping dtype strings to their byte sizes
 bytes_per_dtype: Dict[str, float] = {
     "int4": 0.5,
@@ -20,57 +12,8 @@ bytes_per_dtype: Dict[str, float] = {
     "float32": 4,
 }
-def calculate_kv_cache_memory(context_size: int, model_id: str, dtype: str, filename: str="config.json"):
-    """
-    Implements the formula suggested in https://medium.com/@tejaswi_kashyap/memory-optimization-in-llms-leveraging-kv-cache-quantization-for-efficient-inference-94bc3df5faef
-    """
-    try:
-        file_path = hf_hub_download(repo_id=model_id, filename=filename)
-        with open(file_path, 'r') as f:
-          config = json.load(f)
-        keys_to_find = {"num_hidden_layers", "num_key_value_heads", "hidden_size", "num_attention_heads"}
-        config = extract_keys(config, keys_to_find)
-        num_layers = config["num_hidden_layers"]
-        if "num_key_value_heads" in config:
-          num_att_heads = config["num_key_value_heads"]
-        else:
-          num_att_heads = config["num_attention_heads"]
-        dim_att_head = config["hidden_size"] // config["num_attention_heads"]
-        dtype_bytes = bytes_per_dtype[dtype]
-        memory_per_token = num_layers * num_att_heads * dim_att_head * dtype_bytes * 2
-        context_size_memory_footprint_gb = (context_size * memory_per_token) / 1_000_000_000
-        return context_size_memory_footprint_gb
-    except Exception as e:
-        print(f"Error estimating context size: {str(e)}", file=sys.stderr)
-        return None
 def extract_keys(json_obj, keys_to_extract):
-    """
-    Recursively searches for specific keys in a nested JSON object.
-    Args:
-        json_obj (dict or list): The JSON data (parsed as a dictionary or list).
-        keys_to_extract (set): A set of keys to extract values for.
-    Returns:
-        dict: A dictionary with found key-value pairs.
-    """
     extracted_values = {}
     def recursive_search(obj):
         if isinstance(obj, dict):
             for key, value in obj.items():
@@ -80,71 +23,53 @@ def extract_keys(json_obj, keys_to_extract):
         elif isinstance(obj, list):
             for item in obj:
                 recursive_search(item)
     recursive_search(json_obj)
     return extracted_values
-def calculate_model_memory(parameters: float, bytes: float) -> float:
-    """Calculates the GPU memory required for serving a Large Language Model (LLM).
-    This function estimates the GPU memory needed using the formula:
-    M = (P * 4B) / (32 / Q) * 1.18
-    where:
-    - M is the GPU memory in Gigabytes
-    - P is the number of parameters in billions (e.g., 7 for a 7B model)
-    - 4B represents 4 bytes per parameter
-    - 32 represents bits in 4 bytes
-    - Q is the quantization bits (e.g., 16, 8, or 4 bits)
-    - 1.18 represents ~18% overhead for additional GPU memory requirements
-    Args:
-        parameters: Number of model parameters in billions
-        bytes: Number of bytes per parameter based on dtype
-    Returns:
-        Estimated GPU memory required in Gigabytes
-    Examples:
-        >>> calculate_gpu_memory(7, bytes_per_dtype["float16"])
-        13.72
-        >>> calculate_gpu_memory(13, bytes_per_dtype["int8"])
-        12.74
-    """
-    memory = round((parameters * 4) / (32 / (bytes * 8)) * 1.18, 2)
-    return memory
-def get_model_size(model_id: str, dtype: str = "float16") -> Union[float, None]:
-    """Get the estimated GPU memory requirement for a Hugging Face model.
-    Args:
-        model_id: Hugging Face model ID (e.g., "facebook/opt-350m")
-        dtype: Data type for model loading ("float16", "int8", etc.)
-    Returns:
-        Estimated GPU memory in GB, or None if estimation fails
-    Examples:
-        >>> get_model_size("facebook/opt-350m")
-        0.82
-        >>> get_model_size("meta-llama/Llama-2-7b-hf", dtype="int8")
-        6.86
-    """
     try:
-        metadata = get_safetensors_metadata(model_id)
         if not metadata or not metadata.parameter_count:
-            raise ValueError(f"Could not fetch metadata for model: {model_id}")
-        model_parameters = list(metadata.parameter_count.values())[0]
-        model_parameters = int(model_parameters) / 1_000_000_000  # Convert to billions
-        return calculate_model_memory(model_parameters, bytes_per_dtype[dtype])
     except Exception as e:
-        print(f"Error estimating model size: {str(e)}", file=sys.stderr)
-        return None
-def estimate_vram(model_id, dtype, context_size):
     if dtype not in bytes_per_dtype:
         return "Error: Unsupported dtype"
-    model_memory = get_model_size(model_id, dtype)
-    context_memory = calculate_kv_cache_memory(context_size, model_id, dtype)
     if isinstance(model_memory, str) or isinstance(context_memory, str):
         return model_memory if isinstance(model_memory, str) else context_memory
@@ -157,11 +82,12 @@ iface = gr.Interface(
     inputs=[
         gr.Textbox(label="Hugging Face Model ID", value="google/gemma-3-27b-it"),
         gr.Dropdown(choices=list(bytes_per_dtype.keys()), label="Data Type", value="float16"),
-        gr.Number(label="Context Size", value=128000)
     ],
     outputs=gr.Textbox(label="Estimated VRAM Usage"),
     title="LLM GPU VRAM Calculator",
-    description="Estimate the VRAM requirements of a model and context size."
 )
-iface.launch()

 import gradio as gr
 from typing import Dict, Union
+from huggingface_hub import get_safetensors_metadata, hf_hub_download, login
 import json
 # Dictionary mapping dtype strings to their byte sizes
 bytes_per_dtype: Dict[str, float] = {
     "int4": 0.5,
     "float32": 4,
 }
 def extract_keys(json_obj, keys_to_extract):
     extracted_values = {}
     def recursive_search(obj):
         if isinstance(obj, dict):
             for key, value in obj.items():
         elif isinstance(obj, list):
             for item in obj:
                 recursive_search(item)
     recursive_search(json_obj)
     return extracted_values
+def calculate_kv_cache_memory(context_size: int, model_id: str, dtype: str, token: str = None):
+    try:
+        file_path = hf_hub_download(repo_id=model_id, filename="config.json", token=token)
+        with open(file_path, 'r') as f:
+            config = json.load(f)
+        keys_to_find = {"num_hidden_layers", "num_key_value_heads", "hidden_size", "num_attention_heads"}
+        config = extract_keys(config, keys_to_find)
+        num_layers = config["num_hidden_layers"]
+        num_att_heads = config.get("num_key_value_heads", config["num_attention_heads"])
+        dim_att_head = config["hidden_size"] // config["num_attention_heads"]
+        dtype_bytes = bytes_per_dtype[dtype]
+        memory_per_token = num_layers * num_att_heads * dim_att_head * dtype_bytes * 2
+        context_size_memory_footprint_gb = (context_size * memory_per_token) / 1_000_000_000
+        return context_size_memory_footprint_gb
+    except Exception as e:
+        return f"Error: {str(e)}"
+def calculate_model_memory(parameters: float, dtype: str) -> float:
+    bytes = bytes_per_dtype[dtype]
+    return round((parameters * 4) / (32 / (bytes * 8)) * 1.18, 2)
+def get_model_size(model_id: str, dtype: str, token: str = None) -> Union[float, str]:
     try:
+        metadata = get_safetensors_metadata(model_id, token=token)
         if not metadata or not metadata.parameter_count:
+            return "Error: Could not fetch metadata."
+        model_parameters = int(list(metadata.parameter_count.values())[0]) / 1_000_000_000
+        return calculate_model_memory(model_parameters, dtype)
     except Exception as e:
+        return f"Error: {str(e)}"
+def estimate_vram(model_id, dtype, context_size, hf_token):
+    if hf_token:
+        login(token=hf_token)
     if dtype not in bytes_per_dtype:
         return "Error: Unsupported dtype"
+    model_memory = get_model_size(model_id, dtype, hf_token)
+    context_memory = calculate_kv_cache_memory(context_size, model_id, dtype, hf_token)
     if isinstance(model_memory, str) or isinstance(context_memory, str):
         return model_memory if isinstance(model_memory, str) else context_memory
     inputs=[
         gr.Textbox(label="Hugging Face Model ID", value="google/gemma-3-27b-it"),
         gr.Dropdown(choices=list(bytes_per_dtype.keys()), label="Data Type", value="float16"),
+        gr.Number(label="Context Size", value=128000),
+        gr.Textbox(label="Hugging Face Access Token", type="password", placeholder="Optional - Needed for gated models")
     ],
     outputs=gr.Textbox(label="Estimated VRAM Usage"),
     title="LLM GPU VRAM Calculator",
+    description="Estimate the VRAM requirements of a model and context size. Optionally provide a Hugging Face token for gated models."
 )
+iface.launch()