adityaardak's picture
Update app.py
3bd8cbe verified
raw
history blame
2.91 kB
diff --git a/app.py b/app.py
index 0000000..1111111 100644
--- a/app.py
+++ b/app.py
@@ -1,16 +1,28 @@
import gradio as gr
import torch
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM
-import spaces
# Model configuration
MID = "apple/FastVLM-0.5B"
IMAGE_TOKEN_INDEX = -200
# Load model and tokenizer (will be loaded on first GPU allocation)
tok = None
model = None
def load_model():
global tok, model
if tok is None or model is None:
print("Loading model...")
tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
- model = AutoModelForCausalLM.from_pretrained(
- MID,
- torch_dtype=torch.float16,
- device_map="cuda",
- trust_remote_code=True,
- )
+ # ---- CPU-first, with dynamic fallback if CUDA exists ----
+ use_cuda = torch.cuda.is_available()
+ device_map = "cuda" if use_cuda else "cpu"
+ # float16 is great on GPU, but unsafe on CPU; use float32 on CPU
+ dtype = torch.float16 if use_cuda else torch.float32
+
+ model = AutoModelForCausalLM.from_pretrained(
+ MID,
+ torch_dtype=dtype,
+ device_map=device_map,
+ trust_remote_code=True,
+ )
print("Model loaded successfully!")
return tok, model
-
-@spaces.GPU(duration=60)
+
+# Removed GPU decorator so CPU Spaces don't request a GPU
def caption_image(image, custom_prompt=None):
@@ -66,16 +78,23 @@ def caption_image(image, custom_prompt=None):
# Insert IMAGE token id at placeholder position
- img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
- input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device)
- attention_mask = torch.ones_like(input_ids, device=model.device)
+ # Derive device/dtype from model parameters (robust on CPU or GPU)
+ model_device = next(model.parameters()).device
+ model_dtype = next(model.parameters()).dtype
+
+ img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype, device=model_device)
+ input_ids = torch.cat([pre_ids.to(model_device), img_tok, post_ids.to(model_device)], dim=1)
+ attention_mask = torch.ones_like(input_ids, device=model_device)
# Preprocess image using model's vision tower
px = model.get_vision_tower().image_processor(
images=image, return_tensors="pt"
)["pixel_values"]
- px = px.to(model.device, dtype=model.dtype)
+ px = px.to(model_device, dtype=model_dtype)
# Generate caption
with torch.no_grad():
out = model.generate(
inputs=input_ids,
attention_mask=attention_mask,
images=px,
max_new_tokens=128,
do_sample=False, # Deterministic generation
- temperature=1.0,
+ # temperature is ignored when do_sample=False
)