|
|
import os |
|
|
import torch |
|
|
import gradio as gr |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
import spaces |
|
|
|
|
|
model_name = "scb10x/typhoon2.5-qwen3-4b" |
|
|
token = os.getenv("HF_TOKEN") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_name, |
|
|
trust_remote_code=True, |
|
|
torch_dtype=torch.float16, |
|
|
low_cpu_mem_usage=True, |
|
|
) |
|
|
|
|
|
model.to("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
def generate_chat(message): |
|
|
messages = [{"role": "user", "content": message}] |
|
|
|
|
|
input_ids = tokenizer.apply_chat_template( |
|
|
messages, |
|
|
add_generation_prompt=True, |
|
|
return_tensors="pt" |
|
|
).to(model.device) |
|
|
|
|
|
output_ids = model.generate( |
|
|
input_ids, |
|
|
max_new_tokens=256, |
|
|
temperature=0.7, |
|
|
do_sample=True, |
|
|
top_p=0.9, |
|
|
pad_token_id=tokenizer.eos_token_id, |
|
|
) |
|
|
|
|
|
response = tokenizer.decode(output_ids[0], skip_special_tokens=True) |
|
|
response_text = response.split(message)[-1].strip() |
|
|
|
|
|
return response_text |
|
|
|
|
|
@spaces.GPU |
|
|
def predict(message, history=None): |
|
|
response = generate_chat(message) |
|
|
return response |
|
|
|
|
|
chat_ui = gr.ChatInterface( |
|
|
fn=predict, |
|
|
title="Typhoon 2.5 ZeroGPU", |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
chat_ui.launch() |
|
|
|