import os import torch import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import spaces model_name = "scb10x/typhoon2.5-qwen3-4b" token = os.getenv("HF_TOKEN") tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True, torch_dtype=torch.float16, low_cpu_mem_usage=True, ) model.to("cuda" if torch.cuda.is_available() else "cpu") def generate_chat(message): messages = [{"role": "user", "content": message}] input_ids = tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt" ).to(model.device) output_ids = model.generate( input_ids, max_new_tokens=256, temperature=0.7, do_sample=True, top_p=0.9, pad_token_id=tokenizer.eos_token_id, ) response = tokenizer.decode(output_ids[0], skip_special_tokens=True) response_text = response.split(message)[-1].strip() return response_text @spaces.GPU def predict(message, history=None): # history ไม่ถูกใช้ response = generate_chat(message) return response chat_ui = gr.ChatInterface( fn=predict, title="Typhoon 2.5 ZeroGPU", ) if __name__ == "__main__": chat_ui.launch()