import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM import spaces model_id = "NousResearch/Hermes-4-14B" tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float16, device_map="auto" ) @spaces.GPU(duration=120) def predict(message, history): history.append({"role": "user", "content": message}) input_ids = tokenizer.apply_chat_template( history, add_generation_prompt=True, return_tensors="pt" ).to(model.device) with torch.inference_mode(): output = model.generate( input_ids, max_new_tokens=512, temperature=0.7, top_p=0.95, top_k=40, repetition_penalty=1.1, do_sample=True ) # Remove input tokens from the output output_ids = output[0][input_ids.shape[-1]:] response = tokenizer.decode(output_ids, skip_special_tokens=True) history.append({"role": "assistant", "content": response}) return response gr.ChatInterface( predict, title="Hermes-4-14B Chatbot", description="Chat with Hermes-4-14B, a large language model by Nous Research", examples=["Hello", "Explain quantum computing in simple terms", "What is the capital of France?"] ).launch()