sovetskiysn commited on
Commit
4b050eb
·
1 Parent(s): e96c96e

Add application file

Browse files
Files changed (1) hide show
  1. app.py +40 -0
app.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+ import gradio as gr
4
+
5
+ model_path = "inceptionai/Llama-3.1-Sherkala-8B-Chat"
6
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
7
+ model = AutoModelForCausalLM.from_pretrained(
8
+ model_path, torch_dtype=torch.bfloat16, device_map="auto"
9
+ )
10
+
11
+ tokenizer.chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
12
+
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+
15
+ def chat_fn(user_input):
16
+ conversation = [{"role": "user", "content": user_input}]
17
+ input_ids = tokenizer.apply_chat_template(
18
+ conversation=conversation,
19
+ tokenize=True,
20
+ add_generation_prompt=True,
21
+ return_tensors="pt"
22
+ ).to(device)
23
+
24
+ output_ids = model.generate(
25
+ input_ids,
26
+ max_new_tokens=512,
27
+ do_sample=True,
28
+ temperature=0.7
29
+ )
30
+ response = tokenizer.decode(output_ids[0][input_ids.shape[-1]:], skip_special_tokens=True)
31
+ return response
32
+
33
+ gr.Interface(
34
+ fn=chat_fn,
35
+ inputs="text",
36
+ outputs="text",
37
+ title="Sherkala-8B Chat",
38
+ description="Multilingual LLaMA-3.1 based model (Kazakh, Russian, English)",
39
+ theme="default"
40
+ ).launch()