beyoru commited on
Commit
0da087b
·
verified ·
1 Parent(s): 74622a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -51
app.py CHANGED
@@ -1,74 +1,111 @@
1
- import gradio as gr
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
 
 
 
4
 
5
- # --- Load model ---
6
  MODEL_NAME = "beyoru/Qwen3-0.9B-A0.6B"
7
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 
 
8
  model = AutoModelForCausalLM.from_pretrained(
9
  MODEL_NAME,
10
- torch_dtype=torch.float16,
 
 
11
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # --- Chat function ---
14
- def chat_fn(message, history, num_ctx, temperature, repeat_penalty, min_p, top_k, top_p, presence_penalty):
15
- if not message.strip():
16
- return ""
17
-
18
- # Tạo context chat từ lịch sử
19
- conversation = ""
20
- for turn in history:
21
- role, content = turn["role"], turn["content"]
22
- if role == "user":
23
- conversation += f"User: {content}\n"
24
- else:
25
- conversation += f"Assistant: {content}\n"
26
- conversation += f"User: {message}\nAssistant:"
27
-
28
- inputs = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=int(num_ctx)).to(model.device)
29
-
30
- outputs = model.generate(
31
  **inputs,
32
- max_new_tokens=2048,
 
33
  temperature=float(temperature),
 
34
  top_p=float(top_p),
35
- top_k=int(top_k),
36
- repetition_penalty=float(repeat_penalty),
37
- do_sample=True,
38
- eos_token_id=tokenizer.eos_token_id
39
  )
40
 
41
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
42
- # Cắt phần trước "Assistant:" để chỉ lấy câu trả lời
43
- if "Assistant:" in response:
44
- response = response.split("Assistant:")[-1].strip()
45
- return response
 
 
 
 
 
 
46
 
47
- # --- Giao diện Gradio ---
48
  with gr.Blocks(fill_height=True, fill_width=True) as app:
49
  with gr.Sidebar():
50
- gr.Markdown("## Qwen3 Playground (Transformers Edition)")
51
- gr.Markdown("Model: **beyoru/Qwen3-0.9B-A0.6B** — chạy trực tiếp bằng Transformers")
52
-
53
- num_ctx = gr.Slider(512, 8192, 2048, 128, label="Context Length (num_ctx)")
54
- temperature = gr.Slider(0.1, 2.0, 0.6, 0.1, label="Temperature")
55
- repeat_penalty = gr.Slider(0.1, 2.0, 1.0, 0.1, label="Repeat Penalty")
56
- min_p = gr.Slider(0.0, 1.0, 0.0, 0.01, label="Min P")
57
- top_k = gr.Slider(0, 100, 20, 1, label="Top K")
58
- top_p = gr.Slider(0.0, 1.0, 0.95, 0.05, label="Top P")
59
- presence_penalty = gr.Slider(0.0, 2.0, 1.5, 0.1, label="Presence Penalty")
 
 
 
 
 
 
 
60
 
61
  gr.ChatInterface(
62
- fn=chat_fn,
63
- additional_inputs=[num_ctx, temperature, repeat_penalty, min_p, top_k, top_p, presence_penalty],
64
- chatbot=gr.Chatbot(label="Transformers | Qwen3 (0.9B-A0.6B)", type="messages", show_copy_button=True),
 
 
 
 
65
  examples=[
66
- ["Introduce yourself."],
67
- ["Explain quantum computers."],
68
- ["Give a summary of World War II."]
69
  ],
70
  cache_examples=False,
71
  show_api=False
72
  )
73
 
74
- app.launch(server_name="0.0.0.0", pwa=True)
 
1
+ import os
 
2
  import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
4
+ from threading import Thread
5
+ import gradio as gr
6
 
 
7
  MODEL_NAME = "beyoru/Qwen3-0.9B-A0.6B"
8
+
9
+ print("Loading model...")
10
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
11
  model = AutoModelForCausalLM.from_pretrained(
12
  MODEL_NAME,
13
+ torch_dtype=torch.bfloat16,
14
+ device_map="auto",
15
+ trust_remote_code=True
16
  )
17
+ print("Model loaded.")
18
+
19
+
20
+ def playground(
21
+ message,
22
+ history,
23
+ max_new_tokens,
24
+ temperature,
25
+ repetition_penalty,
26
+ top_k,
27
+ top_p
28
+ ):
29
+ if not isinstance(message, str) or not message.strip():
30
+ yield ""
31
+ return
32
+
33
+ # Build conversation
34
+ conversation = []
35
+ for user_msg, bot_msg in history:
36
+ conversation.append({"role": "user", "content": user_msg})
37
+ if bot_msg:
38
+ conversation.append({"role": "assistant", "content": bot_msg})
39
+ conversation.append({"role": "user", "content": message})
40
+
41
+ if hasattr(tokenizer, "apply_chat_template"):
42
+ prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
43
+ else:
44
+ prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in conversation]) + "\nassistant:"
45
 
46
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
47
+
48
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
49
+
50
+ generation_kwargs = dict(
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  **inputs,
52
+ streamer=streamer,
53
+ max_new_tokens=int(max_new_tokens),
54
  temperature=float(temperature),
55
+ top_k=int(top_k) if top_k > 0 else None,
56
  top_p=float(top_p),
57
+ repetition_penalty=float(repetition_penalty),
58
+ do_sample=True if temperature > 0 else False,
59
+ pad_token_id=tokenizer.eos_token_id
 
60
  )
61
 
62
+ # Start generation in a background thread
63
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
64
+ thread.start()
65
+
66
+ generated_text = ""
67
+ for new_text in streamer:
68
+ generated_text += new_text
69
+ yield generated_text
70
+
71
+ thread.join()
72
+
73
 
 
74
  with gr.Blocks(fill_height=True, fill_width=True) as app:
75
  with gr.Sidebar():
76
+ gr.Markdown("## Playground by UltimaX Intelligence")
77
+ gr.HTML("""
78
+ Runs <b><a href="https://huggingface.co/beyoru/Qwen3-0.9B-A0.6B" target="_blank">
79
+ beyoru/Qwen3-0.9B-A0.6B</a></b> via <b>Hugging Face Transformers</b>.<br><br>
80
+ <b>Supprot me at:</b>.<br><br>
81
+ <a href="https://www.buymeacoffee.com/ductransa0g" target="_blank">
82
+ <img src="https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png" alt="Buy Me A Coffee" width="150px">
83
+ </a>
84
+ </p>
85
+ """)
86
+ gr.Markdown("---")
87
+ gr.Markdown("## Generation Parameters")
88
+ max_new_tokens = gr.Slider(32, 512, value=256, step=32, label="Max New Tokens")
89
+ temperature = gr.Slider(0.1, 2.0, value=0.6, step=0.1, label="Temperature")
90
+ repetition_penalty = gr.Slider(0.1, 2.0, value=1.0, step=0.1, label="Repetition Penalty")
91
+ top_k = gr.Slider(0, 100, value=20, step=1, label="Top K (0 = off)")
92
+ top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.05, label="Top P")
93
 
94
  gr.ChatInterface(
95
+ fn=playground,
96
+ additional_inputs=[max_new_tokens, temperature, repetition_penalty, top_k, top_p],
97
+ chatbot=gr.Chatbot(
98
+ label="Qwen3-0.9B-A0.6B",
99
+ show_copy_button=True,
100
+ allow_tags=["think"],
101
+ ),
102
  examples=[
103
+ ["Hello who are you?"],
104
+ ["How to solve 2x+1=3."],
105
+ ["Example python code for async"]
106
  ],
107
  cache_examples=False,
108
  show_api=False
109
  )
110
 
111
+ app.launch(server_name="0.0.0.0", pwa=True)