Fraser commited on
Commit
f3080ed
·
1 Parent(s): b39563c
Files changed (1) hide show
  1. index.html +101 -82
index.html CHANGED
@@ -1,5 +1,4 @@
1
  <!doctype html>
2
-
3
  <html lang="en">
4
  <head>
5
  <meta charset="utf-8" />
@@ -50,84 +49,67 @@
50
  <main>
51
  <div class="bar">
52
  <label for="model">Model:</label>
53
- <!-- FIX: add the missing <select id="model"> and remove stray text node "$1" -->
54
  <select id="model" class="grow">
55
- <!-- NEW: Gemma-3-270M from ggml-org (public GGUF) -->
56
  <option selected value='{"id":"ggml-org/gemma-3-270m-GGUF","file":"gemma-3-270m-Q8_0.gguf","label":"Gemma‑3‑270M Q8_0 (≈292 MB)"}'>Gemma‑3‑270M Q8_0 (≈292 MB)</option>
57
- <!-- Smallest RAM / fastest (good for phones) -->
58
  <option value='{"id":"mradermacher/OpenELM-270M-Instruct-GGUF","file":"OpenELM-270M-Instruct.Q3_K_S.gguf","label":"OpenELM‑270M‑Instruct Q3_K_S (≈134 MB)"}'>OpenELM‑270M‑Instruct Q3_K_S (≈134 MB)</option>
59
  <option value='{"id":"mradermacher/OpenELM-270M-Instruct-GGUF","file":"OpenELM-270M-Instruct.Q4_K_M.gguf","label":"OpenELM‑270M‑Instruct Q4_K_M (≈175 MB)"}'>OpenELM‑270M‑Instruct Q4_K_M (≈175 MB)</option>
60
-
61
- ```
62
- <!-- Good quality while still small -->
63
- <option value='{"id":"mav23/SmolLM-135M-Instruct-GGUF","file":"smollm-135m-instruct.Q3_K_S.gguf","label":"SmolLM135M‑Instruct Q3_K_S (≈88 MB)"}'>SmolLM135M‑Instruct Q3_K_S (≈88 MB)</option>
64
- <option value='{"id":"QuantFactory/SmolLM-360M-Instruct-GGUF","file":"SmolLM-360M-Instruct.Q3_K_S.gguf","label":"SmolLM360MInstruct Q3_K_S (≈219 MB)"}'>SmolLM360MInstruct Q3_K_S (≈219 MB)</option>
65
-
66
- <!-- Stronger tiny model (bigger, still phonepossible on high‑end) -->
67
- <option value='{"id":"Qwen/Qwen2.5-0.5B-Instruct-GGUF","file":"qwen2.5-0.5b-instruct-q3_k_m.gguf","label":"Qwen2.50.5B‑Instruct Q3_K_M (≈432 MB)"}'>Qwen2.50.5B‑Instruct Q3_K_M (≈432 MB)</option>
68
- <option value='{"id":"Qwen/Qwen2.5-0.5B-Instruct-GGUF","file":"qwen2.5-0.5b-instruct-q4_k_m.gguf","label":"Qwen2.50.5BInstruct Q4_K_M (≈491 MB)"}'>Qwen2.50.5BInstruct Q4_K_M (≈491 MB)</option>
69
-
70
- <!-- Optional: bigger but better; may be too heavy for some phones -->
71
- <option value='{"id":"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF","file":"tinyllama-1.1b-chat-v1.0.Q3_K_S.gguf","label":"TinyLlama‑1.1B‑Chat Q3_K_S (≈500 MB)"}'>TinyLlama‑1.1B‑Chat Q3_K_S (≈500 MB)</option>
72
-
73
- <!-- Your original SmolLM2 360M options (kept) -->
74
- <option value='{"id":"QuantFactory/SmolLM-360M-GGUF","file":"SmolLM-360M.Q4_0.gguf","label":"SmolLM2‑360M Q4_0 (≈229 MB)"}'>SmolLM2‑360M Q4_0 (≈229 MB)</option>
75
- <option value='{"id":"QuantFactory/SmolLM-360M-GGUF","file":"SmolLM-360M.Q3_K_S.gguf","label":"SmolLM2‑360M Q3_K_S (≈219 MB, faster)"}'>SmolLM2‑360M Q3_K_S (≈219 MB, faster)</option>
76
- <option value='{"id":"QuantFactory/SmolLM-360M-GGUF","file":"SmolLM-360M.Q2_K.gguf","label":"SmolLM2‑360M Q2_K (≈200 MB, min RAM / quality drop)"}'>SmolLM2‑360M Q2_K (≈200 MB, min RAM / quality drop)</option>
77
-
78
- <!-- Custom (use for Gemma‑3‑270M when a public GGUF exists) -->
79
- <option value='{"custom":true,"label":"Custom HF GGUF (e.g., Gemma‑3‑270M)"}'>Custom HF GGUF (e.g., Gemma‑3‑270M)</option>
80
- </select>
81
-
82
- <details id="customBox">
83
- <summary class="tiny">Custom GGUF (paste HF repo + file)</summary>
84
- <div class="row">
85
- <label class="tiny">HF repo id</label>
86
- <input id="customRepo" type="text" placeholder="e.g. google/gemma-3-270m-GGUF (when available)" style="width:280px" />
87
- <label class="tiny">file</label>
88
- <input id="customFile" type="text" placeholder="e.g. gemma-3-270m.Q4_0.gguf" style="width:240px" />
 
 
 
 
 
 
 
 
 
 
89
  </div>
90
- <div class="note">Note: official <a href="https://huggingface.co/google/gemma-3-270m" target="_blank" rel="noreferrer">Gemma‑3‑270M</a> is the base HF repo. A ready‑to‑use public GGUF is now available at <a href="https://huggingface.co/ggml-org/gemma-3-270m-GGUF" target="_blank" rel="noreferrer">ggml‑org/gemma‑3‑270m‑GGUF</a> (currently providing <code>gemma-3-270m-Q8_0.gguf</code> ≈292 MB). For maximum speed on low‑RAM phones, the OpenELM‑270M‑Instruct Q3_K_S option above is even lighter, but Gemma‑3‑270M offers strong quality for its size.</div>
91
- </details>
92
-
93
- <div class="row">
94
- <label>Max new tokens</label>
95
- <input id="nPredict" type="number" min="1" max="512" step="1" value="128" />
96
- </div>
97
- <div class="row">
98
- <label>Temp</label><input id="temp" type="number" min="0" max="2" step="0.1" value="0.7" style="width:80px" />
99
- <label>Top‑p</label><input id="topp" type="number" min="0" max="1" step="0.05" value="0.9" style="width:80px" />
100
- <label>Top‑k</label><input id="topk" type="number" min="1" max="100" step="1" value="40" style="width:80px" />
101
- </div>
102
-
103
- <div class="spacer"></div>
104
-
105
- <button id="loadBtn" class="primary">Load model</button>
106
- <button id="unloadBtn" class="ghost" disabled>Unload</button>
107
-
108
- <div class="progress" title="download progress"><i id="prog"></i></div>
109
- <div id="stats">idle</div>
110
- </div>
111
-
112
- <div id="chat" aria-live="polite"></div>
113
-
114
- <form class="inputbar" id="form">
115
- <textarea id="input" placeholder="Ask me anything…" required></textarea>
116
- <div class="row" style="flex-direction:column; gap:6px; align-items:flex-end">
117
- <button id="sendBtn" class="primary">Send</button>
118
- <button id="stopBtn" type="button" class="ghost" disabled>Stop</button>
119
- <div class="tiny">Context kept small for mobile perf</div>
120
- </div>
121
- </form>
122
- ```
123
 
 
 
 
 
 
 
 
 
 
 
124
  </main>
125
 
126
  <script type="module">
127
- // ——— Fixed imports (pin version + explicit wasm paths) ———
128
  import { Wllama, LoggerWithoutDebug } from "https://cdn.jsdelivr.net/npm/@wllama/wllama@2.3.1/esm/index.js";
129
 
130
- // Provide the wasm URLs directly so there is no "+esm" indirection.
131
  const CONFIG_PATHS = {
132
  "single-thread/wllama.wasm": "https://cdn.jsdelivr.net/npm/@wllama/wllama@2.3.1/esm/single-thread/wllama.wasm",
133
  "multi-thread/wllama.wasm" : "https://cdn.jsdelivr.net/npm/@wllama/wllama@2.3.1/esm/multi-thread/wllama.wasm",
@@ -160,9 +142,9 @@
160
  // Keep RAM low for mobile: small context + FP16 V‑cache (WASM safe)
161
  const LOAD_CONFIG = {
162
  n_ctx: 768,
163
- n_batch: 128, // must be >= 64 to satisfy GGML_KQ_MASK_PAD and avoid batch overflow in wasm
164
- cache_type_k: "q4_0", // int4 K cache: reduces RAM without flash_attn
165
- cache_type_v: "f16", // IMPORTANT: V cache quant requires flash_attn; not available in WASM
166
  flash_attn: false,
167
  progressCallback: ({ loaded, total }) => {
168
  const pct = (total && total > 0) ? Math.round(loaded / total * 100) : 0;
@@ -172,6 +154,28 @@
172
 
173
  const messages = [ { role: "system", content: sysPrompt } ];
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  // ——— UI helpers ———
176
  const ui = {
177
  add(role, text) {
@@ -218,6 +222,11 @@
218
  return parsed;
219
  }
220
 
 
 
 
 
 
221
  async function ensureLoaded() {
222
  if (loaded) return;
223
  $prog.style.width = '0%';
@@ -226,7 +235,6 @@
226
  try {
227
  await wllama.loadModelFromHF(choice.id, choice.file, LOAD_CONFIG);
228
  } catch (e) {
229
- // Common causes: gated repo, missing file, or CORS
230
  throw new Error(`Load failed for ${choice.id}/${choice.file}. If the repo is gated or lacks CORS, try a public mirror / different quant. Details: ${e?.message || e}`);
231
  }
232
  loaded = true;
@@ -267,9 +275,9 @@
267
  $input.value = '';
268
 
269
  const assistantBubble = ui.add('assistant', '');
270
- truncateHistoryForMobile(600); // trim harder to reduce initial prompt size vs 768 ctx
271
 
272
- $send.disabled = true; $stop.disabled = true; // will flip to true once stream starts
273
  aborter = new AbortController();
274
 
275
  const nPredict = parseInt(document.getElementById('nPredict').value, 10);
@@ -281,15 +289,26 @@
281
  let outText = '';
282
 
283
  try {
284
- $stop.disabled = false;
285
- const stream = await wllama.createChatCompletion(messages, {
286
  stream: true,
287
  useCache: true,
288
  nPredict,
289
  sampling: { temp, top_p, top_k },
290
  stopTokens: eotToken > 0 ? [eotToken] : undefined,
291
  abortSignal: aborter.signal
292
- });
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
  for await (const chunk of stream) {
295
  const piece = new TextDecoder().decode(chunk.piece);
@@ -324,11 +343,11 @@
324
  </script>
325
 
326
  <!--
327
- What changed:
328
- FIXED: the crash was caused by a missing <select id="model"> (document.getElementById('model') returned null). Added the select and removed a stray "$1" text node.
329
- Added explicit label-for association (for accessibility) and set the first option as selected.
330
- Kept all other logic identical.
 
331
  -->
332
-
333
  </body>
334
  </html>
 
1
  <!doctype html>
 
2
  <html lang="en">
3
  <head>
4
  <meta charset="utf-8" />
 
49
  <main>
50
  <div class="bar">
51
  <label for="model">Model:</label>
 
52
  <select id="model" class="grow">
 
53
  <option selected value='{"id":"ggml-org/gemma-3-270m-GGUF","file":"gemma-3-270m-Q8_0.gguf","label":"Gemma‑3‑270M Q8_0 (≈292 MB)"}'>Gemma‑3‑270M Q8_0 (≈292 MB)</option>
 
54
  <option value='{"id":"mradermacher/OpenELM-270M-Instruct-GGUF","file":"OpenELM-270M-Instruct.Q3_K_S.gguf","label":"OpenELM‑270M‑Instruct Q3_K_S (≈134 MB)"}'>OpenELM‑270M‑Instruct Q3_K_S (≈134 MB)</option>
55
  <option value='{"id":"mradermacher/OpenELM-270M-Instruct-GGUF","file":"OpenELM-270M-Instruct.Q4_K_M.gguf","label":"OpenELM‑270M‑Instruct Q4_K_M (≈175 MB)"}'>OpenELM‑270M‑Instruct Q4_K_M (≈175 MB)</option>
56
+ <option value='{"id":"mav23/SmolLM-135M-Instruct-GGUF","file":"smollm-135m-instruct.Q3_K_S.gguf","label":"SmolLM‑135M‑Instruct Q3_K_S (≈88 MB)"}'>SmolLM‑135M‑Instruct Q3_K_S (≈88 MB)</option>
57
+ <option value='{"id":"QuantFactory/SmolLM-360M-Instruct-GGUF","file":"SmolLM-360M-Instruct.Q3_K_S.gguf","label":"SmolLM‑360M‑Instruct Q3_K_S (≈219 MB)"}'>SmolLM‑360M‑Instruct Q3_K_S (≈219 MB)</option>
58
+ <option value='{"id":"Qwen/Qwen2.5-0.5B-Instruct-GGUF","file":"qwen2.5-0.5b-instruct-q3_k_m.gguf","label":"Qwen2.5‑0.5B‑Instruct Q3_K_M (≈432 MB)"}'>Qwen2.5‑0.5B‑Instruct Q3_K_M (≈432 MB)</option>
59
+ <option value='{"id":"Qwen/Qwen2.5-0.5B-Instruct-GGUF","file":"qwen2.5-0.5b-instruct-q4_k_m.gguf","label":"Qwen2.50.5B‑Instruct Q4_K_M (≈491 MB)"}'>Qwen2.50.5B‑Instruct Q4_K_M (≈491 MB)</option>
60
+ <option value='{"id":"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF","file":"tinyllama-1.1b-chat-v1.0.Q3_K_S.gguf","label":"TinyLlama1.1BChat Q3_K_S (≈500 MB)"}'>TinyLlama1.1BChat Q3_K_S (≈500 MB)</option>
61
+ <option value='{"id":"QuantFactory/SmolLM-360M-GGUF","file":"SmolLM-360M.Q4_0.gguf","label":"SmolLM2‑360M Q4_0 (≈229 MB)"}'>SmolLM2‑360M Q4_0 (≈229 MB)</option>
62
+ <option value='{"id":"QuantFactory/SmolLM-360M-GGUF","file":"SmolLM-360M.Q3_K_S.gguf","label":"SmolLM2‑360M Q3_K_S (≈219 MB, faster)"}'>SmolLM2360M Q3_K_S (≈219 MB, faster)</option>
63
+ <option value='{"id":"QuantFactory/SmolLM-360M-GGUF","file":"SmolLM-360M.Q2_K.gguf","label":"SmolLM2360M Q2_K (≈200 MB, min RAM / quality drop)"}'>SmolLM2360M Q2_K (≈200 MB, min RAM / quality drop)</option>
64
+ <option value='{"custom":true,"label":"Custom HF GGUF (e.g., Gemma3270M)"}'>Custom HF GGUF (e.g., Gemma3270M)</option>
65
+ </select>
66
+
67
+ <details id="customBox">
68
+ <summary class="tiny">Custom GGUF (paste HF repo + file)</summary>
69
+ <div class="row">
70
+ <label class="tiny">HF repo id</label>
71
+ <input id="customRepo" type="text" placeholder="e.g. google/gemma-3-270m-GGUF (when available)" style="width:280px" />
72
+ <label class="tiny">file</label>
73
+ <input id="customFile" type="text" placeholder="e.g. gemma-3-270m.Q4_0.gguf" style="width:240px" />
74
+ </div>
75
+ <div class="note">Note: official <a href="https://huggingface.co/google/gemma-3-270m" target="_blank" rel="noreferrer">Gemma‑3‑270M</a> is the base HF repo. A ready‑to‑use public GGUF is now available at <a href="https://huggingface.co/ggml-org/gemma-3-270m-GGUF" target="_blank" rel="noreferrer">ggmlorg/gemma‑3‑270m‑GGUF</a> (currently providing <code>gemma-3-270m-Q8_0.gguf</code> ≈292 MB). For maximum speed on low‑RAM phones, the OpenELM‑270M‑Instruct Q3_K_S option above is even lighter, but Gemma‑3‑270M offers strong quality for its size.</div>
76
+ </details>
77
+
78
+ <div class="row">
79
+ <label>Max new tokens</label>
80
+ <input id="nPredict" type="number" min="1" max="512" step="1" value="128" />
81
+ </div>
82
+ <div class="row">
83
+ <label>Temp</label><input id="temp" type="number" min="0" max="2" step="0.1" value="0.7" style="width:80px" />
84
+ <label>Top‑p</label><input id="topp" type="number" min="0" max="1" step="0.05" value="0.9" style="width:80px" />
85
+ <label>Top‑k</label><input id="topk" type="number" min="1" max="100" step="1" value="40" style="width:80px" />
86
+ </div>
87
+
88
+ <div class="spacer"></div>
89
+
90
+ <button id="loadBtn" class="primary">Load model</button>
91
+ <button id="unloadBtn" class="ghost" disabled>Unload</button>
92
+
93
+ <div class="progress" title="download progress"><i id="prog"></i></div>
94
+ <div id="stats">idle</div>
95
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
+ <div id="chat" aria-live="polite"></div>
98
+
99
+ <form class="inputbar" id="form">
100
+ <textarea id="input" placeholder="Ask me anything…" required></textarea>
101
+ <div class="row" style="flex-direction:column; gap:6px; align-items:flex-end">
102
+ <button id="sendBtn" class="primary">Send</button>
103
+ <button id="stopBtn" type="button" class="ghost" disabled>Stop</button>
104
+ <div class="tiny">Context kept small for mobile perf</div>
105
+ </div>
106
+ </form>
107
  </main>
108
 
109
  <script type="module">
110
+ // ——— Imports ———
111
  import { Wllama, LoggerWithoutDebug } from "https://cdn.jsdelivr.net/npm/@wllama/wllama@2.3.1/esm/index.js";
112
 
 
113
  const CONFIG_PATHS = {
114
  "single-thread/wllama.wasm": "https://cdn.jsdelivr.net/npm/@wllama/wllama@2.3.1/esm/single-thread/wllama.wasm",
115
  "multi-thread/wllama.wasm" : "https://cdn.jsdelivr.net/npm/@wllama/wllama@2.3.1/esm/multi-thread/wllama.wasm",
 
142
  // Keep RAM low for mobile: small context + FP16 V‑cache (WASM safe)
143
  const LOAD_CONFIG = {
144
  n_ctx: 768,
145
+ n_batch: 128,
146
+ cache_type_k: "q4_0",
147
+ cache_type_v: "f16",
148
  flash_attn: false,
149
  progressCallback: ({ loaded, total }) => {
150
  const pct = (total && total > 0) ? Math.round(loaded / total * 100) : 0;
 
154
 
155
  const messages = [ { role: "system", content: sysPrompt } ];
156
 
157
+ // ——— Chat template for Gemma IT ———
158
+ const GEMMA_JINJA = `{{ bos_token }}
159
+ {%- if messages[0]['role'] == 'system' -%}
160
+ {%- if messages[0]['content'] is string -%}
161
+ {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}
162
+ {%- else -%}
163
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}
164
+ {%- endif -%}
165
+ {%- set loop_messages = messages[1:] -%}
166
+ {%- else -%}
167
+ {%- set first_user_prefix = "" -%}
168
+ {%- set loop_messages = messages -%}
169
+ {%- endif -%}
170
+ {%- for message in loop_messages -%}
171
+ {%- set role = (message['role'] == 'assistant') and 'model' or message['role'] -%}
172
+ <start_of_turn>{{ role }}
173
+ {{ (loop.first and first_user_prefix or '') ~ (message['content'] if message['content'] is string else message['content'][0]['text']) | trim }}<end_of_turn>
174
+ {%- endfor -%}
175
+ {%- if add_generation_prompt -%}
176
+ <start_of_turn>model
177
+ {%- endif -%}`;
178
+
179
  // ——— UI helpers ———
180
  const ui = {
181
  add(role, text) {
 
222
  return parsed;
223
  }
224
 
225
+ function isGemmaSelected() {
226
+ const { id, file, label } = getSelectedModel();
227
+ return /gemma/i.test(id) || /gemma/i.test(file) || /gemma/i.test(label);
228
+ }
229
+
230
  async function ensureLoaded() {
231
  if (loaded) return;
232
  $prog.style.width = '0%';
 
235
  try {
236
  await wllama.loadModelFromHF(choice.id, choice.file, LOAD_CONFIG);
237
  } catch (e) {
 
238
  throw new Error(`Load failed for ${choice.id}/${choice.file}. If the repo is gated or lacks CORS, try a public mirror / different quant. Details: ${e?.message || e}`);
239
  }
240
  loaded = true;
 
275
  $input.value = '';
276
 
277
  const assistantBubble = ui.add('assistant', '');
278
+ truncateHistoryForMobile(600);
279
 
280
+ $send.disabled = true; $stop.disabled = true; // flip to false on stream start
281
  aborter = new AbortController();
282
 
283
  const nPredict = parseInt(document.getElementById('nPredict').value, 10);
 
289
  let outText = '';
290
 
291
  try {
292
+ const opts = {
 
293
  stream: true,
294
  useCache: true,
295
  nPredict,
296
  sampling: { temp, top_p, top_k },
297
  stopTokens: eotToken > 0 ? [eotToken] : undefined,
298
  abortSignal: aborter.signal
299
+ };
300
+
301
+ let stream;
302
+ if (isGemmaSelected()) {
303
+ // Render messages with Gemma template, then complete as plain text
304
+ const prompt = await wllama.formatChat(messages, /* addAssistant */ true, GEMMA_JINJA);
305
+ $stop.disabled = false;
306
+ stream = await wllama.createCompletion(prompt, opts);
307
+ } else {
308
+ // Other models: rely on their embedded chat templates
309
+ $stop.disabled = false;
310
+ stream = await wllama.createChatCompletion(messages, opts);
311
+ }
312
 
313
  for await (const chunk of stream) {
314
  const piece = new TextDecoder().decode(chunk.piece);
 
343
  </script>
344
 
345
  <!--
346
+ Changes for Gemma:
347
+ Added GEMMA_JINJA chat template (<start_of_turn>/<end_of_turn> with BOS).
348
+ When a Gemma model is selected, messages are formatted via wllama.formatChat(..., GEMMA_JINJA)
349
+ and sent to createCompletion() to avoid ChatML (<im_start>/<im_end>) fallback.
350
+ • Non‑Gemma models still use createChatCompletion().
351
  -->
 
352
  </body>
353
  </html>