Spaces:

Fraser
/

web-chat

Running

App Files Files Community

Fraser commited on Aug 19

Commit

f3080ed

1 Parent(s): b39563c

fmt

Browse files

Files changed (1) hide show

index.html +101 -82

index.html CHANGED Viewed

@@ -1,5 +1,4 @@
 <!doctype html>
 <html lang="en">
 <head>
   <meta charset="utf-8" />
@@ -50,84 +49,67 @@
   <main>
     <div class="bar">
       <label for="model">Model:</label>
-      <!-- FIX: add the missing <select id="model"> and remove stray text node "$1" -->
       <select id="model" class="grow">
-        <!-- NEW: Gemma-3-270M from ggml-org (public GGUF) -->
         <option selected value='{"id":"ggml-org/gemma-3-270m-GGUF","file":"gemma-3-270m-Q8_0.gguf","label":"Gemma‑3‑270M Q8_0 (≈292 MB)"}'>Gemma‑3‑270M Q8_0 (≈292 MB)</option>
-        <!-- Smallest RAM / fastest (good for phones) -->
         <option value='{"id":"mradermacher/OpenELM-270M-Instruct-GGUF","file":"OpenELM-270M-Instruct.Q3_K_S.gguf","label":"OpenELM‑270M‑Instruct Q3_K_S (≈134 MB)"}'>OpenELM‑270M‑Instruct Q3_K_S (≈134 MB)</option>
         <option value='{"id":"mradermacher/OpenELM-270M-Instruct-GGUF","file":"OpenELM-270M-Instruct.Q4_K_M.gguf","label":"OpenELM‑270M‑Instruct Q4_K_M (≈175 MB)"}'>OpenELM‑270M‑Instruct Q4_K_M (≈175 MB)</option>
-```
-    <!-- Good quality while still small -->
-    <option value='{"id":"mav23/SmolLM-135M-Instruct-GGUF","file":"smollm-135m-instruct.Q3_K_S.gguf","label":"SmolLM‑135M‑Instruct Q3_K_S (≈88 MB)"}'>SmolLM‑135M‑Instruct Q3_K_S (≈88 MB)</option>
-    <option value='{"id":"QuantFactory/SmolLM-360M-Instruct-GGUF","file":"SmolLM-360M-Instruct.Q3_K_S.gguf","label":"SmolLM‑360M‑Instruct Q3_K_S (≈219 MB)"}'>SmolLM‑360M‑Instruct Q3_K_S (≈219 MB)</option>
-    <!-- Stronger tiny model (bigger, still phone‑possible on high‑end) -->
-    <option value='{"id":"Qwen/Qwen2.5-0.5B-Instruct-GGUF","file":"qwen2.5-0.5b-instruct-q3_k_m.gguf","label":"Qwen2.5‑0.5B‑Instruct Q3_K_M (≈432 MB)"}'>Qwen2.5‑0.5B‑Instruct Q3_K_M (≈432 MB)</option>
-    <option value='{"id":"Qwen/Qwen2.5-0.5B-Instruct-GGUF","file":"qwen2.5-0.5b-instruct-q4_k_m.gguf","label":"Qwen2.5‑0.5B‑Instruct Q4_K_M (≈491 MB)"}'>Qwen2.5‑0.5B‑Instruct Q4_K_M (≈491 MB)</option>
-    <!-- Optional: bigger but better; may be too heavy for some phones -->
-    <option value='{"id":"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF","file":"tinyllama-1.1b-chat-v1.0.Q3_K_S.gguf","label":"TinyLlama‑1.1B‑Chat Q3_K_S (≈500 MB)"}'>TinyLlama‑1.1B‑Chat Q3_K_S (≈500 MB)</option>
-    <!-- Your original SmolLM2 360M options (kept) -->
-    <option value='{"id":"QuantFactory/SmolLM-360M-GGUF","file":"SmolLM-360M.Q4_0.gguf","label":"SmolLM2‑360M Q4_0 (≈229 MB)"}'>SmolLM2‑360M Q4_0 (≈229 MB)</option>
-    <option value='{"id":"QuantFactory/SmolLM-360M-GGUF","file":"SmolLM-360M.Q3_K_S.gguf","label":"SmolLM2‑360M Q3_K_S (≈219 MB, faster)"}'>SmolLM2‑360M Q3_K_S (≈219 MB, faster)</option>
-    <option value='{"id":"QuantFactory/SmolLM-360M-GGUF","file":"SmolLM-360M.Q2_K.gguf","label":"SmolLM2‑360M Q2_K (≈200 MB, min RAM / quality drop)"}'>SmolLM2‑360M Q2_K (≈200 MB, min RAM / quality drop)</option>
-    <!-- Custom (use for Gemma‑3‑270M when a public GGUF exists) -->
-    <option value='{"custom":true,"label":"Custom HF GGUF (e.g., Gemma‑3‑270M)"}'>Custom HF GGUF (e.g., Gemma‑3‑270M)</option>
-  </select>
-  <details id="customBox">
-    <summary class="tiny">Custom GGUF (paste HF repo + file)</summary>
-    <div class="row">
-      <label class="tiny">HF repo id</label>
-      <input id="customRepo" type="text" placeholder="e.g. google/gemma-3-270m-GGUF (when available)" style="width:280px" />
-      <label class="tiny">file</label>
-      <input id="customFile" type="text" placeholder="e.g. gemma-3-270m.Q4_0.gguf" style="width:240px" />
     </div>
-    <div class="note">Note: official <a href="https://huggingface.co/google/gemma-3-270m" target="_blank" rel="noreferrer">Gemma‑3‑270M</a> is the base HF repo. A ready‑to‑use public GGUF is now available at <a href="https://huggingface.co/ggml-org/gemma-3-270m-GGUF" target="_blank" rel="noreferrer">ggml‑org/gemma‑3‑270m‑GGUF</a> (currently providing <code>gemma-3-270m-Q8_0.gguf</code> ≈292 MB). For maximum speed on low‑RAM phones, the OpenELM‑270M‑Instruct Q3_K_S option above is even lighter, but Gemma‑3‑270M offers strong quality for its size.</div>
-  </details>
-  <div class="row">
-    <label>Max new tokens</label>
-    <input id="nPredict" type="number" min="1" max="512" step="1" value="128" />
-  </div>
-  <div class="row">
-    <label>Temp</label><input id="temp" type="number" min="0" max="2" step="0.1" value="0.7" style="width:80px" />
-    <label>Top‑p</label><input id="topp" type="number" min="0" max="1" step="0.05" value="0.9" style="width:80px" />
-    <label>Top‑k</label><input id="topk" type="number" min="1" max="100" step="1" value="40" style="width:80px" />
-  </div>
-  <div class="spacer"></div>
-  <button id="loadBtn" class="primary">Load model</button>
-  <button id="unloadBtn" class="ghost" disabled>Unload</button>
-  <div class="progress" title="download progress"><i id="prog"></i></div>
-  <div id="stats">idle</div>
-</div>
-<div id="chat" aria-live="polite"></div>
-<form class="inputbar" id="form">
-  <textarea id="input" placeholder="Ask me anything…" required></textarea>
-  <div class="row" style="flex-direction:column; gap:6px; align-items:flex-end">
-    <button id="sendBtn" class="primary">Send</button>
-    <button id="stopBtn" type="button" class="ghost" disabled>Stop</button>
-    <div class="tiny">Context kept small for mobile perf</div>
-  </div>
-</form>
-```
   </main>
   <script type="module">
-    // ——— Fixed imports (pin version + explicit wasm paths) ———
     import { Wllama, LoggerWithoutDebug } from "https://cdn.jsdelivr.net/npm/@wllama/wllama@2.3.1/esm/index.js";
-    // Provide the wasm URLs directly so there is no "+esm" indirection.
     const CONFIG_PATHS = {
       "single-thread/wllama.wasm": "https://cdn.jsdelivr.net/npm/@wllama/wllama@2.3.1/esm/single-thread/wllama.wasm",
       "multi-thread/wllama.wasm" : "https://cdn.jsdelivr.net/npm/@wllama/wllama@2.3.1/esm/multi-thread/wllama.wasm",
@@ -160,9 +142,9 @@
     // Keep RAM low for mobile: small context + FP16 V‑cache (WASM safe)
     const LOAD_CONFIG = {
       n_ctx: 768,
-      n_batch: 128,  // must be >= 64 to satisfy GGML_KQ_MASK_PAD and avoid batch overflow in wasm
-      cache_type_k: "q4_0",   // int4 K cache: reduces RAM without flash_attn
-      cache_type_v: "f16",    // IMPORTANT: V cache quant requires flash_attn; not available in WASM
       flash_attn: false,
       progressCallback: ({ loaded, total }) => {
         const pct = (total && total > 0) ? Math.round(loaded / total * 100) : 0;
@@ -172,6 +154,28 @@
     const messages = [ { role: "system", content: sysPrompt } ];
     // ——— UI helpers ———
     const ui = {
       add(role, text) {
@@ -218,6 +222,11 @@
       return parsed;
     }
     async function ensureLoaded() {
       if (loaded) return;
       $prog.style.width = '0%';
@@ -226,7 +235,6 @@
       try {
         await wllama.loadModelFromHF(choice.id, choice.file, LOAD_CONFIG);
       } catch (e) {
-        // Common causes: gated repo, missing file, or CORS
         throw new Error(`Load failed for ${choice.id}/${choice.file}. If the repo is gated or lacks CORS, try a public mirror / different quant. Details: ${e?.message || e}`);
       }
       loaded = true;
@@ -267,9 +275,9 @@
       $input.value = '';
       const assistantBubble = ui.add('assistant', '');
-      truncateHistoryForMobile(600); // trim harder to reduce initial prompt size vs 768 ctx
-      $send.disabled = true; $stop.disabled = true; // will flip to true once stream starts
       aborter = new AbortController();
       const nPredict = parseInt(document.getElementById('nPredict').value, 10);
@@ -281,15 +289,26 @@
       let outText = '';
       try {
-        $stop.disabled = false;
-        const stream = await wllama.createChatCompletion(messages, {
           stream: true,
           useCache: true,
           nPredict,
           sampling: { temp, top_p, top_k },
           stopTokens: eotToken > 0 ? [eotToken] : undefined,
           abortSignal: aborter.signal
-        });
         for await (const chunk of stream) {
           const piece = new TextDecoder().decode(chunk.piece);
@@ -324,11 +343,11 @@
   </script>
   <!--
-    What changed:
-    • FIXED: the crash was caused by a missing <select id="model"> (document.getElementById('model') returned null). Added the select and removed a stray "$1" text node.
-    • Added explicit label-for association (for accessibility) and set the first option as selected.
-    • Kept all other logic identical.
   -->
 </body>
 </html>

 <!doctype html>
 <html lang="en">
 <head>
   <meta charset="utf-8" />
   <main>
     <div class="bar">
       <label for="model">Model:</label>
       <select id="model" class="grow">
         <option selected value='{"id":"ggml-org/gemma-3-270m-GGUF","file":"gemma-3-270m-Q8_0.gguf","label":"Gemma‑3‑270M Q8_0 (≈292 MB)"}'>Gemma‑3‑270M Q8_0 (≈292 MB)</option>
         <option value='{"id":"mradermacher/OpenELM-270M-Instruct-GGUF","file":"OpenELM-270M-Instruct.Q3_K_S.gguf","label":"OpenELM‑270M‑Instruct Q3_K_S (≈134 MB)"}'>OpenELM‑270M‑Instruct Q3_K_S (≈134 MB)</option>
         <option value='{"id":"mradermacher/OpenELM-270M-Instruct-GGUF","file":"OpenELM-270M-Instruct.Q4_K_M.gguf","label":"OpenELM‑270M‑Instruct Q4_K_M (≈175 MB)"}'>OpenELM‑270M‑Instruct Q4_K_M (≈175 MB)</option>
+        <option value='{"id":"mav23/SmolLM-135M-Instruct-GGUF","file":"smollm-135m-instruct.Q3_K_S.gguf","label":"SmolLM‑135M‑Instruct Q3_K_S (≈88 MB)"}'>SmolLM‑135M‑Instruct Q3_K_S (≈88 MB)</option>
+        <option value='{"id":"QuantFactory/SmolLM-360M-Instruct-GGUF","file":"SmolLM-360M-Instruct.Q3_K_S.gguf","label":"SmolLM‑360M‑Instruct Q3_K_S (≈219 MB)"}'>SmolLM‑360M‑Instruct Q3_K_S (≈219 MB)</option>
+        <option value='{"id":"Qwen/Qwen2.5-0.5B-Instruct-GGUF","file":"qwen2.5-0.5b-instruct-q3_k_m.gguf","label":"Qwen2.5‑0.5B‑Instruct Q3_K_M (≈432 MB)"}'>Qwen2.5‑0.5B‑Instruct Q3_K_M (≈432 MB)</option>
+        <option value='{"id":"Qwen/Qwen2.5-0.5B-Instruct-GGUF","file":"qwen2.5-0.5b-instruct-q4_k_m.gguf","label":"Qwen2.5‑0.5B‑Instruct Q4_K_M (≈491 MB)"}'>Qwen2.5‑0.5B‑Instruct Q4_K_M (≈491 MB)</option>
+        <option value='{"id":"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF","file":"tinyllama-1.1b-chat-v1.0.Q3_K_S.gguf","label":"TinyLlama‑1.1B‑Chat Q3_K_S (≈500 MB)"}'>TinyLlama‑1.1B‑Chat Q3_K_S (≈500 MB)</option>
+        <option value='{"id":"QuantFactory/SmolLM-360M-GGUF","file":"SmolLM-360M.Q4_0.gguf","label":"SmolLM2‑360M Q4_0 (≈229 MB)"}'>SmolLM2‑360M Q4_0 (≈229 MB)</option>
+        <option value='{"id":"QuantFactory/SmolLM-360M-GGUF","file":"SmolLM-360M.Q3_K_S.gguf","label":"SmolLM2‑360M Q3_K_S (≈219 MB, faster)"}'>SmolLM2‑360M Q3_K_S (≈219 MB, faster)</option>
+        <option value='{"id":"QuantFactory/SmolLM-360M-GGUF","file":"SmolLM-360M.Q2_K.gguf","label":"SmolLM2‑360M Q2_K (≈200 MB, min RAM / quality drop)"}'>SmolLM2‑360M Q2_K (≈200 MB, min RAM / quality drop)</option>
+        <option value='{"custom":true,"label":"Custom HF GGUF (e.g., Gemma‑3‑270M)"}'>Custom HF GGUF (e.g., Gemma‑3‑270M)</option>
+      </select>
+      <details id="customBox">
+        <summary class="tiny">Custom GGUF (paste HF repo + file)</summary>
+        <div class="row">
+          <label class="tiny">HF repo id</label>
+          <input id="customRepo" type="text" placeholder="e.g. google/gemma-3-270m-GGUF (when available)" style="width:280px" />
+          <label class="tiny">file</label>
+          <input id="customFile" type="text" placeholder="e.g. gemma-3-270m.Q4_0.gguf" style="width:240px" />
+        </div>
+        <div class="note">Note: official <a href="https://huggingface.co/google/gemma-3-270m" target="_blank" rel="noreferrer">Gemma‑3‑270M</a> is the base HF repo. A ready‑to‑use public GGUF is now available at <a href="https://huggingface.co/ggml-org/gemma-3-270m-GGUF" target="_blank" rel="noreferrer">ggml‑org/gemma‑3‑270m‑GGUF</a> (currently providing <code>gemma-3-270m-Q8_0.gguf</code> ≈292 MB). For maximum speed on low‑RAM phones, the OpenELM‑270M‑Instruct Q3_K_S option above is even lighter, but Gemma‑3‑270M offers strong quality for its size.</div>
+      </details>
+      <div class="row">
+        <label>Max new tokens</label>
+        <input id="nPredict" type="number" min="1" max="512" step="1" value="128" />
+      </div>
+      <div class="row">
+        <label>Temp</label><input id="temp" type="number" min="0" max="2" step="0.1" value="0.7" style="width:80px" />
+        <label>Top‑p</label><input id="topp" type="number" min="0" max="1" step="0.05" value="0.9" style="width:80px" />
+        <label>Top‑k</label><input id="topk" type="number" min="1" max="100" step="1" value="40" style="width:80px" />
+      </div>
+      <div class="spacer"></div>
+      <button id="loadBtn" class="primary">Load model</button>
+      <button id="unloadBtn" class="ghost" disabled>Unload</button>
+      <div class="progress" title="download progress"><i id="prog"></i></div>
+      <div id="stats">idle</div>
     </div>
+    <div id="chat" aria-live="polite"></div>
+    <form class="inputbar" id="form">
+      <textarea id="input" placeholder="Ask me anything…" required></textarea>
+      <div class="row" style="flex-direction:column; gap:6px; align-items:flex-end">
+        <button id="sendBtn" class="primary">Send</button>
+        <button id="stopBtn" type="button" class="ghost" disabled>Stop</button>
+        <div class="tiny">Context kept small for mobile perf</div>
+      </div>
+    </form>
   </main>
   <script type="module">
+    // ——— Imports ———
     import { Wllama, LoggerWithoutDebug } from "https://cdn.jsdelivr.net/npm/@wllama/wllama@2.3.1/esm/index.js";
     const CONFIG_PATHS = {
       "single-thread/wllama.wasm": "https://cdn.jsdelivr.net/npm/@wllama/wllama@2.3.1/esm/single-thread/wllama.wasm",
       "multi-thread/wllama.wasm" : "https://cdn.jsdelivr.net/npm/@wllama/wllama@2.3.1/esm/multi-thread/wllama.wasm",
     // Keep RAM low for mobile: small context + FP16 V‑cache (WASM safe)
     const LOAD_CONFIG = {
       n_ctx: 768,
+      n_batch: 128,
+      cache_type_k: "q4_0",
+      cache_type_v: "f16",
       flash_attn: false,
       progressCallback: ({ loaded, total }) => {
         const pct = (total && total > 0) ? Math.round(loaded / total * 100) : 0;
     const messages = [ { role: "system", content: sysPrompt } ];
+    // ——— Chat template for Gemma IT ———
+    const GEMMA_JINJA = `{{ bos_token }}
+{%- if messages[0]['role'] == 'system' -%}
+  {%- if messages[0]['content'] is string -%}
+    {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}
+  {%- else -%}
+    {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}
+  {%- endif -%}
+  {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+  {%- set first_user_prefix = "" -%}
+  {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- for message in loop_messages -%}
+  {%- set role = (message['role'] == 'assistant') and 'model' or message['role'] -%}
+<start_of_turn>{{ role }}
+{{ (loop.first and first_user_prefix or '') ~ (message['content'] if message['content'] is string else message['content'][0]['text']) | trim }}<end_of_turn>
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+<start_of_turn>model
+{%- endif -%}`;
     // ——— UI helpers ———
     const ui = {
       add(role, text) {
       return parsed;
     }
+    function isGemmaSelected() {
+      const { id, file, label } = getSelectedModel();
+      return /gemma/i.test(id) || /gemma/i.test(file) || /gemma/i.test(label);
+    }
     async function ensureLoaded() {
       if (loaded) return;
       $prog.style.width = '0%';
       try {
         await wllama.loadModelFromHF(choice.id, choice.file, LOAD_CONFIG);
       } catch (e) {
         throw new Error(`Load failed for ${choice.id}/${choice.file}. If the repo is gated or lacks CORS, try a public mirror / different quant. Details: ${e?.message || e}`);
       }
       loaded = true;
       $input.value = '';
       const assistantBubble = ui.add('assistant', '');
+      truncateHistoryForMobile(600);
+      $send.disabled = true; $stop.disabled = true; // flip to false on stream start
       aborter = new AbortController();
       const nPredict = parseInt(document.getElementById('nPredict').value, 10);
       let outText = '';
       try {
+        const opts = {
           stream: true,
           useCache: true,
           nPredict,
           sampling: { temp, top_p, top_k },
           stopTokens: eotToken > 0 ? [eotToken] : undefined,
           abortSignal: aborter.signal
+        };
+        let stream;
+        if (isGemmaSelected()) {
+          // Render messages with Gemma template, then complete as plain text
+          const prompt = await wllama.formatChat(messages, /* addAssistant */ true, GEMMA_JINJA);
+          $stop.disabled = false;
+          stream = await wllama.createCompletion(prompt, opts);
+        } else {
+          // Other models: rely on their embedded chat templates
+          $stop.disabled = false;
+          stream = await wllama.createChatCompletion(messages, opts);
+        }
         for await (const chunk of stream) {
           const piece = new TextDecoder().decode(chunk.piece);
   </script>
   <!--
+    Changes for Gemma:
+    • Added GEMMA_JINJA chat template (<start_of_turn>/<end_of_turn> with BOS).
+    • When a Gemma model is selected, messages are formatted via wllama.formatChat(..., GEMMA_JINJA)
+      and sent to createCompletion() to avoid ChatML (<im_start>/<im_end>) fallback.
+    • Non‑Gemma models still use createChatCompletion().
   -->
 </body>
 </html>