smolvlm-realtime-webgpu

Running

App Files Files Community

ginipick commited on May 16

Commit

cda939c

verified ·

1 Parent(s): 1fd79c1

Update index.html

Browse files

Files changed (1) hide show

index.html +37 -64

index.html CHANGED Viewed

@@ -1,9 +1,9 @@
 <!DOCTYPE html>
-<html lang="ko">
   <head>
     <meta charset="UTF-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-    <title>카메라 상호작용 앱</title>
     <style>
       body {
         font-family: sans-serif;
@@ -77,10 +77,10 @@
         color: white;
       }
       #startButton.start {
-        background-color: #28a745; /* 녹색 */
       }
       #startButton.stop {
-        background-color: #dc3545; /* 빨간색 */
       }
       label {
         font-weight: bold;
@@ -96,18 +96,18 @@
     </style>
   </head>
   <body>
-    <h1>카메라 상호작용 앱</h1>
     <div id="videoContainer">
       <video id="videoFeed" autoplay playsinline></video>
-      <div id="loadingOverlay">로딩 중...</div>
     </div>
     <canvas id="canvas" class="hidden"></canvas>
-    <!-- 프레임 캡처용 -->
     <div class="io-areas">
       <div>
-        <label for="instructionText">지시사항:</label><br />
         <textarea
           id="instructionText"
           style="height: 2em; width: 40em"
@@ -115,28 +115,28 @@
         ></textarea>
       </div>
       <div>
-        <label for="responseText">응답:</label><br />
         <textarea
           id="responseText"
           style="height: 2em; width: 40em"
           name="Response"
           readonly
-          placeholder="서버 응답이 여기에 표시됩니다..."
         ></textarea>
       </div>
     </div>
     <div class="controls">
-      <label for="intervalSelect">요청 간 간격:</label>
       <select id="intervalSelect" name="Interval between 2 requests">
         <option value="0" selected>0ms</option>
         <option value="100">100ms</option>
         <option value="250">250ms</option>
         <option value="500">500ms</option>
-        <option value="1000">1초</option>
-        <option value="2000">2초</option>
       </select>
-      <button id="startButton" class="start">시작</button>
     </div>
     <script type="module">
@@ -145,7 +145,6 @@
         AutoModelForVision2Seq,
         RawImage,
       } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers/dist/transformers.min.js";
       const video = document.getElementById("videoFeed");
       const canvas = document.getElementById("canvas");
       const instructionText = document.getElementById("instructionText");
@@ -153,20 +152,16 @@
       const intervalSelect = document.getElementById("intervalSelect");
       const startButton = document.getElementById("startButton");
       const loadingOverlay = document.getElementById("loadingOverlay");
-      instructionText.value = "무엇이 보이나요? 한국어로 대답해주세요."; // 기본 지시사항
       let stream;
       let isProcessing = false;
       let processor, model;
       async function initModel() {
-        const modelId = "HuggingFaceTB/SmolVLM-500M-Instruct"; // 또는 "HuggingFaceTB/SmolVLM-Instruct";
         loadingOverlay.style.display = "flex";
-        responseText.value = "프로세서 로딩 중...";
         processor = await AutoProcessor.from_pretrained(modelId);
-        responseText.value = "프로세서 로딩 완료. 모델 로딩 중...";
         model = await AutoModelForVision2Seq.from_pretrained(modelId, {
           dtype: {
             embed_tokens: "fp16",
@@ -175,10 +170,9 @@
           },
           device: "webgpu",
         });
-        responseText.value = "모델 로딩 완료. 카메라 초기화 중...";
         loadingOverlay.style.display = "none";
       }
       async function initCamera() {
         try {
           stream = await navigator.mediaDevices.getUserMedia({
@@ -186,19 +180,18 @@
             audio: false,
           });
           video.srcObject = stream;
-          responseText.value = "카메라 접근 권한이 허용되었습니다. 시작할 준비가 되었습니다.";
         } catch (err) {
-          console.error("카메라 접근 오류:", err);
-          responseText.value = `카메라 접근 오류: ${err.name} - ${err.message}. 권한이 허용되었는지, HTTPS 또는 localhost에서 실행 중인지 확인하세요.`;
           alert(
-            `카메라 접근 오류: ${err.name}. 권한을 허용했는지, HTTPS 또는 localhost에서 실행 중인지 확인하세요.`
           );
         }
       }
       function captureImage() {
         if (!stream || !video.videoWidth) {
-          console.warn("캡처할 비디오 스트림이 준비되지 않았습니다.");
           return null;
         }
         canvas.width = video.videoWidth;
@@ -208,18 +201,11 @@
         const frame = context.getImageData(0, 0, canvas.width, canvas.height);
         return new RawImage(frame.data, frame.width, frame.height, 4);
       }
       async function runLocalVisionInference(imgElement, instruction) {
-        // 지시사항에 한국어로 대답해달라는 문구가 없으면 추가
-        let koreanInstruction = instruction;
-        if (!instruction.includes("한국어") && !instruction.includes("Korean")) {
-          koreanInstruction = instruction + " (한국어로 대답해주세요)";
-        }
         const messages = [
           {
             role: "user",
-            content: [{ type: "image" }, { type: "text", text: koreanInstruction }],
           },
         ];
         const text = processor.apply_chat_template(messages, {
@@ -238,13 +224,12 @@
         );
         return output[0].trim();
       }
       async function sendData() {
         if (!isProcessing) return;
         const instruction = instructionText.value;
         const rawImg = captureImage();
         if (!rawImg) {
-          responseText.value = "캡처 실패";
           return;
         }
         try {
@@ -252,14 +237,12 @@
           responseText.value = reply;
         } catch (e) {
           console.error(e);
-          responseText.value = `오류: ${e.message}`;
         }
       }
       function sleep(ms) {
         return new Promise((resolve) => setTimeout(resolve, ms));
       }
       async function processingLoop() {
         const intervalMs = parseInt(intervalSelect.value, 10);
         while (isProcessing) {
@@ -268,37 +251,30 @@
           await sleep(intervalMs);
         }
       }
       function handleStart() {
         if (!stream) {
-          responseText.value = "카메라를 사용할 수 없습니다. 시작할 수 없습니다.";
-          alert("카메라를 사용할 수 없습니다. 먼저 권한을 허용해주세요.");
           return;
         }
         isProcessing = true;
-        startButton.textContent = "중지";
         startButton.classList.replace("start", "stop");
         instructionText.disabled = true;
         intervalSelect.disabled = true;
-        responseText.value = "처리 시작...";
         processingLoop();
       }
       function handleStop() {
         isProcessing = false;
-        startButton.textContent = "시작";
         startButton.classList.replace("stop", "start");
         instructionText.disabled = false;
         intervalSelect.disabled = false;
-        if (responseText.value.startsWith("처리 시작...")) {
-          responseText.value = "처리 중지됨.";
         }
       }
       startButton.addEventListener("click", () => {
         if (isProcessing) {
           handleStop();
@@ -306,14 +282,13 @@
           handleStart();
         }
       });
       window.addEventListener("DOMContentLoaded", async () => {
-        // WebGPU 지원 확인
         if (!navigator.gpu) {
           const videoElement = document.getElementById("videoFeed");
           const warningElement = document.createElement("p");
           warningElement.textContent =
-            "이 브라우저에서는 WebGPU를 사용할 수 없습니다.";
           warningElement.style.color = "red";
           warningElement.style.textAlign = "center";
           videoElement.parentNode.insertBefore(
@@ -321,11 +296,9 @@
             videoElement.nextSibling
           );
         }
         await initModel();
         await initCamera();
       });
       window.addEventListener("beforeunload", () => {
         if (stream) {
           stream.getTracks().forEach((track) => track.stop());
@@ -333,4 +306,4 @@
       });
     </script>
   </body>
-</html>

 <!DOCTYPE html>
+<html lang="en">
   <head>
     <meta charset="UTF-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Camera Interaction App</title>
     <style>
       body {
         font-family: sans-serif;
         color: white;
       }
       #startButton.start {
+        background-color: #28a745; /* Green */
       }
       #startButton.stop {
+        background-color: #dc3545; /* Red */
       }
       label {
         font-weight: bold;
     </style>
   </head>
   <body>
+    <h1>Camera Interaction App</h1>
     <div id="videoContainer">
       <video id="videoFeed" autoplay playsinline></video>
+      <div id="loadingOverlay">Loading...</div>
     </div>
     <canvas id="canvas" class="hidden"></canvas>
+    <!-- For capturing frames -->
     <div class="io-areas">
       <div>
+        <label for="instructionText">Instruction:</label><br />
         <textarea
           id="instructionText"
           style="height: 2em; width: 40em"
         ></textarea>
       </div>
       <div>
+        <label for="responseText">Response:</label><br />
         <textarea
           id="responseText"
           style="height: 2em; width: 40em"
           name="Response"
           readonly
+          placeholder="Server response will appear here..."
         ></textarea>
       </div>
     </div>
     <div class="controls">
+      <label for="intervalSelect">Interval between 2 requests:</label>
       <select id="intervalSelect" name="Interval between 2 requests">
         <option value="0" selected>0ms</option>
         <option value="100">100ms</option>
         <option value="250">250ms</option>
         <option value="500">500ms</option>
+        <option value="1000">1s</option>
+        <option value="2000">2s</option>
       </select>
+      <button id="startButton" class="start">Start</button>
     </div>
     <script type="module">
         AutoModelForVision2Seq,
         RawImage,
       } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers/dist/transformers.min.js";
       const video = document.getElementById("videoFeed");
       const canvas = document.getElementById("canvas");
       const instructionText = document.getElementById("instructionText");
       const intervalSelect = document.getElementById("intervalSelect");
       const startButton = document.getElementById("startButton");
       const loadingOverlay = document.getElementById("loadingOverlay");
+      instructionText.value = "What do you see?"; // default instruction
       let stream;
       let isProcessing = false;
       let processor, model;
       async function initModel() {
+        const modelId = "HuggingFaceTB/SmolVLM-500M-Instruct"; // or "HuggingFaceTB/SmolVLM-Instruct";
         loadingOverlay.style.display = "flex";
+        responseText.value = "Loading processor...";
         processor = await AutoProcessor.from_pretrained(modelId);
+        responseText.value = "Processor loaded. Loading model...";
         model = await AutoModelForVision2Seq.from_pretrained(modelId, {
           dtype: {
             embed_tokens: "fp16",
           },
           device: "webgpu",
         });
+        responseText.value = "Model loaded. Initializing camera...";
         loadingOverlay.style.display = "none";
       }
       async function initCamera() {
         try {
           stream = await navigator.mediaDevices.getUserMedia({
             audio: false,
           });
           video.srcObject = stream;
+          responseText.value = "Camera access granted. Ready to start.";
         } catch (err) {
+          console.error("Error accessing camera:", err);
+          responseText.value = `Error accessing camera: ${err.name} - ${err.message}. Please ensure permissions are granted and you are on HTTPS or localhost.`;
           alert(
+            `Error accessing camera: ${err.name}. Make sure you've granted permission and are on HTTPS or localhost.`
           );
         }
       }
       function captureImage() {
         if (!stream || !video.videoWidth) {
+          console.warn("Video stream not ready for capture.");
           return null;
         }
         canvas.width = video.videoWidth;
         const frame = context.getImageData(0, 0, canvas.width, canvas.height);
         return new RawImage(frame.data, frame.width, frame.height, 4);
       }
       async function runLocalVisionInference(imgElement, instruction) {
         const messages = [
           {
             role: "user",
+            content: [{ type: "image" }, { type: "text", text: instruction }],
           },
         ];
         const text = processor.apply_chat_template(messages, {
         );
         return output[0].trim();
       }
       async function sendData() {
         if (!isProcessing) return;
         const instruction = instructionText.value;
         const rawImg = captureImage();
         if (!rawImg) {
+          responseText.value = "Capture failed";
           return;
         }
         try {
           responseText.value = reply;
         } catch (e) {
           console.error(e);
+          responseText.value = `Error: ${e.message}`;
         }
       }
       function sleep(ms) {
         return new Promise((resolve) => setTimeout(resolve, ms));
       }
       async function processingLoop() {
         const intervalMs = parseInt(intervalSelect.value, 10);
         while (isProcessing) {
           await sleep(intervalMs);
         }
       }
       function handleStart() {
         if (!stream) {
+          responseText.value = "Camera not available. Cannot start.";
+          alert("Camera not available. Please grant permission first.");
           return;
         }
         isProcessing = true;
+        startButton.textContent = "Stop";
         startButton.classList.replace("start", "stop");
         instructionText.disabled = true;
         intervalSelect.disabled = true;
+        responseText.value = "Processing started...";
         processingLoop();
       }
       function handleStop() {
         isProcessing = false;
+        startButton.textContent = "Start";
         startButton.classList.replace("stop", "start");
         instructionText.disabled = false;
         intervalSelect.disabled = false;
+        if (responseText.value.startsWith("Processing started...")) {
+          responseText.value = "Processing stopped.";
         }
       }
       startButton.addEventListener("click", () => {
         if (isProcessing) {
           handleStop();
           handleStart();
         }
       });
       window.addEventListener("DOMContentLoaded", async () => {
+        // Check for WebGPU support
         if (!navigator.gpu) {
           const videoElement = document.getElementById("videoFeed");
           const warningElement = document.createElement("p");
           warningElement.textContent =
+            "WebGPU is not available in this browser.";
           warningElement.style.color = "red";
           warningElement.style.textAlign = "center";
           videoElement.parentNode.insertBefore(
             videoElement.nextSibling
           );
         }
         await initModel();
         await initCamera();
       });
       window.addEventListener("beforeunload", () => {
         if (stream) {
           stream.getTracks().forEach((track) => track.stop());
       });
     </script>
   </body>
+</html>