Spaces:

monurcan
/

efficient_test_time_scaling

Running

App Files Files Community

monurcan commited on Oct 3

Commit

875f054

1 Parent(s): 8178d81

Revert "xx"

Browse files

This reverts commit 02b26ea65e697330f39ec54b9bf419cc345890d4.

Files changed (4) hide show

.gitignore +1 -2
README.md +7 -9
app.py +72 -148
requirements.txt +6 -0

.gitignore CHANGED Viewed

	@@ -1,2 +1 @@
1	- /env/*
2	- __pycache__/


1	+ /env/*

README.md CHANGED Viewed

@@ -1,16 +1,14 @@
 ---
-title: Efficient Test Time Scaling
-emoji: 💬
-colorFrom: yellow
-colorTo: purple
 sdk: gradio
-sdk_version: 5.42.0
 app_file: app.py
 pinned: false
-hf_oauth: true
-hf_oauth_scopes:
-  - inference-api
-short_description: Efficient Test-Time Scaling for Small Vision-Language Models
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Smolvlm2 500M Illustration Description
+emoji: 📊
+colorFrom: red
+colorTo: yellow
 sdk: gradio
+sdk_version: 5.33.0
 app_file: app.py
 pinned: false
+license: apache-2.0
+short_description: Illustration Description
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,8 +1,15 @@
 import gradio as gr
-import base64
 import time
 import html
-from huggingface_hub import InferenceClient
 def progress_bar_html(label: str) -> str:
@@ -28,134 +35,63 @@ def progress_bar_html(label: str) -> str:
 model_name = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
-def model_inference(input_dict, history, *additional_inputs):
-    """
-    Use Hugging Face InferenceClient (streaming) to perform the multimodal chat completion.
-    Signature matches ChatInterface call pattern: (input_dict, history, *additional_inputs)
-    The OAuth token (from gr.LoginButton) is passed as `hf_token`.
-    """
-    # Extract hf_token from additional_inputs in a robust way (gradio sometimes passes extra args)
-    hf_token = None
-    for ai in additional_inputs:
-        if ai is None:
-            continue
-        # gradio may pass a small object with attribute `token`
-        if hasattr(ai, "token"):
-            hf_token = ai
-            break
-        # or a dict-like with a token key
-        if isinstance(ai, dict) and "token" in ai:
-            class _T:
-                pass
-            obj = _T()
-            obj.token = ai.get("token")
-            hf_token = obj
-            break
-        # or the token itself could be passed as a string
-        if isinstance(ai, str):
-            class _T2:
-                pass
-            obj = _T2()
-            obj.token = ai
-            hf_token = obj
-            break
-    text = input_dict.get("text", "")
-    files = input_dict.get("files", []) or []
-    if text == "" and not files:
-        # yield an error text so the streaming generator produces at least one value
-        yield "Please input a query and optionally image(s)."
-        return
-    if text == "" and files:
-        yield "Please input a text query along with the image(s)."
-        return
-    # Build the content list: images (as URLs or data URLs) followed by the text
-    content_list = []
-    for f in files:
-        try:
-            # If file looks like a URL, send as image_url
-            if isinstance(f, str) and f.startswith("http"):
-                content_list.append({"type": "image_url", "image_url": {"url": f}})
-            else:
-                # f is a local path-like object; read and convert to base64 data url
-                with open(f, "rb") as fh:
-                    b = fh.read()
-                b64 = base64.b64encode(b).decode("utf-8")
-                # naive mime type: jpeg; this should work for most common images
-                data_url = f"data:image/jpeg;base64,{b64}"
-                content_list.append(
-                    {"type": "image_url", "image_url": {"url": data_url}}
-                )
-        except Exception:
-            # if anything goes wrong reading the file, skip embedding that file
-            continue
-    content_list.append({"type": "text", "text": text})
-    messages = [{"role": "user", "content": content_list}]
-    if hf_token is None or not getattr(hf_token, "token", None):
-        yield "Please login with a Hugging Face account (use the Login button in the sidebar)."
         return
-    client = InferenceClient(
-        token=hf_token.token, model=model_name, provider="hf-inference"
     )
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=1024,
-        stream=True,
-    ):
-        choices = message.choices
-        token = ""
-        if len(choices) and choices[0].delta.content:
-            token = choices[0].delta.content
-        response += token
-        yield response
-    # for chunk in stream:
-    #     # chunk can be an object with attributes or a dict depending on client version
-    #     token = ""
-    #     try:
-    #         # attempt dict-style
-    #         if isinstance(chunk, dict):
-    #             choices = chunk.get("choices")
-    #             if choices and len(choices) > 0:
-    #                 delta = choices[0].get("delta", {})
-    #                 token = delta.get("content") or ""
-    #         else:
-    #             # attribute-style
-    #             choices = getattr(chunk, "choices", None)
-    #             if choices and len(choices) > 0:
-    #                 delta = getattr(choices[0], "delta", None)
-    #                 if isinstance(delta, dict):
-    #                     token = delta.get("content") or ""
-    #                 else:
-    #                     token = getattr(delta, "content", "")
-    #     except Exception:
-    #         token = ""
-    #     if token:
-    #         # escape incremental token to avoid raw HTML breaking the chat box
-    #         response += html.escape(token)
-    #         time.sleep(0.001)
-    #         yield response
-    # # ensure we yield at least one final message so the async iterator doesn't see StopIteration
-    # if response:
-    #     yield response
-    # else:
-    #     yield "(no text was returned by the model)"
 examples = [
@@ -173,27 +109,15 @@ examples = [
     ],
 ]
-with gr.Blocks() as demo:
-    with gr.Sidebar():
-        # Gradio LoginButton may not accept a `label` kwarg depending on the installed version
-        # so create it without that argument for maximum compatibility.
-        login_btn = gr.LoginButton()
-    chatbot = gr.ChatInterface(
-        fn=model_inference,
-        description="# **Smolvlm2-500M-illustration-description** \n (running on CPU) The model only sees the last input, it ignores the previous conversation history.",
-        examples=examples,
-        fill_height=True,
-        textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"]),
-        stop_btn="Stop Generation",
-        multimodal=True,
-        cache_examples=False,
-        additional_inputs=[login_btn],
-    )
-    # ChatInterface is already created inside the Blocks context; calling render() can duplicate it
-    # so we avoid calling chatbot.render() here.
-if __name__ == "__main__":
-    demo.launch(debug=True)

 import gradio as gr
+import torch
+from transformers import (
+    AutoModelForImageTextToText,
+    AutoProcessor,
+    TextIteratorStreamer,
+)
+from peft import PeftModel
+from transformers.image_utils import load_image
+from threading import Thread
 import time
 import html
 def progress_bar_html(label: str) -> str:
 model_name = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
+model = AutoModelForImageTextToText.from_pretrained(
+    model_name, dtype=torch.bfloat16, device_map="auto"
+).eval()
+processor = AutoProcessor.from_pretrained(model_name)
+print(f"Successfully load the model: {model}")
+def model_inference(input_dict, history):
+    text = input_dict["text"]
+    files = input_dict["files"]
+    if len(files) > 1:
+        images = [load_image(image) for image in files]
+    elif len(files) == 1:
+        images = [load_image(files[0])]
+    else:
+        images = []
+    if text == "" and not images:
+        gr.Error("Please input a query and optionally image(s).")
+        return
+    if text == "" and images:
+        gr.Error("Please input a text query along with the image(s).")
         return
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *[{"type": "image", "image": image} for image in images],
+                {"type": "text", "text": text},
+            ],
+        }
+    ]
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    ).to(model.device, dtype=model.dtype)
+    streamer = TextIteratorStreamer(
+        processor, skip_prompt=True, skip_special_tokens=True
     )
+    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    buffer = ""
+    yield progress_bar_html("Processing...")
+    for new_text in streamer:
+        escaped_new_text = html.escape(new_text)
+        buffer += escaped_new_text
+        time.sleep(0.001)
+        yield buffer
 examples = [
     ],
 ]
+demo = gr.ChatInterface(
+    fn=model_inference,
+    description="# **Smolvlm2-500M-illustration-description** \n (running on CPU) The model only sees the last input, it ignores the previous conversation history.",
+    examples=examples,
+    fill_height=True,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"]),
+    stop_btn="Stop Generation",
+    multimodal=True,
+    cache_examples=False,
+)
+demo.launch(debug=True)

requirements.txt CHANGED Viewed

	@@ -0,0 +1,6 @@

+gradio
+transformers
+peft
+torch
+num2words
+torchvision