monurcan commited on
Commit
875f054
Β·
1 Parent(s): 8178d81
Files changed (4) hide show
  1. .gitignore +1 -2
  2. README.md +7 -9
  3. app.py +72 -148
  4. requirements.txt +6 -0
.gitignore CHANGED
@@ -1,2 +1 @@
1
- /env/*
2
- __pycache__/
 
1
+ /env/*
 
README.md CHANGED
@@ -1,16 +1,14 @@
1
  ---
2
- title: Efficient Test Time Scaling
3
- emoji: πŸ’¬
4
- colorFrom: yellow
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.42.0
8
  app_file: app.py
9
  pinned: false
10
- hf_oauth: true
11
- hf_oauth_scopes:
12
- - inference-api
13
- short_description: Efficient Test-Time Scaling for Small Vision-Language Models
14
  ---
15
 
16
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Smolvlm2 500M Illustration Description
3
+ emoji: πŸ“Š
4
+ colorFrom: red
5
+ colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 5.33.0
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
+ short_description: Illustration Description
 
 
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,8 +1,15 @@
1
  import gradio as gr
2
- import base64
 
 
 
 
 
 
 
 
3
  import time
4
  import html
5
- from huggingface_hub import InferenceClient
6
 
7
 
8
  def progress_bar_html(label: str) -> str:
@@ -28,134 +35,63 @@ def progress_bar_html(label: str) -> str:
28
 
29
  model_name = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
30
 
 
 
 
31
 
32
- def model_inference(input_dict, history, *additional_inputs):
33
- """
34
- Use Hugging Face InferenceClient (streaming) to perform the multimodal chat completion.
35
- Signature matches ChatInterface call pattern: (input_dict, history, *additional_inputs)
36
- The OAuth token (from gr.LoginButton) is passed as `hf_token`.
37
- """
38
- # Extract hf_token from additional_inputs in a robust way (gradio sometimes passes extra args)
39
- hf_token = None
40
- for ai in additional_inputs:
41
- if ai is None:
42
- continue
43
- # gradio may pass a small object with attribute `token`
44
- if hasattr(ai, "token"):
45
- hf_token = ai
46
- break
47
- # or a dict-like with a token key
48
- if isinstance(ai, dict) and "token" in ai:
49
-
50
- class _T:
51
- pass
52
-
53
- obj = _T()
54
- obj.token = ai.get("token")
55
- hf_token = obj
56
- break
57
- # or the token itself could be passed as a string
58
- if isinstance(ai, str):
59
-
60
- class _T2:
61
- pass
62
-
63
- obj = _T2()
64
- obj.token = ai
65
- hf_token = obj
66
- break
67
-
68
- text = input_dict.get("text", "")
69
- files = input_dict.get("files", []) or []
70
-
71
- if text == "" and not files:
72
- # yield an error text so the streaming generator produces at least one value
73
- yield "Please input a query and optionally image(s)."
74
- return
75
- if text == "" and files:
76
- yield "Please input a text query along with the image(s)."
77
- return
78
 
79
- # Build the content list: images (as URLs or data URLs) followed by the text
80
- content_list = []
81
- for f in files:
82
- try:
83
- # If file looks like a URL, send as image_url
84
- if isinstance(f, str) and f.startswith("http"):
85
- content_list.append({"type": "image_url", "image_url": {"url": f}})
86
- else:
87
- # f is a local path-like object; read and convert to base64 data url
88
- with open(f, "rb") as fh:
89
- b = fh.read()
90
- b64 = base64.b64encode(b).decode("utf-8")
91
- # naive mime type: jpeg; this should work for most common images
92
- data_url = f"data:image/jpeg;base64,{b64}"
93
- content_list.append(
94
- {"type": "image_url", "image_url": {"url": data_url}}
95
- )
96
- except Exception:
97
- # if anything goes wrong reading the file, skip embedding that file
98
- continue
99
-
100
- content_list.append({"type": "text", "text": text})
101
-
102
- messages = [{"role": "user", "content": content_list}]
103
-
104
- if hf_token is None or not getattr(hf_token, "token", None):
105
- yield "Please login with a Hugging Face account (use the Login button in the sidebar)."
106
  return
107
 
108
- client = InferenceClient(
109
- token=hf_token.token, model=model_name, provider="hf-inference"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  )
 
 
 
 
 
 
 
 
111
 
112
- response = ""
113
- for message in client.chat_completion(
114
- messages,
115
- max_tokens=1024,
116
- stream=True,
117
- ):
118
- choices = message.choices
119
- token = ""
120
- if len(choices) and choices[0].delta.content:
121
- token = choices[0].delta.content
122
-
123
- response += token
124
- yield response
125
-
126
- # for chunk in stream:
127
- # # chunk can be an object with attributes or a dict depending on client version
128
- # token = ""
129
- # try:
130
- # # attempt dict-style
131
- # if isinstance(chunk, dict):
132
- # choices = chunk.get("choices")
133
- # if choices and len(choices) > 0:
134
- # delta = choices[0].get("delta", {})
135
- # token = delta.get("content") or ""
136
- # else:
137
- # # attribute-style
138
- # choices = getattr(chunk, "choices", None)
139
- # if choices and len(choices) > 0:
140
- # delta = getattr(choices[0], "delta", None)
141
- # if isinstance(delta, dict):
142
- # token = delta.get("content") or ""
143
- # else:
144
- # token = getattr(delta, "content", "")
145
- # except Exception:
146
- # token = ""
147
-
148
- # if token:
149
- # # escape incremental token to avoid raw HTML breaking the chat box
150
- # response += html.escape(token)
151
- # time.sleep(0.001)
152
- # yield response
153
-
154
- # # ensure we yield at least one final message so the async iterator doesn't see StopIteration
155
- # if response:
156
- # yield response
157
- # else:
158
- # yield "(no text was returned by the model)"
159
 
160
 
161
  examples = [
@@ -173,27 +109,15 @@ examples = [
173
  ],
174
  ]
175
 
176
- with gr.Blocks() as demo:
177
- with gr.Sidebar():
178
- # Gradio LoginButton may not accept a `label` kwarg depending on the installed version
179
- # so create it without that argument for maximum compatibility.
180
- login_btn = gr.LoginButton()
181
-
182
- chatbot = gr.ChatInterface(
183
- fn=model_inference,
184
- description="# **Smolvlm2-500M-illustration-description** \n (running on CPU) The model only sees the last input, it ignores the previous conversation history.",
185
- examples=examples,
186
- fill_height=True,
187
- textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"]),
188
- stop_btn="Stop Generation",
189
- multimodal=True,
190
- cache_examples=False,
191
- additional_inputs=[login_btn],
192
- )
193
-
194
- # ChatInterface is already created inside the Blocks context; calling render() can duplicate it
195
- # so we avoid calling chatbot.render() here.
196
-
197
-
198
- if __name__ == "__main__":
199
- demo.launch(debug=True)
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import (
4
+ AutoModelForImageTextToText,
5
+ AutoProcessor,
6
+ TextIteratorStreamer,
7
+ )
8
+ from peft import PeftModel
9
+ from transformers.image_utils import load_image
10
+ from threading import Thread
11
  import time
12
  import html
 
13
 
14
 
15
  def progress_bar_html(label: str) -> str:
 
35
 
36
  model_name = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
37
 
38
+ model = AutoModelForImageTextToText.from_pretrained(
39
+ model_name, dtype=torch.bfloat16, device_map="auto"
40
+ ).eval()
41
 
42
+ processor = AutoProcessor.from_pretrained(model_name)
43
+
44
+ print(f"Successfully load the model: {model}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+
47
+ def model_inference(input_dict, history):
48
+ text = input_dict["text"]
49
+ files = input_dict["files"]
50
+
51
+ if len(files) > 1:
52
+ images = [load_image(image) for image in files]
53
+ elif len(files) == 1:
54
+ images = [load_image(files[0])]
55
+ else:
56
+ images = []
57
+
58
+ if text == "" and not images:
59
+ gr.Error("Please input a query and optionally image(s).")
60
+ return
61
+ if text == "" and images:
62
+ gr.Error("Please input a text query along with the image(s).")
 
 
 
 
 
 
 
 
 
 
63
  return
64
 
65
+ messages = [
66
+ {
67
+ "role": "user",
68
+ "content": [
69
+ *[{"type": "image", "image": image} for image in images],
70
+ {"type": "text", "text": text},
71
+ ],
72
+ }
73
+ ]
74
+ inputs = processor.apply_chat_template(
75
+ messages,
76
+ add_generation_prompt=True,
77
+ tokenize=True,
78
+ return_dict=True,
79
+ return_tensors="pt",
80
+ ).to(model.device, dtype=model.dtype)
81
+ streamer = TextIteratorStreamer(
82
+ processor, skip_prompt=True, skip_special_tokens=True
83
  )
84
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
85
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
86
+ thread.start()
87
+ buffer = ""
88
+ yield progress_bar_html("Processing...")
89
+ for new_text in streamer:
90
+ escaped_new_text = html.escape(new_text)
91
+ buffer += escaped_new_text
92
 
93
+ time.sleep(0.001)
94
+ yield buffer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
 
97
  examples = [
 
109
  ],
110
  ]
111
 
112
+ demo = gr.ChatInterface(
113
+ fn=model_inference,
114
+ description="# **Smolvlm2-500M-illustration-description** \n (running on CPU) The model only sees the last input, it ignores the previous conversation history.",
115
+ examples=examples,
116
+ fill_height=True,
117
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"]),
118
+ stop_btn="Stop Generation",
119
+ multimodal=True,
120
+ cache_examples=False,
121
+ )
122
+
123
+ demo.launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ peft
4
+ torch
5
+ num2words
6
+ torchvision