prithivMLmods commited on
Commit
dbb6dae
·
verified ·
1 Parent(s): a624742

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -89
app.py CHANGED
@@ -17,8 +17,8 @@ from transformers import (
17
  Qwen2_5_VLForConditionalGeneration,
18
  AutoProcessor,
19
  AutoModelForImageTextToText,
20
- AutoModelForVision2Seq,
21
- AutoTokenizer
22
  )
23
  from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
24
  from qwen_vl_utils import process_vision_info
@@ -26,6 +26,9 @@ from qwen_vl_utils import process_vision_info
26
  from gradio.themes import Soft
27
  from gradio.themes.utils import colors, fonts, sizes
28
 
 
 
 
29
  colors.orange_red = colors.Color(
30
  name="orange_red",
31
  c50="#FFF0E5",
@@ -97,6 +100,10 @@ orange_red_theme = OrangeRedTheme()
97
  device = "cuda" if torch.cuda.is_available() else "cpu"
98
  print(f"Running on device: {device}")
99
 
 
 
 
 
100
  print("🔄 Loading Fara-7B...")
101
  MODEL_ID_V = "microsoft/Fara-7B"
102
  try:
@@ -140,22 +147,27 @@ except Exception as e:
140
  processor_h = None
141
 
142
  print("🔄 Loading ActIO-UI-7B...")
143
- MODEL_ID_A = "Uniphore/actio-ui-7b-rlvr"
144
  try:
145
- processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
146
- model_a = AutoModelForVision2Seq.from_pretrained(
147
- MODEL_ID_A,
 
148
  trust_remote_code=True,
149
- torch_dtype="auto",
150
- device_map=device
151
- ).eval()
152
  except Exception as e:
153
- print(f"Failed to load ActIO: {e}")
154
- model_a = None
155
- processor_a = None
156
 
157
  print("✅ Models loading sequence complete.")
158
 
 
 
 
 
159
  def array_to_image(image_array: np.ndarray) -> Image.Image:
160
  if image_array is None: raise ValueError("No image provided.")
161
  return Image.fromarray(np.uint8(image_array))
@@ -171,13 +183,13 @@ def get_image_proc_params(processor) -> Dict[str, int]:
171
  min_pixels = getattr(ip, "min_pixels", default_min)
172
  max_pixels = getattr(ip, "max_pixels", default_max)
173
 
174
- # Some configs hide size in a dict
175
  size_config = getattr(ip, "size", {})
176
  if isinstance(size_config, dict):
177
  if "shortest_edge" in size_config:
178
- min_pixels = size_config.get("shortest_edge", default_min)
179
  if "longest_edge" in size_config:
180
- max_pixels = size_config.get("longest_edge", default_max)
181
 
182
  if min_pixels is None: min_pixels = default_min
183
  if max_pixels is None: max_pixels = default_max
@@ -190,11 +202,12 @@ def get_image_proc_params(processor) -> Dict[str, int]:
190
  }
191
 
192
  def apply_chat_template_compat(processor, messages: List[Dict[str, Any]], thinking: bool = True) -> str:
193
- # Handles compat for models that support/don't support the 'thinking' arg
194
  if hasattr(processor, "apply_chat_template"):
195
  try:
196
  return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, thinking=thinking)
197
  except TypeError:
 
198
  return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
199
 
200
  tok = getattr(processor, "tokenizer", None)
@@ -211,6 +224,10 @@ def trim_generated(generated_ids, inputs):
211
  return generated_ids
212
  return [out_ids[len(in_seq):] for in_seq, out_ids in zip(in_ids, generated_ids)]
213
 
 
 
 
 
214
  def get_fara_prompt(task, image):
215
  OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
216
  You need to generate the next action to complete the task.
@@ -263,28 +280,32 @@ def get_actio_prompt(task, image):
263
  "You are a GUI agent. You are given a task and a screenshot of the screen. "
264
  "You need to perform a series of pyautogui actions to complete the task."
265
  )
266
- # ActIO specific format request
267
- user_text = (
268
- "Please perform the following task by providing the action and the coordinates "
269
- "in the format of <action>(x, y): " + task
270
  )
271
-
272
  return [
273
- {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
274
  {
275
  "role": "user",
276
  "content": [
277
- {"type": "text", "text": user_text},
278
  {"type": "image", "image": image},
279
  ],
280
  },
281
  ]
282
 
 
 
 
 
283
  def parse_click_response(text: str) -> List[Dict]:
284
  actions = []
285
  text = text.strip()
286
 
287
- matches_click = re.findall(r"Click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text, re.IGNORECASE)
 
 
288
  for m in matches_click:
289
  actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
290
 
@@ -296,6 +317,7 @@ def parse_click_response(text: str) -> List[Dict]:
296
  for m in matches_box:
297
  actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
298
 
 
299
  if not actions:
300
  matches_tuple = re.findall(r"(?:^|\s)\(\s*(\d+)\s*,\s*(\d+)\s*\)(?:$|\s|,)", text)
301
  for m in matches_tuple:
@@ -339,28 +361,31 @@ def parse_holo2_response(response: str) -> List[Dict]:
339
  "x": int(match.group(1)),
340
  "y": int(match.group(2)),
341
  "text": "Holo2",
342
- "norm": True # 0-1000 scale
343
  })
 
344
  return actions
345
 
346
- def parse_actio_response(text: str) -> List[Dict]:
 
 
347
  actions = []
348
- text = text.strip()
349
- # Pattern for <action>(x, y) e.g., click(500, 300) or type(200, 200)
350
- # Also handles optional text inside or loosely formatted
351
- pattern = r"([a-zA-Z_]+)\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)"
352
- matches = re.findall(pattern, text)
353
-
354
- for m in matches:
355
  actions.append({
356
- "type": m[0],
357
- "x": int(m[1]),
358
- "y": int(m[2]),
359
- "text": text,
360
- "norm": False # ActIO usually outputs absolute pixels relative to input image
361
  })
362
  return actions
363
 
 
 
 
 
364
  def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
365
  if not actions: return None
366
  img_copy = original_image.copy()
@@ -379,32 +404,38 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
379
 
380
  color = 'red' if 'click' in act['type'].lower() else 'blue'
381
 
382
- # Crosshair
383
  line_len = 15
384
  width = 4
 
385
  draw.line((pixel_x - line_len, pixel_y, pixel_x + line_len, pixel_y), fill=color, width=width)
 
386
  draw.line((pixel_x, pixel_y - line_len, pixel_x, pixel_y + line_len), fill=color, width=width)
387
 
388
- # Circle
389
  r = 20
390
  draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=3)
391
 
392
- label = f"{act['type'].capitalize()}"
393
- if act.get('text') and len(act['text']) < 20:
394
- label += f": \"{act['text']}\""
395
 
396
  text_pos = (pixel_x + 25, pixel_y - 15)
397
 
 
398
  try:
399
  bbox = draw.textbbox(text_pos, label, font=font)
400
  padded_bbox = (bbox[0]-4, bbox[1]-2, bbox[2]+4, bbox[3]+2)
401
  draw.rectangle(padded_bbox, fill="yellow", outline=color)
402
  draw.text(text_pos, label, fill="black", font=font)
403
- except Exception:
404
  draw.text(text_pos, label, fill="white")
405
 
406
  return img_copy
407
 
 
 
 
 
408
  @spaces.GPU
409
  def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str):
410
  if input_numpy_image is None: return "⚠️ Please upload an image.", None
@@ -415,6 +446,9 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
415
  actions = []
416
  raw_response = ""
417
 
 
 
 
418
  if model_choice == "Fara-7B":
419
  if model_v is None: return "Error: Fara model failed to load.", None
420
  print("Using Fara Pipeline...")
@@ -437,48 +471,11 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
437
 
438
  generated_ids = trim_generated(generated_ids, inputs)
439
  raw_response = processor_v.batch_decode(generated_ids, skip_special_tokens=True)[0]
440
-
441
  actions = parse_fara_response(raw_response)
442
 
443
- elif model_choice == "ActIO-UI-7B":
444
- if model_a is None: return "Error: ActIO model failed to load.", None
445
- print("Using ActIO-UI Pipeline...")
446
-
447
- model, processor = model_a, processor_a
448
- ip_params = get_image_proc_params(processor)
449
-
450
- # Resize for performance and standard input compliance
451
- resized_h, resized_w = smart_resize(
452
- input_pil_image.height, input_pil_image.width,
453
- factor=ip_params["patch_size"] * ip_params["merge_size"],
454
- min_pixels=ip_params["min_pixels"],
455
- max_pixels=ip_params["max_pixels"],
456
- )
457
- proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
458
-
459
- messages = get_actio_prompt(task, proc_image)
460
- text_prompt = apply_chat_template_compat(processor, messages)
461
-
462
- # ActIO/Qwen processors usually handle image list via processor call
463
- inputs = processor(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
464
- inputs = {k: v.to(device) for k, v in inputs.items()}
465
-
466
- with torch.no_grad():
467
- generated_ids = model.generate(**inputs, max_new_tokens=512, do_sample=False)
468
-
469
- generated_ids = trim_generated(generated_ids, inputs)
470
- raw_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
471
-
472
- actions = parse_actio_response(raw_response)
473
-
474
- # Scale coordinates (Resized -> Original)
475
- if resized_w > 0 and resized_h > 0:
476
- scale_x = orig_w / resized_w
477
- scale_y = orig_h / resized_h
478
- for a in actions:
479
- a['x'] = int(a['x'] * scale_x)
480
- a['y'] = int(a['y'] * scale_y)
481
-
482
  elif model_choice == "Holo2-4B":
483
  if model_h is None: return "Error: Holo2 model failed to load.", None
484
  print("Using Holo2-4B Pipeline...")
@@ -505,14 +502,17 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
505
 
506
  generated_ids = trim_generated(generated_ids, inputs)
507
  raw_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
508
-
509
  actions = parse_holo2_response(raw_response)
510
 
 
511
  for a in actions:
512
  if a.get('norm', False):
513
  a['x'] = (a['x'] / 1000.0) * orig_w
514
  a['y'] = (a['y'] / 1000.0) * orig_h
515
 
 
 
 
516
  elif model_choice == "UI-TARS-1.5-7B":
517
  if model_x is None: return "Error: UI-TARS model failed to load.", None
518
  print("Using UI-TARS Pipeline...")
@@ -539,9 +539,9 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
539
 
540
  generated_ids = trim_generated(generated_ids, inputs)
541
  raw_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
542
-
543
  actions = parse_click_response(raw_response)
544
 
 
545
  if resized_w > 0 and resized_h > 0:
546
  scale_x = orig_w / resized_w
547
  scale_y = orig_h / resized_h
@@ -549,6 +549,54 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
549
  a['x'] = int(a['x'] * scale_x)
550
  a['y'] = int(a['y'] * scale_y)
551
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
552
  else:
553
  return f"Error: Unknown model '{model_choice}'", None
554
 
@@ -562,6 +610,9 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
562
 
563
  return raw_response, output_image
564
 
 
 
 
565
  css="""
566
  #col-container {
567
  margin: 0 auto;
@@ -571,7 +622,7 @@ css="""
571
  """
572
  with gr.Blocks() as demo:
573
  gr.Markdown("# **CUA GUI Operator 🖥️**", elem_id="main-title")
574
- gr.Markdown("Perform Computer Use Agent tasks with the models: [Fara-7B](https://huggingface.co/microsoft/Fara-7B), [UI-TARS-1.5-7B](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B), [Holo2-4B](https://huggingface.co/Hcompany/Holo2-4B) and [ActIO-UI-7B](https://huggingface.co/Uniphore/actio-ui-7b-rlvr).")
575
 
576
  with gr.Row():
577
  with gr.Column(scale=2):
@@ -579,7 +630,7 @@ with gr.Blocks() as demo:
579
 
580
  with gr.Row():
581
  model_choice = gr.Radio(
582
- choices=["Fara-7B", "UI-TARS-1.5-7B", "ActIO-UI-7B", "Holo2-4B"],
583
  label="Select Model",
584
  value="Fara-7B",
585
  interactive=True
@@ -606,8 +657,8 @@ with gr.Blocks() as demo:
606
  examples=[
607
  ["examples/1.png", "Click on the Fara-7B model.", "Fara-7B"],
608
  ["examples/2.png", "Click on the VLMs Collection", "UI-TARS-1.5-7B"],
609
- ["examples/2.png", "Search for 'PRO'", "ActIO-UI-7B"],
610
  ["examples/3.png", "Click on the 'Real-time vision models' collection.", "Holo2-4B"],
 
611
  ],
612
  inputs=[input_image, task_input, model_choice],
613
  label="Quick Examples"
 
17
  Qwen2_5_VLForConditionalGeneration,
18
  AutoProcessor,
19
  AutoModelForImageTextToText,
20
+ AutoTokenizer,
21
+ AutoModelForVision2Seq
22
  )
23
  from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
24
  from qwen_vl_utils import process_vision_info
 
26
  from gradio.themes import Soft
27
  from gradio.themes.utils import colors, fonts, sizes
28
 
29
+ # -----------------------------------------------------------------------------
30
+ # Theme Configuration
31
+ # -----------------------------------------------------------------------------
32
  colors.orange_red = colors.Color(
33
  name="orange_red",
34
  c50="#FFF0E5",
 
100
  device = "cuda" if torch.cuda.is_available() else "cpu"
101
  print(f"Running on device: {device}")
102
 
103
+ # -----------------------------------------------------------------------------
104
+ # Model Loading
105
+ # -----------------------------------------------------------------------------
106
+
107
  print("🔄 Loading Fara-7B...")
108
  MODEL_ID_V = "microsoft/Fara-7B"
109
  try:
 
147
  processor_h = None
148
 
149
  print("🔄 Loading ActIO-UI-7B...")
150
+ MODEL_ID_ACT = "Uniphore/actio-ui-7b-rlvr"
151
  try:
152
+ # ActIO usually relies on Qwen2VL architecture structure
153
+ processor_act = AutoProcessor.from_pretrained(MODEL_ID_ACT, trust_remote_code=True)
154
+ model_act = AutoModelForVision2Seq.from_pretrained(
155
+ MODEL_ID_ACT,
156
  trust_remote_code=True,
157
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
158
+ device_map=None # We will move to device manually to control memory
159
+ ).to(device).eval()
160
  except Exception as e:
161
+ print(f"Failed to load ActIO-UI: {e}")
162
+ model_act = None
163
+ processor_act = None
164
 
165
  print("✅ Models loading sequence complete.")
166
 
167
+ # -----------------------------------------------------------------------------
168
+ # Helper Functions
169
+ # -----------------------------------------------------------------------------
170
+
171
  def array_to_image(image_array: np.ndarray) -> Image.Image:
172
  if image_array is None: raise ValueError("No image provided.")
173
  return Image.fromarray(np.uint8(image_array))
 
183
  min_pixels = getattr(ip, "min_pixels", default_min)
184
  max_pixels = getattr(ip, "max_pixels", default_max)
185
 
186
+ # Holo2/Qwen specific sizing sometimes in 'size' dict
187
  size_config = getattr(ip, "size", {})
188
  if isinstance(size_config, dict):
189
  if "shortest_edge" in size_config:
190
+ min_pixels = size_config["shortest_edge"]
191
  if "longest_edge" in size_config:
192
+ max_pixels = size_config["longest_edge"]
193
 
194
  if min_pixels is None: min_pixels = default_min
195
  if max_pixels is None: max_pixels = default_max
 
202
  }
203
 
204
  def apply_chat_template_compat(processor, messages: List[Dict[str, Any]], thinking: bool = True) -> str:
205
+ # Holo2 specific: allows turning thinking off in template
206
  if hasattr(processor, "apply_chat_template"):
207
  try:
208
  return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, thinking=thinking)
209
  except TypeError:
210
+ # Fallback for processors that don't support 'thinking' kwarg
211
  return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
212
 
213
  tok = getattr(processor, "tokenizer", None)
 
224
  return generated_ids
225
  return [out_ids[len(in_seq):] for in_seq, out_ids in zip(in_ids, generated_ids)]
226
 
227
+ # -----------------------------------------------------------------------------
228
+ # Prompt Construction
229
+ # -----------------------------------------------------------------------------
230
+
231
  def get_fara_prompt(task, image):
232
  OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
233
  You need to generate the next action to complete the task.
 
280
  "You are a GUI agent. You are given a task and a screenshot of the screen. "
281
  "You need to perform a series of pyautogui actions to complete the task."
282
  )
283
+ instruction_text = (
284
+ "Please perform the following task by providing the action and the coordinates in the format of <action>(x, y): "
285
+ + task
 
286
  )
 
287
  return [
288
+ {"role": "system", "content": system_prompt},
289
  {
290
  "role": "user",
291
  "content": [
292
+ {"type": "text", "text": instruction_text},
293
  {"type": "image", "image": image},
294
  ],
295
  },
296
  ]
297
 
298
+ # -----------------------------------------------------------------------------
299
+ # Output Parsing
300
+ # -----------------------------------------------------------------------------
301
+
302
  def parse_click_response(text: str) -> List[Dict]:
303
  actions = []
304
  text = text.strip()
305
 
306
+ # Generic Point parsing (ActIO uses similar click(x,y) format often)
307
+ # Looking for Click(x, y), left_click(x, y), etc.
308
+ matches_click = re.findall(r"(?:click|left_click|right_click|double_click)\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text, re.IGNORECASE)
309
  for m in matches_click:
310
  actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
311
 
 
317
  for m in matches_box:
318
  actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
319
 
320
+ # Fallback tuple
321
  if not actions:
322
  matches_tuple = re.findall(r"(?:^|\s)\(\s*(\d+)\s*,\s*(\d+)\s*\)(?:$|\s|,)", text)
323
  for m in matches_tuple:
 
361
  "x": int(match.group(1)),
362
  "y": int(match.group(2)),
363
  "text": "Holo2",
364
+ "norm": True
365
  })
366
+ return actions
367
  return actions
368
 
369
+ def parse_actio_response(response: str) -> List[Dict]:
370
+ # Expected format: <action>(x, y) e.g., click(551, 355)
371
+ # It might also just output "click(551, 355)" or "left_click(551, 355)"
372
  actions = []
373
+ # General regex for name(x, y)
374
+ matches = re.findall(r"([a-zA-Z_]+)\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", response)
375
+ for action_name, x, y in matches:
 
 
 
 
376
  actions.append({
377
+ "type": action_name,
378
+ "x": int(x),
379
+ "y": int(y),
380
+ "text": "",
381
+ "norm": False # ActIO usually outputs absolute coordinates relative to input image
382
  })
383
  return actions
384
 
385
+ # -----------------------------------------------------------------------------
386
+ # Visualization
387
+ # -----------------------------------------------------------------------------
388
+
389
  def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
390
  if not actions: return None
391
  img_copy = original_image.copy()
 
404
 
405
  color = 'red' if 'click' in act['type'].lower() else 'blue'
406
 
407
+ # Draw Crosshair
408
  line_len = 15
409
  width = 4
410
+ # Horizontal
411
  draw.line((pixel_x - line_len, pixel_y, pixel_x + line_len, pixel_y), fill=color, width=width)
412
+ # Vertical
413
  draw.line((pixel_x, pixel_y - line_len, pixel_x, pixel_y + line_len), fill=color, width=width)
414
 
415
+ # Outer Circle
416
  r = 20
417
  draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=3)
418
 
419
+ label = f"{act['type']}"
420
+ if act.get('text'): label += f": \"{act['text']}\""
 
421
 
422
  text_pos = (pixel_x + 25, pixel_y - 15)
423
 
424
+ # Label with background
425
  try:
426
  bbox = draw.textbbox(text_pos, label, font=font)
427
  padded_bbox = (bbox[0]-4, bbox[1]-2, bbox[2]+4, bbox[3]+2)
428
  draw.rectangle(padded_bbox, fill="yellow", outline=color)
429
  draw.text(text_pos, label, fill="black", font=font)
430
+ except Exception as e:
431
  draw.text(text_pos, label, fill="white")
432
 
433
  return img_copy
434
 
435
+ # -----------------------------------------------------------------------------
436
+ # Main Processing Logic
437
+ # -----------------------------------------------------------------------------
438
+
439
  @spaces.GPU
440
  def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str):
441
  if input_numpy_image is None: return "⚠️ Please upload an image.", None
 
446
  actions = []
447
  raw_response = ""
448
 
449
+ # ==========================
450
+ # FARA-7B
451
+ # ==========================
452
  if model_choice == "Fara-7B":
453
  if model_v is None: return "Error: Fara model failed to load.", None
454
  print("Using Fara Pipeline...")
 
471
 
472
  generated_ids = trim_generated(generated_ids, inputs)
473
  raw_response = processor_v.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
474
  actions = parse_fara_response(raw_response)
475
 
476
+ # ==========================
477
+ # HOLO2-4B
478
+ # ==========================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
479
  elif model_choice == "Holo2-4B":
480
  if model_h is None: return "Error: Holo2 model failed to load.", None
481
  print("Using Holo2-4B Pipeline...")
 
502
 
503
  generated_ids = trim_generated(generated_ids, inputs)
504
  raw_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
505
  actions = parse_holo2_response(raw_response)
506
 
507
+ # Scale Holo2 coordinates (Normalized 0-1000 -> Original Pixel)
508
  for a in actions:
509
  if a.get('norm', False):
510
  a['x'] = (a['x'] / 1000.0) * orig_w
511
  a['y'] = (a['y'] / 1000.0) * orig_h
512
 
513
+ # ==========================
514
+ # UI-TARS
515
+ # ==========================
516
  elif model_choice == "UI-TARS-1.5-7B":
517
  if model_x is None: return "Error: UI-TARS model failed to load.", None
518
  print("Using UI-TARS Pipeline...")
 
539
 
540
  generated_ids = trim_generated(generated_ids, inputs)
541
  raw_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
542
  actions = parse_click_response(raw_response)
543
 
544
+ # Scale UI-TARS coordinates (Resized Pixel -> Original Pixel)
545
  if resized_w > 0 and resized_h > 0:
546
  scale_x = orig_w / resized_w
547
  scale_y = orig_h / resized_h
 
549
  a['x'] = int(a['x'] * scale_x)
550
  a['y'] = int(a['y'] * scale_y)
551
 
552
+ # ==========================
553
+ # ActIO-UI-7B
554
+ # ==========================
555
+ elif model_choice == "ActIO-UI-7B":
556
+ if model_act is None: return "Error: ActIO model failed to load.", None
557
+ print("Using ActIO-UI Pipeline...")
558
+
559
+ model, processor = model_act, processor_act
560
+
561
+ # ActIO generally uses Qwen2-VL like processing
562
+ # We need to construct the prompt with text and image
563
+ messages = get_actio_prompt(task, input_pil_image)
564
+
565
+ text_prompt = processor.apply_chat_template(
566
+ messages, tokenize=False, add_generation_prompt=True
567
+ )
568
+
569
+ # ActIO typically works with standard RGB images
570
+ inputs = processor(
571
+ text=[text_prompt],
572
+ images=[input_pil_image],
573
+ padding=True,
574
+ return_tensors="pt"
575
+ )
576
+ inputs = {k: v.to(device) for k, v in inputs.items()}
577
+
578
+ with torch.no_grad():
579
+ generated_ids = model.generate(
580
+ **inputs,
581
+ max_new_tokens=1024, # ActIO allows verbose output sometimes
582
+ do_sample=False,
583
+ )
584
+
585
+ generated_ids = trim_generated(generated_ids, inputs)
586
+ raw_response = processor.batch_decode(
587
+ generated_ids,
588
+ skip_special_tokens=True,
589
+ clean_up_tokenization_spaces=False
590
+ )[0]
591
+
592
+ actions = parse_actio_response(raw_response)
593
+
594
+ # ActIO usually outputs absolute coordinates based on the input image resolution provided to the processor.
595
+ # Since we passed the original PIL image (unless resized internally by processor to something widely different),
596
+ # these coords are usually correct. If ActIO resizes internally and outputs coords relative to resize,
597
+ # we might need scaling, but standard usage implies absolute.
598
+ pass
599
+
600
  else:
601
  return f"Error: Unknown model '{model_choice}'", None
602
 
 
610
 
611
  return raw_response, output_image
612
 
613
+ # -----------------------------------------------------------------------------
614
+ # Gradio UI
615
+ # -----------------------------------------------------------------------------
616
  css="""
617
  #col-container {
618
  margin: 0 auto;
 
622
  """
623
  with gr.Blocks() as demo:
624
  gr.Markdown("# **CUA GUI Operator 🖥️**", elem_id="main-title")
625
+ gr.Markdown("Perform Computer Use Agent tasks with the models: [Fara-7B](https://huggingface.co/microsoft/Fara-7B), [UI-TARS-1.5-7B](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B), [Holo2-4B](https://huggingface.co/Hcompany/Holo2-4B), and [ActIO-UI-7B](https://huggingface.co/Uniphore/actio-ui-7b-rlvr).")
626
 
627
  with gr.Row():
628
  with gr.Column(scale=2):
 
630
 
631
  with gr.Row():
632
  model_choice = gr.Radio(
633
+ choices=["Fara-7B", "UI-TARS-1.5-7B", "Holo2-4B", "ActIO-UI-7B"],
634
  label="Select Model",
635
  value="Fara-7B",
636
  interactive=True
 
657
  examples=[
658
  ["examples/1.png", "Click on the Fara-7B model.", "Fara-7B"],
659
  ["examples/2.png", "Click on the VLMs Collection", "UI-TARS-1.5-7B"],
 
660
  ["examples/3.png", "Click on the 'Real-time vision models' collection.", "Holo2-4B"],
661
+ ["examples/1.png", "Click on the Fara-7B model.", "ActIO-UI-7B"],
662
  ],
663
  inputs=[input_image, task_input, model_choice],
664
  label="Quick Examples"