Spaces:

prithivMLmods
/

CUA-GUI-Operator

Running on Zero

App Files Files Community

prithivMLmods commited on 4 days ago

Commit

dbb6dae

verified ·

1 Parent(s): a624742

Update app.py

Browse files

Files changed (1) hide show

app.py +140 -89

app.py CHANGED Viewed

@@ -17,8 +17,8 @@ from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     AutoModelForImageTextToText,
-    AutoModelForVision2Seq,
-    AutoTokenizer
 )
 from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 from qwen_vl_utils import process_vision_info
@@ -26,6 +26,9 @@ from qwen_vl_utils import process_vision_info
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 colors.orange_red = colors.Color(
     name="orange_red",
     c50="#FFF0E5",
@@ -97,6 +100,10 @@ orange_red_theme = OrangeRedTheme()
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Running on device: {device}")
 print("🔄 Loading Fara-7B...")
 MODEL_ID_V = "microsoft/Fara-7B"
 try:
@@ -140,22 +147,27 @@ except Exception as e:
     processor_h = None
 print("🔄 Loading ActIO-UI-7B...")
-MODEL_ID_A = "Uniphore/actio-ui-7b-rlvr"
 try:
-    processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
-    model_a = AutoModelForVision2Seq.from_pretrained(
-        MODEL_ID_A,
         trust_remote_code=True,
-        torch_dtype="auto",
-        device_map=device
-    ).eval()
 except Exception as e:
-    print(f"Failed to load ActIO: {e}")
-    model_a = None
-    processor_a = None
 print("✅ Models loading sequence complete.")
 def array_to_image(image_array: np.ndarray) -> Image.Image:
     if image_array is None: raise ValueError("No image provided.")
     return Image.fromarray(np.uint8(image_array))
@@ -171,13 +183,13 @@ def get_image_proc_params(processor) -> Dict[str, int]:
     min_pixels = getattr(ip, "min_pixels", default_min)
     max_pixels = getattr(ip, "max_pixels", default_max)
-    # Some configs hide size in a dict
     size_config = getattr(ip, "size", {})
     if isinstance(size_config, dict):
         if "shortest_edge" in size_config:
-            min_pixels = size_config.get("shortest_edge", default_min)
         if "longest_edge" in size_config:
-            max_pixels = size_config.get("longest_edge", default_max)
     if min_pixels is None: min_pixels = default_min
     if max_pixels is None: max_pixels = default_max
@@ -190,11 +202,12 @@ def get_image_proc_params(processor) -> Dict[str, int]:
     }
 def apply_chat_template_compat(processor, messages: List[Dict[str, Any]], thinking: bool = True) -> str:
-    # Handles compat for models that support/don't support the 'thinking' arg
     if hasattr(processor, "apply_chat_template"):
         try:
             return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, thinking=thinking)
         except TypeError:
             return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     tok = getattr(processor, "tokenizer", None)
@@ -211,6 +224,10 @@ def trim_generated(generated_ids, inputs):
         return generated_ids
     return [out_ids[len(in_seq):] for in_seq, out_ids in zip(in_ids, generated_ids)]
 def get_fara_prompt(task, image):
     OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
     You need to generate the next action to complete the task.
@@ -263,28 +280,32 @@ def get_actio_prompt(task, image):
         "You are a GUI agent. You are given a task and a screenshot of the screen. "
         "You need to perform a series of pyautogui actions to complete the task."
     )
-    # ActIO specific format request
-    user_text = (
-        "Please perform the following task by providing the action and the coordinates "
-        "in the format of <action>(x, y): " + task
     )
     return [
-        {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
         {
             "role": "user",
             "content": [
-                {"type": "text", "text": user_text},
                 {"type": "image", "image": image},
             ],
         },
     ]
 def parse_click_response(text: str) -> List[Dict]:
     actions = []
     text = text.strip()
-    matches_click = re.findall(r"Click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text, re.IGNORECASE)
     for m in matches_click:
         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
@@ -296,6 +317,7 @@ def parse_click_response(text: str) -> List[Dict]:
     for m in matches_box:
         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
     if not actions:
         matches_tuple = re.findall(r"(?:^|\s)\(\s*(\d+)\s*,\s*(\d+)\s*\)(?:$|\s|,)", text)
         for m in matches_tuple:
@@ -339,28 +361,31 @@ def parse_holo2_response(response: str) -> List[Dict]:
             "x": int(match.group(1)),
             "y": int(match.group(2)),
             "text": "Holo2",
-            "norm": True # 0-1000 scale
         })
     return actions
-def parse_actio_response(text: str) -> List[Dict]:
     actions = []
-    text = text.strip()
-    # Pattern for <action>(x, y) e.g., click(500, 300) or type(200, 200)
-    # Also handles optional text inside or loosely formatted
-    pattern = r"([a-zA-Z_]+)\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)"
-    matches = re.findall(pattern, text)
-    for m in matches:
         actions.append({
-            "type": m[0],
-            "x": int(m[1]),
-            "y": int(m[2]),
-            "text": text,
-            "norm": False # ActIO usually outputs absolute pixels relative to input image
         })
     return actions
 def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
     if not actions: return None
     img_copy = original_image.copy()
@@ -379,32 +404,38 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
         color = 'red' if 'click' in act['type'].lower() else 'blue'
-        # Crosshair
         line_len = 15
         width = 4
         draw.line((pixel_x - line_len, pixel_y, pixel_x + line_len, pixel_y), fill=color, width=width)
         draw.line((pixel_x, pixel_y - line_len, pixel_x, pixel_y + line_len), fill=color, width=width)
-        # Circle
         r = 20
         draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=3)
-        label = f"{act['type'].capitalize()}"
-        if act.get('text') and len(act['text']) < 20:
-            label += f": \"{act['text']}\""
         text_pos = (pixel_x + 25, pixel_y - 15)
         try:
             bbox = draw.textbbox(text_pos, label, font=font)
             padded_bbox = (bbox[0]-4, bbox[1]-2, bbox[2]+4, bbox[3]+2)
             draw.rectangle(padded_bbox, fill="yellow", outline=color)
             draw.text(text_pos, label, fill="black", font=font)
-        except Exception:
             draw.text(text_pos, label, fill="white")
     return img_copy
 @spaces.GPU
 def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str):
     if input_numpy_image is None: return "⚠️ Please upload an image.", None
@@ -415,6 +446,9 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
     actions = []
     raw_response = ""
     if model_choice == "Fara-7B":
         if model_v is None: return "Error: Fara model failed to load.", None
         print("Using Fara Pipeline...")
@@ -437,48 +471,11 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
         generated_ids = trim_generated(generated_ids, inputs)
         raw_response = processor_v.batch_decode(generated_ids, skip_special_tokens=True)[0]
         actions = parse_fara_response(raw_response)
-    elif model_choice == "ActIO-UI-7B":
-        if model_a is None: return "Error: ActIO model failed to load.", None
-        print("Using ActIO-UI Pipeline...")
-        model, processor = model_a, processor_a
-        ip_params = get_image_proc_params(processor)
-        # Resize for performance and standard input compliance
-        resized_h, resized_w = smart_resize(
-            input_pil_image.height, input_pil_image.width,
-            factor=ip_params["patch_size"] * ip_params["merge_size"],
-            min_pixels=ip_params["min_pixels"],
-            max_pixels=ip_params["max_pixels"],
-        )
-        proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
-        messages = get_actio_prompt(task, proc_image)
-        text_prompt = apply_chat_template_compat(processor, messages)
-        # ActIO/Qwen processors usually handle image list via processor call
-        inputs = processor(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        with torch.no_grad():
-            generated_ids = model.generate(**inputs, max_new_tokens=512, do_sample=False)
-        generated_ids = trim_generated(generated_ids, inputs)
-        raw_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        actions = parse_actio_response(raw_response)
-        # Scale coordinates (Resized -> Original)
-        if resized_w > 0 and resized_h > 0:
-            scale_x = orig_w / resized_w
-            scale_y = orig_h / resized_h
-            for a in actions:
-                a['x'] = int(a['x'] * scale_x)
-                a['y'] = int(a['y'] * scale_y)
     elif model_choice == "Holo2-4B":
         if model_h is None: return "Error: Holo2 model failed to load.", None
         print("Using Holo2-4B Pipeline...")
@@ -505,14 +502,17 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
         generated_ids = trim_generated(generated_ids, inputs)
         raw_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         actions = parse_holo2_response(raw_response)
         for a in actions:
             if a.get('norm', False):
                 a['x'] = (a['x'] / 1000.0) * orig_w
                 a['y'] = (a['y'] / 1000.0) * orig_h
     elif model_choice == "UI-TARS-1.5-7B":
         if model_x is None: return "Error: UI-TARS model failed to load.", None
         print("Using UI-TARS Pipeline...")
@@ -539,9 +539,9 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
         generated_ids = trim_generated(generated_ids, inputs)
         raw_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         actions = parse_click_response(raw_response)
         if resized_w > 0 and resized_h > 0:
             scale_x = orig_w / resized_w
             scale_y = orig_h / resized_h
@@ -549,6 +549,54 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
                 a['x'] = int(a['x'] * scale_x)
                 a['y'] = int(a['y'] * scale_y)
     else:
         return f"Error: Unknown model '{model_choice}'", None
@@ -562,6 +610,9 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
     return raw_response, output_image
 css="""
 #col-container {
     margin: 0 auto;
@@ -571,7 +622,7 @@ css="""
 """
 with gr.Blocks() as demo:
     gr.Markdown("# **CUA GUI Operator 🖥️**", elem_id="main-title")
-    gr.Markdown("Perform Computer Use Agent tasks with the models: [Fara-7B](https://huggingface.co/microsoft/Fara-7B), [UI-TARS-1.5-7B](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B), [Holo2-4B](https://huggingface.co/Hcompany/Holo2-4B) and [ActIO-UI-7B](https://huggingface.co/Uniphore/actio-ui-7b-rlvr).")
     with gr.Row():
         with gr.Column(scale=2):
@@ -579,7 +630,7 @@ with gr.Blocks() as demo:
             with gr.Row():
                 model_choice = gr.Radio(
-                    choices=["Fara-7B", "UI-TARS-1.5-7B", "ActIO-UI-7B", "Holo2-4B"],
                     label="Select Model",
                     value="Fara-7B",
                     interactive=True
@@ -606,8 +657,8 @@ with gr.Blocks() as demo:
         examples=[
             ["examples/1.png", "Click on the Fara-7B model.", "Fara-7B"],
             ["examples/2.png", "Click on the VLMs Collection", "UI-TARS-1.5-7B"],
-            ["examples/2.png", "Search for 'PRO'", "ActIO-UI-7B"],
             ["examples/3.png", "Click on the 'Real-time vision models' collection.", "Holo2-4B"],
         ],
         inputs=[input_image, task_input, model_choice],
         label="Quick Examples"

     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     AutoModelForImageTextToText,
+    AutoTokenizer,
+    AutoModelForVision2Seq
 )
 from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 from qwen_vl_utils import process_vision_info
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
+# -----------------------------------------------------------------------------
+# Theme Configuration
+# -----------------------------------------------------------------------------
 colors.orange_red = colors.Color(
     name="orange_red",
     c50="#FFF0E5",
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Running on device: {device}")
+# -----------------------------------------------------------------------------
+# Model Loading
+# -----------------------------------------------------------------------------
 print("🔄 Loading Fara-7B...")
 MODEL_ID_V = "microsoft/Fara-7B"
 try:
     processor_h = None
 print("🔄 Loading ActIO-UI-7B...")
+MODEL_ID_ACT = "Uniphore/actio-ui-7b-rlvr"
 try:
+    # ActIO usually relies on Qwen2VL architecture structure
+    processor_act = AutoProcessor.from_pretrained(MODEL_ID_ACT, trust_remote_code=True)
+    model_act = AutoModelForVision2Seq.from_pretrained(
+        MODEL_ID_ACT,
         trust_remote_code=True,
+        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+        device_map=None # We will move to device manually to control memory
+    ).to(device).eval()
 except Exception as e:
+    print(f"Failed to load ActIO-UI: {e}")
+    model_act = None
+    processor_act = None
 print("✅ Models loading sequence complete.")
+# -----------------------------------------------------------------------------
+# Helper Functions
+# -----------------------------------------------------------------------------
 def array_to_image(image_array: np.ndarray) -> Image.Image:
     if image_array is None: raise ValueError("No image provided.")
     return Image.fromarray(np.uint8(image_array))
     min_pixels = getattr(ip, "min_pixels", default_min)
     max_pixels = getattr(ip, "max_pixels", default_max)
+    # Holo2/Qwen specific sizing sometimes in 'size' dict
     size_config = getattr(ip, "size", {})
     if isinstance(size_config, dict):
         if "shortest_edge" in size_config:
+            min_pixels = size_config["shortest_edge"]
         if "longest_edge" in size_config:
+            max_pixels = size_config["longest_edge"]
     if min_pixels is None: min_pixels = default_min
     if max_pixels is None: max_pixels = default_max
     }
 def apply_chat_template_compat(processor, messages: List[Dict[str, Any]], thinking: bool = True) -> str:
+    # Holo2 specific: allows turning thinking off in template
     if hasattr(processor, "apply_chat_template"):
         try:
             return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, thinking=thinking)
         except TypeError:
+            # Fallback for processors that don't support 'thinking' kwarg
             return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     tok = getattr(processor, "tokenizer", None)
         return generated_ids
     return [out_ids[len(in_seq):] for in_seq, out_ids in zip(in_ids, generated_ids)]
+# -----------------------------------------------------------------------------
+# Prompt Construction
+# -----------------------------------------------------------------------------
 def get_fara_prompt(task, image):
     OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
     You need to generate the next action to complete the task.
         "You are a GUI agent. You are given a task and a screenshot of the screen. "
         "You need to perform a series of pyautogui actions to complete the task."
     )
+    instruction_text = (
+        "Please perform the following task by providing the action and the coordinates in the format of <action>(x, y): "
+        + task
     )
     return [
+        {"role": "system", "content": system_prompt},
         {
             "role": "user",
             "content": [
+                {"type": "text", "text": instruction_text},
                 {"type": "image", "image": image},
             ],
         },
     ]
+# -----------------------------------------------------------------------------
+# Output Parsing
+# -----------------------------------------------------------------------------
 def parse_click_response(text: str) -> List[Dict]:
     actions = []
     text = text.strip()
+    # Generic Point parsing (ActIO uses similar click(x,y) format often)
+    # Looking for Click(x, y), left_click(x, y), etc.
+    matches_click = re.findall(r"(?:click|left_click|right_click|double_click)\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text, re.IGNORECASE)
     for m in matches_click:
         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
     for m in matches_box:
         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
+    # Fallback tuple
     if not actions:
         matches_tuple = re.findall(r"(?:^|\s)\(\s*(\d+)\s*,\s*(\d+)\s*\)(?:$|\s|,)", text)
         for m in matches_tuple:
             "x": int(match.group(1)),
             "y": int(match.group(2)),
             "text": "Holo2",
+            "norm": True
         })
+        return actions
     return actions
+def parse_actio_response(response: str) -> List[Dict]:
+    # Expected format: <action>(x, y) e.g., click(551, 355)
+    # It might also just output "click(551, 355)" or "left_click(551, 355)"
     actions = []
+    # General regex for name(x, y)
+    matches = re.findall(r"([a-zA-Z_]+)\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", response)
+    for action_name, x, y in matches:
         actions.append({
+            "type": action_name,
+            "x": int(x),
+            "y": int(y),
+            "text": "",
+            "norm": False # ActIO usually outputs absolute coordinates relative to input image
         })
     return actions
+# -----------------------------------------------------------------------------
+# Visualization
+# -----------------------------------------------------------------------------
 def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
     if not actions: return None
     img_copy = original_image.copy()
         color = 'red' if 'click' in act['type'].lower() else 'blue'
+        # Draw Crosshair
         line_len = 15
         width = 4
+        # Horizontal
         draw.line((pixel_x - line_len, pixel_y, pixel_x + line_len, pixel_y), fill=color, width=width)
+        # Vertical
         draw.line((pixel_x, pixel_y - line_len, pixel_x, pixel_y + line_len), fill=color, width=width)
+        # Outer Circle
         r = 20
         draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=3)
+        label = f"{act['type']}"
+        if act.get('text'): label += f": \"{act['text']}\""
         text_pos = (pixel_x + 25, pixel_y - 15)
+        # Label with background
         try:
             bbox = draw.textbbox(text_pos, label, font=font)
             padded_bbox = (bbox[0]-4, bbox[1]-2, bbox[2]+4, bbox[3]+2)
             draw.rectangle(padded_bbox, fill="yellow", outline=color)
             draw.text(text_pos, label, fill="black", font=font)
+        except Exception as e:
             draw.text(text_pos, label, fill="white")
     return img_copy
+# -----------------------------------------------------------------------------
+# Main Processing Logic
+# -----------------------------------------------------------------------------
 @spaces.GPU
 def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str):
     if input_numpy_image is None: return "⚠️ Please upload an image.", None
     actions = []
     raw_response = ""
+    # ==========================
+    # FARA-7B
+    # ==========================
     if model_choice == "Fara-7B":
         if model_v is None: return "Error: Fara model failed to load.", None
         print("Using Fara Pipeline...")
         generated_ids = trim_generated(generated_ids, inputs)
         raw_response = processor_v.batch_decode(generated_ids, skip_special_tokens=True)[0]
         actions = parse_fara_response(raw_response)
+    # ==========================
+    # HOLO2-4B
+    # ==========================
     elif model_choice == "Holo2-4B":
         if model_h is None: return "Error: Holo2 model failed to load.", None
         print("Using Holo2-4B Pipeline...")
         generated_ids = trim_generated(generated_ids, inputs)
         raw_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         actions = parse_holo2_response(raw_response)
+        # Scale Holo2 coordinates (Normalized 0-1000 -> Original Pixel)
         for a in actions:
             if a.get('norm', False):
                 a['x'] = (a['x'] / 1000.0) * orig_w
                 a['y'] = (a['y'] / 1000.0) * orig_h
+    # ==========================
+    # UI-TARS
+    # ==========================
     elif model_choice == "UI-TARS-1.5-7B":
         if model_x is None: return "Error: UI-TARS model failed to load.", None
         print("Using UI-TARS Pipeline...")
         generated_ids = trim_generated(generated_ids, inputs)
         raw_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         actions = parse_click_response(raw_response)
+        # Scale UI-TARS coordinates (Resized Pixel -> Original Pixel)
         if resized_w > 0 and resized_h > 0:
             scale_x = orig_w / resized_w
             scale_y = orig_h / resized_h
                 a['x'] = int(a['x'] * scale_x)
                 a['y'] = int(a['y'] * scale_y)
+    # ==========================
+    # ActIO-UI-7B
+    # ==========================
+    elif model_choice == "ActIO-UI-7B":
+        if model_act is None: return "Error: ActIO model failed to load.", None
+        print("Using ActIO-UI Pipeline...")
+        model, processor = model_act, processor_act
+        # ActIO generally uses Qwen2-VL like processing
+        # We need to construct the prompt with text and image
+        messages = get_actio_prompt(task, input_pil_image)
+        text_prompt = processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        # ActIO typically works with standard RGB images
+        inputs = processor(
+            text=[text_prompt],
+            images=[input_pil_image],
+            padding=True,
+            return_tensors="pt"
+        )
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            generated_ids = model.generate(
+                **inputs,
+                max_new_tokens=1024, # ActIO allows verbose output sometimes
+                do_sample=False,
+            )
+        generated_ids = trim_generated(generated_ids, inputs)
+        raw_response = processor.batch_decode(
+            generated_ids,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        actions = parse_actio_response(raw_response)
+        # ActIO usually outputs absolute coordinates based on the input image resolution provided to the processor.
+        # Since we passed the original PIL image (unless resized internally by processor to something widely different),
+        # these coords are usually correct. If ActIO resizes internally and outputs coords relative to resize,
+        # we might need scaling, but standard usage implies absolute.
+        pass
     else:
         return f"Error: Unknown model '{model_choice}'", None
     return raw_response, output_image
+# -----------------------------------------------------------------------------
+# Gradio UI
+# -----------------------------------------------------------------------------
 css="""
 #col-container {
     margin: 0 auto;
 """
 with gr.Blocks() as demo:
     gr.Markdown("# **CUA GUI Operator 🖥️**", elem_id="main-title")
+    gr.Markdown("Perform Computer Use Agent tasks with the models: [Fara-7B](https://huggingface.co/microsoft/Fara-7B), [UI-TARS-1.5-7B](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B), [Holo2-4B](https://huggingface.co/Hcompany/Holo2-4B), and [ActIO-UI-7B](https://huggingface.co/Uniphore/actio-ui-7b-rlvr).")
     with gr.Row():
         with gr.Column(scale=2):
             with gr.Row():
                 model_choice = gr.Radio(
+                    choices=["Fara-7B", "UI-TARS-1.5-7B", "Holo2-4B", "ActIO-UI-7B"],
                     label="Select Model",
                     value="Fara-7B",
                     interactive=True
         examples=[
             ["examples/1.png", "Click on the Fara-7B model.", "Fara-7B"],
             ["examples/2.png", "Click on the VLMs Collection", "UI-TARS-1.5-7B"],
             ["examples/3.png", "Click on the 'Real-time vision models' collection.", "Holo2-4B"],
+            ["examples/1.png", "Click on the Fara-7B model.", "ActIO-UI-7B"],
         ],
         inputs=[input_image, task_input, model_choice],
         label="Quick Examples"