sanity.py · Uniphore/actio-ui-7b-rlvr at main

File size: 4,233 Bytes

import base64
import sys
import torch
from transformers import AutoTokenizer, AutoModelForVision2Seq, AutoProcessor
from PIL import Image


def encode_image(image_path: str) -> str:
    """Encode image to base64 string for model input."""
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode()


def load_model(
    model_path: str,
) -> tuple[AutoModelForVision2Seq, AutoTokenizer, AutoProcessor]:
    """Load OpenCUA model, tokenizer, and image processor."""
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    model = AutoModelForVision2Seq.from_pretrained(
        model_path, torch_dtype="auto", device_map="auto", trust_remote_code=True
    )
    image_processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

    return model, tokenizer, image_processor


def create_grounding_messages(image_path: str, instruction: str) -> list[dict]:
    """Create chat messages for GUI grounding task."""
    system_prompt = (
        "You are a GUI agent. You are given a task and a screenshot of the screen. "
        "You need to perform a series of pyautogui actions to complete the task."
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Please perform the following task by providing the action and the coordinates in the format of <action>(x, y): "
                    + instruction,
                },
                {
                    "type": "image",
                    "image": f"data:image/png;base64,{encode_image(image_path)}",
                },
            ],
        },
    ]
    return messages


def run_inference(
    model: AutoModelForVision2Seq,
    tokenizer: AutoTokenizer,
    image_processor: AutoProcessor,
    messages: list[dict],
    image_path: str,
) -> str:
    """Run inference on the model."""
    # Prepare text from messages
    text = image_processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Open image
    image = Image.open(image_path).convert("RGB")

    # Process inputs using the processor
    inputs = image_processor(
        text=[text], images=[image], padding=True, return_tensors="pt"
    )

    # Move inputs to model device
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Generate response
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=2048,
            do_sample=False,
        )

    # Decode output (skip the input tokens)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :]
        for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
    ]
    output_text = image_processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )[0]

    return output_text


def main():
    """Main function to run the sanity check."""
    # Configuration
    model_path = "Uniphore/actio-ui-7b-rlvr"  # or other model variants
    image_path = "screenshot.png"
    instruction = "Click on the submit button"

    # Check if custom instruction provided
    if len(sys.argv) > 1:
        instruction = " ".join(sys.argv[1:])

    print(f"Loading model from: {model_path}")
    try:
        model, tokenizer, image_processor = load_model(model_path)
        print("✓ Model loaded successfully")
    except Exception as e:
        print(f"✗ Error loading model: {e}")
        return 1

    print(f"Processing image: {image_path}")
    print(f"Instruction: {instruction}")

    try:
        messages = create_grounding_messages(image_path, instruction)
        result = run_inference(model, tokenizer, image_processor, messages, image_path)

        print("\n" + "=" * 60)
        print("MODEL OUTPUT:")
        print("=" * 60)
        print(result)
        print("=" * 60)
        return 0
    except Exception as e:
        print(f"✗ Error during inference: {e}")
        import traceback

        traceback.print_exc()
        return 1


if __name__ == "__main__":
    sys.exit(main())