Spaces:

anycoderapps
/

LongCat-Image-Edit

Running on Zero

File size: 16,931 Bytes

import spaces
import gradio as gr
import torch
from PIL import Image
from transformers import AutoProcessor
from longcat_image.models import LongCatImageTransformer2DModel
from longcat_image.pipelines import LongCatImageEditPipeline, LongCatImagePipeline
import numpy as np

# Load models directly at startup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Text-to-Image Model
t2i_model_id = 'meituan-longcat/LongCat-Image'
print(f"🔄 Loading Text-to-Image model from {t2i_model_id}...")

t2i_text_processor = AutoProcessor.from_pretrained(
    t2i_model_id, 
    subfolder='tokenizer'
)

t2i_transformer = LongCatImageTransformer2DModel.from_pretrained(
    t2i_model_id, 
    subfolder='transformer',
    torch_dtype=torch.bfloat16, 
    use_safetensors=True
).to(device)

t2i_pipe = LongCatImagePipeline.from_pretrained(
    t2i_model_id,
    transformer=t2i_transformer,
    text_processor=t2i_text_processor,
)
t2i_pipe.to(device, torch.bfloat16)

print(f"✅ Text-to-Image model loaded successfully")

# Image Edit Model
edit_model_id = 'meituan-longcat/LongCat-Image-Edit'
print(f"🔄 Loading Image Edit model from {edit_model_id}...")

edit_text_processor = AutoProcessor.from_pretrained(
    edit_model_id, 
    subfolder='tokenizer'
)

edit_transformer = LongCatImageTransformer2DModel.from_pretrained(
    edit_model_id, 
    subfolder='transformer',
    torch_dtype=torch.bfloat16, 
    use_safetensors=True
).to(device)

edit_pipe = LongCatImageEditPipeline.from_pretrained(
    edit_model_id,
    transformer=edit_transformer,
    text_processor=edit_text_processor,
)
edit_pipe.to(device, torch.bfloat16)

print(f"✅ Image Edit model loaded successfully on {device}")

@spaces.GPU(duration=120)
def generate_image(
    prompt: str,
    negative_prompt: str,
    width: int,
    height: int,
    guidance_scale: float,
    num_inference_steps: int,
    seed: int,
    enable_cfg_renorm: bool,
    enable_prompt_rewrite: bool,
    progress=gr.Progress()
):
    """Generate image from text prompt"""
    
    if not prompt or prompt.strip() == "":
        raise gr.Error("Please enter a prompt")
    
    try:
        progress(0.1, desc="Preparing generation...")
        
        progress(0.2, desc="Generating image...")
        
        # Set random seed for reproducibility
        generator = torch.Generator("cuda" if torch.cuda.is_available() else "cpu").manual_seed(seed)
        
        # Run the pipeline
        with torch.inference_mode():
            output = t2i_pipe(
                prompt,
                negative_prompt=negative_prompt,
                height=height,
                width=width,
                guidance_scale=guidance_scale,
                num_inference_steps=num_inference_steps,
                num_images_per_prompt=1,
                generator=generator,
                enable_cfg_renorm=enable_cfg_renorm,
                enable_prompt_rewrite=enable_prompt_rewrite
            )
        
        progress(1.0, desc="Done!")
        
        generated_image = output.images[0]
        
        return generated_image
        
    except Exception as e:
        raise gr.Error(f"Error during image generation: {str(e)}")

@spaces.GPU(duration=120)
def edit_image(
    input_image: Image.Image,
    prompt: str,
    negative_prompt: str,
    guidance_scale: float,
    num_inference_steps: int,
    seed: int,
    progress=gr.Progress()
):
    """Edit image based on text prompt"""
    
    if input_image is None:
        raise gr.Error("Please upload an image first")
    
    if not prompt or prompt.strip() == "":
        raise gr.Error("Please enter an edit instruction")
    
    try:
        progress(0.1, desc="Preparing image...")
        
        # Convert to RGB if needed
        if input_image.mode != 'RGB':
            input_image = input_image.convert('RGB')
        
        progress(0.2, desc="Generating edited image...")
        
        # Set random seed for reproducibility
        generator = torch.Generator("cuda" if torch.cuda.is_available() else "cpu").manual_seed(seed)
        
        # Run the pipeline
        with torch.inference_mode():
            output = edit_pipe(
                input_image,
                prompt,
                negative_prompt=negative_prompt,
                guidance_scale=guidance_scale,
                num_inference_steps=num_inference_steps,
                num_images_per_prompt=1,
                generator=generator
            )
        
        progress(1.0, desc="Done!")
        
        edited_image = output.images[0]
        
        return edited_image
        
    except Exception as e:
        raise gr.Error(f"Error during image editing: {str(e)}")

# Example for image editing
edit_example_image_url = "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.png"
edit_example_data = [
    [edit_example_image_url, "Add a mustache", "", 4.5, 50, 42],
]

# Examples for text-to-image
t2i_example_prompts = [
    ["一个年轻的亚裔女性，身穿黄色针织衫，搭配白色项链。她的双手放在膝盖上，表情恬静。背景是一堵粗糙的砖墙，午后的阳光温暖地洒在她身上，营造出一种宁静而温馨的氛围。", "", 1344, 768, 4.5, 50, 43, True, True],
    ["A serene mountain landscape at sunset with golden clouds", "", 1344, 768, 4.5, 50, 42, True, True],
    ["A cute robot sitting at a desk, digital art style", "", 1024, 1024, 4.5, 50, 44, True, True],
]

# Build Gradio interface
with gr.Blocks(fill_height=True) as demo:
    gr.HTML("""
        <div style="text-align: center; margin-bottom: 20px;">
            <h1>🎨 LongCat Image Studio</h1>
            <p style="font-size: 16px; color: #666;">
                Generate images from text or edit existing images with AI-powered tools
            </p>
            <p style="font-size: 14px; margin-top: 10px;">
                Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #4A90E2; text-decoration: none;">anycoder</a>
            </p>
            <p style="font-size: 12px; color: #888; margin-top: 5px;">
                ⚡ Powered by Zero-GPU | 🤗 Models: 
                <a href="https://huggingface.co/meituan-longcat/LongCat-Image" target="_blank" style="color: #4A90E2;">Text-to-Image</a> & 
                <a href="https://huggingface.co/meituan-longcat/LongCat-Image-Edit" target="_blank" style="color: #4A90E2;">Image Edit</a>
            </p>
        </div>
    """)
    
    with gr.Tabs():
        # Text-to-Image Tab
        with gr.TabItem("🖼️ Text to Image"):
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### 📝 Prompt")
                    t2i_prompt = gr.Textbox(
                        label="Image Description",
                        placeholder="Describe the image you want to generate (supports English and Chinese)",
                        lines=5
                    )
                    
                    with gr.Accordion("⚙️ Settings", open=True):
                        t2i_negative_prompt = gr.Textbox(
                            label="Negative Prompt (Optional)",
                            placeholder="What you don't want in the image",
                            lines=2
                        )
                        
                        with gr.Row():
                            t2i_width = gr.Slider(
                                minimum=512,
                                maximum=2048,
                                value=1344,
                                step=64,
                                label="Width",
                            )
                            
                            t2i_height = gr.Slider(
                                minimum=512,
                                maximum=2048,
                                value=768,
                                step=64,
                                label="Height",
                            )
                        
                        t2i_guidance_scale = gr.Slider(
                            minimum=1.0,
                            maximum=10.0,
                            value=4.5,
                            step=0.5,
                            label="Guidance Scale",
                            info="Higher values = stronger adherence to prompt"
                        )
                        
                        t2i_num_inference_steps = gr.Slider(
                            minimum=20,
                            maximum=100,
                            value=50,
                            step=5,
                            label="Inference Steps",
                            info="More steps = higher quality but slower"
                        )
                        
                        t2i_seed = gr.Slider(
                            minimum=0,
                            maximum=999999,
                            value=42,
                            step=1,
                            label="Random Seed",
                        )
                        
                        t2i_enable_cfg_renorm = gr.Checkbox(
                            label="Enable CFG Renormalization",
                            value=True,
                            info="Improves image quality"
                        )
                        
                        t2i_enable_prompt_rewrite = gr.Checkbox(
                            label="Enable Prompt Rewrite",
                            value=True,
                            info="Uses text encoder as built-in prompt enhancer"
                        )
                    
                    generate_btn = gr.Button("✨ Generate Image", variant="primary", size="lg")
                    
                with gr.Column(scale=1):
                    gr.Markdown("### 🎯 Generated Image")
                    t2i_output = gr.Image(
                        label="Output",
                        type="pil",
                        height=500,
                        buttons=["download"]
                    )
                    
                    gr.Markdown("### 💡 Tips")
                    gr.Markdown("""
                    - Be detailed and specific in your descriptions
                    - Supports both English and Chinese prompts
                    - Try different aspect ratios for varied compositions
                    - Enable prompt rewrite for enhanced descriptions
                    - Higher inference steps = better quality (but slower)
                    """)
            
            gr.Markdown("### 📝 Example Prompts")
            gr.Examples(
                examples=t2i_example_prompts,
                inputs=[t2i_prompt, t2i_negative_prompt, t2i_width, t2i_height, t2i_guidance_scale, t2i_num_inference_steps, t2i_seed, t2i_enable_cfg_renorm, t2i_enable_prompt_rewrite],
                outputs=t2i_output,
                fn=generate_image,
                cache_examples=False,
                label="Click to try these examples"
            )
        
        # Image Edit Tab
        with gr.TabItem("✏️ Image Edit"):
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### 📤 Input")
                    input_image = gr.Image(
                        label="Upload Image",
                        type="pil",
                        sources=["upload", "clipboard"],
                        height=400
                    )
                    
                    prompt = gr.Textbox(
                        label="Edit Instruction",
                        placeholder="Describe how you want to edit the image",
                        lines=3
                    )
                    
                    with gr.Accordion("⚙️ Advanced Settings", open=False):
                        negative_prompt = gr.Textbox(
                            label="Negative Prompt (Optional)",
                            placeholder="What you don't want in the image",
                            lines=2
                        )
                        
                        guidance_scale = gr.Slider(
                            minimum=1.0,
                            maximum=10.0,
                            value=4.5,
                            step=0.5,
                            label="Guidance Scale",
                            info="Higher values = stronger adherence to prompt"
                        )
                        
                        num_inference_steps = gr.Slider(
                            minimum=20,
                            maximum=100,
                            value=50,
                            step=5,
                            label="Inference Steps",
                            info="More steps = higher quality but slower"
                        )
                        
                        seed = gr.Slider(
                            minimum=0,
                            maximum=999999,
                            value=42,
                            step=1,
                            label="Random Seed",
                        )
                    
                    edit_btn = gr.Button("✨ Edit Image", variant="primary", size="lg")
                    
                with gr.Column(scale=1):
                    gr.Markdown("### 🎯 Output")
                    output_image = gr.Image(
                        label="Edited Image",
                        type="pil",
                        height=400,
                        buttons=["download"]
                    )
                    
                    gr.Markdown("### 💡 Tips")
                    gr.Markdown("""
                    - Upload a clear, well-lit image for best results
                    - Be specific in your edit instructions
                    - Supports both English and Chinese prompts
                    - Try different guidance scales for varied results
                    """)
            
            gr.Markdown("### 📝 Example")
            gr.Examples(
                examples=edit_example_data,
                inputs=[input_image, prompt, negative_prompt, guidance_scale, num_inference_steps, seed],
                outputs=output_image,
                fn=edit_image,
                cache_examples=False,
                label="Click to try this example"
            )
    
    gr.HTML("""
        <div style="padding: 10px; background-color: #f0f7ff; border-radius: 8px; margin: 20px 0;">
            <p style="margin: 0; font-size: 12px; color: #555;">
                ⏱️ <strong>Note:</strong> Zero-GPU provides 120 seconds of GPU time per request. 
                Models are loaded at startup from Hugging Face Hub.
                Processing typically takes 30-60 seconds depending on settings.
            </p>
        </div>
    """)
    
    # Event handlers
    generate_btn.click(
        fn=generate_image,
        inputs=[
            t2i_prompt,
            t2i_negative_prompt,
            t2i_width,
            t2i_height,
            t2i_guidance_scale,
            t2i_num_inference_steps,
            t2i_seed,
            t2i_enable_cfg_renorm,
            t2i_enable_prompt_rewrite
        ],
        outputs=t2i_output,
        api_visibility="public"
    )
    
    edit_btn.click(
        fn=edit_image,
        inputs=[
            input_image,
            prompt,
            negative_prompt,
            guidance_scale,
            num_inference_steps,
            seed
        ],
        outputs=output_image,
        api_visibility="public"
    )
    
    # Footer
    gr.HTML("""
        <div style="text-align: center; margin-top: 40px; padding: 20px; border-top: 1px solid #eee;">
            <p style="color: #666; font-size: 14px;">
                Powered by <a href="https://huggingface.co/meituan-longcat/LongCat-Image" target="_blank" style="color: #4A90E2;">LongCat Image</a> & 
                <a href="https://huggingface.co/meituan-longcat/LongCat-Image-Edit" target="_blank" style="color: #4A90E2;">LongCat Image Edit</a> | 
                <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #4A90E2;">Built with anycoder</a>
            </p>
        </div>
    """)

# Launch the app
if __name__ == "__main__":
    demo.launch(
        theme=gr.themes.Soft(
            primary_hue="blue",
            secondary_hue="indigo",
            neutral_hue="slate",
            font=gr.themes.GoogleFont("Inter"),
            text_size="lg",
            spacing_size="lg",
            radius_size="md"
        ),
        footer_links=[
            {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}
        ],
        mcp_server=True
    )