text-guided-image-colorization

Running

App Files Files Community

LogicGoInfotechSpaces commited on Nov 13

Commit

8f6f449

1 Parent(s): eb42092

Align pipeline with text-guided colorization Space

Browse files

Files changed (3) hide show

app/colorize_model.py +259 -254
app/config.py +14 -2
app/main.py +3 -2

app/colorize_model.py CHANGED Viewed

@@ -1,275 +1,280 @@
 """
-ColorizeNet model wrapper for image colorization
 """
 import logging
 import os
 import torch
-import numpy as np
 from PIL import Image
-from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, StableDiffusionXLControlNetPipeline, StableDiffusionImg2ImgPipeline
-from diffusers.utils import load_image
-from transformers import pipeline
-from huggingface_hub import hf_hub_download
 from app.config import settings
 logger = logging.getLogger(__name__)
 class ColorizeModel:
-    """Wrapper for ColorizeNet model"""
-    def __init__(self, model_id: str | None = None):
-        """
-        Initialize the ColorizeNet model
-        Args:
-            model_id: Hugging Face model ID for ColorizeNet
-        """
-        if model_id is None:
-            model_id = settings.MODEL_ID
-        self.model_id = model_id
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info("Using device: %s", self.device)
-        self.dtype = torch.float16 if self.device == "cuda" else torch.float32
-        # Check for Hugging Face token (try both environment variable names)
-        self.hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN") or None
-        # Configure writable cache to avoid permission issues on Spaces
-        # Prefer DATA_DIR if available, otherwise fallback to /tmp
-        data_dir = os.getenv("DATA_DIR")
-        if not data_dir:
-            data_dir = "/tmp"
-        hf_cache_dir = os.path.join(data_dir, "hf_cache")
-        # Set cache environment variables
-        os.environ["HF_HOME"] = hf_cache_dir
-        os.environ["HUGGINGFACE_HUB_CACHE"] = hf_cache_dir
-        os.environ["TRANSFORMERS_CACHE"] = hf_cache_dir
-        try:
-            os.makedirs(hf_cache_dir, exist_ok=True)
-            logger.info("HF cache directory: %s", hf_cache_dir)
-        except Exception as e:
-            # Fallback to /tmp/hf_cache if DATA_DIR was set but not writable
-            tmp_cache_dir = os.path.join("/tmp", "hf_cache")
-            logger.warning("Failed to create cache in %s: %s, trying %s", data_dir, str(e), tmp_cache_dir)
-            hf_cache_dir = tmp_cache_dir
-            os.environ["HF_HOME"] = hf_cache_dir
-            os.environ["HUGGINGFACE_HUB_CACHE"] = hf_cache_dir
-            os.environ["TRANSFORMERS_CACHE"] = hf_cache_dir
-            try:
-                os.makedirs(hf_cache_dir, exist_ok=True)
-                logger.info("HF cache directory (tmp): %s", hf_cache_dir)
-            except Exception as e_tmp:
-                # Final fallback to user home (local dev)
-                logger.warning("Failed to create cache in /tmp: %s, trying user home", str(e_tmp))
-                default_home_cache = os.path.join(os.path.expanduser("~"), ".cache", "huggingface")
-                hf_cache_dir = default_home_cache
-                os.environ["HF_HOME"] = hf_cache_dir
-                os.environ["HUGGINGFACE_HUB_CACHE"] = hf_cache_dir
-                os.environ["TRANSFORMERS_CACHE"] = hf_cache_dir
-                try:
-                    os.makedirs(hf_cache_dir, exist_ok=True)
-                    logger.info("HF cache directory (home): %s", hf_cache_dir)
-                except Exception as e2:
-                    logger.error("Failed to create cache directory: %s", str(e2))
-                    raise RuntimeError(f"Cannot create Hugging Face cache directory: {str(e2)}")
-        else:
-            # Ensure environment variables reflect the final cache dir
-            os.environ["HF_HOME"] = hf_cache_dir
-            os.environ["HUGGINGFACE_HUB_CACHE"] = hf_cache_dir
-            os.environ["TRANSFORMERS_CACHE"] = hf_cache_dir
-        # Avoid libgomp warning by setting a valid integer
         os.environ.setdefault("OMP_NUM_THREADS", "1")
-        try:
-            # Decide whether to use ControlNet based on model_id
-            wants_controlnet = "control" in self.model_id.lower()
-            if wants_controlnet:
-                # Try loading as ControlNet with Stable Diffusion
-                logger.info("Attempting to load model as ControlNet: %s", self.model_id)
                 try:
-                    # Load ControlNet model
-                    self.controlnet = ControlNetModel.from_pretrained(
-                        self.model_id,
-                        torch_dtype=self.dtype,
-                        token=self.hf_token,
-                        cache_dir=hf_cache_dir
-                    )
-                    # Try SDXL first, fallback to SD 1.5
-                    try:
-                        self.pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
-                            "stabilityai/stable-diffusion-xl-base-1.0",
-                            controlnet=self.controlnet,
-                            torch_dtype=self.dtype,
-                            safety_checker=None,
-                            requires_safety_checker=False,
-                            token=self.hf_token,
-                            cache_dir=hf_cache_dir
-                        )
-                        logger.info("Loaded with SDXL base model")
-                    except Exception:
-                        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
-                            "runwayml/stable-diffusion-v1-5",
-                            controlnet=self.controlnet,
-                            torch_dtype=self.dtype,
-                            safety_checker=None,
-                            requires_safety_checker=False,
-                            token=self.hf_token,
-                            cache_dir=hf_cache_dir
-                        )
-                        logger.info("Loaded with SD 1.5 base model")
-                    self.pipe.to(self.device)
-                    # Enable memory efficient attention if available
-                    if hasattr(self.pipe, "enable_xformers_memory_efficient_attention"):
-                        try:
-                            self.pipe.enable_xformers_memory_efficient_attention()
-                            logger.info("XFormers memory efficient attention enabled")
-                        except Exception as e:
-                            logger.warning("Could not enable XFormers: %s", str(e))
-                    logger.info("ColorizeNet model loaded successfully as ControlNet")
-                    self.model_type = "controlnet"
-                except Exception as e:
-                    logger.warning("Failed to load as ControlNet: %s", str(e))
-                    wants_controlnet = False  # fall through to pipeline
-            if not wants_controlnet:
-                # Load as image-to-image pipeline
-                logger.info("Trying to load as image-to-image pipeline...")
-                self.pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-                    self.model_id,
-                    torch_dtype=self.dtype,
-                    safety_checker=None,
-                    requires_safety_checker=False,
-                    use_safetensors=True,
-                    cache_dir=hf_cache_dir,
-                    token=self.hf_token
-                ).to(self.device)
-                logger.info("ColorizeNet model loaded using image-to-image pipeline")
-                self.model_type = "pipeline"
-        except Exception as e:
-            logger.error("Failed to load ColorizeNet model: %s", str(e))
-            raise RuntimeError(f"Could not load ColorizeNet model: {str(e)}")
-    def preprocess_image(self, image: Image.Image) -> Image.Image:
-        """
-        Preprocess image for colorization
-        Args:
-            image: PIL Image
-        Returns:
-            Preprocessed PIL Image
-        """
-        # Convert to grayscale if needed
-        if image.mode != "L":
-            # Convert to grayscale
-            image = image.convert("L")
-        # Convert back to RGB (grayscale image with 3 channels)
-        image = image.convert("RGB")
-        # Resize to standard size (512x512 for SD models)
-        image = image.resize((512, 512), Image.Resampling.LANCZOS)
-        return image
-    def colorize(self, image: Image.Image, num_inference_steps: int = None) -> Image.Image:
-        """
-        Colorize a grayscale image
-        Args:
-            image: PIL Image (grayscale or color)
-            num_inference_steps: Number of inference steps (auto-adjusted for CPU/GPU)
-        Returns:
-            Colorized PIL Image
-        """
         try:
-            # Optimize inference steps based on device
-            if num_inference_steps is None:
-                # Use fewer steps on CPU for faster processing
-                num_inference_steps = 8 if self.device == "cpu" else 20
-            # Preprocess image
-            control_image = self.preprocess_image(image)
             original_size = image.size
-            # Prepare prompt for colorization
-            prompt = "colorize this black and white image, high quality, detailed, vibrant colors, natural colors"
-            negative_prompt = "black and white, grayscale, monochrome, low quality, blurry, desaturated"
-            # Adjust guidance scale for CPU (lower = faster)
-            guidance_scale = 5.0 if self.device == "cpu" else 7.5
-            # Generate colorized image based on model type
-            if self.model_type == "controlnet":
-                # Use ControlNet pipeline
-                result = self.pipe(
-                    prompt=prompt,
-                    image=control_image,
-                    negative_prompt=negative_prompt,
-                    num_inference_steps=num_inference_steps,
-                    guidance_scale=guidance_scale,
-                    controlnet_conditioning_scale=1.0,
-                    generator=torch.Generator(device=self.device).manual_seed(42)
-                )
-                if isinstance(result, dict) and "images" in result:
-                    colorized = result["images"][0]
-                elif isinstance(result, list) and len(result) > 0:
-                    colorized = result[0]
-                else:
-                    colorized = result
-            else:
-                # Use pipeline directly
-                result = self.pipe(
-                    prompt=prompt,
-                    image=control_image,
-                    negative_prompt=negative_prompt,
-                    num_inference_steps=num_inference_steps,
-                    guidance_scale=guidance_scale,
-                    strength=1.0
-                )
-                if isinstance(result, dict) and "images" in result:
-                    colorized = result["images"][0]
-                elif isinstance(result, list) and len(result) > 0:
-                    colorized = result[0]
-                else:
-                    colorized = result
-            # Ensure we have a PIL Image
-            if not isinstance(colorized, Image.Image):
-                if isinstance(colorized, np.ndarray):
-                    # Handle numpy array
-                    if colorized.dtype != np.uint8:
-                        colorized = (colorized * 255).astype(np.uint8)
-                    if len(colorized.shape) == 3 and colorized.shape[2] == 3:
-                        colorized = Image.fromarray(colorized, 'RGB')
-                    else:
-                        colorized = Image.fromarray(colorized)
-                elif torch.is_tensor(colorized):
-                    # Handle torch tensor
-                    colorized = colorized.cpu().permute(1, 2, 0).numpy()
-                    colorized = (colorized * 255).astype(np.uint8)
-                    colorized = Image.fromarray(colorized, 'RGB')
-                else:
-                    raise ValueError(f"Unexpected output type: {type(colorized)}")
-            # Resize back to original size
-            if original_size != (512, 512):
                 colorized = colorized.resize(original_size, Image.Resampling.LANCZOS)
-            return colorized
-        except Exception as e:
-            logger.error("Error during colorization: %s", str(e))
             raise

 """
+Colorize model wrapper replicating the behaviour of the
+`fffiloni/text-guided-image-colorization` Space.
 """
+from __future__ import annotations
 import logging
 import os
+from typing import Tuple
 import torch
 from PIL import Image
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    StableDiffusionXLControlNetPipeline,
+    UNet2DConditionModel,
+)
+from huggingface_hub import hf_hub_download, snapshot_download
+from safetensors.torch import load_file
+from transformers import BlipForConditionalGeneration, BlipProcessor
 from app.config import settings
 logger = logging.getLogger(__name__)
+def _ensure_cache_dir() -> str:
+    """Ensure we have a writable Hugging Face cache directory."""
+    data_dir = os.getenv("DATA_DIR")
+    candidate_dirs = []
+    if data_dir:
+        candidate_dirs.append(os.path.join(data_dir, "hf_cache"))
+    candidate_dirs.extend(
+        [
+            os.path.join("/tmp", "hf_cache"),
+            os.path.join(os.path.expanduser("~"), ".cache", "huggingface"),
+        ]
+    )
+    for path in candidate_dirs:
+        try:
+            os.makedirs(path, exist_ok=True)
+            logger.info("Using HF cache directory: %s", path)
+            os.environ["HF_HOME"] = path
+            os.environ["HUGGINGFACE_HUB_CACHE"] = path
+            os.environ["TRANSFORMERS_CACHE"] = path
+            return path
+        except Exception as exc:  # pragma: no cover - best effort
+            logger.warning("Failed to create cache dir %s: %s", path, exc)
+    raise RuntimeError("Unable to create a writable cache directory for Hugging Face downloads.")
+def _apply_color(luminance_image: Image.Image, color_map: Image.Image) -> Image.Image:
+    """Merge the L channel of the grayscale control image with AB channels from generated image."""
+    image_lab = luminance_image.convert("LAB")
+    color_map_lab = color_map.convert("LAB")
+    l_channel, _, _ = image_lab.split()
+    _, a_channel, b_channel = color_map_lab.split()
+    merged = Image.merge("LAB", (l_channel, a_channel, b_channel))
+    return merged.convert("RGB")
+def _remove_unlikely_words(prompt: str) -> str:
+    """Clean up BLIP captions to avoid misleading descriptors."""
+    unlikely_words = []
+    decades = [f"{i}s" for i in range(1900, 2000)]
+    years = [f"{i}" for i in range(1900, 2000)]
+    years_with_word = [f"year {i}" for i in range(1900, 2000)]
+    circa_years = [f"circa {i}" for i in range(1900, 2000)]
+    expanded = [
+        [f"{d[0]} {d[1]} {d[2]} {d[3]} s" for d in decades],
+        [f"{d[0]} {d[1]} {d[2]} {d[3]}" for d in decades],
+        [f"year {d[0]} {d[1]} {d[2]} {d[3]}" for d in decades],
+        [f"circa {d[0]} {d[1]} {d[2]} {d[3]}" for d in decades],
+    ]
+    manual_terms = [
+        "black and white,", "black and white", "black & white,", "black & white",
+        "circa", "monochrome,", "monochrome", "bw", "bw,", "b&w", "b&w,",
+        "grainy", "grainy photo", "grainy photograph", "grainy footage",
+        "black-and-white", "black - and - white", "black on white",
+        "historical photo", "historic photo", "restored", "desaturated",
+        "low contrast", "blurry", "overcast", "taken in", "photo taken in",
+        ", photo", ",  photo", ",   photo", ", photograph",
+    ]
+    for seq in expanded:
+        unlikely_words.extend(seq)
+    unlikely_words.extend(decades + years + years_with_word + circa_years + manual_terms)
+    cleaned = prompt
+    for word in unlikely_words:
+        cleaned = cleaned.replace(word, "")
+    return cleaned.strip(" ,")
 class ColorizeModel:
+    """Colorization model wrapper."""
+    CONTROLNET_REPO = "nickpai/sdxl_light_caption_output"
+    CONTROLNET_SUBDIR = os.path.join("checkpoint-30000", "controlnet")
+    BASE_MODEL = "stabilityai/stable-diffusion-xl-base-1.0"
+    LIGHTNING_REPO = "ByteDance/SDXL-Lightning"
+    LIGHTNING_WEIGHTS = "sdxl_lightning_8step_unet.safetensors"
+    CAPTION_MODEL = "Salesforce/blip-image-captioning-large"
+    def __init__(self, model_id: str | None = None) -> None:
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         logger.info("Using device: %s", self.device)
+        self.dtype = torch.float16 if self.device.type == "cuda" else torch.float32
         os.environ.setdefault("OMP_NUM_THREADS", "1")
+        self.hf_token = (
+            os.getenv("HF_TOKEN")
+            or os.getenv("HUGGINGFACE_HUB_TOKEN")
+            or None
+        )
+        self.cache_dir = _ensure_cache_dir()
+        self.num_inference_steps = settings.NUM_INFERENCE_STEPS
+        self.guidance_scale = settings.GUIDANCE_SCALE
+        self.controlnet_scale = settings.CONTROLNET_SCALE
+        self.positive_prompt = settings.POSITIVE_PROMPT
+        self.negative_prompt = settings.NEGATIVE_PROMPT
+        self.caption_prefix = settings.CAPTION_PREFIX
+        self.seed = settings.COLORIZE_SEED
+        self.model_id = model_id or settings.MODEL_ID
+        self._load_pipeline()
+        self._load_caption_model()
+        self.last_caption: str | None = None
+    # --------------------------------------------------------------------- #
+    # Initialisation helpers
+    # --------------------------------------------------------------------- #
+    def _download_controlnet(self) -> str:
+        logger.info("Downloading ControlNet snapshot: %s", self.CONTROLNET_REPO)
+        local_dir = os.path.join(self.cache_dir, "sdxl_light_caption_output")
+        path = snapshot_download(
+            repo_id=self.CONTROLNET_REPO,
+            local_dir=local_dir,
+            local_dir_use_symlinks=False,
+            token=self.hf_token,
+        )
+        controlnet_path = os.path.join(path, self.CONTROLNET_SUBDIR)
+        if not os.path.isdir(controlnet_path):
+            raise RuntimeError(f"ControlNet weights not found at {controlnet_path}")
+        return controlnet_path
+    def _load_pipeline(self) -> None:
+        controlnet_path = self._download_controlnet()
+        logger.info("Loading SDXL components...")
+        vae = AutoencoderKL.from_pretrained(
+            self.BASE_MODEL,
+            subfolder="vae",
+            torch_dtype=self.dtype,
+            token=self.hf_token,
+        )
+        unet = UNet2DConditionModel.from_config(
+            self.BASE_MODEL,
+            subfolder="unet",
+            token=self.hf_token,
+        )
+        lightning_path = hf_hub_download(
+            repo_id=self.LIGHTNING_REPO,
+            filename=self.LIGHTNING_WEIGHTS,
+            token=self.hf_token,
+        )
+        unet.load_state_dict(load_file(lightning_path))
+        controlnet = ControlNetModel.from_pretrained(
+            controlnet_path,
+            torch_dtype=self.dtype,
+        )
+        self.pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+            self.BASE_MODEL,
+            vae=vae,
+            unet=unet,
+            controlnet=controlnet,
+            torch_dtype=self.dtype,
+            safety_checker=None,
+            requires_safety_checker=False,
+            token=self.hf_token,
+        )
+        self.pipe.set_progress_bar_config(disable=True)
+        if self.device.type == "cuda":
+            self.pipe.to(self.device, dtype=self.dtype)
+            if hasattr(self.pipe, "enable_xformers_memory_efficient_attention"):
                 try:
+                    self.pipe.enable_xformers_memory_efficient_attention()
+                except Exception as exc:  # pragma: no cover
+                    logger.warning("Could not enable xformers attention: %s", exc)
+        else:
+            self.pipe.to(self.device, dtype=self.dtype)
+        logger.info("Colorization pipeline ready.")
+    def _load_caption_model(self) -> None:
+        logger.info("Loading BLIP captioning model...")
+        processor = BlipProcessor.from_pretrained(self.CAPTION_MODEL, token=self.hf_token)
+        model = BlipForConditionalGeneration.from_pretrained(
+            self.CAPTION_MODEL,
+            torch_dtype=self.dtype if self.device.type == "cuda" else torch.float32,
+            token=self.hf_token,
+        )
+        self.caption_processor = processor
+        self.caption_model = model.to(self.device)
+    # --------------------------------------------------------------------- #
+    # Public API
+    # --------------------------------------------------------------------- #
+    def caption_image(self, image: Image.Image) -> str:
+        """Generate a cleaned caption for the image."""
+        inputs = self.caption_processor(
+            image,
+            self.caption_prefix,
+            return_tensors="pt",
+        ).to(self.device)
+        # BLIP on CPU expects float32 inputs
+        if self.device.type != "cuda":
+            inputs = {k: v.to(torch.float32) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
+        with torch.inference_mode():
+            caption_ids = self.caption_model.generate(**inputs)
+        caption = self.caption_processor.decode(caption_ids[0], skip_special_tokens=True)
+        cleaned_caption = _remove_unlikely_words(caption)
+        return cleaned_caption or caption
+    def colorize(self, image: Image.Image, num_inference_steps: int | None = None) -> Tuple[Image.Image, str]:
+        """Colorize a grayscale image."""
         try:
             original_size = image.size
+            control_image = image.convert("L").convert("RGB").resize(
+                (512, 512), Image.Resampling.LANCZOS
+            )
+            caption = self.caption_image(image)
+            self.last_caption = caption
+            prompt_parts = [caption]
+            if self.positive_prompt:
+                prompt_parts.insert(0, self.positive_prompt)
+            final_prompt = ", ".join([part for part in prompt_parts if part])
+            negative_prompt = self.negative_prompt or None
+            steps = num_inference_steps or self.num_inference_steps
+            generator = torch.Generator(device=self.device).manual_seed(self.seed)
+            logger.info("Running SDXL pipeline with prompt: %s", final_prompt)
+            result = self.pipe(
+                prompt=final_prompt,
+                negative_prompt=negative_prompt,
+                image=control_image,
+                num_inference_steps=steps,
+                guidance_scale=self.guidance_scale,
+                controlnet_conditioning_scale=self.controlnet_scale,
+                generator=generator,
+            )
+            generated_image = result.images[0]
+            colorized = _apply_color(control_image, generated_image)
+            if colorized.size != original_size:
                 colorized = colorized.resize(original_size, Image.Resampling.LANCZOS)
+            return colorized, caption
+        except Exception as exc:
+            logger.exception("Error during colorization: %s", exc)
             raise

app/config.py CHANGED Viewed

@@ -18,8 +18,20 @@ class Settings(BaseSettings):
     BASE_URL: str = os.getenv("BASE_URL", "http://localhost:8000")
     # Model settings
-    MODEL_ID: str = os.getenv("MODEL_ID", "lllyasviel/control_v11f1e_sd15_color")
-    NUM_INFERENCE_STEPS: int = int(os.getenv("NUM_INFERENCE_STEPS", "20"))
     # Storage settings
     UPLOAD_DIR: str = os.getenv("UPLOAD_DIR", "uploads")

     BASE_URL: str = os.getenv("BASE_URL", "http://localhost:8000")
     # Model settings
+    MODEL_ID: str = os.getenv("MODEL_ID", "nickpai/sdxl_light_caption_output")
+    NUM_INFERENCE_STEPS: int = int(os.getenv("NUM_INFERENCE_STEPS", "8"))
+    POSITIVE_PROMPT: str = os.getenv(
+        "POSITIVE_PROMPT",
+        "high quality color photo, vibrant natural colors, detailed lighting"
+    )
+    NEGATIVE_PROMPT: str = os.getenv(
+        "NEGATIVE_PROMPT",
+        "low quality, monochrome, black and white, desaturated, blurry, grainy"
+    )
+    GUIDANCE_SCALE: float = float(os.getenv("GUIDANCE_SCALE", "1.0"))
+    CONTROLNET_SCALE: float = float(os.getenv("CONTROLNET_SCALE", "1.0"))
+    CAPTION_PREFIX: str = os.getenv("CAPTION_PREFIX", "a photography of")
+    COLORIZE_SEED: int = int(os.getenv("COLORIZE_SEED", "123"))
     # Storage settings
     UPLOAD_DIR: str = os.getenv("UPLOAD_DIR", "uploads")

app/main.py CHANGED Viewed

@@ -254,7 +254,7 @@ async def colorize_image(
         # Colorize the image
         logger.info("Colorizing image...")
-        colorized_image = colorize_model.colorize(image)
         # Save colorized image
         file_id = str(uuid.uuid4())
@@ -274,7 +274,8 @@ async def colorize_image(
             "result_id": file_id,
             "download_url": download_url,
             "api_download_url": api_download_url,
-            "filename": result_filename
         }
     except Exception as e:
         logger.error("Error colorizing image: %s", str(e))

         # Colorize the image
         logger.info("Colorizing image...")
+        colorized_image, caption = colorize_model.colorize(image)
         # Save colorized image
         file_id = str(uuid.uuid4())
             "result_id": file_id,
             "download_url": download_url,
             "api_download_url": api_download_url,
+            "filename": result_filename,
+            "caption": caption
         }
     except Exception as e:
         logger.error("Error colorizing image: %s", str(e))