text-guided-image-colorization

Running

App Files Files Community

LogicGoInfotechSpaces commited on Nov 13

Commit

8d0a1ae

1 Parent(s): d58eb50

Restore local ControlNet colorization pipeline

Browse files

Files changed (2) hide show

app/colorize_model.py +134 -92
app/config.py +6 -2

app/colorize_model.py CHANGED Viewed

@@ -1,17 +1,24 @@
 """
-Colorize model wrapper that forwards requests to the Hugging Face Inference API.
 """
 from __future__ import annotations
-import io
 import logging
 import os
 from typing import Tuple
-import requests
 import torch
 from PIL import Image
 from transformers import BlipForConditionalGeneration, BlipProcessor
 from app.config import settings
@@ -20,92 +27,138 @@ logger = logging.getLogger(__name__)
 def _ensure_cache_dir() -> str:
-    """Ensure we have a writable Hugging Face cache directory."""
-    data_dir = os.getenv("DATA_DIR")
-    candidates = []
-    if data_dir:
-        candidates.append(os.path.join(data_dir, "hf_cache"))
-    candidates.extend(
-        [
-            os.path.join("/tmp", "hf_cache"),
-            os.path.join(os.path.expanduser("~"), ".cache", "huggingface"),
-        ]
-    )
-    for path in candidates:
-        try:
-            os.makedirs(path, exist_ok=True)
-            logger.info("Using HF cache directory: %s", path)
-            os.environ["HF_HOME"] = path
-            os.environ["HUGGINGFACE_HUB_CACHE"] = path
-            os.environ["TRANSFORMERS_CACHE"] = path
-            return path
-        except Exception as exc:
-            logger.warning("Failed to create cache dir %s: %s", path, exc)
-    raise RuntimeError("Unable to create a writable cache directory for Hugging Face downloads.")
 def _clean_caption(prompt: str) -> str:
-    replacements = [
-        "black and white", "black & white", "monochrome", "monochromatic",
-        "bw photo", "blurry", "grainy", "historical", "restored", "circa",
-        "taken in", "overcast", "desaturated", "low contrast",
     ]
     cleaned = prompt
-    for word in replacements:
-        cleaned = cleaned.replace(word, "")
     return cleaned.strip(" ,")
 class ColorizeModel:
-    """Colorization model that leverages the HF Inference API."""
-    CAPTION_MODEL = "Salesforce/blip-image-captioning-large"
     def __init__(self, model_id: str | None = None) -> None:
-        self.model_id = model_id or settings.MODEL_ID
-        self.api_url = f"https://router.huggingface.co/hf-inference/models/{self.model_id}"
-        self.api_token = (
-            os.getenv("HUGGINGFACE_API_TOKEN")
             or os.getenv("HUGGINGFACE_HUB_TOKEN")
-            or os.getenv("HF_TOKEN")
         )
-        if not self.api_token:
-            raise RuntimeError(
-                "HUGGINGFACE_API_TOKEN (or HUGGINGFACE_HUB_TOKEN / HF_TOKEN) is not set. "
-                "Please provide an access token with Inference API permissions."
-            )
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.dtype = torch.float16 if self.device.type == "cuda" else torch.float32
         os.environ.setdefault("OMP_NUM_THREADS", "1")
-        self.cache_dir = _ensure_cache_dir()
-        self.positive_prompt = settings.POSITIVE_PROMPT
-        self.negative_prompt = settings.NEGATIVE_PROMPT
         self.num_inference_steps = settings.NUM_INFERENCE_STEPS
         self.guidance_scale = settings.GUIDANCE_SCALE
         self.caption_prefix = settings.CAPTION_PREFIX
         self.seed = settings.COLORIZE_SEED
-        self.timeout = settings.INFERENCE_TIMEOUT
-        self.provider = settings.INFERENCE_PROVIDER
         self._load_caption_model()
     def _load_caption_model(self) -> None:
-        logger.info("Loading BLIP captioning model for prompt generation...")
         self.caption_processor = BlipProcessor.from_pretrained(
-            self.CAPTION_MODEL,
-            cache_dir=self.cache_dir
         )
         self.caption_model = BlipForConditionalGeneration.from_pretrained(
-            self.CAPTION_MODEL,
             torch_dtype=self.dtype if self.device.type == "cuda" else torch.float32,
-            cache_dir=self.cache_dir
         ).to(self.device)
     def caption_image(self, image: Image.Image) -> str:
         inputs = self.caption_processor(
             image,
@@ -121,43 +174,32 @@ class ColorizeModel:
         caption = self.caption_processor.decode(caption_ids[0], skip_special_tokens=True)
         return _clean_caption(caption)
-    def _build_payload(self, prompt: str) -> dict:
-        payload = {
-            "inputs": prompt,
-            "parameters": {
-                "num_inference_steps": self.num_inference_steps,
-                "guidance_scale": self.guidance_scale,
-                "negative_prompt": self.negative_prompt,
-                "seed": self.seed,
-            },
-        }
-        if self.provider:
-            payload["provider"] = {"name": self.provider}
-        return payload
-    def colorize(self, image: Image.Image, _num_inference_steps: int | None = None) -> Tuple[Image.Image, str]:
-        caption = self.caption_image(image)
-        prompt_parts = [self.positive_prompt, caption]
-        prompt = ", ".join([p for p in prompt_parts if p])
-        headers = {
-            "Authorization": f"Bearer {self.api_token}",
-            "Content-Type": "application/json",
-        }
-        payload = self._build_payload(prompt)
-        logger.info("Calling HF Inference API for prompt: %s", prompt)
-        response = requests.post(self.api_url, headers=headers, json=payload, timeout=self.timeout)
-        if response.status_code != 200:
-            try:
-                data = response.json()
-            except ValueError:
-                data = response.text
-            logger.error("Inference API error (%s): %s", response.status_code, data)
-            raise RuntimeError(f"Inference API error ({response.status_code}): {data}")
-        colorized = Image.open(io.BytesIO(response.content)).convert("RGB")
-        colorized = colorized.resize(image.size, Image.Resampling.LANCZOS)
         return colorized, caption

 """
+Colorize model wrapper replicating the behaviour of the
+`fffiloni/text-guided-image-colorization` Space.
 """
 from __future__ import annotations
 import logging
 import os
 from typing import Tuple
 import torch
 from PIL import Image
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    StableDiffusionXLControlNetPipeline,
+    UNet2DConditionModel,
+)
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
 from transformers import BlipForConditionalGeneration, BlipProcessor
 from app.config import settings
 def _ensure_cache_dir() -> str:
+    cache_dir = os.environ.get("HF_HOME") or "/tmp/hf_cache"
+    try:
+        os.makedirs(cache_dir, exist_ok=True)
+    except Exception as exc:  # pragma: no cover
+        logger.warning("Could not create cache directory %s: %s", cache_dir, exc)
+    os.environ["HF_HOME"] = cache_dir
+    os.environ["TRANSFORMERS_CACHE"] = cache_dir
+    os.environ["HUGGINGFACE_HUB_CACHE"] = cache_dir
+    os.environ["HF_HUB_CACHE"] = cache_dir
+    return cache_dir
+def _apply_lab_merge(original_luminance: Image.Image, color_map: Image.Image) -> Image.Image:
+    base_lab = original_luminance.convert("LAB")
+    color_lab = color_map.convert("LAB")
+    l_channel, _, _ = base_lab.split()
+    _, a_channel, b_channel = color_lab.split()
+    merged = Image.merge("LAB", (l_channel, a_channel, b_channel))
+    return merged.convert("RGB")
 def _clean_caption(prompt: str) -> str:
+    remove_terms = [
+        "black and white", "black & white", "monochrome", "bw photo",
+        "historical", "restored", "low contrast", "desaturated", "overcast",
     ]
     cleaned = prompt
+    for term in remove_terms:
+        cleaned = cleaned.replace(term, "")
     return cleaned.strip(" ,")
 class ColorizeModel:
+    """Colorization model that runs the SDXL + ControlNet pipeline locally."""
     def __init__(self, model_id: str | None = None) -> None:
+        self.cache_dir = _ensure_cache_dir()
+        self.hf_token = (
+            os.getenv("HF_TOKEN")
             or os.getenv("HUGGINGFACE_HUB_TOKEN")
+            or os.getenv("HUGGINGFACE_API_TOKEN")
         )
+        if not self.hf_token:
+            logger.warning("HF token not provided – attempting to download public models only.")
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.dtype = torch.float16 if self.device.type == "cuda" else torch.float32
         os.environ.setdefault("OMP_NUM_THREADS", "1")
+        self.controlnet_id = model_id or settings.MODEL_ID
+        self.base_model_id = settings.BASE_MODEL_ID
+        self.lightning_repo = settings.LIGHTNING_REPO
+        self.lightning_weights = settings.LIGHTNING_WEIGHTS
+        self.caption_model_id = settings.CAPTION_MODEL_ID
         self.num_inference_steps = settings.NUM_INFERENCE_STEPS
         self.guidance_scale = settings.GUIDANCE_SCALE
+        self.controlnet_scale = settings.CONTROLNET_SCALE
+        self.positive_prompt = settings.POSITIVE_PROMPT
+        self.negative_prompt = settings.NEGATIVE_PROMPT
         self.caption_prefix = settings.CAPTION_PREFIX
         self.seed = settings.COLORIZE_SEED
         self._load_caption_model()
+        self._load_pipeline()
     def _load_caption_model(self) -> None:
+        logger.info("Loading BLIP captioning model: %s", self.caption_model_id)
         self.caption_processor = BlipProcessor.from_pretrained(
+            self.caption_model_id,
+            cache_dir=self.cache_dir,
+            token=self.hf_token,
         )
         self.caption_model = BlipForConditionalGeneration.from_pretrained(
+            self.caption_model_id,
+            cache_dir=self.cache_dir,
+            token=self.hf_token,
             torch_dtype=self.dtype if self.device.type == "cuda" else torch.float32,
         ).to(self.device)
+    def _load_pipeline(self) -> None:
+        logger.info("Loading ControlNet model: %s", self.controlnet_id)
+        controlnet = ControlNetModel.from_pretrained(
+            self.controlnet_id,
+            torch_dtype=self.dtype,
+            cache_dir=self.cache_dir,
+            token=self.hf_token,
+        )
+        logger.info("Loading SDXL base model components: %s", self.base_model_id)
+        vae = AutoencoderKL.from_pretrained(
+            self.base_model_id,
+            subfolder="vae",
+            torch_dtype=self.dtype,
+            cache_dir=self.cache_dir,
+            token=self.hf_token,
+        )
+        unet = UNet2DConditionModel.from_config(
+            self.base_model_id,
+            subfolder="unet",
+            cache_dir=self.cache_dir,
+            token=self.hf_token,
+        )
+        lightning_path = hf_hub_download(
+            repo_id=self.lightning_repo,
+            filename=self.lightning_weights,
+            cache_dir=self.cache_dir,
+            token=self.hf_token,
+        )
+        unet.load_state_dict(load_file(lightning_path))
+        self.pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+            self.base_model_id,
+            vae=vae,
+            unet=unet,
+            controlnet=controlnet,
+            torch_dtype=self.dtype,
+            cache_dir=self.cache_dir,
+            token=self.hf_token,
+            safety_checker=None,
+            requires_safety_checker=False,
+        )
+        self.pipe.set_progress_bar_config(disable=True)
+        self.pipe.to(self.device, dtype=self.dtype)
+        if self.device.type == "cuda" and hasattr(self.pipe, "enable_xformers_memory_efficient_attention"):
+            try:
+                self.pipe.enable_xformers_memory_efficient_attention()
+            except Exception as exc:  # pragma: no cover
+                logger.warning("Could not enable xFormers optimizations: %s", exc)
+        logger.info("Colorization pipeline ready.")
     def caption_image(self, image: Image.Image) -> str:
         inputs = self.caption_processor(
             image,
         caption = self.caption_processor.decode(caption_ids[0], skip_special_tokens=True)
         return _clean_caption(caption)
+    def colorize(self, image: Image.Image, num_inference_steps: int | None = None) -> Tuple[Image.Image, str]:
+        original_size = image.size
+        control_image = image.convert("L").convert("RGB").resize((512, 512), Image.Resampling.LANCZOS)
+        caption = self.caption_image(image)
+        prompt_components = [self.positive_prompt, caption]
+        prompt = ", ".join([p for p in prompt_components if p])
+        steps = num_inference_steps or self.num_inference_steps
+        generator = torch.Generator(device=self.device).manual_seed(self.seed)
+        logger.info("Running ControlNet pipeline with prompt: %s", prompt)
+        result = self.pipe(
+            prompt=prompt,
+            negative_prompt=self.negative_prompt or None,
+            image=control_image,
+            control_image=control_image,
+            num_inference_steps=steps,
+            guidance_scale=self.guidance_scale,
+            controlnet_conditioning_scale=self.controlnet_scale,
+            generator=generator,
+        )
+        generated = result.images[0]
+        colorized = _apply_lab_merge(control_image, generated)
+        if colorized.size != original_size:
+            colorized = colorized.resize(original_size, Image.Resampling.LANCZOS)
         return colorized, caption

app/config.py CHANGED Viewed

@@ -18,8 +18,12 @@ class Settings(BaseSettings):
     BASE_URL: str = os.getenv("BASE_URL", "http://localhost:8000")
     # Model / inference settings
-    MODEL_ID: str = os.getenv("MODEL_ID", "stabilityai/stable-diffusion-xl-base-1.0")
-    NUM_INFERENCE_STEPS: int = int(os.getenv("NUM_INFERENCE_STEPS", "30"))
     POSITIVE_PROMPT: str = os.getenv(
         "POSITIVE_PROMPT",
         "high quality color photo, vibrant natural colors, detailed lighting"

     BASE_URL: str = os.getenv("BASE_URL", "http://localhost:8000")
     # Model / inference settings
+    MODEL_ID: str = os.getenv("MODEL_ID", "fffiloni/controlnet-colorization-sdxl")
+    BASE_MODEL_ID: str = os.getenv("BASE_MODEL_ID", "stabilityai/stable-diffusion-xl-base-1.0")
+    LIGHTNING_REPO: str = os.getenv("LIGHTNING_REPO", "ByteDance/SDXL-Lightning")
+    LIGHTNING_WEIGHTS: str = os.getenv("LIGHTNING_WEIGHTS", "sdxl_lightning_8step_unet.safetensors")
+    CAPTION_MODEL_ID: str = os.getenv("CAPTION_MODEL_ID", "Salesforce/blip-image-captioning-base")
+    NUM_INFERENCE_STEPS: int = int(os.getenv("NUM_INFERENCE_STEPS", "20"))
     POSITIVE_PROMPT: str = os.getenv(
         "POSITIVE_PROMPT",
         "high quality color photo, vibrant natural colors, detailed lighting"