text-guided-image-colorization

Running

App Files Files Community

LogicGoInfotechSpaces commited on Nov 14

Commit

ec7bfd1

1 Parent(s): 0a1a3e1

Improve PyTorch colorizer: Better preprocessing, output handling, and debugging

Browse files

Files changed (1) hide show

app/pytorch_colorizer.py +48 -7

app/pytorch_colorizer.py CHANGED Viewed

@@ -141,6 +141,12 @@ class PyTorchColorizer:
                 # Otherwise, it's likely a state_dict
                 state_dict = loaded_obj
             except Exception as e:
                 logger.error(f"Failed to load model file: {e}")
                 raise
@@ -216,30 +222,65 @@ class PyTorchColorizer:
         if image.mode != "L":
             image = image.convert("L")
         # Transform to tensor
         transform = transforms.Compose([
-            transforms.Resize((256, 256)),  # Common size for GAN models
-            transforms.ToTensor(),
-            transforms.Normalize(mean=[0.5], std=[0.5])  # Normalize to [-1, 1]
         ])
         input_tensor = transform(image).unsqueeze(0).to(self.device)
         # Run inference
         with torch.no_grad():
-            output_tensor = self.model(input_tensor)
         # Convert output back to PIL Image
-        # Output is typically in range [-1, 1] from Tanh activation
         output_tensor = output_tensor.squeeze(0).cpu()
-        output_tensor = (output_tensor + 1) / 2.0  # Denormalize from [-1, 1] to [0, 1]
         output_tensor = torch.clamp(output_tensor, 0, 1)
         # Convert to numpy and then PIL
         output_array = (output_tensor.permute(1, 2, 0).numpy() * 255).astype('uint8')
         output_image = Image.fromarray(output_array, 'RGB')
-        # Resize back to original size
         if output_image.size != original_size:
             output_image = output_image.resize(original_size, Image.Resampling.LANCZOS)

                 # Otherwise, it's likely a state_dict
                 state_dict = loaded_obj
+                # Log state dict keys to understand model structure
+                if isinstance(state_dict, dict):
+                    keys = list(state_dict.keys())[:10]  # First 10 keys
+                    logger.info(f"Model state_dict keys (sample): {keys}")
+                    logger.info(f"Total state_dict keys: {len(state_dict.keys())}")
             except Exception as e:
                 logger.error(f"Failed to load model file: {e}")
                 raise
         if image.mode != "L":
             image = image.convert("L")
+        # Try to maintain aspect ratio and use a better resize
+        # Many GAN models work better with 256x256 or 512x512
+        target_size = 256
+        if max(original_size) > 512:
+            # Scale down proportionally but keep max dimension reasonable
+            scale = target_size / max(original_size)
+            new_size = (int(original_size[0] * scale), int(original_size[1] * scale))
+        else:
+            new_size = original_size
         # Transform to tensor
+        # GAN colorization models typically expect normalized input
         transform = transforms.Compose([
+            transforms.Resize(new_size, Image.Resampling.LANCZOS),
+            transforms.ToTensor(),  # Converts to [0, 1]
         ])
         input_tensor = transform(image).unsqueeze(0).to(self.device)
+        # Normalize to [-1, 1] for GAN models (common for Pix2Pix style models)
+        input_tensor = (input_tensor - 0.5) / 0.5
         # Run inference
         with torch.no_grad():
+            try:
+                output_tensor = self.model(input_tensor)
+                logger.debug(f"Model output shape: {output_tensor.shape}, min: {output_tensor.min():.3f}, max: {output_tensor.max():.3f}, mean: {output_tensor.mean():.3f}")
+            except Exception as e:
+                logger.error(f"Model inference error: {e}")
+                # If model fails, try with different input format (without normalization)
+                input_tensor_alt = transform(image).unsqueeze(0).to(self.device)
+                output_tensor = self.model(input_tensor_alt)
         # Convert output back to PIL Image
         output_tensor = output_tensor.squeeze(0).cpu()
+        # Handle different output ranges
+        # Check if output is in [-1, 1] range (from Tanh) or [0, 1] range
+        output_min = output_tensor.min().item()
+        output_max = output_tensor.max().item()
+        logger.debug(f"Output tensor range: [{output_min:.3f}, {output_max:.3f}]")
+        if output_min < -0.5:
+            # Likely [-1, 1] range, denormalize
+            output_tensor = (output_tensor + 1) / 2.0
+            logger.debug("Applied [-1, 1] denormalization")
+        elif output_max > 1.5:
+            # Might be in [0, 255] range
+            output_tensor = output_tensor / 255.0
+            logger.debug("Applied [0, 255] normalization")
+        # If already in [0, 1], use as-is
         output_tensor = torch.clamp(output_tensor, 0, 1)
         # Convert to numpy and then PIL
         output_array = (output_tensor.permute(1, 2, 0).numpy() * 255).astype('uint8')
         output_image = Image.fromarray(output_array, 'RGB')
+        # Resize back to original size with high-quality resampling
         if output_image.size != original_size:
             output_image = output_image.resize(original_size, Image.Resampling.LANCZOS)