Spaces:

HanzhouLiu
/

XYScanNet_Demo

Running on Zero

App Files Files Community

HanzhouLiu commited on Nov 10, 2025

Commit

b56342d

1 Parent(s): 9f55394

Add application file

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +17 -7
app.py +123 -0
aug.py +56 -0
config/xyscannetp_gopro/config_stage1.yaml +40 -0
config/xyscannetp_gopro/config_stage2.yaml +41 -0
config/xyscannetp_realj/config_stage2.yml +40 -0
config/xyscannetp_realr/config_stage2.yml +40 -0
dataset.py +140 -0
datasets/datasets.txt +2 -0
evaluate_NIQE.m +57 -0
evaluate_RealBlur_J.py +117 -0
evaluate_RealBlur_R.py +110 -0
evaluation_GoPro.m +60 -0
evaluation_HIDE.m +60 -0
examples/blur1.png +3 -0
examples/blur2.png +3 -0
examples/blur3.png +3 -0
examples/blur4.png +3 -0
examples/blur5.png +3 -0
license +37 -0
metric_counter.py +55 -0
models/XYScanNet.py +737 -0
models/XYScanNetP.py +737 -0
models/__init__.py +0 -0
models/__pycache__/XYScanNet.cpython-38.pyc +0 -0
models/__pycache__/XYScanNetP.cpython-38.pyc +0 -0
models/__pycache__/__init__.cpython-38.pyc +0 -0
models/__pycache__/networks.cpython-38.pyc +0 -0
models/losses.py +233 -0
models/models.py +36 -0
models/networks.py +16 -0
models/sota/FFTformer.py +324 -0
models/sota/Restormer.py +340 -0
models/sota/Stripformer.py +429 -0
models/sota/XYScanNet.py +754 -0
out/Results.txt +1 -0
predict_GoPro_test_results.py +89 -0
predict_HIDE_test_results.py +69 -0
predict_RWBI_test_results.py +88 -0
predict_RealBlur_J_test_results.py +97 -0
predict_RealBlur_R_test_results.py +96 -0
requirements.txt +11 -0
results/xyscannetp_gopro/models/best_XYScanNet_stage2.pth +3 -0
schedulers.py +59 -0
train_XYScanNet_stage1.py +182 -0
train_XYScanNet_stage2.py +182 -0
util/__init__.py +0 -0
util/__pycache__/__init__.cpython-310.pyc +0 -0
util/__pycache__/__init__.cpython-36.pyc +0 -0
util/__pycache__/__init__.cpython-38.pyc +0 -0

README.md CHANGED Viewed

@@ -1,13 +1,23 @@
 ---
-title: XYScanNet Demo
-emoji: 👁
-colorFrom: red
-colorTo: purple
 sdk: gradio
-sdk_version: 5.49.1
 app_file: app.py
 pinned: false
-license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: XYScanNet
+emoji: 🚀
+colorFrom: blue
+colorTo: indigo
 sdk: gradio
+sdk_version: "4.44.1"
 app_file: app.py
 pinned: false
 ---
+# XYScanNet: Mamba-based Image Deblurring Demo
+This Space runs the **XYScanNet** deblurring model on GPU using **Gradio**.
+Upload a blurry image, and the model will restore a sharp version automatically.
+🧠 **Tech Highlights**
+- Based on the **Mamba selective state space model**
+- Implements cross-directional strip attention (horizontal & vertical)
+- Runs efficiently on GPU with automatic padding
+👤 Author: [Hanzhou Liu](https://huggingface.co/HanzhouLiu)
+📦 Model weights: [HanzhouLiu/XYScanNet-weights](https://huggingface.co/spaces/HanzhouLiu/XYScanNet_Demo)

app.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import gradio as gr
+import spaces
+import torch
+import torchvision
+import torch.nn.functional as F
+from torch.autograd import Variable
+import numpy as np
+from PIL import Image
+import yaml
+import os
+from models.networks import get_generator
+# ===========================
+# 1. Device setup
+# ===========================
+# Automatically choose GPU if available
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"🔥 Using device: {device}")
+# ===========================
+# 2. Model Loading
+# ===========================
+def load_model(job_name="xyscannetp_gopro"):
+    """
+    Load the pretrained XYScanNet model on CPU or GPU automatically.
+    """
+    cfg_path = os.path.join("config", job_name, "config_stage2.yaml")
+    with open(cfg_path, "r") as f:
+        config = yaml.safe_load(f)
+    weights_path = os.path.join(
+        "results", job_name, "models", f"best_{config['experiment_desc']}.pth"
+    )
+    print(f"🔹 Loading model from {weights_path}")
+    model = get_generator(config["model"])
+    model.load_state_dict(torch.load(weights_path, map_location=device))
+    model.eval().to(device)
+    print(f"✅ Model loaded on {device}")
+    return model
+print("Initializing XYScanNet model...")
+MODEL = load_model()
+print("Model ready.")
+# ===========================
+# 3. Helper functions
+# ===========================
+def pad_to_multiple_of_8(img_tensor):
+    """
+    Pad the image tensor so that both height and width are multiples of 8.
+    """
+    _, _, h, w = img_tensor.shape
+    pad_h = (8 - h % 8) % 8
+    pad_w = (8 - w % 8) % 8
+    img_tensor = F.pad(img_tensor, (0, pad_w, 0, pad_h), mode="reflect")
+    return img_tensor, h, w
+def crop_back(img_tensor, orig_h, orig_w):
+    """Crop output back to original image size."""
+    return img_tensor[:, :, :orig_h, :orig_w]
+# ===========================
+# 4. Inference Function
+# ===========================
+# The decorator below *requests* GPU if available,
+# but won't crash if only CPU exists.
+@spaces.GPU
+def run_deblur(input_image: Image.Image):
+    """
+    Run deblurring inference on GPU if available, else CPU.
+    """
+    # Convert PIL RGB → Tensor [B,C,H,W] normalized to [-0.5,0.5]
+    img = np.array(input_image.convert("RGB"))
+    img_tensor = (
+        torch.from_numpy(np.transpose(img / 255.0, (2, 0, 1)).astype("float32")) - 0.5
+    )
+    img_tensor = Variable(img_tensor.unsqueeze(0)).to(device)
+    # Pad to valid window size
+    img_tensor, orig_h, orig_w = pad_to_multiple_of_8(img_tensor)
+    # Inference
+    with torch.no_grad():
+        result_image, _, _ = MODEL(img_tensor)
+        result_image = result_image + 0.5
+        result_image = crop_back(result_image, orig_h, orig_w)
+    # Convert to PIL Image for display
+    out_img = result_image.squeeze(0).clamp(0, 1).cpu()
+    out_pil = torchvision.transforms.ToPILImage()(out_img)
+    return out_pil
+# ===========================
+# 5. Gradio Interface
+# ===========================
+demo = gr.Interface(
+    fn=run_deblur,
+    inputs=gr.Image(type="pil", label="Upload a Blurry Image"),
+    outputs=gr.Image(type="pil", label="Deblurred Result"),
+    title="XYScanNet: Mamba-based Image Deblurring (GPU Demo)",
+    description=(
+        "Upload a blurry image to see how XYScanNet restores it using a Mamba-based vision state-space model."
+    ),
+    examples=[
+        ["examples/blur1.jpg"],
+        ["examples/blur2.png"],
+        ["examples/blur3.jpg"],
+        ["examples/blur4.jpg"],
+        ["examples/blur5.jpg"],
+    ],
+    allow_flagging="never",
+)
+if __name__ == "__main__":
+    demo.launch()

aug.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from typing import List
+import albumentations as albu
+from torchvision import transforms
+def get_transforms(size: int, scope: str = 'geometric', crop='random'):
+    augs = {'strong': albu.Compose([albu.HorizontalFlip(),
+                                    albu.ShiftScaleRotate(shift_limit=0.0, scale_limit=0.2, rotate_limit=20, p=.4),
+                                    albu.ElasticTransform(),
+                                    albu.OpticalDistortion(),
+                                    albu.OneOf([
+                                        albu.CLAHE(clip_limit=2),
+                                        albu.Sharpen(),
+                                        albu.Emboss(),
+                                        albu.RandomBrightnessContrast(),
+                                        albu.RandomGamma()
+                                    ], p=0.5),
+                                    albu.OneOf([
+                                        albu.RGBShift(),
+                                        albu.HueSaturationValue(),
+                                    ], p=0.5),
+                                    ]),
+            'weak': albu.Compose([albu.HorizontalFlip(),
+                                  ]),
+            'geometric': albu.Compose([albu.HorizontalFlip(),
+                                     albu.VerticalFlip(),
+                                     albu.RandomRotate90(),
+                                     ]),
+            'None': None
+            }
+    aug_fn = augs[scope]
+    crop_fn = {'random': albu.RandomCrop(size, size, always_apply=True),
+               'center': albu.CenterCrop(size, size, always_apply=True)}[crop]
+    pipeline = albu.Compose([aug_fn, crop_fn], additional_targets={'target': 'image'})
+    def process(a, b):
+        r = pipeline(image=a, target=b)
+        return r['image'], r['target']
+    return process
+def get_normalize():
+    transform = transforms.Compose([
+        transforms.ToTensor()
+    ])
+    def process(a, b):
+        image = transform(a).permute(1, 2, 0) - 0.5
+        target = transform(b).permute(1, 2, 0) - 0.5
+        return image, target
+    return process

config/xyscannetp_gopro/config_stage1.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+---
+experiment_desc: XYScanNet_stage1
+train:
+  files_a: /scratch/user/hanzhou1996/datasets/deblur/GOPRO_/train/blur/**/*.png
+  files_b: /scratch/user/hanzhou1996/datasets/deblur/GOPRO_/train/sharp/**/*.png
+  size: &SIZE 252
+  crop: random
+  preload: &PRELOAD false
+  preload_size: &PRELOAD_SIZE 0
+  bounds: [0, 1]
+  scope: geometric
+val:
+  files_a: /scratch/user/hanzhou1996/datasets/deblur/GOPRO_/test/blur/**/*.png
+  files_b: /scratch/user/hanzhou1996/datasets/deblur/GOPRO_/test/sharp/**/*.png
+  size: *SIZE
+  scope: None
+  crop: random
+  preload: *PRELOAD
+  preload_size: *PRELOAD_SIZE
+  bounds: [0, 1]
+model:
+  g_name: XYScanNetP
+  content_loss: Stripformer_Loss
+num_epochs: 4000
+train_batches_per_epoch: 2103
+val_batches_per_epoch: 1111
+batch_size: 16
+image_size: [252, 252]
+optimizer:
+  name: adam
+  lr: 0.00022
+scheduler:
+  name: cosine
+  start_epoch: 50
+  min_lr: 0.0000001

config/xyscannetp_gopro/config_stage2.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+---
+experiment_desc: XYScanNet_stage2
+train:
+  #/mnt/g/RESEARCH/PHD/Motion_Deblurred/datasets/GOPRO_
+  files_a: /scratch/user/hanzhou1996/datasets/deblur/GOPRO_/train/blur/**/*.png
+  files_b: /scratch/user/hanzhou1996/datasets/deblur/GOPRO_/train/sharp/**/*.png
+  size: &SIZE 320
+  crop: random
+  preload: &PRELOAD false
+  preload_size: &PRELOAD_SIZE 0
+  bounds: [0, 1]
+  scope: geometric
+val:
+  files_a: /scratch/user/hanzhou1996/datasets/deblur/GOPRO_/test/blur/**/*.png
+  files_b: /scratch/user/hanzhou1996/datasets/deblur/GOPRO_/test/sharp/**/*.png
+  size: *SIZE
+  scope: None
+  crop: random
+  preload: *PRELOAD
+  preload_size: *PRELOAD_SIZE
+  bounds: [0, 1]
+model:
+  g_name: XYScanNetP
+  content_loss: Stripformer_Loss
+num_epochs: 4000
+train_batches_per_epoch: 2103
+val_batches_per_epoch: 1111
+batch_size: 8
+image_size: [320, 320]
+optimizer:
+  name: adam
+  lr: 0.00015
+scheduler:
+  name: cosine
+  start_epoch: 50
+  min_lr: 0.0000001

config/xyscannetp_realj/config_stage2.yml ADDED Viewed

	@@ -0,0 +1,40 @@

+---
+experiment_desc: XYScanNet_stage2
+train:
+  files_a: /scratch/user/hanzhou1996/datasets/deblur/RealBlur_J/train/trainA/*.png
+  files_b: /scratch/user/hanzhou1996/datasets/deblur/RealBlur_J/train/trainB/*.png
+  size: &SIZE 320
+  crop: random
+  preload: &PRELOAD false
+  preload_size: &PRELOAD_SIZE 0
+  bounds: [0, 1]
+  scope: geometric
+val:
+  files_a: /scratch/user/hanzhou1996/datasets/deblur/RealBlur_J/test/testA/*.png
+  files_b: /scratch/user/hanzhou1996/datasets/deblur/RealBlur_J/test/testB/*.png
+  size: *SIZE
+  scope: None
+  crop: random
+  preload: *PRELOAD
+  preload_size: *PRELOAD_SIZE
+  bounds: [0, 1]
+model:
+  g_name: XYScanNetP
+  content_loss: Stripformer_Loss
+num_epochs: 2000
+train_batches_per_epoch: 3758
+val_batches_per_epoch: 980
+batch_size: 8
+image_size: [320, 320]
+optimizer:
+  name: adam
+  lr: 0.0001
+scheduler:
+  name: cosine
+  start_epoch: 50
+  min_lr: 0.0000001

config/xyscannetp_realr/config_stage2.yml ADDED Viewed

	@@ -0,0 +1,40 @@

+---
+experiment_desc: XYScanNet_stage2
+train:
+  files_a: /scratch/user/hanzhou1996/datasets/deblur/RealBlur_R/train/trainA/*.png
+  files_b: /scratch/user/hanzhou1996/datasets/deblur/RealBlur_R/train/trainB/*.png
+  size: &SIZE 320
+  crop: random
+  preload: &PRELOAD false
+  preload_size: &PRELOAD_SIZE 0
+  bounds: [0, 1]
+  scope: geometric
+val:
+  files_a: /scratch/user/hanzhou1996/datasets/deblur/RealBlur_R/test/testA/*.png
+  files_b: /scratch/user/hanzhou1996/datasets/deblur/RealBlur_R/test/testB/*.png
+  size: *SIZE
+  scope: None
+  crop: random
+  preload: *PRELOAD
+  preload_size: *PRELOAD_SIZE
+  bounds: [0, 1]
+model:
+  g_name: XYScanNetP
+  content_loss: Stripformer_Loss
+num_epochs: 2000
+train_batches_per_epoch: 3758
+val_batches_per_epoch: 980
+batch_size: 8
+image_size: [320, 320]
+optimizer:
+  name: adam
+  lr: 0.0001
+scheduler:
+  name: cosine
+  start_epoch: 50
+  min_lr: 0.0000001

dataset.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import os
+from copy import deepcopy
+from functools import partial
+from glob import glob
+from hashlib import sha1
+from typing import Callable, Iterable, Optional, Tuple
+import cv2
+import numpy as np
+from glog import logger
+from joblib import Parallel, cpu_count, delayed
+from skimage.io import imread
+from torch.utils.data import Dataset
+from tqdm import tqdm
+import aug
+def subsample(data: Iterable, bounds: Tuple[float, float], hash_fn: Callable, n_buckets=100, salt='', verbose=True):
+    data = list(data)
+    buckets = split_into_buckets(data, n_buckets=n_buckets, salt=salt, hash_fn=hash_fn)
+    lower_bound, upper_bound = [x * n_buckets for x in bounds]
+    msg = f'Subsampling buckets from {lower_bound} to {upper_bound}, total buckets number is {n_buckets}'
+    if salt:
+        msg += f'; salt is {salt}'
+    if verbose:
+        logger.info(msg)
+    return np.array([sample for bucket, sample in zip(buckets, data) if lower_bound <= bucket < upper_bound])
+def hash_from_paths(x: Tuple[str, str], salt: str = '') -> str:
+    path_a, path_b = x
+    names = ''.join(map(os.path.basename, (path_a, path_b)))
+    return sha1(f'{names}_{salt}'.encode()).hexdigest()
+def split_into_buckets(data: Iterable, n_buckets: int, hash_fn: Callable, salt=''):
+    hashes = map(partial(hash_fn, salt=salt), data)
+    return np.array([int(x, 16) % n_buckets for x in hashes])
+def _read_img(x: str):
+    img = cv2.imread(x)
+    if img is None:
+        logger.warning(f'Can not read image {x} with OpenCV, switching to scikit-image')
+        img = imread(x)
+    return img
+class PairedDataset(Dataset):
+    def __init__(self,
+                 files_a: Tuple[str],
+                 files_b: Tuple[str],
+                 transform_fn: Callable,
+                 normalize_fn: Callable,
+                 corrupt_fn: Optional[Callable] = None,
+                 preload: bool = True,
+                 preload_size: Optional[int] = 0,
+                 verbose=True):
+        assert len(files_a) == len(files_b)
+        self.preload = preload
+        self.data_a = files_a
+        self.data_b = files_b
+        self.verbose = verbose
+        self.corrupt_fn = corrupt_fn
+        self.transform_fn = transform_fn
+        self.normalize_fn = normalize_fn
+        logger.info(f'Dataset has been created with {len(self.data_a)} samples')
+        if preload:
+            preload_fn = partial(self._bulk_preload, preload_size=preload_size)
+            if files_a == files_b:
+                self.data_a = self.data_b = preload_fn(self.data_a)
+            else:
+                self.data_a, self.data_b = map(preload_fn, (self.data_a, self.data_b))
+            self.preload = True
+    def _bulk_preload(self, data: Iterable[str], preload_size: int):
+        jobs = [delayed(self._preload)(x, preload_size=preload_size) for x in data]
+        jobs = tqdm(jobs, desc='preloading images', disable=not self.verbose)
+        return Parallel(n_jobs=cpu_count(), backend='threading')(jobs)
+    @staticmethod
+    def _preload(x: str, preload_size: int):
+        img = _read_img(x)
+        if preload_size:
+            h, w, *_ = img.shape
+            h_scale = preload_size / h
+            w_scale = preload_size / w
+            scale = max(h_scale, w_scale)
+            img = cv2.resize(img, fx=scale, fy=scale, dsize=None)
+            assert min(img.shape[:2]) >= preload_size, f'weird img shape: {img.shape}'
+        return img
+    def _preprocess(self, img, res):
+        def transpose(x):
+            return np.transpose(x, (2, 0, 1))
+        return map(transpose, self.normalize_fn(img, res))
+    def __len__(self):
+        return len(self.data_a)
+    def __getitem__(self, idx):
+        a, b = self.data_a[idx], self.data_b[idx]
+        if not self.preload:
+            a, b = map(_read_img, (a, b))
+        a, b = self.transform_fn(a, b)
+        if self.corrupt_fn is not None:
+            a = self.corrupt_fn(a)
+        a, b = self._preprocess(a, b)
+        return {'a': a, 'b': b}
+    @staticmethod
+    def from_config(config):
+        config = deepcopy(config)
+        files_a, files_b = map(lambda x: sorted(glob(config[x], recursive=True)), ('files_a', 'files_b'))
+        transform_fn = aug.get_transforms(size=config['size'], scope=config['scope'], crop=config['crop'])
+        normalize_fn = aug.get_normalize()
+        hash_fn = hash_from_paths
+        # ToDo: add more hash functions
+        verbose = config.get('verbose', True)
+        data = subsample(data=zip(files_a, files_b),
+                         bounds=config.get('bounds', (0, 1)),
+                         hash_fn=hash_fn,
+                         verbose=verbose)
+        files_a, files_b = map(list, zip(*data))
+        return PairedDataset(files_a=files_a,
+                             files_b=files_b,
+                             preload=config['preload'],
+                             preload_size=config['preload_size'],
+                             normalize_fn=normalize_fn,
+                             transform_fn=transform_fn,
+                             verbose=verbose)

datasets/datasets.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ A good way to maintain the datasets in a project is to create a soft link.
2	+ In that case, you simply set the dataset path to the current path in the config files.

evaluate_NIQE.m ADDED Viewed

	@@ -0,0 +1,57 @@

+%p = genpath("G:\RESEARCH\PHD\Motion_Deblurred\xyscannet\visualization_bidir\comparison\epoch3k\uni");
+%gt = genpath("G:\RESEARCH\PHD\Motion_Deblurred\xyscannet\visualization_bidir\comparison\epoch3k\gt");
+length_p = size(p,2);
+path = {};
+temp = [];
+for i = 1:length_p
+    if p(i) ~= ';'
+        temp = [temp p(i)];
+    else
+        temp = [temp '\'];
+        path = [path ; temp];
+        temp = [];
+    end
+end
+clear p length_p temp;
+length_gt = size(gt,2);
+path_gt = {};
+temp_gt = [];
+for i = 1:length_gt
+    if gt(i) ~= ';'
+        temp_gt = [temp_gt gt(i)];
+    else
+        temp_gt = [temp_gt '\'];
+        path_gt = [path_gt ; temp_gt];
+        temp_gt = [];
+    end
+end
+clear gt length_gt temp_gt;
+file_num = size(path,1);
+total_niqe = 0;
+n = 0;
+for i = 1:file_num
+    file_path =  path{i};
+    gt_file_path = path_gt{i};
+    img_path_list = dir(strcat(file_path,'*.png'));
+    gt_path_list = dir(strcat(gt_file_path,'*.png'));
+    img_num = length(img_path_list);
+    if img_num > 0
+        for j = 1:img_num
+            image_name = img_path_list(j).name;
+            gt_name = gt_path_list(j).name;
+            image =  imread(strcat(file_path,image_name));
+            gt = imread(strcat(gt_file_path,gt_name));
+            size(image);
+            size(gt);
+            cur_niqe = niqe(image);
+            fprintf('%d', cur_niqe);
+            total_niqe = total_niqe + cur_niqe;
+            n = n + 1
+        end
+    end
+end
+niqe_score = total_niqe / n
+close all;clear all;

evaluate_RealBlur_J.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+from skimage import io
+import cv2
+import numpy as np
+from skimage.metrics import structural_similarity
+import concurrent.futures
+def image_align(deblurred, gt):
+    # this function is based on kohler evaluation code
+    z = deblurred
+    c = np.ones_like(z)
+    x = gt
+    zs = (np.sum(x * z) / np.sum(z * z)) * z  # simple intensity matching
+    warp_mode = cv2.MOTION_HOMOGRAPHY
+    warp_matrix = np.eye(3, 3, dtype=np.float32)
+    # Specify the number of iterations.
+    number_of_iterations = 100
+    termination_eps = 0
+    criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT,
+                number_of_iterations, termination_eps)
+    # Run the ECC algorithm. The results are stored in warp_matrix.
+    (cc, warp_matrix) = cv2.findTransformECC(cv2.cvtColor(x, cv2.COLOR_RGB2GRAY), cv2.cvtColor(zs, cv2.COLOR_RGB2GRAY),
+                                             warp_matrix, warp_mode, criteria, inputMask=None, gaussFiltSize=5)
+    target_shape = x.shape
+    shift = warp_matrix
+    zr = cv2.warpPerspective(
+        zs,
+        warp_matrix,
+        (target_shape[1], target_shape[0]),
+        flags=cv2.INTER_CUBIC + cv2.WARP_INVERSE_MAP,
+        borderMode=cv2.BORDER_REFLECT)
+    cr = cv2.warpPerspective(
+        np.ones_like(zs, dtype='float32'),
+        warp_matrix,
+        (target_shape[1], target_shape[0]),
+        flags=cv2.INTER_NEAREST + cv2.WARP_INVERSE_MAP,
+        borderMode=cv2.BORDER_CONSTANT,
+        borderValue=0)
+    zr = zr * cr
+    xr = x * cr
+    return zr, xr, cr, shift
+def compute_psnr(image_true, image_test, image_mask, data_range=None):
+    # this function is based on skimage.metrics.peak_signal_noise_ratio
+    err = np.sum((image_true - image_test) ** 2, dtype=np.float64) / np.sum(image_mask)
+    return 10 * np.log10((data_range ** 2) / err)
+def compute_ssim(tar_img, prd_img, cr1):
+    ssim_pre, ssim_map = structural_similarity(tar_img, prd_img, channel_axis=2, gaussian_weights=True,
+                                               use_sample_covariance=False, data_range=1.0, full=True)
+    ssim_map = ssim_map * cr1
+    r = int(3.5 * 1.5 + 0.5)  # radius as in ndimage
+    win_size = 2 * r + 1
+    pad = (win_size - 1) // 2
+    ssim = ssim_map[pad:-pad, pad:-pad, :]
+    crop_cr1 = cr1[pad:-pad, pad:-pad, :]
+    ssim = ssim.sum(axis=0).sum(axis=0) / crop_cr1.sum(axis=0).sum(axis=0)
+    ssim = np.mean(ssim)
+    return ssim
+total_psnr = 0.
+total_ssim = 0.
+count = 0
+#img_path = '/mnt/g/RESEARCH/PHD/Motion_Deblurred/xyscannet/ablation/v33/run1/images_realj'
+#img_path = '/mnt/g/RESEARCH/PHD/Motion_Deblurred/xyscannet/sota/algnet/images/RealBlur_J'
+#img_path = '/mnt/g/RESEARCH/PHD/Motion_Deblurred/xyscannet/sota/deeprft/FMIMOUNetPLUS_RealBlur/RealBlur_J--'
+#img_path = '/mnt/g/RESEARCH/PHD/Motion_Deblurred/xyscannet/sota/deeprft/images_author/RealBlur_J'
+#img_path = '/mnt/g/RESEARCH/PHD/Motion_Deblurred/xyscannet/sota/mprnet/images/RealBlur_J'
+#img_path = '/mnt/g/RESEARCH/PHD/Motion_Deblurred/xyscannet/sota/stripformer/images/RealBlur_J'
+img_path = '/mnt/g/RESEARCH/PHD/Motion_Deblurred/xyscannet/sota/xyscannetp/images/realj_final_stage3'
+gt_path = '/mnt/g/RESEARCH/PHD/Motion_Deblurred/datasets/Realblur_J/test/testB'
+print(img_path)
+for file in os.listdir(img_path):
+    #for img_name in os.listdir(img_path + '/' + file):
+    img_name = file
+    count += 1
+    number = img_name.split('_')[1]
+    #number = img_name.split('-')[1]
+    #gt_name = 'gt_' + number
+    img_dir = img_path + '/' + file
+    s = file.split('_')
+    #s = file.split('-')
+    #gt_file = s[0] + '_' + 'gt_' + number
+    gt_file = '_'.join([s[0], 'gt', s[-1]])
+    gt_dir = gt_path + '/' + gt_file
+    print(gt_file)
+    print(img_dir)
+    with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
+        tar_img = io.imread(gt_dir)
+        prd_img = io.imread(img_dir)
+        tar_img = tar_img.astype(np.float32) / 255.0
+        prd_img = prd_img.astype(np.float32) / 255.0
+        prd_img, tar_img, cr1, shift = image_align(prd_img, tar_img)
+        PSNR = compute_psnr(tar_img, prd_img, cr1, data_range=1)
+        SSIM = compute_ssim(tar_img, prd_img, cr1)
+        total_psnr += PSNR
+        total_ssim += SSIM
+        print(count, PSNR)
+print('PSNR:', total_psnr / count)
+print('SSIM:', total_ssim / count)
+print(img_path)

evaluate_RealBlur_R.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import os
+from skimage import io
+import cv2
+import numpy as np
+from skimage.metrics import structural_similarity
+import concurrent.futures
+def image_align(deblurred, gt):
+    # this function is based on kohler evaluation code
+    z = deblurred
+    c = np.ones_like(z)
+    x = gt
+    zs = (np.sum(x * z) / np.sum(z * z)) * z  # simple intensity matching
+    warp_mode = cv2.MOTION_HOMOGRAPHY
+    warp_matrix = np.eye(3, 3, dtype=np.float32)
+    # Specify the number of iterations.
+    number_of_iterations = 100
+    termination_eps = 0
+    criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT,
+                number_of_iterations, termination_eps)
+    # Run the ECC algorithm. The results are stored in warp_matrix.
+    (cc, warp_matrix) = cv2.findTransformECC(cv2.cvtColor(x, cv2.COLOR_RGB2GRAY), cv2.cvtColor(zs, cv2.COLOR_RGB2GRAY),
+                                             warp_matrix, warp_mode, criteria, inputMask=None, gaussFiltSize=5)
+    target_shape = x.shape
+    shift = warp_matrix
+    zr = cv2.warpPerspective(
+        zs,
+        warp_matrix,
+        (target_shape[1], target_shape[0]),
+        flags=cv2.INTER_CUBIC + cv2.WARP_INVERSE_MAP,
+        borderMode=cv2.BORDER_REFLECT)
+    cr = cv2.warpPerspective(
+        np.ones_like(zs, dtype='float32'),
+        warp_matrix,
+        (target_shape[1], target_shape[0]),
+        flags=cv2.INTER_NEAREST + cv2.WARP_INVERSE_MAP,
+        borderMode=cv2.BORDER_CONSTANT,
+        borderValue=0)
+    zr = zr * cr
+    xr = x * cr
+    return zr, xr, cr, shift
+def compute_psnr(image_true, image_test, image_mask, data_range=None):
+    # this function is based on skimage.metrics.peak_signal_noise_ratio
+    err = np.sum((image_true - image_test) ** 2, dtype=np.float64) / np.sum(image_mask)
+    return 10 * np.log10((data_range ** 2) / err)
+def compute_ssim(tar_img, prd_img, cr1):
+    ssim_pre, ssim_map = structural_similarity(tar_img, prd_img, channel_axis=2, gaussian_weights=True,
+                                               use_sample_covariance=False, data_range=1.0, full=True)
+    ssim_map = ssim_map * cr1
+    r = int(3.5 * 1.5 + 0.5)  # radius as in ndimage
+    win_size = 2 * r + 1
+    pad = (win_size - 1) // 2
+    ssim = ssim_map[pad:-pad, pad:-pad, :]
+    crop_cr1 = cr1[pad:-pad, pad:-pad, :]
+    ssim = ssim.sum(axis=0).sum(axis=0) / crop_cr1.sum(axis=0).sum(axis=0)
+    ssim = np.mean(ssim)
+    return ssim
+total_psnr = 0.
+total_ssim = 0.
+count = 0
+#gt_path = '/mnt/g/RESEARCH/PHD/Motion_Deblurred/datasets/Realblur_R/test/testB'
+img_path = '/mnt/g/RESEARCH/PHD/Motion_Deblurred/xyscannet/sota/deeprft/FMIMOUNetPLUS_RealBlur/RealBlur_R__'
+#img_path = '/mnt/g/RESEARCH/PHD/Motion_Deblurred/xyscannet/sota/stripformer/images/RealBlur_R'
+#img_path = '/mnt/g/RESEARCH/PHD/Motion_Deblurred/xyscannet/sota/deeprft/images/RealBlur_R'
+gt_path = '/mnt/g/RESEARCH/PHD/Motion_Deblurred/datasets/Realblur_R/test/testB'
+print(img_path)
+for file in os.listdir(img_path):
+    #for img_name in os.listdir(img_path + '/' + file):
+    img_name = file
+    count += 1
+    number = img_name.split('_')[1]
+    #gt_name = 'gt_' + number
+    img_dir = img_path + '/' + file
+    s = file.split('_')
+    gt_file = '_'.join([s[0], 'gt', s[-1]])
+    gt_dir = gt_path + '/' + gt_file
+    print(img_dir)
+    with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
+        tar_img = io.imread(gt_dir)
+        prd_img = io.imread(img_dir)
+        tar_img = tar_img.astype(np.float32) / 255.0
+        prd_img = prd_img.astype(np.float32) / 255.0
+        prd_img, tar_img, cr1, shift = image_align(prd_img, tar_img)
+        PSNR = compute_psnr(tar_img, prd_img, cr1, data_range=1)
+        SSIM = compute_ssim(tar_img, prd_img, cr1)
+        total_psnr += PSNR
+        total_ssim += SSIM
+        print(count, PSNR)
+print('PSNR:', total_psnr / count)
+print('SSIM:', total_ssim / count)
+print(img_path)

evaluation_GoPro.m ADDED Viewed

	@@ -0,0 +1,60 @@

+p = genpath('.\out\Stripformer_GoPro_results');% GoPro Deblur Results
+gt = genpath('.\datasets\GoPro\test\sharp');% GoPro GT Results
+length_p = size(p,2);
+path = {};
+temp = [];
+for i = 1:length_p
+    if p(i) ~= ';'
+        temp = [temp p(i)];
+    else
+        temp = [temp '\'];
+        path = [path ; temp];
+        temp = [];
+    end
+end
+clear p length_p temp;
+length_gt = size(gt,2);
+path_gt = {};
+temp_gt = [];
+for i = 1:length_gt
+    if gt(i) ~= ';'
+        temp_gt = [temp_gt gt(i)];
+    else
+        temp_gt = [temp_gt '\'];
+        path_gt = [path_gt ; temp_gt];
+        temp_gt = [];
+    end
+end
+clear gt length_gt temp_gt;
+file_num = size(path,1);
+total_psnr = 0;
+n = 0;
+total_ssim = 0;
+for i = 1:file_num
+    file_path =  path{i};
+    gt_file_path = path_gt{i};
+    img_path_list = dir(strcat(file_path,'*.png'));
+    gt_path_list = dir(strcat(gt_file_path,'*.png'));
+    img_num = length(img_path_list);
+    if img_num > 0
+        for j = 1:img_num
+            image_name = img_path_list(j).name;
+            gt_name = gt_path_list(j).name;
+            image =  imread(strcat(file_path,image_name));
+            gt = imread(strcat(gt_file_path,gt_name));
+            size(image);
+            size(gt);
+            peaksnr = psnr(image,gt);
+            ssimval = ssim(image,gt);
+            total_psnr = total_psnr + peaksnr;
+            total_ssim = total_ssim + ssimval;
+            n = n + 1
+        end
+    end
+end
+psnr = total_psnr / n
+ssim = total_ssim / n
+close all;clear all;

evaluation_HIDE.m ADDED Viewed

	@@ -0,0 +1,60 @@

+p = genpath('.\out\Stripformer_HIDE_results');% HIDE Deblur Results
+gt = genpath('.\datasets\HIDE\sharp');% HIDE GT Results
+length_p = size(p,2);
+path = {};
+temp = [];
+for i = 1:length_p
+    if p(i) ~= ';'
+        temp = [temp p(i)];
+    else
+        temp = [temp '\'];
+        path = [path ; temp];
+        temp = [];
+    end
+end
+clear p length_p temp;
+length_gt = size(gt,2);
+path_gt = {};
+temp_gt = [];
+for i = 1:length_gt
+    if gt(i) ~= ';'
+        temp_gt = [temp_gt gt(i)];
+    else
+        temp_gt = [temp_gt '\'];
+        path_gt = [path_gt ; temp_gt];
+        temp_gt = [];
+    end
+end
+clear gt length_gt temp_gt;
+file_num = size(path,1);
+total_psnr = 0;
+n = 0;
+total_ssim = 0;
+for i = 1:file_num
+    file_path =  path{i};
+    gt_file_path = path_gt{i};
+    img_path_list = dir(strcat(file_path,'*.png'));
+    gt_path_list = dir(strcat(gt_file_path,'*.png'));
+    img_num = length(img_path_list);
+    if img_num > 0
+        for j = 1:img_num
+            image_name = img_path_list(j).name;
+            gt_name = gt_path_list(j).name;
+            image =  imread(strcat(file_path,image_name));
+            gt = imread(strcat(gt_file_path,gt_name));
+            size(image);
+            size(gt);
+            peaksnr = psnr(image,gt);
+            ssimval = ssim(image,gt);
+            total_psnr = total_psnr + peaksnr;
+            total_ssim = total_ssim + ssimval;
+            n = n + 1
+        end
+    end
+end
+psnr = total_psnr / n
+ssim = total_ssim / n
+close all;clear all;

examples/blur1.png ADDED Viewed

Git LFS Details

SHA256: 7a39f365845b4b6c77882971bba18fae77a28bf410c253a19c026e43e5949207
Pointer size: 132 Bytes
Size of remote file: 1.04 MB

examples/blur2.png ADDED Viewed

Git LFS Details

SHA256: e692be16467fbcaf8051a9fe09504b84dc8388b0579d7e5cbbac718ee0ac5f2a
Pointer size: 131 Bytes
Size of remote file: 961 kB

examples/blur3.png ADDED Viewed

Git LFS Details

SHA256: 50cf03fc83fbc6ac2e92c1daf0b3580abbc206e136dc032c9a1e248647628a44
Pointer size: 131 Bytes
Size of remote file: 841 kB

examples/blur4.png ADDED Viewed

Git LFS Details

SHA256: f9e7dfe63e11c57711881ca9b04ed335a7f41eb34e68e52ee8259dfd0f7c32bf
Pointer size: 131 Bytes
Size of remote file: 744 kB

examples/blur5.png ADDED Viewed

Git LFS Details

SHA256: 788fab8dca7aa0440511a15418f4bbb6b559ac2b3c22767b40dfd1bd25621562
Pointer size: 131 Bytes
Size of remote file: 962 kB

license ADDED Viewed

	@@ -0,0 +1,37 @@

+Copyright (c) 2025 Hanzhou Liu
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software with non-commercial usage, including non-commercial usage
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+--------------------------- LICENSE FOR DeblurGANv2 --------------------------------
+BSD License
+For DeblurGANv2 software
+Copyright (c) 2019, Orest Kupyn, Tetiana Martyniuk, Junru Wu and Zhangyang Wang
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.

metric_counter.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import logging
+from collections import defaultdict
+import numpy as np
+from tensorboardX import SummaryWriter
+WINDOW_SIZE = 100
+class MetricCounter:
+    def __init__(self, exp_name):
+        self.writer = SummaryWriter(exp_name)
+        logging.basicConfig(filename='{}.log'.format(exp_name), level=logging.DEBUG)
+        self.metrics = defaultdict(list)
+        self.images = defaultdict(list)
+        self.best_metric = 0
+    def add_image(self, x: np.ndarray, tag: str):
+        self.images[tag].append(x)
+    def clear(self):
+        self.metrics = defaultdict(list)
+        self.images = defaultdict(list)
+    def add_losses(self, l_G):
+        for name, value in zip(('G_loss', None), (l_G, None)):
+            self.metrics[name].append(value)
+    def add_metrics(self, psnr, ssim):
+        for name, value in zip(('PSNR', 'SSIM'),
+                               (psnr, ssim)):
+            self.metrics[name].append(value)
+    def loss_message(self):
+        metrics = ((k, np.mean(self.metrics[k][-WINDOW_SIZE:])) for k in ('G_loss', 'PSNR', 'SSIM'))
+        return '; '.join(map(lambda x: f'{x[0]}={x[1]:.4f}', metrics))
+    def write_to_tensorboard(self, epoch_num, validation=False):
+        scalar_prefix = 'Validation' if validation else 'Train'
+        for tag in ('G_loss', 'SSIM', 'PSNR'):
+            self.writer.add_scalar(f'{scalar_prefix}_{tag}', np.mean(self.metrics[tag]), global_step=epoch_num)
+        for tag in self.images:
+            imgs = self.images[tag]
+            if imgs:
+                imgs = np.array(imgs)
+                self.writer.add_images(tag, imgs[:, :, :, ::-1].astype('float32') / 255, dataformats='NHWC',
+                                       global_step=epoch_num)
+                self.images[tag] = []
+    def update_best_model(self):
+        cur_metric = np.mean(self.metrics['PSNR'])
+        if self.best_metric < cur_metric:
+            self.best_metric = cur_metric
+            return True
+        return False

models/XYScanNet.py ADDED Viewed

	@@ -0,0 +1,737 @@

+import numbers
+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from einops import rearrange, repeat
+from mamba_ssm.ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn
+try:
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+except ImportError:
+    causal_conv1d_fn, causal_conv1d_update = None, None
+try:
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+except ImportError:
+    selective_state_update = None
+try:
+    from mamba_ssm.ops.triton.layer_norm import RMSNorm, layer_norm_fn, rms_norm_fn
+except ImportError:
+    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+def to_3d(x):
+    return rearrange(x, 'b c h w -> b (h w) c')
+def to_4d(x, h, w):
+    return rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
+class BiasFree_LayerNorm(nn.Module):
+    def __init__(self, normalized_shape):
+        super(BiasFree_LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+        assert len(normalized_shape) == 1
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.normalized_shape = normalized_shape
+    def forward(self, x):
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return x / torch.sqrt(sigma + 1e-5) * self.weight
+class WithBias_LayerNorm(nn.Module):
+    def __init__(self, normalized_shape):
+        super(WithBias_LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+        assert len(normalized_shape) == 1
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.normalized_shape = normalized_shape
+    def forward(self, x):
+        mu = x.mean(-1, keepdim=True)
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return (x - mu) / torch.sqrt(sigma + 1e-5) * self.weight + self.bias
+class LayerNorm(nn.Module):
+    def __init__(self, dim, LayerNorm_type):
+        super(LayerNorm, self).__init__()
+        if LayerNorm_type == 'BiasFree':
+            self.body = BiasFree_LayerNorm(dim)
+        else:
+            self.body = WithBias_LayerNorm(dim)
+    def forward(self, x):
+        h, w = x.shape[-2:]
+        return to_4d(self.body(to_3d(x)), h, w)
+##########################################################################
+def conv(in_channels, out_channels, kernel_size, bias=False, stride = 1):
+    return nn.Conv2d(
+        in_channels, out_channels, kernel_size,
+        padding=(kernel_size//2), bias=bias, stride = stride)
+"""
+Borrow from "https://github.com/state-spaces/mamba.git"
+@article{mamba,
+  title={Mamba: Linear-Time Sequence Modeling with Selective State Spaces},
+  author={Gu, Albert and Dao, Tri},
+  journal={arXiv preprint arXiv:2312.00752},
+  year={2023}
+}
+"""
+class Mamba(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        d_state=16,
+        d_conv=4,
+        expand=2,
+        dt_rank="auto",
+        dt_min=0.001,
+        dt_max=0.1,
+        dt_init="random",
+        dt_scale=1.0,
+        dt_init_floor=1e-4,
+        conv_bias=True,
+        bias=False,
+        use_fast_path=True,  # Fused kernel options
+        layer_idx=None,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.d_model = d_model
+        self.d_state = d_state
+        self.d_conv = d_conv
+        self.expand = expand
+        self.d_inner = int(self.expand * self.d_model)
+        self.dt_rank = math.ceil(self.d_model / 16) if dt_rank == "auto" else dt_rank
+        self.use_fast_path = use_fast_path
+        self.layer_idx = layer_idx
+        self.in_proj = nn.Linear(self.d_model, self.d_inner * 2, bias=bias, **factory_kwargs)
+        self.conv1d = nn.Conv1d(
+            in_channels=self.d_inner,
+            out_channels=self.d_inner,
+            bias=conv_bias,
+            kernel_size=d_conv,
+            groups=self.d_inner,
+            padding=d_conv - 1,
+            **factory_kwargs,
+        )
+        self.activation = "silu"
+        self.act = nn.SiLU()
+        self.x_proj = nn.Linear(
+            self.d_inner, self.dt_rank + self.d_state * 2, bias=False, **factory_kwargs
+        )
+        self.dt_proj = nn.Linear(self.dt_rank, self.d_inner, bias=True, **factory_kwargs)
+        # Initialize special dt projection to preserve variance at initialization
+        dt_init_std = self.dt_rank**-0.5 * dt_scale
+        if dt_init == "constant":
+            nn.init.constant_(self.dt_proj.weight, dt_init_std)
+        elif dt_init == "random":
+            nn.init.uniform_(self.dt_proj.weight, -dt_init_std, dt_init_std)
+        else:
+            raise NotImplementedError
+        # Initialize dt bias so that F.softplus(dt_bias) is between dt_min and dt_max
+        dt = torch.exp(
+            torch.rand(self.d_inner, **factory_kwargs) * (math.log(dt_max) - math.log(dt_min))
+            + math.log(dt_min)
+        ).clamp(min=dt_init_floor)
+        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+        inv_dt = dt + torch.log(-torch.expm1(-dt))
+        with torch.no_grad():
+            self.dt_proj.bias.copy_(inv_dt)
+        # Our initialization would set all Linear.bias to zero, need to mark this one as _no_reinit
+        self.dt_proj.bias._no_reinit = True
+        # S4D real initialization
+        A = repeat(
+            torch.arange(1, self.d_state + 1, dtype=torch.float32, device=device),
+            "n -> d n",
+            d=self.d_inner,
+        ).contiguous()
+        A_log = torch.log(A)  # Keep A_log in fp32
+        self.A_log = nn.Parameter(A_log)
+        self.A_log._no_weight_decay = True
+        # D "skip" parameter
+        self.D = nn.Parameter(torch.ones(self.d_inner, device=device))  # Keep in fp32
+        self.D._no_weight_decay = True
+        self.out_proj = nn.Linear(self.d_inner, self.d_model, bias=bias, **factory_kwargs)
+    def forward(self, hidden_states, inference_params=None):
+        """
+        hidden_states: (B, L, D)
+        Returns: same shape as hidden_states
+        """
+        batch, seqlen, dim = hidden_states.shape
+        conv_state, ssm_state = None, None
+        if inference_params is not None:
+            conv_state, ssm_state = self._get_states_from_cache(inference_params, batch)
+            if inference_params.seqlen_offset > 0:
+                # The states are updated inplace
+                out, _, _ = self.step(hidden_states, conv_state, ssm_state)
+                return out
+        # We do matmul and transpose BLH -> HBL at the same time
+        xz = rearrange(
+            self.in_proj.weight @ rearrange(hidden_states, "b l d -> d (b l)"),
+            "d (b l) -> b d l",
+            l=seqlen,
+        )
+        if self.in_proj.bias is not None:
+            xz = xz + rearrange(self.in_proj.bias.to(dtype=xz.dtype), "d -> d 1")
+        A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
+        # In the backward pass we write dx and dz next to each other to avoid torch.cat
+        if self.use_fast_path and causal_conv1d_fn is not None and inference_params is None:  # Doesn't support outputting the states
+            out = mamba_inner_fn(
+                xz,
+                self.conv1d.weight,
+                self.conv1d.bias,
+                self.x_proj.weight,
+                self.dt_proj.weight,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                A,
+                None,  # input-dependent B
+                None,  # input-dependent C
+                self.D.float(),
+                delta_bias=self.dt_proj.bias.float(),
+                delta_softplus=True,
+            )
+        else:
+            x, z = xz.chunk(2, dim=1)
+            # Compute short convolution
+            if conv_state is not None:
+                # If we just take x[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
+                # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise.
+                conv_state.copy_(F.pad(x, (self.d_conv - x.shape[-1], 0)))  # Update state (B D W)
+            if causal_conv1d_fn is None:
+                x = self.act(self.conv1d(x)[..., :seqlen])
+            else:
+                assert self.activation in ["silu", "swish"]
+                x = causal_conv1d_fn(
+                    x=x,
+                    weight=rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                    bias=self.conv1d.bias,
+                    activation=self.activation,
+                )
+            # We're careful here about the layout, to avoid extra transposes.
+            # We want dt to have d as the slowest moving dimension
+            # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+            x_dbl = self.x_proj(rearrange(x, "b d l -> (b l) d"))  # (bl d)
+            dt, B, C = torch.split(x_dbl, [self.dt_rank, self.d_state, self.d_state], dim=-1)
+            dt = self.dt_proj.weight @ dt.t()
+            dt = rearrange(dt, "d (b l) -> b d l", l=seqlen)
+            B = rearrange(B, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
+            C = rearrange(C, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
+            assert self.activation in ["silu", "swish"]
+            y = selective_scan_fn(
+                x,
+                dt,
+                A,
+                B,
+                C,
+                self.D.float(),
+                z=z,
+                delta_bias=self.dt_proj.bias.float(),
+                delta_softplus=True,
+                return_last_state=ssm_state is not None,
+            )
+            if ssm_state is not None:
+                y, last_state = y
+                ssm_state.copy_(last_state)
+            y = rearrange(y, "b d l -> b l d")
+            out = self.out_proj(y)
+        return out
+    def step(self, hidden_states, conv_state, ssm_state):
+        dtype = hidden_states.dtype
+        assert hidden_states.shape[1] == 1, "Only support decoding with 1 token at a time for now"
+        xz = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
+        x, z = xz.chunk(2, dim=-1)  # (B D)
+        # Conv step
+        if causal_conv1d_update is None:
+            conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1))  # Update state (B D W)
+            conv_state[:, :, -1] = x
+            x = torch.sum(conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1)  # (B D)
+            if self.conv1d.bias is not None:
+                x = x + self.conv1d.bias
+            x = self.act(x).to(dtype=dtype)
+        else:
+            x = causal_conv1d_update(
+                x,
+                conv_state,
+                rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                self.conv1d.bias,
+                self.activation,
+            )
+        x_db = self.x_proj(x)  # (B dt_rank+2*d_state)
+        dt, B, C = torch.split(x_db, [self.dt_rank, self.d_state, self.d_state], dim=-1)
+        # Don't add dt_bias here
+        dt = F.linear(dt, self.dt_proj.weight)  # (B d_inner)
+        A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
+        # SSM step
+        if selective_state_update is None:
+            # Discretize A and B
+            dt = F.softplus(dt + self.dt_proj.bias.to(dtype=dt.dtype))
+            dA = torch.exp(torch.einsum("bd,dn->bdn", dt, A))
+            dB = torch.einsum("bd,bn->bdn", dt, B)
+            ssm_state.copy_(ssm_state * dA + rearrange(x, "b d -> b d 1") * dB)
+            y = torch.einsum("bdn,bn->bd", ssm_state.to(dtype), C)
+            y = y + self.D.to(dtype) * x
+            y = y * self.act(z)  # (B D)
+        else:
+            y = selective_state_update(
+                ssm_state, x, dt, A, B, C, self.D, z=z, dt_bias=self.dt_proj.bias, dt_softplus=True
+            )
+        out = self.out_proj(y)
+        return out.unsqueeze(1), conv_state, ssm_state
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        device = self.out_proj.weight.device
+        conv_dtype = self.conv1d.weight.dtype if dtype is None else dtype
+        conv_state = torch.zeros(
+            batch_size, self.d_model * self.expand, self.d_conv, device=device, dtype=conv_dtype
+        )
+        ssm_dtype = self.dt_proj.weight.dtype if dtype is None else dtype
+        # ssm_dtype = torch.float32
+        ssm_state = torch.zeros(
+            batch_size, self.d_model * self.expand, self.d_state, device=device, dtype=ssm_dtype
+        )
+        return conv_state, ssm_state
+    def _get_states_from_cache(self, inference_params, batch_size, initialize_states=False):
+        assert self.layer_idx is not None
+        if self.layer_idx not in inference_params.key_value_memory_dict:
+            batch_shape = (batch_size,)
+            conv_state = torch.zeros(
+                batch_size,
+                self.d_model * self.expand,
+                self.d_conv,
+                device=self.conv1d.weight.device,
+                dtype=self.conv1d.weight.dtype,
+            )
+            ssm_state = torch.zeros(
+                batch_size,
+                self.d_model * self.expand,
+                self.d_state,
+                device=self.dt_proj.weight.device,
+                dtype=self.dt_proj.weight.dtype,
+                # dtype=torch.float32,
+            )
+            inference_params.key_value_memory_dict[self.layer_idx] = (conv_state, ssm_state)
+        else:
+            conv_state, ssm_state = inference_params.key_value_memory_dict[self.layer_idx]
+            # TODO: What if batch size changes between generation, and we reuse the same states?
+            if initialize_states:
+                conv_state.zero_()
+                ssm_state.zero_()
+        return conv_state, ssm_state
+    ##########################################################################
+## Feed-forward Network
+class FFN(nn.Module):
+    def __init__(self, dim, ffn_expansion_factor, bias):
+        super(FFN, self).__init__()
+        hidden_features = int(dim*ffn_expansion_factor)
+        self.project_in = nn.Conv2d(dim, hidden_features*2, kernel_size=1, bias=bias)
+        self.dwconv = nn.Conv2d(hidden_features*2, hidden_features*2, kernel_size=3, stride=1, padding=1, groups=hidden_features*2, bias=bias, dilation=1)
+        self.win_size = 8
+        self.modulator = nn.Parameter(torch.ones(self.win_size, self.win_size, dim*2))  # modulator
+        self.project_out = nn.Conv2d(hidden_features, dim, kernel_size=1, bias=bias)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        h1, w1 = h//self.win_size, w//self.win_size
+        x = self.project_in(x)
+        x = self.dwconv(x)
+        x_win = rearrange(x, 'b c (wsh h1) (wsw w1) -> b h1 w1 wsh wsw c', wsh=self.win_size, wsw=self.win_size)
+        x_win = x_win * self.modulator
+        x = rearrange(x_win, 'b h1 w1 wsh wsw c -> b c (wsh h1) (wsw w1)', wsh=self.win_size, wsw=self.win_size, h1=h1, w1=w1)
+        x1, x2 = x.chunk(2, dim=1)
+        x = x1 * x2
+        x = self.project_out(x)
+        return x
+    ##########################################################################
+## Gated Depth-wise Feed-forward Network (GDFN)
+class GDFN(nn.Module):
+    def __init__(self, dim, ffn_expansion_factor, bias):
+        super(GDFN, self).__init__()
+        hidden_features = int(dim*ffn_expansion_factor)
+        self.project_in = nn.Conv2d(dim, hidden_features*2, kernel_size=1, bias=bias)
+        self.dwconv = nn.Conv2d(hidden_features*2, hidden_features*2, kernel_size=3, stride=1, padding=1, groups=hidden_features*2, bias=bias, dilation=1)
+        self.project_out = nn.Conv2d(hidden_features, dim, kernel_size=1, bias=bias)
+    def forward(self, x):
+        x = self.project_in(x)
+        x = self.dwconv(x)
+        x1, x2 = x.chunk(2, dim=1)
+        x = F.silu(x1) * x2
+        x = self.project_out(x)
+        return x
+##########################################################################
+## Overlapped image patch embedding with 3x3 Conv
+class OverlapPatchEmbed(nn.Module):
+    def __init__(self, in_c=3, embed_dim=48, bias=False):
+        super(OverlapPatchEmbed, self).__init__()
+        self.proj = nn.Conv2d(in_c, embed_dim, kernel_size=3, stride=1, padding=1, bias=bias)
+    def forward(self, x):
+        x = self.proj(x)
+        return x
+##########################################################################
+## Resizing modules
+class Downsample(nn.Module):
+    def __init__(self, n_feat):
+        super(Downsample, self).__init__()
+        self.body = nn.Sequential(nn.Upsample(scale_factor=0.5, mode='bilinear', align_corners=False),
+                                  nn.Conv2d(n_feat, n_feat * 2, 3, stride=1, padding=1, bias=False))
+    def forward(self, x):
+        return self.body(x)
+class Upsample(nn.Module):
+    def __init__(self, n_feat):
+        super(Upsample, self).__init__()
+        self.body = nn.Sequential(nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),
+                                  nn.Conv2d(n_feat, n_feat // 2, 3, stride=1, padding=1, bias=False))
+    def forward(self, x):
+        return self.body(x)
+"""
+Borrow from "https://github.com/pp00704831/Stripformer-ECCV-2022-.git"
+@inproceedings{Tsai2022Stripformer,
+  author    = {Fu-Jen Tsai and Yan-Tsung Peng and Yen-Yu Lin and Chung-Chi Tsai and Chia-Wen Lin},
+  title     = {Stripformer: Strip Transformer for Fast Image Deblurring},
+  booktitle = {ECCV},
+  year      = {2022}
+}
+"""
+class Intra_VSSM(nn.Module):
+    def __init__(self, dim, vssm_expansion_factor, bias):  # gated = True
+        super(Intra_VSSM, self).__init__()
+        hidden = int(dim*vssm_expansion_factor)
+        self.proj_in = nn.Conv2d(dim, hidden*2, kernel_size=1, bias=bias)
+        self.dwconv = nn.Conv2d(hidden*2, hidden*2, kernel_size=3, stride=1, padding=1, groups=hidden*2, bias=bias)
+        self.proj_out = nn.Conv2d(hidden, dim, kernel_size=1, bias=bias)
+        self.conv_input = nn.Conv2d(hidden, hidden, kernel_size=1, padding=0, bias=bias)
+        self.fuse_out = nn.Conv2d(hidden, hidden, kernel_size=1, padding=0, bias=bias)
+        self.mamba = Mamba(d_model=hidden // 2)
+    def forward_core(self, x):
+        B, C, H, W = x.size()
+        x_input = torch.chunk(self.conv_input(x), 2, dim=1)
+        feature_h = (x_input[0]).permute(0, 2, 3, 1).contiguous()
+        feature_h = feature_h.view(B * H, W, C//2)
+        feature_v = (x_input[1]).permute(0, 3, 2, 1).contiguous()
+        feature_v = feature_v.view(B * W, H, C//2)
+        if H == W:
+            feature = torch.cat((feature_h, feature_v), dim=0)  # B * H * 2, W, C//2
+            scan_output = self.mamba(feature)
+            scan_output = torch.chunk(scan_output, 2, dim=0)
+            scan_output_h = scan_output[0]
+            scan_output_v = scan_output[1]
+        else:
+            scan_output_h = self.mamba(feature_h)
+            scan_output_v = self.mamba(feature_v)
+        scan_output_h = scan_output_h.view(B, H, W, C//2).permute(0, 3, 1, 2).contiguous()
+        scan_output_v = scan_output_v.view(B, W, H, C//2).permute(0, 3, 2, 1).contiguous()
+        scan_output = self.fuse_out(torch.cat((scan_output_h, scan_output_v), dim=1))
+        return scan_output
+    def forward(self, x):
+        x = self.proj_in(x)
+        x, x_ = self.dwconv(x).chunk(2, dim=1)
+        x = self.forward_core(x)
+        x = F.silu(x_) * x
+        x = self.proj_out(x)
+        return x
+class Inter_VSSM(nn.Module):
+    def __init__(self, dim, vssm_expansion_factor, bias):  # gated = True
+        super(Inter_VSSM, self).__init__()
+        hidden = int(dim*vssm_expansion_factor)
+        self.proj_in = nn.Conv2d(dim, hidden*2, kernel_size=1, bias=bias)
+        self.dwconv = nn.Conv2d(hidden*2, hidden*2, kernel_size=3, stride=1, padding=1, groups=hidden*2, bias=bias)
+        self.proj_out = nn.Conv2d(hidden, dim, kernel_size=1, bias=bias)
+        self.avg_pool = nn.AdaptiveAvgPool2d((None,1))
+        self.conv_input = nn.Conv2d(hidden, hidden, kernel_size=1, padding=0, bias=bias)
+        self.fuse_out = nn.Conv2d(hidden, hidden, kernel_size=1, padding=0, bias=bias)
+        self.mamba = Mamba(d_model=hidden // 2)
+        self.sigmoid = nn.Sigmoid()
+    def forward_core(self, x):
+        B, C, H, W = x.size()
+        x_input = torch.chunk(self.conv_input(x), 2, dim=1)  # B, C, H, W
+        feature_h = x_input[0].permute(0, 2, 1, 3).contiguous()  # B, H, C//2, W
+        feature_h_score = self.avg_pool(feature_h)  # B, H, C//2, 1
+        feature_h_score = feature_h_score.view(B, H, -1)
+        feature_v = x_input[1].permute(0, 3, 1, 2).contiguous()  # B, W, C//2, H
+        feature_v_score = self.avg_pool(feature_v)  # B, W, C//2, 1
+        feature_v_score = feature_v_score.view(B, W, -1)
+        if H == W:
+            feature_score = torch.cat((feature_h_score, feature_v_score), dim=0)  # B * 2, W or H, C//2
+            scan_score = self.mamba(feature_score)
+            scan_score = torch.chunk(scan_score, 2, dim=0)
+            scan_score_h = scan_score[0]
+            scan_score_v = scan_score[1]
+        else:
+            scan_score_h = self.mamba(feature_h_score)
+            scan_score_v = self.mamba(feature_v_score)
+        scan_score_h = self.sigmoid(scan_score_h)
+        scan_score_v = self.sigmoid(scan_score_v)
+        feature_h = feature_h*scan_score_h[:,:,:,None]
+        feature_v = feature_v*scan_score_v[:,:,:,None]
+        feature_h = feature_h.view(B, H, C//2, W).permute(0, 2, 1, 3).contiguous()
+        feature_v = feature_v.view(B, W, C//2, H).permute(0, 2, 3, 1).contiguous()
+        output = self.fuse_out(torch.cat((feature_h, feature_v), dim=1))
+        return output
+    def forward(self, x):
+        x = self.proj_in(x)
+        x, x_ = self.dwconv(x).chunk(2, dim=1)
+        x = self.forward_core(x)
+        x = F.silu(x_) * x
+        x = self.proj_out(x)
+        return x
+##########################################################################
+class Strip_VSSB(nn.Module):
+    def __init__(self, dim, vssm_expansion_factor, ffn_expansion_factor, bias=False, ssm=False, LayerNorm_type='WithBias'):
+        super(Strip_VSSB, self).__init__()
+        self.ssm = ssm
+        if self.ssm == True:
+            self.norm1_ssm = LayerNorm(dim, LayerNorm_type)
+            self.norm2_ssm = LayerNorm(dim, LayerNorm_type)
+            self.intra = Intra_VSSM(dim, vssm_expansion_factor, bias)
+            self.inter = Inter_VSSM(dim, vssm_expansion_factor, bias)
+        self.norm1_ffn = LayerNorm(dim, LayerNorm_type)
+        self.norm2_ffn = LayerNorm(dim, LayerNorm_type)
+        self.ffn1 = GDFN(dim, ffn_expansion_factor, bias)
+        self.ffn2 = GDFN(dim, ffn_expansion_factor, bias)
+    def forward(self, x):
+        if self.ssm == True:
+            x = x + self.intra(self.norm1_ssm(x))
+        x = x + self.ffn1(self.norm1_ffn(x))
+        if self.ssm == True:
+            x = x + self.inter(self.norm2_ssm(x))
+        x = x + self.ffn2(self.norm2_ffn(x))
+        return x
+##########################################################################
+##---------- Cross-level Feature Fusion by Adding Sigmoid(KL-Div) * Multi-Scale Feat -----------------------
+class CLFF(nn.Module):
+    def __init__(self, dim, dim_n1, dim_n2, bias=False):
+        super(CLFF, self).__init__()
+        self.conv = nn.Conv2d(dim, dim, kernel_size=1, bias=bias)
+        self.conv_n1 = nn.Conv2d(dim_n1, dim, kernel_size=1, bias=bias)
+        self.conv_n2 = nn.Conv2d(dim_n2, dim, kernel_size=1, bias=bias)
+        self.fuse_out1 = nn.Conv2d(dim*2, dim, kernel_size=1, bias=bias)
+        self.log_sigmoid = nn.LogSigmoid()
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x, n1, n2):
+        x_ = self.conv(x)
+        n1_ = self.conv_n1(n1)
+        n2_ = self.conv_n2(n2)
+        kl_n1 = F.kl_div(input=self.log_sigmoid(n1_), target=self.log_sigmoid(x_), log_target=True)
+        kl_n2 = F.kl_div(input=self.log_sigmoid(n2_), target=self.log_sigmoid(x_), log_target=True)
+        #g = self.sigmoid(x_)
+        g1 = self.sigmoid(kl_n1)
+        g2 = self.sigmoid(kl_n2)
+        #x = (1 + g) * x_ + (1 - g) * (g1 * n1_ + g2 * n2_)
+        x = self.fuse_out1(torch.cat((x_, g1 * n1_ + g2 * n2_), dim=1))
+        return x
+##########################################################################
+##---------- StripScanNet -----------------------
+class XYScanNet(nn.Module):
+    def __init__(self,
+        inp_channels=3,
+        out_channels=3,
+        dim = 72,  # 48, 72, 96, 120, 144
+        num_blocks = [3,3,6],
+        vssm_expansion_factor  = 1,  # 1 or 2
+        ffn_expansion_factor  = 1,  # 1 or 3
+        bias = False,
+        LayerNorm_type = 'WithBias',   ## Other option 'BiasFree'
+    ):
+        super(XYScanNet, self).__init__()
+        self.patch_embed = OverlapPatchEmbed(inp_channels, dim)
+        self.encoder_level1 = nn.Sequential(*[Strip_VSSB(dim=dim, vssm_expansion_factor=vssm_expansion_factor, ffn_expansion_factor = ffn_expansion_factor,
+                                                         bias=bias, ssm=False, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[0])])
+        self.down1_2 = Downsample(dim) ## From Level 1 to Level 2
+        self.encoder_level2 = nn.Sequential(*[Strip_VSSB(dim=int(dim*2**1), vssm_expansion_factor=vssm_expansion_factor, ffn_expansion_factor = ffn_expansion_factor,
+                                                         bias=bias, ssm=False, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[1])])
+        self.down2_3 = Downsample(int(dim*2**1)) ## From Level 2 to Level 3
+        self.encoder_level3 = nn.Sequential(*[Strip_VSSB(dim=int(dim*2**2), vssm_expansion_factor=vssm_expansion_factor, ffn_expansion_factor = ffn_expansion_factor,
+                                                         bias=bias, ssm=False, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[2])])
+        self.decoder_level3 = nn.Sequential(*[Strip_VSSB(dim=int(dim*2**2), vssm_expansion_factor=vssm_expansion_factor, ffn_expansion_factor = ffn_expansion_factor,
+                                                         bias=bias, ssm=True, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[2])])
+        self.up3_2 = Upsample(int(dim*2**2)) ## From Level 3 to Level 2
+        self.clff_level2 = CLFF(int(dim*2**1), dim_n1=int(dim*2**0), dim_n2=(dim*2**2), bias=bias)
+        self.reduce_chan_level2 = nn.Conv2d(int(dim*2**2), int(dim*2**1), kernel_size=1, bias=bias)
+        self.decoder_level2 = nn.Sequential(*[Strip_VSSB(dim=int(dim*2**1), vssm_expansion_factor=vssm_expansion_factor, ffn_expansion_factor = ffn_expansion_factor,
+                                                         bias=bias, ssm=True, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[1])])
+        self.up2_1 = Upsample(int(dim*2**1))  ## From Level 2 to Level 1
+        self.clff_level1 = CLFF(int(dim*2**0), dim_n1=int(dim*2**1), dim_n2=(dim*2**2), bias=bias)
+        self.reduce_chan_level1 = nn.Conv2d(int(dim*2**1), int(dim*2**0), kernel_size=1, bias=bias)
+        self.decoder_level1 = nn.Sequential(*[Strip_VSSB(dim=int(dim*2**0), vssm_expansion_factor=vssm_expansion_factor, ffn_expansion_factor = ffn_expansion_factor,
+                                                         bias=bias, ssm=True, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[0])])
+        # self.refinement = nn.Sequential(*[Strip_VSSB(dim=int(dim*2**0), expansion_factor=expansion_factor, bias=bias, ssm=True, LayerNorm_type=LayerNorm_type) for i in range(num_refinement_blocks)])
+        self.output = nn.Conv2d(int(dim*2**0), out_channels, kernel_size=3, stride=1, padding=1, bias=bias)
+    def forward(self, inp_img):
+        # Encoder
+        inp_enc_level1 = self.patch_embed(inp_img)
+        out_enc_level1 = self.encoder_level1(inp_enc_level1)
+        out_enc_level1_2 = F.interpolate(out_enc_level1, scale_factor=0.5)  # dim*2, lvl1 down-scaled to lvl2
+        inp_enc_level2 = self.down1_2(out_enc_level1)
+        out_enc_level2 = self.encoder_level2(inp_enc_level2)
+        out_enc_level2_1 = F.interpolate(out_enc_level2, scale_factor=2)  # dim*2, lvl2 up-scaled to lvl1
+        inp_enc_level3 = self.down2_3(out_enc_level2)
+        out_enc_level3 = self.encoder_level3(inp_enc_level3)
+        out_enc_level3_2 = F.interpolate(out_enc_level3, scale_factor=2)  # dim*2**2, lvl3 up-scaled to lvl2 (lvl3->lvl2)
+        out_enc_level3_1 = F.interpolate(out_enc_level3_2, scale_factor=2)  # dim*2**2, lvl3 up-scaled to lvl1 (lvl3->lvl2->lvl1)
+        out_enc_level1 = self.clff_level1(out_enc_level1, out_enc_level2_1, out_enc_level3_1)
+        out_enc_level2 = self.clff_level2(out_enc_level2, out_enc_level1_2, out_enc_level3_2)
+        # Decoder
+        out_dec_level3_decomp1 = self.decoder_level3(out_enc_level3)
+        inp_dec_level2_decomp1 = self.up3_2(out_dec_level3_decomp1)
+        inp_dec_level2_decomp1 = self.reduce_chan_level2(torch.cat((inp_dec_level2_decomp1, out_enc_level2), dim=1))
+        out_dec_level2_decomp1 = self.decoder_level2(inp_dec_level2_decomp1)
+        inp_dec_level1_decomp1 = self.up2_1(out_dec_level2_decomp1)
+        inp_dec_level1_decomp1 = self.reduce_chan_level1(torch.cat((inp_dec_level1_decomp1, out_enc_level1), dim=1))
+        out_dec_level1_decomp1 = self.decoder_level1(inp_dec_level1_decomp1)
+        out_dec_level1_decomp1 = self.output(out_dec_level1_decomp1)
+        out_dec_level1 = out_dec_level1_decomp1 + inp_img
+        return out_dec_level1, out_dec_level1_decomp1, None
+def count_parameters(model):
+    total = sum(p.numel() for p in model.parameters())
+    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"Total parameters: {total:,}")
+    print(f"Trainable parameters: {trainable:,}")
+def main():
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = XYScanNet().to(device)
+    print("Model architecture:\n")
+    print(model)
+    count_parameters(model)
+    # Optionally test with a dummy input
+    dummy_input = torch.randn(1, 3, 256, 256).to(device)
+    output, _, _ = model(dummy_input)
+    print(f"Output shape: {output.shape}")
+if __name__ == "__main__":
+    main()

models/XYScanNetP.py ADDED Viewed

	@@ -0,0 +1,737 @@

+import numbers
+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from einops import rearrange, repeat
+from mamba_ssm.ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn
+try:
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+except ImportError:
+    causal_conv1d_fn, causal_conv1d_update = None, None
+try:
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+except ImportError:
+    selective_state_update = None
+try:
+    from mamba_ssm.ops.triton.layer_norm import RMSNorm, layer_norm_fn, rms_norm_fn
+except ImportError:
+    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+def to_3d(x):
+    return rearrange(x, 'b c h w -> b (h w) c')
+def to_4d(x, h, w):
+    return rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
+class BiasFree_LayerNorm(nn.Module):
+    def __init__(self, normalized_shape):
+        super(BiasFree_LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+        assert len(normalized_shape) == 1
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.normalized_shape = normalized_shape
+    def forward(self, x):
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return x / torch.sqrt(sigma + 1e-5) * self.weight
+class WithBias_LayerNorm(nn.Module):
+    def __init__(self, normalized_shape):
+        super(WithBias_LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+        assert len(normalized_shape) == 1
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.normalized_shape = normalized_shape
+    def forward(self, x):
+        mu = x.mean(-1, keepdim=True)
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return (x - mu) / torch.sqrt(sigma + 1e-5) * self.weight + self.bias
+class LayerNorm(nn.Module):
+    def __init__(self, dim, LayerNorm_type):
+        super(LayerNorm, self).__init__()
+        if LayerNorm_type == 'BiasFree':
+            self.body = BiasFree_LayerNorm(dim)
+        else:
+            self.body = WithBias_LayerNorm(dim)
+    def forward(self, x):
+        h, w = x.shape[-2:]
+        return to_4d(self.body(to_3d(x)), h, w)
+##########################################################################
+def conv(in_channels, out_channels, kernel_size, bias=False, stride = 1):
+    return nn.Conv2d(
+        in_channels, out_channels, kernel_size,
+        padding=(kernel_size//2), bias=bias, stride = stride)
+"""
+Borrow from "https://github.com/state-spaces/mamba.git"
+@article{mamba,
+  title={Mamba: Linear-Time Sequence Modeling with Selective State Spaces},
+  author={Gu, Albert and Dao, Tri},
+  journal={arXiv preprint arXiv:2312.00752},
+  year={2023}
+}
+"""
+class Mamba(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        d_state=16,
+        d_conv=4,
+        expand=2,
+        dt_rank="auto",
+        dt_min=0.001,
+        dt_max=0.1,
+        dt_init="random",
+        dt_scale=1.0,
+        dt_init_floor=1e-4,
+        conv_bias=True,
+        bias=False,
+        use_fast_path=True,  # Fused kernel options
+        layer_idx=None,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.d_model = d_model
+        self.d_state = d_state
+        self.d_conv = d_conv
+        self.expand = expand
+        self.d_inner = int(self.expand * self.d_model)
+        self.dt_rank = math.ceil(self.d_model / 16) if dt_rank == "auto" else dt_rank
+        self.use_fast_path = use_fast_path
+        self.layer_idx = layer_idx
+        self.in_proj = nn.Linear(self.d_model, self.d_inner * 2, bias=bias, **factory_kwargs)
+        self.conv1d = nn.Conv1d(
+            in_channels=self.d_inner,
+            out_channels=self.d_inner,
+            bias=conv_bias,
+            kernel_size=d_conv,
+            groups=self.d_inner,
+            padding=d_conv - 1,
+            **factory_kwargs,
+        )
+        self.activation = "silu"
+        self.act = nn.SiLU()
+        self.x_proj = nn.Linear(
+            self.d_inner, self.dt_rank + self.d_state * 2, bias=False, **factory_kwargs
+        )
+        self.dt_proj = nn.Linear(self.dt_rank, self.d_inner, bias=True, **factory_kwargs)
+        # Initialize special dt projection to preserve variance at initialization
+        dt_init_std = self.dt_rank**-0.5 * dt_scale
+        if dt_init == "constant":
+            nn.init.constant_(self.dt_proj.weight, dt_init_std)
+        elif dt_init == "random":
+            nn.init.uniform_(self.dt_proj.weight, -dt_init_std, dt_init_std)
+        else:
+            raise NotImplementedError
+        # Initialize dt bias so that F.softplus(dt_bias) is between dt_min and dt_max
+        dt = torch.exp(
+            torch.rand(self.d_inner, **factory_kwargs) * (math.log(dt_max) - math.log(dt_min))
+            + math.log(dt_min)
+        ).clamp(min=dt_init_floor)
+        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+        inv_dt = dt + torch.log(-torch.expm1(-dt))
+        with torch.no_grad():
+            self.dt_proj.bias.copy_(inv_dt)
+        # Our initialization would set all Linear.bias to zero, need to mark this one as _no_reinit
+        self.dt_proj.bias._no_reinit = True
+        # S4D real initialization
+        A = repeat(
+            torch.arange(1, self.d_state + 1, dtype=torch.float32, device=device),
+            "n -> d n",
+            d=self.d_inner,
+        ).contiguous()
+        A_log = torch.log(A)  # Keep A_log in fp32
+        self.A_log = nn.Parameter(A_log)
+        self.A_log._no_weight_decay = True
+        # D "skip" parameter
+        self.D = nn.Parameter(torch.ones(self.d_inner, device=device))  # Keep in fp32
+        self.D._no_weight_decay = True
+        self.out_proj = nn.Linear(self.d_inner, self.d_model, bias=bias, **factory_kwargs)
+    def forward(self, hidden_states, inference_params=None):
+        """
+        hidden_states: (B, L, D)
+        Returns: same shape as hidden_states
+        """
+        batch, seqlen, dim = hidden_states.shape
+        conv_state, ssm_state = None, None
+        if inference_params is not None:
+            conv_state, ssm_state = self._get_states_from_cache(inference_params, batch)
+            if inference_params.seqlen_offset > 0:
+                # The states are updated inplace
+                out, _, _ = self.step(hidden_states, conv_state, ssm_state)
+                return out
+        # We do matmul and transpose BLH -> HBL at the same time
+        xz = rearrange(
+            self.in_proj.weight @ rearrange(hidden_states, "b l d -> d (b l)"),
+            "d (b l) -> b d l",
+            l=seqlen,
+        )
+        if self.in_proj.bias is not None:
+            xz = xz + rearrange(self.in_proj.bias.to(dtype=xz.dtype), "d -> d 1")
+        A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
+        # In the backward pass we write dx and dz next to each other to avoid torch.cat
+        if self.use_fast_path and causal_conv1d_fn is not None and inference_params is None:  # Doesn't support outputting the states
+            out = mamba_inner_fn(
+                xz,
+                self.conv1d.weight,
+                self.conv1d.bias,
+                self.x_proj.weight,
+                self.dt_proj.weight,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                A,
+                None,  # input-dependent B
+                None,  # input-dependent C
+                self.D.float(),
+                delta_bias=self.dt_proj.bias.float(),
+                delta_softplus=True,
+            )
+        else:
+            x, z = xz.chunk(2, dim=1)
+            # Compute short convolution
+            if conv_state is not None:
+                # If we just take x[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
+                # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise.
+                conv_state.copy_(F.pad(x, (self.d_conv - x.shape[-1], 0)))  # Update state (B D W)
+            if causal_conv1d_fn is None:
+                x = self.act(self.conv1d(x)[..., :seqlen])
+            else:
+                assert self.activation in ["silu", "swish"]
+                x = causal_conv1d_fn(
+                    x=x,
+                    weight=rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                    bias=self.conv1d.bias,
+                    activation=self.activation,
+                )
+            # We're careful here about the layout, to avoid extra transposes.
+            # We want dt to have d as the slowest moving dimension
+            # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+            x_dbl = self.x_proj(rearrange(x, "b d l -> (b l) d"))  # (bl d)
+            dt, B, C = torch.split(x_dbl, [self.dt_rank, self.d_state, self.d_state], dim=-1)
+            dt = self.dt_proj.weight @ dt.t()
+            dt = rearrange(dt, "d (b l) -> b d l", l=seqlen)
+            B = rearrange(B, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
+            C = rearrange(C, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
+            assert self.activation in ["silu", "swish"]
+            y = selective_scan_fn(
+                x,
+                dt,
+                A,
+                B,
+                C,
+                self.D.float(),
+                z=z,
+                delta_bias=self.dt_proj.bias.float(),
+                delta_softplus=True,
+                return_last_state=ssm_state is not None,
+            )
+            if ssm_state is not None:
+                y, last_state = y
+                ssm_state.copy_(last_state)
+            y = rearrange(y, "b d l -> b l d")
+            out = self.out_proj(y)
+        return out
+    def step(self, hidden_states, conv_state, ssm_state):
+        dtype = hidden_states.dtype
+        assert hidden_states.shape[1] == 1, "Only support decoding with 1 token at a time for now"
+        xz = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
+        x, z = xz.chunk(2, dim=-1)  # (B D)
+        # Conv step
+        if causal_conv1d_update is None:
+            conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1))  # Update state (B D W)
+            conv_state[:, :, -1] = x
+            x = torch.sum(conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1)  # (B D)
+            if self.conv1d.bias is not None:
+                x = x + self.conv1d.bias
+            x = self.act(x).to(dtype=dtype)
+        else:
+            x = causal_conv1d_update(
+                x,
+                conv_state,
+                rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                self.conv1d.bias,
+                self.activation,
+            )
+        x_db = self.x_proj(x)  # (B dt_rank+2*d_state)
+        dt, B, C = torch.split(x_db, [self.dt_rank, self.d_state, self.d_state], dim=-1)
+        # Don't add dt_bias here
+        dt = F.linear(dt, self.dt_proj.weight)  # (B d_inner)
+        A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
+        # SSM step
+        if selective_state_update is None:
+            # Discretize A and B
+            dt = F.softplus(dt + self.dt_proj.bias.to(dtype=dt.dtype))
+            dA = torch.exp(torch.einsum("bd,dn->bdn", dt, A))
+            dB = torch.einsum("bd,bn->bdn", dt, B)
+            ssm_state.copy_(ssm_state * dA + rearrange(x, "b d -> b d 1") * dB)
+            y = torch.einsum("bdn,bn->bd", ssm_state.to(dtype), C)
+            y = y + self.D.to(dtype) * x
+            y = y * self.act(z)  # (B D)
+        else:
+            y = selective_state_update(
+                ssm_state, x, dt, A, B, C, self.D, z=z, dt_bias=self.dt_proj.bias, dt_softplus=True
+            )
+        out = self.out_proj(y)
+        return out.unsqueeze(1), conv_state, ssm_state
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        device = self.out_proj.weight.device
+        conv_dtype = self.conv1d.weight.dtype if dtype is None else dtype
+        conv_state = torch.zeros(
+            batch_size, self.d_model * self.expand, self.d_conv, device=device, dtype=conv_dtype
+        )
+        ssm_dtype = self.dt_proj.weight.dtype if dtype is None else dtype
+        # ssm_dtype = torch.float32
+        ssm_state = torch.zeros(
+            batch_size, self.d_model * self.expand, self.d_state, device=device, dtype=ssm_dtype
+        )
+        return conv_state, ssm_state
+    def _get_states_from_cache(self, inference_params, batch_size, initialize_states=False):
+        assert self.layer_idx is not None
+        if self.layer_idx not in inference_params.key_value_memory_dict:
+            batch_shape = (batch_size,)
+            conv_state = torch.zeros(
+                batch_size,
+                self.d_model * self.expand,
+                self.d_conv,
+                device=self.conv1d.weight.device,
+                dtype=self.conv1d.weight.dtype,
+            )
+            ssm_state = torch.zeros(
+                batch_size,
+                self.d_model * self.expand,
+                self.d_state,
+                device=self.dt_proj.weight.device,
+                dtype=self.dt_proj.weight.dtype,
+                # dtype=torch.float32,
+            )
+            inference_params.key_value_memory_dict[self.layer_idx] = (conv_state, ssm_state)
+        else:
+            conv_state, ssm_state = inference_params.key_value_memory_dict[self.layer_idx]
+            # TODO: What if batch size changes between generation, and we reuse the same states?
+            if initialize_states:
+                conv_state.zero_()
+                ssm_state.zero_()
+        return conv_state, ssm_state
+    ##########################################################################
+## Feed-forward Network
+class FFN(nn.Module):
+    def __init__(self, dim, ffn_expansion_factor, bias):
+        super(FFN, self).__init__()
+        hidden_features = int(dim*ffn_expansion_factor)
+        self.project_in = nn.Conv2d(dim, hidden_features*2, kernel_size=1, bias=bias)
+        self.dwconv = nn.Conv2d(hidden_features*2, hidden_features*2, kernel_size=3, stride=1, padding=1, groups=hidden_features*2, bias=bias, dilation=1)
+        self.win_size = 8
+        self.modulator = nn.Parameter(torch.ones(self.win_size, self.win_size, dim*2))  # modulator
+        self.project_out = nn.Conv2d(hidden_features, dim, kernel_size=1, bias=bias)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        h1, w1 = h//self.win_size, w//self.win_size
+        x = self.project_in(x)
+        x = self.dwconv(x)
+        x_win = rearrange(x, 'b c (wsh h1) (wsw w1) -> b h1 w1 wsh wsw c', wsh=self.win_size, wsw=self.win_size)
+        x_win = x_win * self.modulator
+        x = rearrange(x_win, 'b h1 w1 wsh wsw c -> b c (wsh h1) (wsw w1)', wsh=self.win_size, wsw=self.win_size, h1=h1, w1=w1)
+        x1, x2 = x.chunk(2, dim=1)
+        x = x1 * x2
+        x = self.project_out(x)
+        return x
+    ##########################################################################
+## Gated Depth-wise Feed-forward Network (GDFN)
+class GDFN(nn.Module):
+    def __init__(self, dim, ffn_expansion_factor, bias):
+        super(GDFN, self).__init__()
+        hidden_features = int(dim*ffn_expansion_factor)
+        self.project_in = nn.Conv2d(dim, hidden_features*2, kernel_size=1, bias=bias)
+        self.dwconv = nn.Conv2d(hidden_features*2, hidden_features*2, kernel_size=3, stride=1, padding=1, groups=hidden_features*2, bias=bias, dilation=1)
+        self.project_out = nn.Conv2d(hidden_features, dim, kernel_size=1, bias=bias)
+    def forward(self, x):
+        x = self.project_in(x)
+        x = self.dwconv(x)
+        x1, x2 = x.chunk(2, dim=1)
+        x = F.silu(x1) * x2
+        x = self.project_out(x)
+        return x
+##########################################################################
+## Overlapped image patch embedding with 3x3 Conv
+class OverlapPatchEmbed(nn.Module):
+    def __init__(self, in_c=3, embed_dim=48, bias=False):
+        super(OverlapPatchEmbed, self).__init__()
+        self.proj = nn.Conv2d(in_c, embed_dim, kernel_size=3, stride=1, padding=1, bias=bias)
+    def forward(self, x):
+        x = self.proj(x)
+        return x
+##########################################################################
+## Resizing modules
+class Downsample(nn.Module):
+    def __init__(self, n_feat):
+        super(Downsample, self).__init__()
+        self.body = nn.Sequential(nn.Upsample(scale_factor=0.5, mode='bilinear', align_corners=False),
+                                  nn.Conv2d(n_feat, n_feat * 2, 3, stride=1, padding=1, bias=False))
+    def forward(self, x):
+        return self.body(x)
+class Upsample(nn.Module):
+    def __init__(self, n_feat):
+        super(Upsample, self).__init__()
+        self.body = nn.Sequential(nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),
+                                  nn.Conv2d(n_feat, n_feat // 2, 3, stride=1, padding=1, bias=False))
+    def forward(self, x):
+        return self.body(x)
+"""
+Borrow from "https://github.com/pp00704831/Stripformer-ECCV-2022-.git"
+@inproceedings{Tsai2022Stripformer,
+  author    = {Fu-Jen Tsai and Yan-Tsung Peng and Yen-Yu Lin and Chung-Chi Tsai and Chia-Wen Lin},
+  title     = {Stripformer: Strip Transformer for Fast Image Deblurring},
+  booktitle = {ECCV},
+  year      = {2022}
+}
+"""
+class Intra_VSSM(nn.Module):
+    def __init__(self, dim, vssm_expansion_factor, bias):  # gated = True
+        super(Intra_VSSM, self).__init__()
+        hidden = int(dim*vssm_expansion_factor)
+        self.proj_in = nn.Conv2d(dim, hidden*2, kernel_size=1, bias=bias)
+        self.dwconv = nn.Conv2d(hidden*2, hidden*2, kernel_size=3, stride=1, padding=1, groups=hidden*2, bias=bias)
+        self.proj_out = nn.Conv2d(hidden, dim, kernel_size=1, bias=bias)
+        self.conv_input = nn.Conv2d(hidden, hidden, kernel_size=1, padding=0, bias=bias)
+        self.fuse_out = nn.Conv2d(hidden, hidden, kernel_size=1, padding=0, bias=bias)
+        self.mamba = Mamba(d_model=hidden // 2)
+    def forward_core(self, x):
+        B, C, H, W = x.size()
+        x_input = torch.chunk(self.conv_input(x), 2, dim=1)
+        feature_h = (x_input[0]).permute(0, 2, 3, 1).contiguous()
+        feature_h = feature_h.view(B * H, W, C//2)
+        feature_v = (x_input[1]).permute(0, 3, 2, 1).contiguous()
+        feature_v = feature_v.view(B * W, H, C//2)
+        if H == W:
+            feature = torch.cat((feature_h, feature_v), dim=0)  # B * H * 2, W, C//2
+            scan_output = self.mamba(feature)
+            scan_output = torch.chunk(scan_output, 2, dim=0)
+            scan_output_h = scan_output[0]
+            scan_output_v = scan_output[1]
+        else:
+            scan_output_h = self.mamba(feature_h)
+            scan_output_v = self.mamba(feature_v)
+        scan_output_h = scan_output_h.view(B, H, W, C//2).permute(0, 3, 1, 2).contiguous()
+        scan_output_v = scan_output_v.view(B, W, H, C//2).permute(0, 3, 2, 1).contiguous()
+        scan_output = self.fuse_out(torch.cat((scan_output_h, scan_output_v), dim=1))
+        return scan_output
+    def forward(self, x):
+        x = self.proj_in(x)
+        x, x_ = self.dwconv(x).chunk(2, dim=1)
+        x = self.forward_core(x)
+        x = F.silu(x_) * x
+        x = self.proj_out(x)
+        return x
+class Inter_VSSM(nn.Module):
+    def __init__(self, dim, vssm_expansion_factor, bias):  # gated = True
+        super(Inter_VSSM, self).__init__()
+        hidden = int(dim*vssm_expansion_factor)
+        self.proj_in = nn.Conv2d(dim, hidden*2, kernel_size=1, bias=bias)
+        self.dwconv = nn.Conv2d(hidden*2, hidden*2, kernel_size=3, stride=1, padding=1, groups=hidden*2, bias=bias)
+        self.proj_out = nn.Conv2d(hidden, dim, kernel_size=1, bias=bias)
+        self.avg_pool = nn.AdaptiveAvgPool2d((None,1))
+        self.conv_input = nn.Conv2d(hidden, hidden, kernel_size=1, padding=0, bias=bias)
+        self.fuse_out = nn.Conv2d(hidden, hidden, kernel_size=1, padding=0, bias=bias)
+        self.mamba = Mamba(d_model=hidden // 2)
+        self.sigmoid = nn.Sigmoid()
+    def forward_core(self, x):
+        B, C, H, W = x.size()
+        x_input = torch.chunk(self.conv_input(x), 2, dim=1)  # B, C, H, W
+        feature_h = x_input[0].permute(0, 2, 1, 3).contiguous()  # B, H, C//2, W
+        feature_h_score = self.avg_pool(feature_h)  # B, H, C//2, 1
+        feature_h_score = feature_h_score.view(B, H, -1)
+        feature_v = x_input[1].permute(0, 3, 1, 2).contiguous()  # B, W, C//2, H
+        feature_v_score = self.avg_pool(feature_v)  # B, W, C//2, 1
+        feature_v_score = feature_v_score.view(B, W, -1)
+        if H == W:
+            feature_score = torch.cat((feature_h_score, feature_v_score), dim=0)  # B * 2, W or H, C//2
+            scan_score = self.mamba(feature_score)
+            scan_score = torch.chunk(scan_score, 2, dim=0)
+            scan_score_h = scan_score[0]
+            scan_score_v = scan_score[1]
+        else:
+            scan_score_h = self.mamba(feature_h_score)
+            scan_score_v = self.mamba(feature_v_score)
+        scan_score_h = self.sigmoid(scan_score_h)
+        scan_score_v = self.sigmoid(scan_score_v)
+        feature_h = feature_h*scan_score_h[:,:,:,None]
+        feature_v = feature_v*scan_score_v[:,:,:,None]
+        feature_h = feature_h.view(B, H, C//2, W).permute(0, 2, 1, 3).contiguous()
+        feature_v = feature_v.view(B, W, C//2, H).permute(0, 2, 3, 1).contiguous()
+        output = self.fuse_out(torch.cat((feature_h, feature_v), dim=1))
+        return output
+    def forward(self, x):
+        x = self.proj_in(x)
+        x, x_ = self.dwconv(x).chunk(2, dim=1)
+        x = self.forward_core(x)
+        x = F.silu(x_) * x
+        x = self.proj_out(x)
+        return x
+##########################################################################
+class Strip_VSSB(nn.Module):
+    def __init__(self, dim, vssm_expansion_factor, ffn_expansion_factor, bias=False, ssm=False, LayerNorm_type='WithBias'):
+        super(Strip_VSSB, self).__init__()
+        self.ssm = ssm
+        if self.ssm == True:
+            self.norm1_ssm = LayerNorm(dim, LayerNorm_type)
+            self.norm2_ssm = LayerNorm(dim, LayerNorm_type)
+            self.intra = Intra_VSSM(dim, vssm_expansion_factor, bias)
+            self.inter = Inter_VSSM(dim, vssm_expansion_factor, bias)
+        self.norm1_ffn = LayerNorm(dim, LayerNorm_type)
+        self.norm2_ffn = LayerNorm(dim, LayerNorm_type)
+        self.ffn1 = GDFN(dim, ffn_expansion_factor, bias)
+        self.ffn2 = GDFN(dim, ffn_expansion_factor, bias)
+    def forward(self, x):
+        if self.ssm == True:
+            x = x + self.intra(self.norm1_ssm(x))
+        x = x + self.ffn1(self.norm1_ffn(x))
+        if self.ssm == True:
+            x = x + self.inter(self.norm2_ssm(x))
+        x = x + self.ffn2(self.norm2_ffn(x))
+        return x
+##########################################################################
+##---------- Cross-level Feature Fusion by Adding Sigmoid(KL-Div) * Multi-Scale Feat -----------------------
+class CLFF(nn.Module):
+    def __init__(self, dim, dim_n1, dim_n2, bias=False):
+        super(CLFF, self).__init__()
+        self.conv = nn.Conv2d(dim, dim, kernel_size=1, bias=bias)
+        self.conv_n1 = nn.Conv2d(dim_n1, dim, kernel_size=1, bias=bias)
+        self.conv_n2 = nn.Conv2d(dim_n2, dim, kernel_size=1, bias=bias)
+        self.fuse_out1 = nn.Conv2d(dim*2, dim, kernel_size=1, bias=bias)
+        self.log_sigmoid = nn.LogSigmoid()
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x, n1, n2):
+        x_ = self.conv(x)
+        n1_ = self.conv_n1(n1)
+        n2_ = self.conv_n2(n2)
+        kl_n1 = F.kl_div(input=self.log_sigmoid(n1_), target=self.log_sigmoid(x_), log_target=True)
+        kl_n2 = F.kl_div(input=self.log_sigmoid(n2_), target=self.log_sigmoid(x_), log_target=True)
+        #g = self.sigmoid(x_)
+        g1 = self.sigmoid(kl_n1)
+        g2 = self.sigmoid(kl_n2)
+        #x = (1 + g) * x_ + (1 - g) * (g1 * n1_ + g2 * n2_)
+        x = self.fuse_out1(torch.cat((x_, g1 * n1_ + g2 * n2_), dim=1))
+        return x
+##########################################################################
+##---------- StripScanNet -----------------------
+class XYScanNetP(nn.Module):
+    def __init__(self,
+        inp_channels=3,
+        out_channels=3,
+        dim = 144,  # 48, 72, 96, 120, 144
+        num_blocks = [3,3,6],
+        vssm_expansion_factor  = 1,  # 1 or 2
+        ffn_expansion_factor  = 1,  # 1 or 3
+        bias = False,
+        LayerNorm_type = 'WithBias',   ## Other option 'BiasFree'
+    ):
+        super(XYScanNetP, self).__init__()
+        self.patch_embed = OverlapPatchEmbed(inp_channels, dim)
+        self.encoder_level1 = nn.Sequential(*[Strip_VSSB(dim=dim, vssm_expansion_factor=vssm_expansion_factor, ffn_expansion_factor = ffn_expansion_factor,
+                                                         bias=bias, ssm=False, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[0])])
+        self.down1_2 = Downsample(dim) ## From Level 1 to Level 2
+        self.encoder_level2 = nn.Sequential(*[Strip_VSSB(dim=int(dim*2**1), vssm_expansion_factor=vssm_expansion_factor, ffn_expansion_factor = ffn_expansion_factor,
+                                                         bias=bias, ssm=False, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[1])])
+        self.down2_3 = Downsample(int(dim*2**1)) ## From Level 2 to Level 3
+        self.encoder_level3 = nn.Sequential(*[Strip_VSSB(dim=int(dim*2**2), vssm_expansion_factor=vssm_expansion_factor, ffn_expansion_factor = ffn_expansion_factor,
+                                                         bias=bias, ssm=False, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[2])])
+        self.decoder_level3 = nn.Sequential(*[Strip_VSSB(dim=int(dim*2**2), vssm_expansion_factor=vssm_expansion_factor, ffn_expansion_factor = ffn_expansion_factor,
+                                                         bias=bias, ssm=True, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[2])])
+        self.up3_2 = Upsample(int(dim*2**2)) ## From Level 3 to Level 2
+        self.clff_level2 = CLFF(int(dim*2**1), dim_n1=int(dim*2**0), dim_n2=(dim*2**2), bias=bias)
+        self.reduce_chan_level2 = nn.Conv2d(int(dim*2**2), int(dim*2**1), kernel_size=1, bias=bias)
+        self.decoder_level2 = nn.Sequential(*[Strip_VSSB(dim=int(dim*2**1), vssm_expansion_factor=vssm_expansion_factor, ffn_expansion_factor = ffn_expansion_factor,
+                                                         bias=bias, ssm=True, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[1])])
+        self.up2_1 = Upsample(int(dim*2**1))  ## From Level 2 to Level 1
+        self.clff_level1 = CLFF(int(dim*2**0), dim_n1=int(dim*2**1), dim_n2=(dim*2**2), bias=bias)
+        self.reduce_chan_level1 = nn.Conv2d(int(dim*2**1), int(dim*2**0), kernel_size=1, bias=bias)
+        self.decoder_level1 = nn.Sequential(*[Strip_VSSB(dim=int(dim*2**0), vssm_expansion_factor=vssm_expansion_factor, ffn_expansion_factor = ffn_expansion_factor,
+                                                         bias=bias, ssm=True, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[0])])
+        # self.refinement = nn.Sequential(*[Strip_VSSB(dim=int(dim*2**0), expansion_factor=expansion_factor, bias=bias, ssm=True, LayerNorm_type=LayerNorm_type) for i in range(num_refinement_blocks)])
+        self.output = nn.Conv2d(int(dim*2**0), out_channels, kernel_size=3, stride=1, padding=1, bias=bias)
+    def forward(self, inp_img):
+        # Encoder
+        inp_enc_level1 = self.patch_embed(inp_img)
+        out_enc_level1 = self.encoder_level1(inp_enc_level1)
+        out_enc_level1_2 = F.interpolate(out_enc_level1, scale_factor=0.5)  # dim*2, lvl1 down-scaled to lvl2
+        inp_enc_level2 = self.down1_2(out_enc_level1)
+        out_enc_level2 = self.encoder_level2(inp_enc_level2)
+        out_enc_level2_1 = F.interpolate(out_enc_level2, scale_factor=2)  # dim*2, lvl2 up-scaled to lvl1
+        inp_enc_level3 = self.down2_3(out_enc_level2)
+        out_enc_level3 = self.encoder_level3(inp_enc_level3)
+        out_enc_level3_2 = F.interpolate(out_enc_level3, scale_factor=2)  # dim*2**2, lvl3 up-scaled to lvl2 (lvl3->lvl2)
+        out_enc_level3_1 = F.interpolate(out_enc_level3_2, scale_factor=2)  # dim*2**2, lvl3 up-scaled to lvl1 (lvl3->lvl2->lvl1)
+        out_enc_level1 = self.clff_level1(out_enc_level1, out_enc_level2_1, out_enc_level3_1)
+        out_enc_level2 = self.clff_level2(out_enc_level2, out_enc_level1_2, out_enc_level3_2)
+        # Decoder
+        out_dec_level3_decomp1 = self.decoder_level3(out_enc_level3)
+        inp_dec_level2_decomp1 = self.up3_2(out_dec_level3_decomp1)
+        inp_dec_level2_decomp1 = self.reduce_chan_level2(torch.cat((inp_dec_level2_decomp1, out_enc_level2), dim=1))
+        out_dec_level2_decomp1 = self.decoder_level2(inp_dec_level2_decomp1)
+        inp_dec_level1_decomp1 = self.up2_1(out_dec_level2_decomp1)
+        inp_dec_level1_decomp1 = self.reduce_chan_level1(torch.cat((inp_dec_level1_decomp1, out_enc_level1), dim=1))
+        out_dec_level1_decomp1 = self.decoder_level1(inp_dec_level1_decomp1)
+        out_dec_level1_decomp1 = self.output(out_dec_level1_decomp1)
+        out_dec_level1 = out_dec_level1_decomp1 + inp_img
+        return out_dec_level1, out_dec_level1_decomp1, None
+def count_parameters(model):
+    total = sum(p.numel() for p in model.parameters())
+    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"Total parameters: {total:,}")
+    print(f"Trainable parameters: {trainable:,}")
+def main():
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = XYScanNetP().to(device)
+    print("Model architecture:\n")
+    print(model)
+    count_parameters(model)
+    # Optionally test with a dummy input
+    dummy_input = torch.randn(1, 3, 256, 256).to(device)
+    output, _, _ = model(dummy_input)
+    print(f"Output shape: {output.shape}")
+if __name__ == "__main__":
+    main()

models/__init__.py ADDED Viewed

File without changes

models/__pycache__/XYScanNet.cpython-38.pyc ADDED Viewed

Binary file (21.2 kB). View file

models/__pycache__/XYScanNetP.cpython-38.pyc ADDED Viewed

Binary file (21.2 kB). View file

models/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (140 Bytes). View file

models/__pycache__/networks.cpython-38.pyc ADDED Viewed

Binary file (691 Bytes). View file

models/losses.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import torch
+import torch.nn as nn
+import torchvision.models as models
+import torch.nn.functional as F
+class Vgg19(torch.nn.Module):
+    def __init__(self, requires_grad=False):
+        super(Vgg19, self).__init__()
+        vgg_pretrained_features = vgg19(pretrained=True).features
+        self.slice1 = torch.nn.Sequential()
+        for x in range(12):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x].eval())
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+    def forward(self, X):
+        h_relu1 = self.slice1(X)
+        return h_relu1
+class ContrastLoss(nn.Module):
+    def __init__(self, ablation=False):
+        super(ContrastLoss, self).__init__()
+        self.vgg = Vgg19().cuda()
+        self.l1 = nn.L1Loss()
+        self.ab = ablation
+        self.down_sample_4 = nn.Upsample(scale_factor=1 / 4, mode='bilinear')
+    def forward(self, restore, sharp, blur):
+        B, C, H, W = restore.size()
+        restore_vgg, sharp_vgg, blur_vgg = self.vgg(restore), self.vgg(sharp), self.vgg(blur)
+        # filter out sharp regions
+        threshold = 0.01
+        mask = torch.mean(torch.abs(sharp-blur), dim=1).view(B, 1, H, W)
+        mask[mask <= threshold] = 0
+        mask[mask > threshold] = 1
+        mask = self.down_sample_4(mask)
+        d_ap = torch.mean(torch.abs((restore_vgg - sharp_vgg.detach())), dim=1).view(B, 1, H//4, W//4)
+        d_an = torch.mean(torch.abs((restore_vgg - blur_vgg.detach())), dim=1).view(B, 1, H//4, W//4)
+        mask_size = torch.sum(mask)
+        contrastive = torch.sum((d_ap / (d_an + 1e-7)) * mask) / mask_size
+        return contrastive
+class ContrastLoss_Ori(nn.Module):
+    def __init__(self, ablation=False):
+        super(ContrastLoss_Ori, self).__init__()
+        self.vgg = Vgg19().cuda()
+        self.l1 = nn.L1Loss()
+        self.ab = ablation
+    def forward(self, restore, sharp, blur):
+        restore_vgg, sharp_vgg, blur_vgg = self.vgg(restore), self.vgg(sharp), self.vgg(blur)
+        d_ap = self.l1(restore_vgg, sharp_vgg.detach())
+        d_an = self.l1(restore_vgg, blur_vgg.detach())
+        contrastive_loss = d_ap / (d_an + 1e-7)
+        return contrastive_loss
+class CharbonnierLoss(nn.Module):
+    """Charbonnier Loss (L1)"""
+    def __init__(self, eps=1e-3):
+        super(CharbonnierLoss, self).__init__()
+        self.eps = eps
+    def forward(self, x, y):
+        diff = x - y
+        # loss = torch.sum(torch.sqrt(diff * diff + self.eps))
+        loss = torch.mean(torch.sqrt((diff * diff) + (self.eps * self.eps)))
+        return loss
+class EdgeLoss(nn.Module):
+    def __init__(self):
+        super(EdgeLoss, self).__init__()
+        k = torch.Tensor([[.05, .25, .4, .25, .05]])
+        self.kernel = torch.matmul(k.t(), k).unsqueeze(0).repeat(3, 1, 1, 1)
+        if torch.cuda.is_available():
+            self.kernel = self.kernel.cuda()
+        self.loss = CharbonnierLoss()
+    def conv_gauss(self, img):
+        n_channels, _, kw, kh = self.kernel.shape
+        img = F.pad(img, (kw // 2, kh // 2, kw // 2, kh // 2), mode='replicate')
+        return F.conv2d(img, self.kernel, groups=n_channels)
+    def laplacian_kernel(self, current):
+        filtered = self.conv_gauss(current)  # filter
+        down = filtered[:, :, ::2, ::2]  # downsample
+        new_filter = torch.zeros_like(filtered)
+        new_filter[:, :, ::2, ::2] = down * 4  # upsample
+        filtered = self.conv_gauss(new_filter)  # filter
+        diff = current - filtered
+        return diff
+    def forward(self, x, y):
+        # x = torch.clamp(x + 0.5, min = 0,max = 1)
+        # y = torch.clamp(y + 0.5, min = 0,max = 1)
+        loss = self.loss(self.laplacian_kernel(x), self.laplacian_kernel(y))
+        return loss
+class Stripformer_Loss(nn.Module):
+    def __init__(self, ):
+        super(Stripformer_Loss, self).__init__()
+        self.char = CharbonnierLoss()
+        self.edge = EdgeLoss()
+        self.contrastive = ContrastLoss()
+    def forward(self, restore, sharp, blur):
+        char = self.char(restore, sharp)
+        edge = 0.05 * self.edge(restore, sharp)
+        contrastive = 0.0005 * self.contrastive(restore, sharp, blur)
+        loss = char + edge + contrastive
+        return loss
+def get_loss(model):
+    if model['content_loss'] == 'Stripformer_Loss':
+        content_loss = Stripformer_Loss()
+    elif model['content_loss'] == 'CharbonnierLoss':
+        content_loss = CharbonnierLoss()
+    else:
+        raise ValueError("ContentLoss [%s] not recognized." % model['content_loss'])
+    return content_loss
+from typing import Union, List, Dict, Any, cast
+import torch
+import torch.nn as nn
+class VGG(nn.Module):
+    def __init__(
+        self, features: nn.Module, num_classes: int = 1000, init_weights: bool = True, dropout: float = 0.5
+    ) -> None:
+        super().__init__()
+        self.features = features
+        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
+        self.classifier = nn.Sequential(
+            nn.Linear(512 * 7 * 7, 4096),
+            nn.ReLU(True),
+            nn.Dropout(p=dropout),
+            nn.Linear(4096, 4096),
+            nn.ReLU(True),
+            nn.Dropout(p=dropout),
+            nn.Linear(4096, num_classes),
+        )
+        if init_weights:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                    if m.bias is not None:
+                        nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.Linear):
+                    nn.init.normal_(m.weight, 0, 0.01)
+                    nn.init.constant_(m.bias, 0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.classifier(x)
+        return x
+def make_layers(cfg: List[Union[str, int]], batch_norm: bool = False) -> nn.Sequential:
+    layers: List[nn.Module] = []
+    in_channels = 3
+    for v in cfg:
+        if v == "M":
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+        else:
+            v = cast(int, v)
+            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
+            if batch_norm:
+                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
+            else:
+                layers += [conv2d, nn.ReLU(inplace=True)]
+            in_channels = v
+    return nn.Sequential(*layers)
+cfgs: Dict[str, List[Union[str, int]]] = {
+    "A": [64, "M", 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
+    "B": [64, 64, "M", 128, 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
+    "D": [64, 64, "M", 128, 128, "M", 256, 256, 256, "M", 512, 512, 512, "M", 512, 512, 512, "M"],
+    "E": [64, 64, "M", 128, 128, "M", 256, 256, 256, 256, "M", 512, 512, 512, 512, "M", 512, 512, 512, 512, "M"],
+}
+def _vgg(arch: str, cfg: str, batch_norm: bool, pretrained: bool, progress: bool, **kwargs: Any) -> VGG:
+    if pretrained:
+        kwargs["init_weights"] = False
+    model = VGG(make_layers(cfgs[cfg], batch_norm=batch_norm), **kwargs)
+    if pretrained:
+        state_dict = torch.load("/home/hanzhou1996/low-level/StripMamba/models/vgg19-dcbb9e9d.pth")  # change the path to vgg19.pth
+        model.load_state_dict(state_dict)
+    return model
+def vgg19(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VGG:
+    r"""VGG 19-layer model (configuration "E")
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
+    The required minimum input size of the model is 32x32.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _vgg("vgg19", "E", False, pretrained, progress, **kwargs)
+"""
+if __name__ == "__main__":
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    #model = VGG(make_layers(cfgs["E"], batch_norm=False)).to(device)
+    #model.load_state_dict(torch.load("models/vgg19-dcbb9e9d.pth"))
+    model = vgg19().to(device)
+    print(model.features)
+    BATCH_SIZE = 3
+    x = torch.randn(3, 3, 224, 224).to(device)
+    assert model(x).shape == torch.Size([BATCH_SIZE, 1000])
+    print(model(x).shape)
+"""

models/models.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import numpy as np
+import torch.nn as nn
+#from skimage.measure import compare_ssim as SSIM
+from skimage.metrics import structural_similarity as SSIM
+from util.metrics import PSNR
+class DeblurModel(nn.Module):
+    def __init__(self):
+        super(DeblurModel, self).__init__()
+    def get_input(self, data):
+        img = data['a']
+        inputs = img
+        targets = data['b']
+        inputs, targets = inputs.cuda(), targets.cuda()
+        return inputs, targets
+    def tensor2im(self, image_tensor, imtype=np.uint8):
+        image_numpy = image_tensor[0].cpu().float().numpy()
+        image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + 0.5) * 255.0
+        return image_numpy
+    def get_images_and_metrics(self, inp, output, target) -> (float, float, np.ndarray):
+        inp = self.tensor2im(inp)
+        fake = self.tensor2im(output.data)
+        real = self.tensor2im(target.data)
+        psnr = PSNR(fake, real)
+        ssim = SSIM(fake.astype('uint8'), real.astype('uint8'), channel_axis=2)
+        vis_img = np.hstack((inp, fake, real))
+        return psnr, ssim, vis_img
+def get_model(model_config):
+    return DeblurModel()

models/networks.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch.nn as nn
+from models.XYScanNet import XYScanNet
+from models.XYScanNetP import XYScanNetP
+def get_generator(model_config):
+    generator_name = model_config['g_name']
+    if generator_name == 'XYScanNet':
+        model_g = XYScanNet()
+    elif generator_name == 'XYScanNetP':
+        model_g = XYScanNetP()
+    else:
+        raise ValueError("Generator Network [%s] not recognized." % generator_name)
+    return nn.DataParallel(model_g)
+def get_nets(model_config):
+    return get_generator(model_config)

models/sota/FFTformer.py ADDED Viewed

	@@ -0,0 +1,324 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numbers
+from einops import rearrange
+def to_3d(x):
+    return rearrange(x, 'b c h w -> b (h w) c')
+def to_4d(x, h, w):
+    return rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
+class BiasFree_LayerNorm(nn.Module):
+    def __init__(self, normalized_shape):
+        super(BiasFree_LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+        assert len(normalized_shape) == 1
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.normalized_shape = normalized_shape
+    def forward(self, x):
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return x / torch.sqrt(sigma + 1e-5) * self.weight
+class WithBias_LayerNorm(nn.Module):
+    def __init__(self, normalized_shape):
+        super(WithBias_LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+        assert len(normalized_shape) == 1
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.normalized_shape = normalized_shape
+    def forward(self, x):
+        mu = x.mean(-1, keepdim=True)
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return (x - mu) / torch.sqrt(sigma + 1e-5) * self.weight + self.bias
+class LayerNorm(nn.Module):
+    def __init__(self, dim, LayerNorm_type):
+        super(LayerNorm, self).__init__()
+        if LayerNorm_type == 'BiasFree':
+            self.body = BiasFree_LayerNorm(dim)
+        else:
+            self.body = WithBias_LayerNorm(dim)
+    def forward(self, x):
+        h, w = x.shape[-2:]
+        return to_4d(self.body(to_3d(x)), h, w)
+class DFFN(nn.Module):
+    def __init__(self, dim, ffn_expansion_factor, bias):
+        super(DFFN, self).__init__()
+        hidden_features = int(dim * ffn_expansion_factor)
+        self.patch_size = 8
+        self.dim = dim
+        self.project_in = nn.Conv2d(dim, hidden_features * 2, kernel_size=1, bias=bias)
+        self.dwconv = nn.Conv2d(hidden_features * 2, hidden_features * 2, kernel_size=3, stride=1, padding=1,
+                                groups=hidden_features * 2, bias=bias)
+        self.fft = nn.Parameter(torch.ones((hidden_features * 2, 1, 1, self.patch_size, self.patch_size // 2 + 1)))
+        self.project_out = nn.Conv2d(hidden_features, dim, kernel_size=1, bias=bias)
+    def forward(self, x):
+        x = self.project_in(x)
+        x_patch = rearrange(x, 'b c (h patch1) (w patch2) -> b c h w patch1 patch2', patch1=self.patch_size,
+                            patch2=self.patch_size)
+        x_patch_fft = torch.fft.rfft2(x_patch.float())
+        x_patch_fft = x_patch_fft * self.fft
+        x_patch = torch.fft.irfft2(x_patch_fft, s=(self.patch_size, self.patch_size))
+        x = rearrange(x_patch, 'b c h w patch1 patch2 -> b c (h patch1) (w patch2)', patch1=self.patch_size,
+                      patch2=self.patch_size)
+        x1, x2 = self.dwconv(x).chunk(2, dim=1)
+        x = F.gelu(x1) * x2
+        x = self.project_out(x)
+        return x
+class FSAS(nn.Module):
+    def __init__(self, dim, bias):
+        super(FSAS, self).__init__()
+        self.to_hidden = nn.Conv2d(dim, dim * 6, kernel_size=1, bias=bias)
+        self.to_hidden_dw = nn.Conv2d(dim * 6, dim * 6, kernel_size=3, stride=1, padding=1, groups=dim * 6, bias=bias)
+        self.project_out = nn.Conv2d(dim * 2, dim, kernel_size=1, bias=bias)
+        self.norm = LayerNorm(dim * 2, LayerNorm_type='WithBias')
+        self.patch_size = 8
+    def forward(self, x):
+        hidden = self.to_hidden(x)
+        q, k, v = self.to_hidden_dw(hidden).chunk(3, dim=1)
+        q_patch = rearrange(q, 'b c (h patch1) (w patch2) -> b c h w patch1 patch2', patch1=self.patch_size,
+                            patch2=self.patch_size)
+        k_patch = rearrange(k, 'b c (h patch1) (w patch2) -> b c h w patch1 patch2', patch1=self.patch_size,
+                            patch2=self.patch_size)
+        q_fft = torch.fft.rfft2(q_patch.float())
+        k_fft = torch.fft.rfft2(k_patch.float())
+        out = q_fft * k_fft
+        out = torch.fft.irfft2(out, s=(self.patch_size, self.patch_size))
+        out = rearrange(out, 'b c h w patch1 patch2 -> b c (h patch1) (w patch2)', patch1=self.patch_size,
+                        patch2=self.patch_size)
+        out = self.norm(out)
+        output = v * out
+        output = self.project_out(output)
+        return output
+##########################################################################
+class TransformerBlock(nn.Module):
+    def __init__(self, dim, ffn_expansion_factor=2.66, bias=False, LayerNorm_type='WithBias', att=False):
+        super(TransformerBlock, self).__init__()
+        self.att = att
+        if self.att:
+            self.norm1 = LayerNorm(dim, LayerNorm_type)
+            self.attn = FSAS(dim, bias)
+        self.norm2 = LayerNorm(dim, LayerNorm_type)
+        self.ffn = DFFN(dim, ffn_expansion_factor, bias)
+    def forward(self, x):
+        if self.att:
+            x = x + self.attn(self.norm1(x))
+        x = x + self.ffn(self.norm2(x))
+        return x
+class Fuse(nn.Module):
+    def __init__(self, n_feat):
+        super(Fuse, self).__init__()
+        self.n_feat = n_feat
+        self.att_channel = TransformerBlock(dim=n_feat * 2)
+        self.conv = nn.Conv2d(n_feat * 2, n_feat * 2, 1, 1, 0)
+        self.conv2 = nn.Conv2d(n_feat * 2, n_feat * 2, 1, 1, 0)
+    def forward(self, enc, dnc):
+        x = self.conv(torch.cat((enc, dnc), dim=1))
+        x = self.att_channel(x)
+        x = self.conv2(x)
+        e, d = torch.split(x, [self.n_feat, self.n_feat], dim=1)
+        output = e + d
+        return output
+##########################################################################
+## Overlapped image patch embedding with 3x3 Conv
+class OverlapPatchEmbed(nn.Module):
+    def __init__(self, in_c=3, embed_dim=48, bias=False):
+        super(OverlapPatchEmbed, self).__init__()
+        self.proj = nn.Conv2d(in_c, embed_dim, kernel_size=3, stride=1, padding=1, bias=bias)
+    def forward(self, x):
+        x = self.proj(x)
+        return x
+##########################################################################
+## Resizing modules
+class Downsample(nn.Module):
+    def __init__(self, n_feat):
+        super(Downsample, self).__init__()
+        self.body = nn.Sequential(nn.Upsample(scale_factor=0.5, mode='bilinear', align_corners=False),
+                                  nn.Conv2d(n_feat, n_feat * 2, 3, stride=1, padding=1, bias=False))
+    def forward(self, x):
+        return self.body(x)
+class Upsample(nn.Module):
+    def __init__(self, n_feat):
+        super(Upsample, self).__init__()
+        self.body = nn.Sequential(nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),
+                                  nn.Conv2d(n_feat, n_feat // 2, 3, stride=1, padding=1, bias=False))
+    def forward(self, x):
+        return self.body(x)
+##########################################################################
+##---------- FFTformer -----------------------
+class fftformer(nn.Module):
+    def __init__(self,
+                 inp_channels=3,
+                 out_channels=3,
+                 dim=8,
+                 num_blocks=[6, 6, 12, 8],
+                 num_refinement_blocks=4,
+                 ffn_expansion_factor=3,
+                 bias=False,
+                 ):
+        super(fftformer, self).__init__()
+        self.patch_embed = OverlapPatchEmbed(inp_channels, dim)
+        self.encoder_level1 = nn.Sequential(*[
+            TransformerBlock(dim=dim, ffn_expansion_factor=ffn_expansion_factor, bias=bias) for i in
+            range(num_blocks[0])])
+        self.down1_2 = Downsample(dim)
+        self.encoder_level2 = nn.Sequential(*[
+            TransformerBlock(dim=int(dim * 2 ** 1), ffn_expansion_factor=ffn_expansion_factor,
+                             bias=bias) for i in range(num_blocks[1])])
+        self.down2_3 = Downsample(int(dim * 2 ** 1))
+        self.encoder_level3 = nn.Sequential(*[
+            TransformerBlock(dim=int(dim * 2 ** 2), ffn_expansion_factor=ffn_expansion_factor,
+                             bias=bias) for i in range(num_blocks[2])])
+        self.decoder_level3 = nn.Sequential(*[
+            TransformerBlock(dim=int(dim * 2 ** 2), ffn_expansion_factor=ffn_expansion_factor,
+                             bias=bias, att=True) for i in range(num_blocks[2])])
+        self.up3_2 = Upsample(int(dim * 2 ** 2))
+        self.reduce_chan_level2 = nn.Conv2d(int(dim * 2 ** 2), int(dim * 2 ** 1), kernel_size=1, bias=bias)
+        self.decoder_level2 = nn.Sequential(*[
+            TransformerBlock(dim=int(dim * 2 ** 1), ffn_expansion_factor=ffn_expansion_factor,
+                             bias=bias, att=True) for i in range(num_blocks[1])])
+        self.up2_1 = Upsample(int(dim * 2 ** 1))
+        self.decoder_level1 = nn.Sequential(*[
+            TransformerBlock(dim=int(dim), ffn_expansion_factor=ffn_expansion_factor,
+                             bias=bias, att=True) for i in range(num_blocks[0])])
+        self.refinement = nn.Sequential(*[
+            TransformerBlock(dim=int(dim), ffn_expansion_factor=ffn_expansion_factor,
+                             bias=bias, att=True) for i in range(num_refinement_blocks)])
+        self.fuse2 = Fuse(dim * 2)
+        self.fuse1 = Fuse(dim)
+        self.output = nn.Conv2d(int(dim), out_channels, kernel_size=3, stride=1, padding=1, bias=bias)
+    def forward(self, inp_img):
+        inp_enc_level1 = self.patch_embed(inp_img)
+        out_enc_level1 = self.encoder_level1(inp_enc_level1)
+        inp_enc_level2 = self.down1_2(out_enc_level1)
+        out_enc_level2 = self.encoder_level2(inp_enc_level2)
+        inp_enc_level3 = self.down2_3(out_enc_level2)
+        out_enc_level3 = self.encoder_level3(inp_enc_level3)
+        out_dec_level3 = self.decoder_level3(out_enc_level3)
+        inp_dec_level2 = self.up3_2(out_dec_level3)
+        inp_dec_level2 = self.fuse2(inp_dec_level2, out_enc_level2)
+        out_dec_level2 = self.decoder_level2(inp_dec_level2)
+        inp_dec_level1 = self.up2_1(out_dec_level2)
+        inp_dec_level1 = self.fuse1(inp_dec_level1, out_enc_level1)
+        out_dec_level1 = self.decoder_level1(inp_dec_level1)
+        out_dec_level1 = self.refinement(out_dec_level1)
+        out_dec_level1 = self.output(out_dec_level1) + inp_img
+        return out_dec_level1
+#"""
+import time
+start_time = time.time()
+inp = torch.randn(1, 3, 512, 512).cuda()#.to(dtype=torch.float16)
+model = fftformer().cuda()#.to(dtype=torch.float16)
+out = model(inp)
+print(out.shape)
+print("--- %s seconds ---" % (time.time() - start_time))
+pytorch_total_params = sum(p.numel() for p in model.parameters())
+print("--- {num} parameters ---".format(num = pytorch_total_params))
+pytorch_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+print("--- {num} trainable parameters ---".format(num = pytorch_trainable_params))
+gpu_memmem_usage_bytes = torch.cuda.max_memory_allocated()
+print(gpu_memmem_usage_bytes / 1024 / 1024 / 1024)  # 64: 1.32 128: 4.94 256: 19.12; 512: OOM
+#"""
+"""
+import torch
+from ptflops import get_model_complexity_info
+with torch.cuda.device(0):
+  net = model
+  macs, params = get_model_complexity_info(net, (3, 256, 256), as_strings=True,
+                                           print_per_layer_stat=True, verbose=True)
+  print('{:<30}  {:<8}'.format('Computational complexity: ', macs))  # 31.97 GMac
+  print('{:<30}  {:<8}'.format('Number of parameters: ', params))  # 8.37 M
+"""

models/sota/Restormer.py ADDED Viewed

	@@ -0,0 +1,340 @@

+## Restormer: Efficient Transformer for High-Resolution Image Restoration
+## Syed Waqas Zamir, Aditya Arora, Salman Khan, Munawar Hayat, Fahad Shahbaz Khan, and Ming-Hsuan Yang
+## https://arxiv.org/abs/2111.09881
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pdb import set_trace as stx
+import numbers
+from einops import rearrange
+##########################################################################
+## Layer Norm
+def to_3d(x):
+    return rearrange(x, 'b c h w -> b (h w) c')
+def to_4d(x,h,w):
+    return rearrange(x, 'b (h w) c -> b c h w',h=h,w=w)
+class BiasFree_LayerNorm(nn.Module):
+    def __init__(self, normalized_shape):
+        super(BiasFree_LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+        assert len(normalized_shape) == 1
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.normalized_shape = normalized_shape
+    def forward(self, x):
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return x / torch.sqrt(sigma+1e-5) * self.weight
+class WithBias_LayerNorm(nn.Module):
+    def __init__(self, normalized_shape):
+        super(WithBias_LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+        assert len(normalized_shape) == 1
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.normalized_shape = normalized_shape
+    def forward(self, x):
+        mu = x.mean(-1, keepdim=True)
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return (x - mu) / torch.sqrt(sigma+1e-5) * self.weight + self.bias
+class LayerNorm(nn.Module):
+    def __init__(self, dim, LayerNorm_type):
+        super(LayerNorm, self).__init__()
+        if LayerNorm_type =='BiasFree':
+            self.body = BiasFree_LayerNorm(dim)
+        else:
+            self.body = WithBias_LayerNorm(dim)
+    def forward(self, x):
+        h, w = x.shape[-2:]
+        return to_4d(self.body(to_3d(x)), h, w)
+##########################################################################
+## Gated-Dconv Feed-Forward Network (GDFN)
+class FeedForward(nn.Module):
+    def __init__(self, dim, ffn_expansion_factor, bias):
+        super(FeedForward, self).__init__()
+        hidden_features = int(dim*ffn_expansion_factor)
+        self.project_in = nn.Conv2d(dim, hidden_features*2, kernel_size=1, bias=bias)
+        self.dwconv = nn.Conv2d(hidden_features*2, hidden_features*2, kernel_size=3, stride=1, padding=1, groups=hidden_features*2, bias=bias)
+        self.project_out = nn.Conv2d(hidden_features, dim, kernel_size=1, bias=bias)
+    def forward(self, x):
+        x = self.project_in(x)
+        x1, x2 = self.dwconv(x).chunk(2, dim=1)
+        x = F.gelu(x1) * x2
+        x = self.project_out(x)
+        return x
+##########################################################################
+## Multi-DConv Head Transposed Self-Attention (MDTA)
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads, bias):
+        super(Attention, self).__init__()
+        self.num_heads = num_heads
+        self.temperature = nn.Parameter(torch.ones(num_heads, 1, 1))
+        self.qkv = nn.Conv2d(dim, dim*3, kernel_size=1, bias=bias)
+        self.qkv_dwconv = nn.Conv2d(dim*3, dim*3, kernel_size=3, stride=1, padding=1, groups=dim*3, bias=bias)
+        self.project_out = nn.Conv2d(dim, dim, kernel_size=1, bias=bias)
+    def forward(self, x):
+        b,c,h,w = x.shape
+        qkv = self.qkv_dwconv(self.qkv(x))
+        q,k,v = qkv.chunk(3, dim=1)
+        q = rearrange(q, 'b (head c) h w -> b head c (h w)', head=self.num_heads)
+        k = rearrange(k, 'b (head c) h w -> b head c (h w)', head=self.num_heads)
+        v = rearrange(v, 'b (head c) h w -> b head c (h w)', head=self.num_heads)
+        q = torch.nn.functional.normalize(q, dim=-1)
+        k = torch.nn.functional.normalize(k, dim=-1)
+        attn = (q @ k.transpose(-2, -1)) * self.temperature
+        attn = attn.softmax(dim=-1)
+        out = (attn @ v)
+        out = rearrange(out, 'b head c (h w) -> b (head c) h w', head=self.num_heads, h=h, w=w)
+        out = self.project_out(out)
+        return out
+##########################################################################
+class TransformerBlock(nn.Module):
+    def __init__(self, dim, num_heads, ffn_expansion_factor, bias, LayerNorm_type):
+        super(TransformerBlock, self).__init__()
+        self.norm1 = LayerNorm(dim, LayerNorm_type)
+        self.attn = Attention(dim, num_heads, bias)
+        self.norm2 = LayerNorm(dim, LayerNorm_type)
+        self.ffn = FeedForward(dim, ffn_expansion_factor, bias)
+    def forward(self, x):
+        x = x + self.attn(self.norm1(x))
+        x = x + self.ffn(self.norm2(x))
+        return x
+##########################################################################
+## Overlapped image patch embedding with 3x3 Conv
+class OverlapPatchEmbed(nn.Module):
+    def __init__(self, in_c=3, embed_dim=48, bias=False):
+        super(OverlapPatchEmbed, self).__init__()
+        self.proj = nn.Conv2d(in_c, embed_dim, kernel_size=3, stride=1, padding=1, bias=bias)
+    def forward(self, x):
+        x = self.proj(x)
+        return x
+##########################################################################
+## Resizing modules
+class Downsample(nn.Module):
+    def __init__(self, n_feat):
+        super(Downsample, self).__init__()
+        self.body = nn.Sequential(nn.Conv2d(n_feat, n_feat//2, kernel_size=3, stride=1, padding=1, bias=False),
+                                  nn.PixelUnshuffle(2))
+    def forward(self, x):
+        return self.body(x)
+class Upsample(nn.Module):
+    def __init__(self, n_feat):
+        super(Upsample, self).__init__()
+        self.body = nn.Sequential(nn.Conv2d(n_feat, n_feat*2, kernel_size=3, stride=1, padding=1, bias=False),
+                                  nn.PixelShuffle(2))
+    def forward(self, x):
+        return self.body(x)
+##########################################################################
+class Strip_VSSB(nn.Module):
+    def __init__(self, dim, head_num):
+        super(Strip_VSSB, self).__init__()
+        self.intra = TransformerBlock(dim=32, num_heads=head_num, ffn_expansion_factor=2.66, bias=False, LayerNorm_type='WithBias')
+        self.inter = TransformerBlock(dim=32, num_heads=head_num, ffn_expansion_factor=2.66, bias=False, LayerNorm_type='WithBias')
+    def forward(self, x):
+        x = self.intra(x)
+        x = self.inter(x)
+        return x
+##########################################################################
+##---------- Restormer -----------------------
+class Restormer(nn.Module):
+    def __init__(self,
+        inp_channels=3,
+        out_channels=3,
+        dim = 12,
+        num_blocks = [4,6,6,8],
+        num_refinement_blocks = 4,
+        heads = [1,2,4,8],
+        ffn_expansion_factor = 2.66,
+        bias = False,
+        LayerNorm_type = 'WithBias',   ## Other option 'BiasFree'
+        dual_pixel_task = False        ## True for dual-pixel defocus deblurring only. Also set inp_channels=6
+    ):
+        super(Restormer, self).__init__()
+        self.patch_embed = OverlapPatchEmbed(inp_channels, dim)
+        self.encoder_level1 = nn.Sequential(*[TransformerBlock(dim=dim, num_heads=heads[0], ffn_expansion_factor=ffn_expansion_factor, bias=bias, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[0])])
+        self.down1_2 = Downsample(dim) ## From Level 1 to Level 2
+        self.encoder_level2 = nn.Sequential(*[TransformerBlock(dim=int(dim*2**1), num_heads=heads[1], ffn_expansion_factor=ffn_expansion_factor, bias=bias, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[1])])
+        self.down2_3 = Downsample(int(dim*2**1)) ## From Level 2 to Level 3
+        self.encoder_level3 = nn.Sequential(*[TransformerBlock(dim=int(dim*2**2), num_heads=heads[2], ffn_expansion_factor=ffn_expansion_factor, bias=bias, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[2])])
+        self.down3_4 = Downsample(int(dim*2**2)) ## From Level 3 to Level 4
+        self.latent = nn.Sequential(*[TransformerBlock(dim=int(dim*2**3), num_heads=heads[3], ffn_expansion_factor=ffn_expansion_factor, bias=bias, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[3])])
+        self.up4_3 = Upsample(int(dim*2**3)) ## From Level 4 to Level 3
+        self.reduce_chan_level3 = nn.Conv2d(int(dim*2**3), int(dim*2**2), kernel_size=1, bias=bias)
+        self.decoder_level3 = nn.Sequential(*[TransformerBlock(dim=int(dim*2**2), num_heads=heads[2], ffn_expansion_factor=ffn_expansion_factor, bias=bias, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[2])])
+        self.up3_2 = Upsample(int(dim*2**2)) ## From Level 3 to Level 2
+        self.reduce_chan_level2 = nn.Conv2d(int(dim*2**2), int(dim*2**1), kernel_size=1, bias=bias)
+        self.decoder_level2 = nn.Sequential(*[TransformerBlock(dim=int(dim*2**1), num_heads=heads[1], ffn_expansion_factor=ffn_expansion_factor, bias=bias, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[1])])
+        self.up2_1 = Upsample(int(dim*2**1))  ## From Level 2 to Level 1  (NO 1x1 conv to reduce channels)
+        self.decoder_level1 = nn.Sequential(*[TransformerBlock(dim=int(dim*2**1), num_heads=heads[0], ffn_expansion_factor=ffn_expansion_factor, bias=bias, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[0])])
+        self.refinement = nn.Sequential(*[TransformerBlock(dim=int(dim*2**1), num_heads=heads[0], ffn_expansion_factor=ffn_expansion_factor, bias=bias, LayerNorm_type=LayerNorm_type) for i in range(num_refinement_blocks)])
+        #### For Dual-Pixel Defocus Deblurring Task ####
+        self.dual_pixel_task = dual_pixel_task
+        if self.dual_pixel_task:
+            self.skip_conv = nn.Conv2d(dim, int(dim*2**1), kernel_size=1, bias=bias)
+        ###########################
+        self.output = nn.Conv2d(int(dim*2**1), out_channels, kernel_size=3, stride=1, padding=1, bias=bias)
+    def forward(self, inp_img):
+        inp_enc_level1 = self.patch_embed(inp_img)
+        out_enc_level1 = self.encoder_level1(inp_enc_level1)
+        inp_enc_level2 = self.down1_2(out_enc_level1)
+        out_enc_level2 = self.encoder_level2(inp_enc_level2)
+        inp_enc_level3 = self.down2_3(out_enc_level2)
+        out_enc_level3 = self.encoder_level3(inp_enc_level3)
+        inp_enc_level4 = self.down3_4(out_enc_level3)
+        latent = self.latent(inp_enc_level4)
+        inp_dec_level3 = self.up4_3(latent)
+        inp_dec_level3 = torch.cat([inp_dec_level3, out_enc_level3], 1)
+        inp_dec_level3 = self.reduce_chan_level3(inp_dec_level3)
+        out_dec_level3 = self.decoder_level3(inp_dec_level3)
+        inp_dec_level2 = self.up3_2(out_dec_level3)
+        inp_dec_level2 = torch.cat([inp_dec_level2, out_enc_level2], 1)
+        inp_dec_level2 = self.reduce_chan_level2(inp_dec_level2)
+        out_dec_level2 = self.decoder_level2(inp_dec_level2)
+        inp_dec_level1 = self.up2_1(out_dec_level2)
+        inp_dec_level1 = torch.cat([inp_dec_level1, out_enc_level1], 1)
+        out_dec_level1 = self.decoder_level1(inp_dec_level1)
+        out_dec_level1 = self.refinement(out_dec_level1)
+        #### For Dual-Pixel Defocus Deblurring Task ####
+        if self.dual_pixel_task:
+            out_dec_level1 = out_dec_level1 + self.skip_conv(inp_enc_level1)
+            out_dec_level1 = self.output(out_dec_level1)
+        ###########################
+        else:
+            out_dec_level1 = self.output(out_dec_level1) + inp_img
+        return out_dec_level1
+#"""
+import time
+start_time = time.time()
+inp = torch.randn(1, 3, 256, 256).cuda()#.to(dtype=torch.float16)
+model = Restormer().cuda()#.to(dtype=torch.float16)
+out = model(inp)
+print(out.shape)
+print("--- %s seconds ---" % (time.time() - start_time))
+pytorch_total_params = sum(p.numel() for p in model.parameters())
+print("--- {num} parameters ---".format(num = pytorch_total_params))
+pytorch_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+print("--- {num} trainable parameters ---".format(num = pytorch_trainable_params))
+gpu_memmem_usage_bytes = torch.cuda.max_memory_allocated()
+print(gpu_memmem_usage_bytes / 1024 / 1024 / 1024)  # 64: 0.97 128: 3.04 256: 11.93; 512: OOM
+#"""
+"""
+import torch
+from ptflops import get_model_complexity_info
+with torch.cuda.device(0):
+  net = model
+  macs, params = get_model_complexity_info(net, (3, 256, 256), as_strings=True,
+                                           print_per_layer_stat=True, verbose=True)
+  print('{:<30}  {:<8}'.format('Computational complexity: ', macs))  # 31.97 GMac
+  print('{:<30}  {:<8}'.format('Number of parameters: ', params))  # 8.37 M
+"""
+"""
+import time
+start_time = time.time()
+inp = torch.randn(1, 32, 64, 64).cuda()#.to(dtype=torch.float16)
+model = Strip_VSSB(dim=32, head_num=4).cuda()#.to(dtype=torch.float16)
+out = model(inp)
+print(out.shape)
+print("--- %s seconds ---" % (time.time() - start_time))
+pytorch_total_params = sum(p.numel() for p in model.parameters())
+print("--- {num} parameters ---".format(num = pytorch_total_params))
+pytorch_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+print("--- {num} trainable parameters ---".format(num = pytorch_trainable_params))
+gpu_memmem_usage_bytes = torch.cuda.max_memory_allocated()
+print(gpu_memmem_usage_bytes / 1024 / 1024 / 1024)  # 64: 0.16; 128: 0.22; 192: 0.37; 256: 0.56; 512: 2.10;
+"""

models/sota/Stripformer.py ADDED Viewed

	@@ -0,0 +1,429 @@

+import torch
+import torch.nn as nn
+import math
+class Embeddings(nn.Module):
+    def __init__(self):
+        super(Embeddings, self).__init__()
+        self.activation = nn.LeakyReLU(0.2, True)
+        self.en_layer1_1 = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=3, padding=1),
+            self.activation,
+        )
+        self.en_layer1_2 = nn.Sequential(
+            nn.Conv2d(64, 64, kernel_size=3, padding=1),
+            self.activation,
+            nn.Conv2d(64, 64, kernel_size=3, padding=1))
+        self.en_layer1_3 = nn.Sequential(
+            nn.Conv2d(64, 64, kernel_size=3, padding=1),
+            self.activation,
+            nn.Conv2d(64, 64, kernel_size=3, padding=1))
+        self.en_layer1_4 = nn.Sequential(
+            nn.Conv2d(64, 64, kernel_size=3, padding=1),
+            self.activation,
+            nn.Conv2d(64, 64, kernel_size=3, padding=1))
+        self.en_layer2_1 = nn.Sequential(
+            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
+            self.activation,
+        )
+        self.en_layer2_2 = nn.Sequential(
+            nn.Conv2d(128, 128, kernel_size=3, padding=1),
+            self.activation,
+            nn.Conv2d(128, 128, kernel_size=3, padding=1))
+        self.en_layer2_3 = nn.Sequential(
+            nn.Conv2d(128, 128, kernel_size=3, padding=1),
+            self.activation,
+            nn.Conv2d(128, 128, kernel_size=3, padding=1))
+        self.en_layer2_4 = nn.Sequential(
+            nn.Conv2d(128, 128, kernel_size=3, padding=1),
+            self.activation,
+            nn.Conv2d(128, 128, kernel_size=3, padding=1))
+        self.en_layer3_1 = nn.Sequential(
+            nn.Conv2d(128, 320, kernel_size=3, stride=2, padding=1),
+            self.activation,
+        )
+    def forward(self, x):
+        hx = self.en_layer1_1(x)
+        hx = self.activation(self.en_layer1_2(hx) + hx)
+        hx = self.activation(self.en_layer1_3(hx) + hx)
+        hx = self.activation(self.en_layer1_4(hx) + hx)
+        residual_1 = hx
+        hx = self.en_layer2_1(hx)
+        hx = self.activation(self.en_layer2_2(hx) + hx)
+        hx = self.activation(self.en_layer2_3(hx) + hx)
+        hx = self.activation(self.en_layer2_4(hx) + hx)
+        residual_2 = hx
+        hx = self.en_layer3_1(hx)
+        return hx, residual_1, residual_2
+class Embeddings_output(nn.Module):
+    def __init__(self):
+        super(Embeddings_output, self).__init__()
+        self.activation = nn.LeakyReLU(0.2, True)
+        self.de_layer3_1 = nn.Sequential(
+            nn.ConvTranspose2d(320, 192, kernel_size=4, stride=2, padding=1),
+            self.activation,
+        )
+        head_num = 3
+        dim = 192
+        self.de_layer2_2 = nn.Sequential(
+            nn.Conv2d(192+128, 192, kernel_size=1, padding=0),
+            self.activation,
+        )
+        self.de_block_1 = Intra_SA(dim, head_num)
+        self.de_block_2 = Inter_SA(dim, head_num)
+        self.de_block_3 = Intra_SA(dim, head_num)
+        self.de_block_4 = Inter_SA(dim, head_num)
+        self.de_block_5 = Intra_SA(dim, head_num)
+        self.de_block_6 = Inter_SA(dim, head_num)
+        self.de_layer2_1 = nn.Sequential(
+            nn.ConvTranspose2d(192, 64, kernel_size=4, stride=2, padding=1),
+            self.activation,
+        )
+        self.de_layer1_3 = nn.Sequential(
+            nn.Conv2d(128, 64, kernel_size=1, padding=0),
+            self.activation,
+            nn.Conv2d(64, 64, kernel_size=3, padding=1))
+        self.de_layer1_2 = nn.Sequential(
+            nn.Conv2d(64, 64, kernel_size=3, padding=1),
+            self.activation,
+            nn.Conv2d(64, 64, kernel_size=3, padding=1))
+        self.de_layer1_1 = nn.Sequential(
+            nn.Conv2d(64, 3, kernel_size=3, padding=1),
+            self.activation
+        )
+    def forward(self, x, residual_1, residual_2):
+        hx = self.de_layer3_1(x)
+        hx = self.de_layer2_2(torch.cat((hx, residual_2), dim = 1))
+        hx = self.de_block_1(hx)
+        hx = self.de_block_2(hx)
+        hx = self.de_block_3(hx)
+        hx = self.de_block_4(hx)
+        hx = self.de_block_5(hx)
+        hx = self.de_block_6(hx)
+        hx = self.de_layer2_1(hx)
+        hx = self.activation(self.de_layer1_3(torch.cat((hx, residual_1), dim = 1)) + hx)
+        hx = self.activation(self.de_layer1_2(hx) + hx)
+        hx = self.de_layer1_1(hx)
+        return hx
+class Attention(nn.Module):
+    def __init__(self, head_num):
+        super(Attention, self).__init__()
+        self.num_attention_heads = head_num
+        self.softmax = nn.Softmax(dim=-1)
+    def transpose_for_scores(self, x):
+        B, N, C = x.size()
+        attention_head_size = int(C / self.num_attention_heads)
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3).contiguous()
+    def forward(self, query_layer, key_layer, value_layer):
+        B, N, C = query_layer.size()
+        query_layer = self.transpose_for_scores(query_layer)
+        key_layer = self.transpose_for_scores(key_layer)
+        value_layer = self.transpose_for_scores(value_layer)
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        _, _, _, d = query_layer.size()
+        attention_scores = attention_scores / math.sqrt(d)
+        attention_probs = self.softmax(attention_scores)
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (C,)
+        attention_out = context_layer.view(*new_context_layer_shape)
+        return attention_out
+class Mlp(nn.Module):
+    def __init__(self, hidden_size):
+        super(Mlp, self).__init__()
+        self.fc1 = nn.Linear(hidden_size, 4*hidden_size)
+        self.fc2 = nn.Linear(4*hidden_size, hidden_size)
+        self.act_fn = torch.nn.functional.gelu
+        self._init_weights()
+    def _init_weights(self):
+        nn.init.xavier_uniform_(self.fc1.weight)
+        nn.init.xavier_uniform_(self.fc2.weight)
+        nn.init.normal_(self.fc1.bias, std=1e-6)
+        nn.init.normal_(self.fc2.bias, std=1e-6)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act_fn(x)
+        x = self.fc2(x)
+        return x
+# CPE (Conditional Positional Embedding)
+class PEG(nn.Module):
+    def __init__(self, hidden_size):
+        super(PEG, self).__init__()
+        self.PEG = nn.Conv2d(hidden_size, hidden_size, kernel_size=3, padding=1, groups=hidden_size)
+    def forward(self, x):
+        x = self.PEG(x) + x
+        return x
+class Intra_SA(nn.Module):
+    def __init__(self, dim, head_num):
+        super(Intra_SA, self).__init__()
+        self.hidden_size = dim // 2
+        self.head_num = head_num
+        self.attention_norm = nn.LayerNorm(dim)
+        self.conv_input = nn.Conv2d(dim, dim, kernel_size=1, padding=0)
+        self.qkv_local_h = nn.Linear(self.hidden_size, self.hidden_size * 3)  # qkv_h
+        self.qkv_local_v = nn.Linear(self.hidden_size, self.hidden_size * 3)  # qkv_v
+        self.fuse_out = nn.Conv2d(dim, dim, kernel_size=1, padding=0)
+        self.ffn_norm = nn.LayerNorm(dim)
+        self.ffn = Mlp(dim)
+        self.attn = Attention(head_num=self.head_num)
+        self.PEG = PEG(dim)
+    def forward(self, x):
+        h = x
+        B, C, H, W = x.size()
+        x = x.view(B, C, H*W).permute(0, 2, 1).contiguous()
+        x = self.attention_norm(x).permute(0, 2, 1).contiguous()
+        x = x.view(B, C, H, W)
+        x_input = torch.chunk(self.conv_input(x), 2, dim=1)
+        feature_h = (x_input[0]).permute(0, 2, 3, 1).contiguous()
+        feature_h = feature_h.view(B * H, W, C//2)
+        feature_v = (x_input[1]).permute(0, 3, 2, 1).contiguous()
+        feature_v = feature_v.view(B * W, H, C//2)
+        qkv_h = torch.chunk(self.qkv_local_h(feature_h), 3, dim=2)
+        qkv_v = torch.chunk(self.qkv_local_v(feature_v), 3, dim=2)
+        q_h, k_h, v_h = qkv_h[0], qkv_h[1], qkv_h[2]
+        q_v, k_v, v_v = qkv_v[0], qkv_v[1], qkv_v[2]
+        if H == W:
+            query = torch.cat((q_h, q_v), dim=0)
+            key = torch.cat((k_h, k_v), dim=0)
+            value = torch.cat((v_h, v_v), dim=0)
+            attention_output = self.attn(query, key, value)
+            attention_output = torch.chunk(attention_output, 2, dim=0)
+            attention_output_h = attention_output[0]
+            attention_output_v = attention_output[1]
+            attention_output_h = attention_output_h.view(B, H, W, C//2).permute(0, 3, 1, 2).contiguous()
+            attention_output_v = attention_output_v.view(B, W, H, C//2).permute(0, 3, 2, 1).contiguous()
+            attn_out = self.fuse_out(torch.cat((attention_output_h, attention_output_v), dim=1))
+        else:
+            attention_output_h = self.attn(q_h, k_h, v_h)
+            attention_output_v = self.attn(q_v, k_v, v_v)
+            attention_output_h = attention_output_h.view(B, H, W, C//2).permute(0, 3, 1, 2).contiguous()
+            attention_output_v = attention_output_v.view(B, W, H, C//2).permute(0, 3, 2, 1).contiguous()
+            attn_out = self.fuse_out(torch.cat((attention_output_h, attention_output_v), dim=1))
+        x = attn_out + h
+        x = x.view(B, C, H*W).permute(0, 2, 1).contiguous()
+        h = x
+        x = self.ffn_norm(x)
+        x = self.ffn(x)
+        x = x + h
+        x = x.permute(0, 2, 1).contiguous()
+        x = x.view(B, C, H, W)
+        x = self.PEG(x)
+        return x
+class Inter_SA(nn.Module):
+    def __init__(self,dim, head_num):
+        super(Inter_SA, self).__init__()
+        self.hidden_size = dim
+        self.head_num = head_num
+        self.attention_norm = nn.LayerNorm(self.hidden_size)
+        self.conv_input = nn.Conv2d(self.hidden_size, self.hidden_size, kernel_size=1, padding=0)
+        self.conv_h = nn.Conv2d(self.hidden_size//2, 3 * (self.hidden_size//2), kernel_size=1, padding=0)  # qkv_h
+        self.conv_v = nn.Conv2d(self.hidden_size//2, 3 * (self.hidden_size//2), kernel_size=1, padding=0)  # qkv_v
+        self.ffn_norm = nn.LayerNorm(self.hidden_size)
+        self.ffn = Mlp(self.hidden_size)
+        self.fuse_out = nn.Conv2d(self.hidden_size, self.hidden_size, kernel_size=1, padding=0)
+        self.attn = Attention(head_num=self.head_num)
+        self.PEG = PEG(dim)
+    def forward(self, x):
+        h = x
+        B, C, H, W = x.size()
+        x = x.view(B, C, H*W).permute(0, 2, 1).contiguous()
+        x = self.attention_norm(x).permute(0, 2, 1).contiguous()
+        x = x.view(B, C, H, W)
+        #print(x.shape)
+        x_input = torch.chunk(self.conv_input(x), 2, dim=1)
+        feature_h = torch.chunk(self.conv_h(x_input[0]), 3, dim=1)
+        feature_v = torch.chunk(self.conv_v(x_input[1]), 3, dim=1)
+        query_h, key_h, value_h = feature_h[0], feature_h[1], feature_h[2]
+        query_v, key_v, value_v = feature_v[0], feature_v[1], feature_v[2]
+        horizontal_groups = torch.cat((query_h, key_h, value_h), dim=0)
+        horizontal_groups = horizontal_groups.permute(0, 2, 1, 3).contiguous()
+        horizontal_groups = horizontal_groups.view(3*B, H, -1)
+        horizontal_groups = torch.chunk(horizontal_groups, 3, dim=0)
+        query_h, key_h, value_h = horizontal_groups[0], horizontal_groups[1], horizontal_groups[2]
+        vertical_groups = torch.cat((query_v, key_v, value_v), dim=0)
+        vertical_groups = vertical_groups.permute(0, 3, 1, 2).contiguous()
+        vertical_groups = vertical_groups.view(3*B, W, -1)
+        vertical_groups = torch.chunk(vertical_groups, 3, dim=0)
+        query_v, key_v, value_v = vertical_groups[0], vertical_groups[1], vertical_groups[2]
+        if H == W:
+            query = torch.cat((query_h, query_v), dim=0)
+            key = torch.cat((key_h, key_v), dim=0)
+            value = torch.cat((value_h, value_v), dim=0)
+            attention_output = self.attn(query, key, value)
+            attention_output = torch.chunk(attention_output, 2, dim=0)
+            attention_output_h = attention_output[0]
+            attention_output_v = attention_output[1]
+            attention_output_h = attention_output_h.view(B, H, C//2, W).permute(0, 2, 1, 3).contiguous()
+            attention_output_v = attention_output_v.view(B, W, C//2, H).permute(0, 2, 3, 1).contiguous()
+            attn_out = self.fuse_out(torch.cat((attention_output_h, attention_output_v), dim=1))
+        else:
+            attention_output_h = self.attn(query_h, key_h, value_h)
+            attention_output_v = self.attn(query_v, key_v, value_v)
+            attention_output_h = attention_output_h.view(B, H, C//2, W).permute(0, 2, 1, 3).contiguous()
+            attention_output_v = attention_output_v.view(B, W, C//2, H).permute(0, 2, 3, 1).contiguous()
+            attn_out = self.fuse_out(torch.cat((attention_output_h, attention_output_v), dim=1))
+        x = attn_out + h
+        x = x.view(B, C, H*W).permute(0, 2, 1).contiguous()
+        h = x
+        x = self.ffn_norm(x)
+        x = self.ffn(x)
+        x = x + h
+        x = x.permute(0, 2, 1).contiguous()
+        x = x.view(B, C, H, W)
+        x = self.PEG(x)
+        return x
+##########################################################################
+class Strip_VSSB(nn.Module):
+    def __init__(self, dim, head_num):
+        super(Strip_VSSB, self).__init__()
+        self.intra = Intra_SA(dim, head_num)
+        self.inter = Inter_SA(dim, head_num)
+    def forward(self, x):
+        x = self.intra(x)
+        x = self.inter(x)
+        return x
+class Stripformer(nn.Module):
+    def __init__(self):
+        super(Stripformer, self).__init__()
+        self.encoder = Embeddings()
+        head_num = 5
+        dim = 320
+        self.Trans_block_1 = Intra_SA(dim, head_num)
+        self.Trans_block_2 = Inter_SA(dim, head_num)
+        self.Trans_block_3 = Intra_SA(dim, head_num)
+        self.Trans_block_4 = Inter_SA(dim, head_num)
+        self.Trans_block_5 = Intra_SA(dim, head_num)
+        self.Trans_block_6 = Inter_SA(dim, head_num)
+        self.Trans_block_7 = Intra_SA(dim, head_num)
+        self.Trans_block_8 = Inter_SA(dim, head_num)
+        self.Trans_block_9 = Intra_SA(dim, head_num)
+        self.Trans_block_10 = Inter_SA(dim, head_num)
+        self.Trans_block_11 = Intra_SA(dim, head_num)
+        self.Trans_block_12 = Inter_SA(dim, head_num)
+        self.decoder = Embeddings_output()
+    def forward(self, x):
+        hx, residual_1, residual_2 = self.encoder(x)
+        hx = self.Trans_block_1(hx)
+        hx = self.Trans_block_2(hx)
+        hx = self.Trans_block_3(hx)
+        hx = self.Trans_block_4(hx)
+        hx = self.Trans_block_5(hx)
+        hx = self.Trans_block_6(hx)
+        hx = self.Trans_block_7(hx)
+        hx = self.Trans_block_8(hx)
+        hx = self.Trans_block_9(hx)
+        hx = self.Trans_block_10(hx)
+        hx = self.Trans_block_11(hx)
+        hx = self.Trans_block_12(hx)
+        hx = self.decoder(hx, residual_1, residual_2)
+        return hx + x
+#"""
+import time
+start_time = time.time()
+inp = torch.randn(1, 3, 64, 64).cuda()
+model = Stripformer().cuda()
+out = model(inp)
+print(out.shape)
+print("--- %s seconds ---" % (time.time() - start_time))
+pytorch_total_params = sum(p.numel() for p in model.parameters())
+print("--- {num} parameters ---".format(num = pytorch_total_params))
+pytorch_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+print("--- {num} trainable parameters ---".format(num = pytorch_trainable_params))
+gpu_memmem_usage_bytes = torch.cuda.max_memory_allocated()
+print(gpu_memmem_usage_bytes / 1024 / 1024 / 1024)  # 64: 0.37 128: 0.84 -> 256: 3.02 -> 512: 12.55
+#"""
+"""
+import torch
+from ptflops import get_model_complexity_info
+with torch.cuda.device(0):
+  net = model
+  macs, params = get_model_complexity_info(net, (3, 512, 512), as_strings=True,
+                                           print_per_layer_stat=True, verbose=True)
+  print('{:<30}  {:<8}'.format('Computational complexity: ', macs))  # 49.79 GMac
+  print('{:<30}  {:<8}'.format('Number of parameters: ', params))  # 6.06 M
+"""
+"""
+import time
+start_time = time.time()
+inp = torch.randn(1, 32, 512, 512).cuda().to(dtype=torch.float32)
+model = Strip_VSSB(dim=32, head_num = 4).cuda().to(dtype=torch.float32)
+out = model(inp)
+print(out.shape)
+print("--- %s seconds ---" % (time.time() - start_time))
+pytorch_total_params = sum(p.numel() for p in model.parameters())
+print("--- {num} parameters ---".format(num = pytorch_total_params))
+pytorch_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+print("--- {num} trainable parameters ---".format(num = pytorch_trainable_params))
+gpu_memmem_usage_bytes = torch.cuda.max_memory_allocated()
+print(gpu_memmem_usage_bytes / 1024 / 1024 / 1024)  # 128: 0.84 -> 256: 3.02 -> 512: 12.55
+"""

models/sota/XYScanNet.py ADDED Viewed

	@@ -0,0 +1,754 @@

+import numbers
+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from einops import rearrange, repeat
+from mamba_ssm.ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn
+try:
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+except ImportError:
+    causal_conv1d_fn, causal_conv1d_update = None, None
+try:
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+except ImportError:
+    selective_state_update = None
+try:
+    from mamba_ssm.ops.triton.layer_norm import RMSNorm, layer_norm_fn, rms_norm_fn
+except ImportError:
+    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+def to_3d(x):
+    return rearrange(x, 'b c h w -> b (h w) c')
+def to_4d(x, h, w):
+    return rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
+class BiasFree_LayerNorm(nn.Module):
+    def __init__(self, normalized_shape):
+        super(BiasFree_LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+        assert len(normalized_shape) == 1
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.normalized_shape = normalized_shape
+    def forward(self, x):
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return x / torch.sqrt(sigma + 1e-5) * self.weight
+class WithBias_LayerNorm(nn.Module):
+    def __init__(self, normalized_shape):
+        super(WithBias_LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+        assert len(normalized_shape) == 1
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.normalized_shape = normalized_shape
+    def forward(self, x):
+        mu = x.mean(-1, keepdim=True)
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return (x - mu) / torch.sqrt(sigma + 1e-5) * self.weight + self.bias
+class LayerNorm(nn.Module):
+    def __init__(self, dim, LayerNorm_type):
+        super(LayerNorm, self).__init__()
+        if LayerNorm_type == 'BiasFree':
+            self.body = BiasFree_LayerNorm(dim)
+        else:
+            self.body = WithBias_LayerNorm(dim)
+    def forward(self, x):
+        h, w = x.shape[-2:]
+        return to_4d(self.body(to_3d(x)), h, w)
+##########################################################################
+def conv(in_channels, out_channels, kernel_size, bias=False, stride = 1):
+    return nn.Conv2d(
+        in_channels, out_channels, kernel_size,
+        padding=(kernel_size//2), bias=bias, stride = stride)
+"""
+Borrow from "https://github.com/state-spaces/mamba.git"
+@article{mamba,
+  title={Mamba: Linear-Time Sequence Modeling with Selective State Spaces},
+  author={Gu, Albert and Dao, Tri},
+  journal={arXiv preprint arXiv:2312.00752},
+  year={2023}
+}
+"""
+class Mamba(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        d_state=16,
+        d_conv=4,
+        expand=2,
+        dt_rank="auto",
+        dt_min=0.001,
+        dt_max=0.1,
+        dt_init="random",
+        dt_scale=1.0,
+        dt_init_floor=1e-4,
+        conv_bias=True,
+        bias=False,
+        use_fast_path=True,  # Fused kernel options
+        layer_idx=None,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.d_model = d_model
+        self.d_state = d_state
+        self.d_conv = d_conv
+        self.expand = expand
+        self.d_inner = int(self.expand * self.d_model)
+        self.dt_rank = math.ceil(self.d_model / 16) if dt_rank == "auto" else dt_rank
+        self.use_fast_path = use_fast_path
+        self.layer_idx = layer_idx
+        self.in_proj = nn.Linear(self.d_model, self.d_inner * 2, bias=bias, **factory_kwargs)
+        self.conv1d = nn.Conv1d(
+            in_channels=self.d_inner,
+            out_channels=self.d_inner,
+            bias=conv_bias,
+            kernel_size=d_conv,
+            groups=self.d_inner,
+            padding=d_conv - 1,
+            **factory_kwargs,
+        )
+        self.activation = "silu"
+        self.act = nn.SiLU()
+        self.x_proj = nn.Linear(
+            self.d_inner, self.dt_rank + self.d_state * 2, bias=False, **factory_kwargs
+        )
+        self.dt_proj = nn.Linear(self.dt_rank, self.d_inner, bias=True, **factory_kwargs)
+        # Initialize special dt projection to preserve variance at initialization
+        dt_init_std = self.dt_rank**-0.5 * dt_scale
+        if dt_init == "constant":
+            nn.init.constant_(self.dt_proj.weight, dt_init_std)
+        elif dt_init == "random":
+            nn.init.uniform_(self.dt_proj.weight, -dt_init_std, dt_init_std)
+        else:
+            raise NotImplementedError
+        # Initialize dt bias so that F.softplus(dt_bias) is between dt_min and dt_max
+        dt = torch.exp(
+            torch.rand(self.d_inner, **factory_kwargs) * (math.log(dt_max) - math.log(dt_min))
+            + math.log(dt_min)
+        ).clamp(min=dt_init_floor)
+        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+        inv_dt = dt + torch.log(-torch.expm1(-dt))
+        with torch.no_grad():
+            self.dt_proj.bias.copy_(inv_dt)
+        # Our initialization would set all Linear.bias to zero, need to mark this one as _no_reinit
+        self.dt_proj.bias._no_reinit = True
+        # S4D real initialization
+        A = repeat(
+            torch.arange(1, self.d_state + 1, dtype=torch.float32, device=device),
+            "n -> d n",
+            d=self.d_inner,
+        ).contiguous()
+        A_log = torch.log(A)  # Keep A_log in fp32
+        self.A_log = nn.Parameter(A_log)
+        self.A_log._no_weight_decay = True
+        # D "skip" parameter
+        self.D = nn.Parameter(torch.ones(self.d_inner, device=device))  # Keep in fp32
+        self.D._no_weight_decay = True
+        self.out_proj = nn.Linear(self.d_inner, self.d_model, bias=bias, **factory_kwargs)
+    def forward(self, hidden_states, inference_params=None):
+        """
+        hidden_states: (B, L, D)
+        Returns: same shape as hidden_states
+        """
+        batch, seqlen, dim = hidden_states.shape
+        conv_state, ssm_state = None, None
+        if inference_params is not None:
+            conv_state, ssm_state = self._get_states_from_cache(inference_params, batch)
+            if inference_params.seqlen_offset > 0:
+                # The states are updated inplace
+                out, _, _ = self.step(hidden_states, conv_state, ssm_state)
+                return out
+        # We do matmul and transpose BLH -> HBL at the same time
+        xz = rearrange(
+            self.in_proj.weight @ rearrange(hidden_states, "b l d -> d (b l)"),
+            "d (b l) -> b d l",
+            l=seqlen,
+        )
+        if self.in_proj.bias is not None:
+            xz = xz + rearrange(self.in_proj.bias.to(dtype=xz.dtype), "d -> d 1")
+        A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
+        # In the backward pass we write dx and dz next to each other to avoid torch.cat
+        if self.use_fast_path and causal_conv1d_fn is not None and inference_params is None:  # Doesn't support outputting the states
+            out = mamba_inner_fn(
+                xz,
+                self.conv1d.weight,
+                self.conv1d.bias,
+                self.x_proj.weight,
+                self.dt_proj.weight,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                A,
+                None,  # input-dependent B
+                None,  # input-dependent C
+                self.D.float(),
+                delta_bias=self.dt_proj.bias.float(),
+                delta_softplus=True,
+            )
+        else:
+            x, z = xz.chunk(2, dim=1)
+            # Compute short convolution
+            if conv_state is not None:
+                # If we just take x[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
+                # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise.
+                conv_state.copy_(F.pad(x, (self.d_conv - x.shape[-1], 0)))  # Update state (B D W)
+            if causal_conv1d_fn is None:
+                x = self.act(self.conv1d(x)[..., :seqlen])
+            else:
+                assert self.activation in ["silu", "swish"]
+                x = causal_conv1d_fn(
+                    x=x,
+                    weight=rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                    bias=self.conv1d.bias,
+                    activation=self.activation,
+                )
+            # We're careful here about the layout, to avoid extra transposes.
+            # We want dt to have d as the slowest moving dimension
+            # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+            x_dbl = self.x_proj(rearrange(x, "b d l -> (b l) d"))  # (bl d)
+            dt, B, C = torch.split(x_dbl, [self.dt_rank, self.d_state, self.d_state], dim=-1)
+            dt = self.dt_proj.weight @ dt.t()
+            dt = rearrange(dt, "d (b l) -> b d l", l=seqlen)
+            B = rearrange(B, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
+            C = rearrange(C, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
+            assert self.activation in ["silu", "swish"]
+            y = selective_scan_fn(
+                x,
+                dt,
+                A,
+                B,
+                C,
+                self.D.float(),
+                z=z,
+                delta_bias=self.dt_proj.bias.float(),
+                delta_softplus=True,
+                return_last_state=ssm_state is not None,
+            )
+            if ssm_state is not None:
+                y, last_state = y
+                ssm_state.copy_(last_state)
+            y = rearrange(y, "b d l -> b l d")
+            out = self.out_proj(y)
+        return out
+    def step(self, hidden_states, conv_state, ssm_state):
+        dtype = hidden_states.dtype
+        assert hidden_states.shape[1] == 1, "Only support decoding with 1 token at a time for now"
+        xz = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
+        x, z = xz.chunk(2, dim=-1)  # (B D)
+        # Conv step
+        if causal_conv1d_update is None:
+            conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1))  # Update state (B D W)
+            conv_state[:, :, -1] = x
+            x = torch.sum(conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1)  # (B D)
+            if self.conv1d.bias is not None:
+                x = x + self.conv1d.bias
+            x = self.act(x).to(dtype=dtype)
+        else:
+            x = causal_conv1d_update(
+                x,
+                conv_state,
+                rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                self.conv1d.bias,
+                self.activation,
+            )
+        x_db = self.x_proj(x)  # (B dt_rank+2*d_state)
+        dt, B, C = torch.split(x_db, [self.dt_rank, self.d_state, self.d_state], dim=-1)
+        # Don't add dt_bias here
+        dt = F.linear(dt, self.dt_proj.weight)  # (B d_inner)
+        A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
+        # SSM step
+        if selective_state_update is None:
+            # Discretize A and B
+            dt = F.softplus(dt + self.dt_proj.bias.to(dtype=dt.dtype))
+            dA = torch.exp(torch.einsum("bd,dn->bdn", dt, A))
+            dB = torch.einsum("bd,bn->bdn", dt, B)
+            ssm_state.copy_(ssm_state * dA + rearrange(x, "b d -> b d 1") * dB)
+            y = torch.einsum("bdn,bn->bd", ssm_state.to(dtype), C)
+            y = y + self.D.to(dtype) * x
+            y = y * self.act(z)  # (B D)
+        else:
+            y = selective_state_update(
+                ssm_state, x, dt, A, B, C, self.D, z=z, dt_bias=self.dt_proj.bias, dt_softplus=True
+            )
+        out = self.out_proj(y)
+        return out.unsqueeze(1), conv_state, ssm_state
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        device = self.out_proj.weight.device
+        conv_dtype = self.conv1d.weight.dtype if dtype is None else dtype
+        conv_state = torch.zeros(
+            batch_size, self.d_model * self.expand, self.d_conv, device=device, dtype=conv_dtype
+        )
+        ssm_dtype = self.dt_proj.weight.dtype if dtype is None else dtype
+        # ssm_dtype = torch.float32
+        ssm_state = torch.zeros(
+            batch_size, self.d_model * self.expand, self.d_state, device=device, dtype=ssm_dtype
+        )
+        return conv_state, ssm_state
+    def _get_states_from_cache(self, inference_params, batch_size, initialize_states=False):
+        assert self.layer_idx is not None
+        if self.layer_idx not in inference_params.key_value_memory_dict:
+            batch_shape = (batch_size,)
+            conv_state = torch.zeros(
+                batch_size,
+                self.d_model * self.expand,
+                self.d_conv,
+                device=self.conv1d.weight.device,
+                dtype=self.conv1d.weight.dtype,
+            )
+            ssm_state = torch.zeros(
+                batch_size,
+                self.d_model * self.expand,
+                self.d_state,
+                device=self.dt_proj.weight.device,
+                dtype=self.dt_proj.weight.dtype,
+                # dtype=torch.float32,
+            )
+            inference_params.key_value_memory_dict[self.layer_idx] = (conv_state, ssm_state)
+        else:
+            conv_state, ssm_state = inference_params.key_value_memory_dict[self.layer_idx]
+            # TODO: What if batch size changes between generation, and we reuse the same states?
+            if initialize_states:
+                conv_state.zero_()
+                ssm_state.zero_()
+        return conv_state, ssm_state
+    ##########################################################################
+## Feed-forward Network
+class FFN(nn.Module):
+    def __init__(self, dim, ffn_expansion_factor, bias):
+        super(FFN, self).__init__()
+        hidden_features = int(dim*ffn_expansion_factor)
+        self.project_in = nn.Conv2d(dim, hidden_features*2, kernel_size=1, bias=bias)
+        self.dwconv = nn.Conv2d(hidden_features*2, hidden_features*2, kernel_size=3, stride=1, padding=1, groups=hidden_features*2, bias=bias, dilation=1)
+        self.win_size = 8
+        self.modulator = nn.Parameter(torch.ones(self.win_size, self.win_size, dim*2))  # modulator
+        self.project_out = nn.Conv2d(hidden_features, dim, kernel_size=1, bias=bias)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        h1, w1 = h//self.win_size, w//self.win_size
+        x = self.project_in(x)
+        x = self.dwconv(x)
+        x_win = rearrange(x, 'b c (wsh h1) (wsw w1) -> b h1 w1 wsh wsw c', wsh=self.win_size, wsw=self.win_size)
+        x_win = x_win * self.modulator
+        x = rearrange(x_win, 'b h1 w1 wsh wsw c -> b c (wsh h1) (wsw w1)', wsh=self.win_size, wsw=self.win_size, h1=h1, w1=w1)
+        x1, x2 = x.chunk(2, dim=1)
+        x = x1 * x2
+        x = self.project_out(x)
+        return x
+    ##########################################################################
+## Gated Depth-wise Feed-forward Network (GDFN)
+class GDFN(nn.Module):
+    def __init__(self, dim, ffn_expansion_factor, bias):
+        super(GDFN, self).__init__()
+        hidden_features = int(dim*ffn_expansion_factor)
+        self.project_in = nn.Conv2d(dim, hidden_features*2, kernel_size=1, bias=bias)
+        self.dwconv = nn.Conv2d(hidden_features*2, hidden_features*2, kernel_size=3, stride=1, padding=1, groups=hidden_features*2, bias=bias, dilation=1)
+        self.project_out = nn.Conv2d(hidden_features, dim, kernel_size=1, bias=bias)
+    def forward(self, x):
+        x = self.project_in(x)
+        x = self.dwconv(x)
+        x1, x2 = x.chunk(2, dim=1)
+        x = F.silu(x1) * x2
+        x = self.project_out(x)
+        return x
+##########################################################################
+## Overlapped image patch embedding with 3x3 Conv
+class OverlapPatchEmbed(nn.Module):
+    def __init__(self, in_c=3, embed_dim=48, bias=False):
+        super(OverlapPatchEmbed, self).__init__()
+        self.proj = nn.Conv2d(in_c, embed_dim, kernel_size=3, stride=1, padding=1, bias=bias)
+    def forward(self, x):
+        x = self.proj(x)
+        return x
+##########################################################################
+## Resizing modules
+class Downsample(nn.Module):
+    def __init__(self, n_feat):
+        super(Downsample, self).__init__()
+        self.body = nn.Sequential(nn.Upsample(scale_factor=0.5, mode='bilinear', align_corners=False),
+                                  nn.Conv2d(n_feat, n_feat * 2, 3, stride=1, padding=1, bias=False))
+    def forward(self, x):
+        return self.body(x)
+class Upsample(nn.Module):
+    def __init__(self, n_feat):
+        super(Upsample, self).__init__()
+        self.body = nn.Sequential(nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),
+                                  nn.Conv2d(n_feat, n_feat // 2, 3, stride=1, padding=1, bias=False))
+    def forward(self, x):
+        return self.body(x)
+"""
+Borrow from "https://github.com/pp00704831/Stripformer-ECCV-2022-.git"
+@inproceedings{Tsai2022Stripformer,
+  author    = {Fu-Jen Tsai and Yan-Tsung Peng and Yen-Yu Lin and Chung-Chi Tsai and Chia-Wen Lin},
+  title     = {Stripformer: Strip Transformer for Fast Image Deblurring},
+  booktitle = {ECCV},
+  year      = {2022}
+}
+"""
+class Intra_VSSM(nn.Module):
+    def __init__(self, dim, vssm_expansion_factor, bias):  # gated = True
+        super(Intra_VSSM, self).__init__()
+        hidden = int(dim*vssm_expansion_factor)
+        self.proj_in = nn.Conv2d(dim, hidden*2, kernel_size=1, bias=bias)
+        self.dwconv = nn.Conv2d(hidden*2, hidden*2, kernel_size=3, stride=1, padding=1, groups=hidden*2, bias=bias)
+        self.proj_out = nn.Conv2d(hidden, dim, kernel_size=1, bias=bias)
+        self.conv_input = nn.Conv2d(hidden, hidden, kernel_size=1, padding=0, bias=bias)
+        self.fuse_out = nn.Conv2d(hidden, hidden, kernel_size=1, padding=0, bias=bias)
+        self.mamba = Mamba(d_model=hidden // 2)
+    def forward_core(self, x):
+        B, C, H, W = x.size()
+        x_input = torch.chunk(self.conv_input(x), 2, dim=1)
+        feature_h = (x_input[0]).permute(0, 2, 3, 1).contiguous()
+        feature_h = feature_h.view(B * H, W, C//2)
+        feature_v = (x_input[1]).permute(0, 3, 2, 1).contiguous()
+        feature_v = feature_v.view(B * W, H, C//2)
+        if H == W:
+            feature = torch.cat((feature_h, feature_v), dim=0)  # B * H * 2, W, C//2
+            scan_output = self.mamba(feature)
+            scan_output = torch.chunk(scan_output, 2, dim=0)
+            scan_output_h = scan_output[0]
+            scan_output_v = scan_output[1]
+        else:
+            scan_output_h = self.mamba(feature_h)
+            scan_output_v = self.mamba(feature_v)
+        scan_output_h = scan_output_h.view(B, H, W, C//2).permute(0, 3, 1, 2).contiguous()
+        scan_output_v = scan_output_v.view(B, W, H, C//2).permute(0, 3, 2, 1).contiguous()
+        scan_output = self.fuse_out(torch.cat((scan_output_h, scan_output_v), dim=1))
+        return scan_output
+    def forward(self, x):
+        x = self.proj_in(x)
+        x, x_ = self.dwconv(x).chunk(2, dim=1)
+        x = self.forward_core(x)
+        x = F.silu(x_) * x
+        x = self.proj_out(x)
+        return x
+class Inter_VSSM(nn.Module):
+    def __init__(self, dim, vssm_expansion_factor, bias):  # gated = True
+        super(Inter_VSSM, self).__init__()
+        hidden = int(dim*vssm_expansion_factor)
+        self.proj_in = nn.Conv2d(dim, hidden*2, kernel_size=1, bias=bias)
+        self.dwconv = nn.Conv2d(hidden*2, hidden*2, kernel_size=3, stride=1, padding=1, groups=hidden*2, bias=bias)
+        self.proj_out = nn.Conv2d(hidden, dim, kernel_size=1, bias=bias)
+        self.avg_pool = nn.AdaptiveAvgPool2d((None,1))
+        self.conv_input = nn.Conv2d(hidden, hidden, kernel_size=1, padding=0, bias=bias)
+        self.fuse_out = nn.Conv2d(hidden, hidden, kernel_size=1, padding=0, bias=bias)
+        self.mamba = Mamba(d_model=hidden // 2)
+        self.sigmoid = nn.Sigmoid()
+    def forward_core(self, x):
+        B, C, H, W = x.size()
+        x_input = torch.chunk(self.conv_input(x), 2, dim=1)  # B, C, H, W
+        feature_h = x_input[0].permute(0, 2, 1, 3).contiguous()  # B, H, C//2, W
+        feature_h_score = self.avg_pool(feature_h)  # B, H, C//2, 1
+        feature_h_score = feature_h_score.view(B, H, -1)
+        feature_v = x_input[1].permute(0, 3, 1, 2).contiguous()  # B, W, C//2, H
+        feature_v_score = self.avg_pool(feature_v)  # B, W, C//2, 1
+        feature_v_score = feature_v_score.view(B, W, -1)
+        if H == W:
+            feature_score = torch.cat((feature_h_score, feature_v_score), dim=0)  # B * 2, W or H, C//2
+            scan_score = self.mamba(feature_score)
+            scan_score = torch.chunk(scan_score, 2, dim=0)
+            scan_score_h = scan_score[0]
+            scan_score_v = scan_score[1]
+        else:
+            scan_score_h = self.mamba(feature_h_score)
+            scan_score_v = self.mamba(feature_v_score)
+        scan_score_h = self.sigmoid(scan_score_h)
+        scan_score_v = self.sigmoid(scan_score_v)
+        feature_h = feature_h*scan_score_h[:,:,:,None]
+        feature_v = feature_v*scan_score_v[:,:,:,None]
+        feature_h = feature_h.view(B, H, C//2, W).permute(0, 2, 1, 3).contiguous()
+        feature_v = feature_v.view(B, W, C//2, H).permute(0, 2, 3, 1).contiguous()
+        output = self.fuse_out(torch.cat((feature_h, feature_v), dim=1))
+        return output
+    def forward(self, x):
+        x = self.proj_in(x)
+        x, x_ = self.dwconv(x).chunk(2, dim=1)
+        x = self.forward_core(x)
+        x = F.silu(x_) * x
+        x = self.proj_out(x)
+        return x
+##########################################################################
+class Strip_VSSB(nn.Module):
+    def __init__(self, dim, vssm_expansion_factor, ffn_expansion_factor, bias=False, ssm=False, LayerNorm_type='WithBias'):
+        super(Strip_VSSB, self).__init__()
+        self.ssm = ssm
+        if self.ssm == True:
+            self.norm1_ssm = LayerNorm(dim, LayerNorm_type)
+            self.norm2_ssm = LayerNorm(dim, LayerNorm_type)
+            self.intra = Intra_VSSM(dim, vssm_expansion_factor, bias)
+            self.inter = Inter_VSSM(dim, vssm_expansion_factor, bias)
+        self.norm1_ffn = LayerNorm(dim, LayerNorm_type)
+        self.norm2_ffn = LayerNorm(dim, LayerNorm_type)
+        self.ffn1 = GDFN(dim, ffn_expansion_factor, bias)
+        self.ffn2 = GDFN(dim, ffn_expansion_factor, bias)
+    def forward(self, x):
+        if self.ssm == True:
+            x = x + self.intra(self.norm1_ssm(x))
+        x = x + self.ffn1(self.norm1_ffn(x))
+        if self.ssm == True:
+            x = x + self.inter(self.norm2_ssm(x))
+        x = x + self.ffn2(self.norm2_ffn(x))
+        return x
+##########################################################################
+##---------- Cross-level Feature Fusion by Adding Sigmoid(KL-Div) * Multi-Scale Feat -----------------------
+class CLFF(nn.Module):
+    def __init__(self, dim, dim_n1, dim_n2, bias=False):
+        super(CLFF, self).__init__()
+        self.conv = nn.Conv2d(dim, dim, kernel_size=1, bias=bias)
+        self.conv_n1 = nn.Conv2d(dim_n1, dim, kernel_size=1, bias=bias)
+        self.conv_n2 = nn.Conv2d(dim_n2, dim, kernel_size=1, bias=bias)
+        self.fuse_out1 = nn.Conv2d(dim*2, dim, kernel_size=1, bias=bias)
+        self.log_sigmoid = nn.LogSigmoid()
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x, n1, n2):
+        x_ = self.conv(x)
+        n1_ = self.conv_n1(n1)
+        n2_ = self.conv_n2(n2)
+        kl_n1 = F.kl_div(input=self.log_sigmoid(n1_), target=self.log_sigmoid(x_), log_target=True)
+        kl_n2 = F.kl_div(input=self.log_sigmoid(n2_), target=self.log_sigmoid(x_), log_target=True)
+        #g = self.sigmoid(x_)
+        g1 = self.sigmoid(kl_n1)
+        g2 = self.sigmoid(kl_n2)
+        #x = (1 + g) * x_ + (1 - g) * (g1 * n1_ + g2 * n2_)
+        x = self.fuse_out1(torch.cat((x_, g1 * n1_ + g2 * n2_), dim=1))
+        return x
+##########################################################################
+##---------- StripScanNet -----------------------
+class XYScanNet(nn.Module):
+    def __init__(self,
+        inp_channels=3,
+        out_channels=3,
+        dim = 24,  # 48, 72, 96, 120, 144, default: 72
+        num_blocks = [3,3,6],
+        vssm_expansion_factor  = 1,  # 1 or 2
+        ffn_expansion_factor  = 1,  # 1 or 3
+        bias = False,
+        LayerNorm_type = 'WithBias',   ## Other option 'BiasFree'
+    ):
+        super(XYScanNet, self).__init__()
+        self.patch_embed = OverlapPatchEmbed(inp_channels, dim)
+        self.encoder_level1 = nn.Sequential(*[Strip_VSSB(dim=dim, vssm_expansion_factor=vssm_expansion_factor, ffn_expansion_factor = ffn_expansion_factor,
+                                                         bias=bias, ssm=False, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[0])])
+        self.down1_2 = Downsample(dim) ## From Level 1 to Level 2
+        self.encoder_level2 = nn.Sequential(*[Strip_VSSB(dim=int(dim*2**1), vssm_expansion_factor=vssm_expansion_factor, ffn_expansion_factor = ffn_expansion_factor,
+                                                         bias=bias, ssm=False, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[1])])
+        self.down2_3 = Downsample(int(dim*2**1)) ## From Level 2 to Level 3
+        self.encoder_level3 = nn.Sequential(*[Strip_VSSB(dim=int(dim*2**2), vssm_expansion_factor=vssm_expansion_factor, ffn_expansion_factor = ffn_expansion_factor,
+                                                         bias=bias, ssm=False, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[2])])
+        self.decoder_level3 = nn.Sequential(*[Strip_VSSB(dim=int(dim*2**2), vssm_expansion_factor=vssm_expansion_factor, ffn_expansion_factor = ffn_expansion_factor,
+                                                         bias=bias, ssm=True, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[2])])
+        self.up3_2 = Upsample(int(dim*2**2)) ## From Level 3 to Level 2
+        self.clff_level2 = CLFF(int(dim*2**1), dim_n1=int(dim*2**0), dim_n2=(dim*2**2), bias=bias)
+        self.reduce_chan_level2 = nn.Conv2d(int(dim*2**2), int(dim*2**1), kernel_size=1, bias=bias)
+        self.decoder_level2 = nn.Sequential(*[Strip_VSSB(dim=int(dim*2**1), vssm_expansion_factor=vssm_expansion_factor, ffn_expansion_factor = ffn_expansion_factor,
+                                                         bias=bias, ssm=True, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[1])])
+        self.up2_1 = Upsample(int(dim*2**1))  ## From Level 2 to Level 1
+        self.clff_level1 = CLFF(int(dim*2**0), dim_n1=int(dim*2**1), dim_n2=(dim*2**2), bias=bias)
+        self.reduce_chan_level1 = nn.Conv2d(int(dim*2**1), int(dim*2**0), kernel_size=1, bias=bias)
+        self.decoder_level1 = nn.Sequential(*[Strip_VSSB(dim=int(dim*2**0), vssm_expansion_factor=vssm_expansion_factor, ffn_expansion_factor = ffn_expansion_factor,
+                                                         bias=bias, ssm=True, LayerNorm_type=LayerNorm_type) for i in range(num_blocks[0])])
+        # self.refinement = nn.Sequential(*[Strip_VSSB(dim=int(dim*2**0), expansion_factor=expansion_factor, bias=bias, ssm=True, LayerNorm_type=LayerNorm_type) for i in range(num_refinement_blocks)])
+        self.output = nn.Conv2d(int(dim*2**0), out_channels, kernel_size=3, stride=1, padding=1, bias=bias)
+    def forward(self, inp_img):
+        # Encoder
+        inp_enc_level1 = self.patch_embed(inp_img)
+        out_enc_level1 = self.encoder_level1(inp_enc_level1)
+        out_enc_level1_2 = F.interpolate(out_enc_level1, scale_factor=0.5)  # dim*2, lvl1 down-scaled to lvl2
+        inp_enc_level2 = self.down1_2(out_enc_level1)
+        out_enc_level2 = self.encoder_level2(inp_enc_level2)
+        out_enc_level2_1 = F.interpolate(out_enc_level2, scale_factor=2)  # dim*2, lvl2 up-scaled to lvl1
+        inp_enc_level3 = self.down2_3(out_enc_level2)
+        out_enc_level3 = self.encoder_level3(inp_enc_level3)
+        out_enc_level3_2 = F.interpolate(out_enc_level3, scale_factor=2)  # dim*2**2, lvl3 up-scaled to lvl2 (lvl3->lvl2)
+        out_enc_level3_1 = F.interpolate(out_enc_level3_2, scale_factor=2)  # dim*2**2, lvl3 up-scaled to lvl1 (lvl3->lvl2->lvl1)
+        out_enc_level1 = self.clff_level1(out_enc_level1, out_enc_level2_1, out_enc_level3_1)
+        out_enc_level2 = self.clff_level2(out_enc_level2, out_enc_level1_2, out_enc_level3_2)
+        # Decoder
+        out_dec_level3_decomp1 = self.decoder_level3(out_enc_level3)
+        inp_dec_level2_decomp1 = self.up3_2(out_dec_level3_decomp1)
+        inp_dec_level2_decomp1 = self.reduce_chan_level2(torch.cat((inp_dec_level2_decomp1, out_enc_level2), dim=1))
+        out_dec_level2_decomp1 = self.decoder_level2(inp_dec_level2_decomp1)
+        inp_dec_level1_decomp1 = self.up2_1(out_dec_level2_decomp1)
+        inp_dec_level1_decomp1 = self.reduce_chan_level1(torch.cat((inp_dec_level1_decomp1, out_enc_level1), dim=1))
+        out_dec_level1_decomp1 = self.decoder_level1(inp_dec_level1_decomp1)
+        out_dec_level1_decomp1 = self.output(out_dec_level1_decomp1)
+        out_dec_level1 = out_dec_level1_decomp1 + inp_img
+        return out_dec_level1, out_dec_level1_decomp1, None
+#"""
+import time
+start_time = time.time()
+inp = torch.randn(1, 3, 512, 512).cuda()#.to(dtype=torch.float16)
+model = XYScanNet().cuda()#.to(dtype=torch.float16)
+out = model(inp)[0]
+print(out.shape)
+print("--- %s seconds ---" % (time.time() - start_time))
+pytorch_total_params = sum(p.numel() for p in model.parameters())
+print("--- {num} parameters ---".format(num = pytorch_total_params))
+pytorch_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+print("--- {num} trainable parameters ---".format(num = pytorch_trainable_params))
+gpu_memmem_usage_bytes = torch.cuda.max_memory_allocated()
+print(gpu_memmem_usage_bytes / 1024 / 1024 / 1024)  # 64: 0.61 128: 2.21 256: 8.56; 512: 33.45
+#"""
+"""
+import torch
+from ptflops import get_model_complexity_info
+with torch.cuda.device(0):
+  net = model
+  macs, params = get_model_complexity_info(net, (3, 256, 256), as_strings=True,
+                                           print_per_layer_stat=True, verbose=True)
+  print('{:<30}  {:<8}'.format('Computational complexity: ', macs))  # 31.97 GMac
+  print('{:<30}  {:<8}'.format('Number of parameters: ', params))  # 8.37 M
+"""
+"""
+import time
+start_time = time.time()
+inp = torch.randn(1, 128, 64, 64).cuda()#.to(dtype=torch.float16)
+model = Strip_VSSB(dim=128, expansion_factor=1).cuda()#.to(dtype=torch.float16)
+out = model(inp)
+print(out.shape)
+print("--- %s seconds ---" % (time.time() - start_time))
+pytorch_total_params = sum(p.numel() for p in model.parameters())
+print("--- {num} parameters ---".format(num = pytorch_total_params))
+pytorch_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+print("--- {num} trainable parameters ---".format(num = pytorch_trainable_params))
+gpu_memmem_usage_bytes = torch.cuda.max_memory_allocated()
+print(gpu_memmem_usage_bytes / 1024 / 1024 / 1024)  # 128: 0.16 256: 0.24 512: 0.65
+"""

out/Results.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ testing results are created in this folder

predict_GoPro_test_results.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from __future__ import print_function
+import numpy as np
+import torch
+import cv2
+import yaml
+import os
+from torch.autograd import Variable
+from models.networks import get_generator
+import torchvision
+import time
+import argparse
+import torch.nn.functional as F
+def get_args():
+    parser = argparse.ArgumentParser('Test an image')
+    parser.add_argument('--job_name', default='xyscannet',
+    type=str, help='current job s name')
+    return parser.parse_args()
+def print_max_gpu_usage():
+    """Prints the maximum GPU memory usage in GB."""
+    max_memory = torch.cuda.max_memory_allocated()
+    max_memory_in_gb = max_memory / (1024 ** 3)  # Convert bytes to GB
+    print(f"Maximum GPU memory usage during test: {max_memory_in_gb:.2f} GB")
+if __name__ == '__main__':
+    # optionally reset gpu
+    #torch.cuda.reset_max_memory_allocated()
+    args = get_args()
+    #with open(os.path.join('config/', args.job_name, 'config_stage2.yaml'), 'r') as cfg:  # change the CFG name to test different models: pretrained, gopro, refined, stage1, stage2
+    #    config = yaml.safe_load(cfg)
+    with open(os.path.join('config/', args.job_name, 'config_stage2.yaml'), 'r') as cfg:  # change the CFG name to test different models: pretrained, gopro, refined, stage1, stage2
+        config = yaml.safe_load(cfg)
+    blur_path = '/mnt/g/RESEARCH/PHD/Motion_Deblurred/datasets/GOPRO_/test/testA'
+    out_path = os.path.join('results', args.job_name, 'images')
+    weights_path = os.path.join('results', args.job_name, 'models', 'best_{}.pth'.format(config['experiment_desc']))  # change the model name to test different phases: final/best
+    if not os.path.isdir(out_path):
+        os.mkdir(out_path)
+    model = get_generator(config['model'])
+    model.load_state_dict(torch.load(weights_path))
+    model = model.cuda()
+    #model.eval()
+    test_time = 0
+    iteration = 0
+    total_image_number = 1111
+    # warm-up
+    warm_up = 0
+    print('Hardware warm-up')
+    for file in os.listdir(blur_path):
+        for img_name in os.listdir(blur_path + '/' + file):
+            warm_up += 1
+            img = cv2.imread(blur_path + '/' + file + '/' + img_name)
+            img_tensor = torch.from_numpy(np.transpose(img / 255, (2, 0, 1)).astype('float32')) - 0.5
+            with torch.no_grad():
+                img_tensor = Variable(img_tensor.unsqueeze(0)).cuda()
+                result_image, decomp1, decomp2 = model(img_tensor)
+                #result_image = model(img_tensor)
+            if warm_up == 20:
+                break
+        break
+    for file in os.listdir(blur_path):
+        if not os.path.isdir(out_path + '/' + file):
+            os.mkdir(out_path + '/' + file)
+        for img_name in os.listdir(blur_path + '/' + file):
+            img = cv2.imread(blur_path + '/' + file + '/' + img_name)
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+            img_tensor = torch.from_numpy(np.transpose(img / 255, (2, 0, 1)).astype('float32')) - 0.5
+            with torch.no_grad():
+                iteration += 1
+                img_tensor = Variable(img_tensor.unsqueeze(0)).cuda()
+                start = time.time()
+                result_image, decomp1, decomp2 = model(img_tensor)
+                #result_image = model(img_tensor)
+                stop = time.time()
+                print('Image:{}/{}, CNN Runtime:{:.4f}'.format(iteration, total_image_number, (stop - start)))
+                test_time += stop - start
+                print('Average Runtime:{:.4f}'.format(test_time / float(iteration)))
+                result_image = result_image + 0.5
+                out_file_name = out_path + '/' + file + '/' + img_name
+                # optionally save image
+                torchvision.utils.save_image(result_image, out_file_name)
+    # optionally print gpu usage
+    #print_max_gpu_usage()
+    #torch.cuda.reset_max_memory_allocated()

predict_HIDE_test_results.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from __future__ import print_function
+import numpy as np
+import torch
+import cv2
+import yaml
+import os
+from torch.autograd import Variable
+from models.networks import get_generator
+import torchvision
+import time
+import argparse
+def get_args():
+    parser = argparse.ArgumentParser('Test an image')
+    parser.add_argument('--job_name', default='xyscannet',
+    type=str, help='current job s name')
+    return parser.parse_args()
+if __name__ == '__main__':
+    args = get_args()
+    with open(os.path.join('config/', args.job_name, 'config_stage2.yaml')) as cfg:  # change the yaml file to config_pretrained if ablation
+    #with open(os.path.join('config/', args.job_name, 'config_stage2.yaml')) as cfg:  # change the yaml file to config_pretrained if ablation
+        config = yaml.safe_load(cfg)
+    blur_path = '/scratch/user/hanzhou1996/datasets/deblur/HIDE/test/testA/'
+    out_path = os.path.join('results', args.job_name, 'images_hide')
+    weights_path = os.path.join('results', args.job_name, 'models', 'best_XYScanNet_stage2.pth')  # change the model name to test different phases: final/best
+    if not os.path.isdir(out_path):
+        os.mkdir(out_path)
+    model = get_generator(config['model'])
+    model.load_state_dict(torch.load(weights_path))
+    model = model.cuda()
+    test_time = 0
+    iteration = 0
+    total_image_number = 2025
+    # warm up
+    warm_up = 0
+    print('Hardware warm-up')
+    for img_name in os.listdir(blur_path):
+        warm_up += 1
+        img = cv2.imread(blur_path + '/' + img_name)
+        img_tensor = torch.from_numpy(np.transpose(img / 255, (2, 0, 1)).astype('float32')) - 0.5
+        with torch.no_grad():
+            img_tensor = Variable(img_tensor.unsqueeze(0)).cuda()
+            result_image, decomp1, decomp2 = model(img_tensor)
+            #result_image = model(img_tensor)
+        if warm_up == 20:
+            break
+        break
+    for img_name in os.listdir(blur_path):
+        img = cv2.imread(blur_path + '/' + img_name)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img_tensor = torch.from_numpy(np.transpose(img / 255, (2, 0, 1)).astype('float32')) - 0.5
+        with torch.no_grad():
+            iteration += 1
+            img_tensor = Variable(img_tensor.unsqueeze(0)).cuda()
+            start = time.time()
+            result_image, decomp1, decomp2 = model(img_tensor)
+            #result_image = model(img_tensor)
+            stop = time.time()
+            print('Image:{}/{}, CNN Runtime:{:.4f}'.format(iteration, total_image_number, (stop - start)))
+            test_time += stop - start
+            print('Average Runtime:{:.4f}'.format(test_time / float(iteration)))
+            result_image = result_image + 0.5
+            out_file_name = out_path + '/' + img_name
+            torchvision.utils.save_image(result_image, out_file_name)

predict_RWBI_test_results.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from __future__ import print_function
+import numpy as np
+import torch
+import cv2
+import yaml
+import os
+from torch.autograd import Variable
+from models.networks import get_generator
+import torchvision
+import time
+import torch.nn.functional as F
+import argparse
+def get_args():
+    parser = argparse.ArgumentParser('Test an image')
+    parser.add_argument('--job_name', default='xyscannet',
+    type=str, help='current job s name')
+    return parser.parse_args()
+if __name__ == '__main__':
+    args = get_args()
+    with open(os.path.join('config/', args.job_name, 'config_pretrained.yaml')) as cfg:  # change the yaml file to config_pretrained if ablation
+    #with open(os.path.join('config/', args.job_name, 'config_stage2.yaml')) as cfg:  # change the yaml file to config_pretrained if ablation
+        config = yaml.safe_load(cfg)
+    blur_path = '/scratch/user/hanzhou1996/datasets/deblur/RWBI/test/testA/'
+    out_path = os.path.join('results', args.job_name, 'images_rwbi')
+    weights_path = os.path.join('results', args.job_name, 'models', 'best_XYScanNet_stage2.pth')  # change the model name to test different phases: final/best
+    if not os.path.isdir(out_path):
+        os.mkdir(out_path)
+    model = get_generator(config['model'])
+    model.load_state_dict(torch.load(weights_path))
+    model = model.cuda()
+    test_time = 0
+    iteration = 0
+    total_image_number = 1000
+    # warm up
+    warm_up = 0
+    print('Hardware warm-up')
+    for img_name in os.listdir(blur_path):
+        warm_up += 1
+        img = cv2.imread(blur_path + '/' + img_name)
+        img_tensor = torch.from_numpy(np.transpose(img / 255, (2, 0, 1)).astype('float32')) - 0.5
+        with torch.no_grad():
+            img_tensor = Variable(img_tensor.unsqueeze(0)).cuda()
+            factor = 8
+            h, w = img_tensor.shape[2], img_tensor.shape[3]
+            H, W = ((h + factor) // factor) * factor, ((w + factor) // factor) * factor
+            padh = H - h if h % factor != 0 else 0
+            padw = W - w if w % factor != 0 else 0
+            img_tensor = F.pad(img_tensor, (0, padw, 0, padh), 'reflect')
+            H, W = img_tensor.shape[2], img_tensor.shape[3]
+            result_image, decomp1, decomp2 = model(img_tensor)
+        if warm_up == 20:
+            break
+    for file in os.listdir(blur_path):
+        if not os.path.isdir(out_path):
+            os.mkdir(out_path)
+        img = cv2.imread(blur_path + '/' + file)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img_tensor = torch.from_numpy(np.transpose(img / 255, (2, 0, 1)).astype('float32')) - 0.5
+        with torch.no_grad():
+            iteration += 1
+            img_tensor = Variable(img_tensor.unsqueeze(0)).cuda()
+            factor = 8
+            h, w = img_tensor.shape[2], img_tensor.shape[3]
+            H, W = ((h + factor) // factor) * factor, ((w + factor) // factor) * factor
+            padh = H - h if h % factor != 0 else 0
+            padw = W - w if w % factor != 0 else 0
+            img_tensor = F.pad(img_tensor, (0, padw, 0, padh), 'reflect')
+            H, W = img_tensor.shape[2], img_tensor.shape[3]
+            #with torch.autocast(device_type='cuda', dtype=torch.float16):
+            start = time.time()
+            result_image, decomp1, decomp2 = model(img_tensor)
+            stop = time.time()
+            result_image = result_image[:, :, :h, :w]
+            print('Image:{}/{}, CNN Runtime:{:.4f}'.format(iteration, total_image_number, (stop - start)))
+            test_time += stop - start
+            print('Average Runtime:{:.4f}'.format(test_time / float(iteration)))
+            result_image = result_image + 0.5
+            out_file_name = out_path + '/' + file
+            torchvision.utils.save_image(result_image, out_file_name)

predict_RealBlur_J_test_results.py ADDED Viewed

	@@ -0,0 +1,97 @@

+from __future__ import print_function
+import numpy as np
+import torch
+import cv2
+import yaml
+import os
+from torch.autograd import Variable
+from models.networks import get_generator
+import torchvision
+import time
+import torch.nn.functional as F
+import argparse
+def get_args():
+    parser = argparse.ArgumentParser('Test an image')
+    parser.add_argument('--job_name', default='fsformer_without_fs',
+    type=str, help='current job s name')
+    return parser.parse_args()
+if __name__ == '__main__':
+    args = get_args()
+    with open(os.path.join('config/', args.job_name, 'config_stage2.yaml')) as cfg:
+    #with open(os.path.join('config/', args.job_name, 'config_pretrained.yaml')) as cfg:
+        config = yaml.safe_load(cfg)
+    blur_path = '/scratch/user/hanzhou1996/datasets/deblur/RealBlur_J/test/testA'
+    out_path = os.path.join('results', args.job_name, 'images_realj')
+    weights_path = os.path.join('results', args.job_name, 'models', 'final_XYScanNet_stage2.pth')  # change the model name to test different phases: final/best final_StripMamba_pretrained.pth
+    if not os.path.isdir(out_path):
+        os.mkdir(out_path)
+    model = get_generator(config['model'])
+    model.load_state_dict(torch.load(weights_path))
+    model = model.cuda()
+    test_time = 0
+    iteration = 0
+    total_image_number = 980
+    # warm up
+    warm_up = 0
+    print('Hardware warm-up')
+    for file in os.listdir(blur_path):
+        #if not os.path.isdir(out_path + '/' + file):
+        #    os.mkdir(out_path + '/' + file)
+        img_name = file
+        # for img_name in os.listdir(blur_path + '/' + file):
+        warm_up += 1
+        img = cv2.imread(blur_path + '/' + file)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img_tensor = torch.from_numpy(np.transpose(img / 255, (2, 0, 1)).astype('float32')) - 0.5
+        with torch.no_grad():
+            img_tensor = Variable(img_tensor.unsqueeze(0)).cuda()
+            factor = 8
+            h, w = img_tensor.shape[2], img_tensor.shape[3]
+            H, W = ((h + factor) // factor) * factor, ((w + factor) // factor) * factor
+            padh = H - h if h % factor != 0 else 0
+            padw = W - w if w % factor != 0 else 0
+            img_tensor = F.pad(img_tensor, (0, padw, 0, padh), 'reflect')
+            result_image, decomp1, decomp2 = model(img_tensor)
+            #result_image = model(img_tensor)
+        if warm_up == 20:
+            break
+        break
+    for file in os.listdir(blur_path):
+        #if not os.path.isdir(out_path + '/' + file):
+        #    os.mkdir(out_path + '/' + file)
+        img_name = file
+        # for img_name in os.listdir(blur_path + '/' + file):
+        img = cv2.imread(blur_path + '/' + file)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img_tensor = torch.from_numpy(np.transpose(img / 255, (2, 0, 1)).astype('float32')) - 0.5
+        with torch.no_grad():
+            iteration += 1
+            img_tensor = Variable(img_tensor.unsqueeze(0)).cuda()
+            factor = 8
+            h, w = img_tensor.shape[2], img_tensor.shape[3]
+            H, W = ((h + factor) // factor) * factor, ((w + factor) // factor) * factor
+            padh = H - h if h % factor != 0 else 0
+            padw = W - w if w % factor != 0 else 0
+            img_tensor = F.pad(img_tensor, (0, padw, 0, padh), 'reflect')
+            H, W = img_tensor.shape[2], img_tensor.shape[3]
+            start = time.time()
+            _output, decomp1, decomp2 = model(img_tensor)
+            #_output = model(img_tensor)
+            stop = time.time()
+            result_image = _output[:, :, :h, :w]
+            result_image = torch.clamp(result_image, -0.5, 0.5)
+            result_image = result_image + 0.5
+            test_time += stop - start
+            print('Image:{}/{}, CNN Runtime:{:.4f}'.format(iteration, total_image_number, (stop - start)))
+            print('Average Runtime:{:.4f}'.format(test_time / float(iteration)))
+            out_file_name = out_path + '/' + img_name
+            torchvision.utils.save_image(result_image, out_file_name)

predict_RealBlur_R_test_results.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from __future__ import print_function
+import argparse
+import numpy as np
+import torch
+import cv2
+import yaml
+import os
+from torch.autograd import Variable
+from models.networks import get_generator
+import torchvision
+import time
+import torch.nn.functional as F
+def get_args():
+    parser = argparse.ArgumentParser('Test an image')
+    parser.add_argument('--job_name', default='xyscannet',
+    type=str, help='current job s name')
+    return parser.parse_args()
+if __name__ == '__main__':
+    args = get_args()
+    with open(os.path.join('config/', args.job_name, 'config_stage2.yaml')) as cfg:
+        config = yaml.safe_load(cfg)
+    blur_path = '/scratch/user/hanzhou1996/datasets/deblur/RealBlur_R/test/testA'
+    out_path = os.path.join('results', args.job_name, 'images_realr')
+    weights_path = os.path.join('results', args.job_name, 'models', 'final_XYScanNet_stage2.pth')  # change the model name to test different phases: final/best
+    if not os.path.isdir(out_path):
+        os.mkdir(out_path)
+    model = get_generator(config['model'])
+    model.load_state_dict(torch.load(weights_path))
+    model = model.cuda()
+    test_time = 0
+    iteration = 0
+    total_image_number = 980
+    # warm up
+    warm_up = 0
+    print('Hardware warm-up')
+    for file in os.listdir(blur_path):
+        #if not os.path.isdir(out_path + '/' + file):
+        #    os.mkdir(out_path + '/' + file)
+        #for img_name in os.listdir(blur_path + '/' + file):
+        img_name = file
+        warm_up += 1
+        img = cv2.imread(blur_path + '/' + file)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img_tensor = torch.from_numpy(np.transpose(img / 255, (2, 0, 1)).astype('float32')) - 0.5
+        with torch.no_grad():
+            img_tensor = Variable(img_tensor.unsqueeze(0)).cuda()
+            factor = 8
+            h, w = img_tensor.shape[2], img_tensor.shape[3]
+            H, W = ((h + factor) // factor) * factor, ((w + factor) // factor) * factor
+            padh = H - h if h % factor != 0 else 0
+            padw = W - w if w % factor != 0 else 0
+            img_tensor = F.pad(img_tensor, (0, padw, 0, padh), 'reflect')
+            result_image, decomp1, decomp2 = model(img_tensor)
+            #result_image = model(img_tensor)
+        if warm_up == 20:
+            break
+        break
+    for file in os.listdir(blur_path):
+        #if not os.path.isdir(out_path + '/' + file):
+        #    os.mkdir(out_path + '/' + file)
+        #for img_name in os.listdir(blur_path + '/' + file):
+        img_name = file
+        img = cv2.imread(blur_path + '/' + file)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img_tensor = torch.from_numpy(np.transpose(img / 255, (2, 0, 1)).astype('float32')) - 0.5
+        with torch.no_grad():
+            iteration += 1
+            img_tensor = Variable(img_tensor.unsqueeze(0)).cuda()
+            factor = 8
+            h, w = img_tensor.shape[2], img_tensor.shape[3]
+            H, W = ((h + factor) // factor) * factor, ((w + factor) // factor) * factor
+            padh = H - h if h % factor != 0 else 0
+            padw = W - w if w % factor != 0 else 0
+            img_tensor = F.pad(img_tensor, (0, padw, 0, padh), 'reflect')
+            H, W = img_tensor.shape[2], img_tensor.shape[3]
+            start = time.time()
+            _output, decomp1, decomp2 = model(img_tensor)
+            #_output = model(img_tensor)
+            stop = time.time()
+            result_image = _output[:, :, :h, :w]
+            result_image = torch.clamp(result_image, -0.5, 0.5)
+            result_image = result_image + 0.5
+            test_time += stop - start
+            print('Image:{}/{}, CNN Runtime:{:.4f}'.format(iteration, total_image_number, (stop - start)))
+            print('Average Runtime:{:.4f}'.format(test_time / float(iteration)))
+            out_file_name = out_path + '/' + img_name
+            torchvision.utils.save_image(result_image, out_file_name)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio==4.44.1
+spaces
+torch==2.1.2
+torchvision==0.16.2
+transformers==4.46.3
+einops==0.8.1
+PyYAML==6.0.2
+opencv-python-headless==4.10.0.84
+numpy==1.26.4
+pillow==10.4.0
+mamba-ssm==2.2.2

results/xyscannetp_gopro/models/best_XYScanNet_stage2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da56c2c8ccb0c7cfc86c81431e9b7c2681c2109ca056d56d9454ba4aeb6c07e0
+size 254328477

schedulers.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import math
+from torch.optim import lr_scheduler
+class WarmRestart(lr_scheduler.CosineAnnealingLR):
+    """This class implements Stochastic Gradient Descent with Warm Restarts(SGDR): https://arxiv.org/abs/1608.03983.
+    Set the learning rate of each parameter group using a cosine annealing schedule, When last_epoch=-1, sets initial lr as lr.
+    This can't support scheduler.step(epoch). please keep epoch=None.
+    """
+    def __init__(self, optimizer, T_max=30, T_mult=1, eta_min=0, last_epoch=-1):
+        """implements SGDR
+        Parameters:
+        ----------
+        T_max : int
+            Maximum number of epochs.
+        T_mult : int
+            Multiplicative factor of T_max.
+        eta_min : int
+            Minimum learning rate. Default: 0.
+        last_epoch : int
+            The index of last epoch. Default: -1.
+        """
+        self.T_mult = T_mult
+        super().__init__(optimizer, T_max, eta_min, last_epoch)
+    def get_lr(self):
+        if self.last_epoch == self.T_max:
+            self.last_epoch = 0
+            self.T_max *= self.T_mult
+        return [self.eta_min + (base_lr - self.eta_min) * (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / 2 for
+                base_lr in self.base_lrs]
+class LinearDecay(lr_scheduler._LRScheduler):
+    """This class implements LinearDecay
+    """
+    def __init__(self, optimizer, num_epochs, start_epoch=0, min_lr=0, last_epoch=-1):
+        """implements LinearDecay
+        Parameters:
+        ----------
+        """
+        self.num_epochs = num_epochs
+        self.start_epoch = start_epoch
+        self.min_lr = min_lr
+        super().__init__(optimizer, last_epoch)
+    def get_lr(self):
+        if self.last_epoch < self.start_epoch:
+            return self.base_lrs
+        return [base_lr - ((base_lr - self.min_lr) / self.num_epochs) * (self.last_epoch - self.start_epoch) for
+                base_lr in self.base_lrs]

train_XYScanNet_stage1.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import logging
+from functools import partial
+import os
+import cv2
+import torch
+import torch.optim as optim
+import tqdm
+import yaml
+from joblib import cpu_count
+from torch.utils.data import DataLoader
+import random
+from dataset import PairedDataset
+from metric_counter import MetricCounter
+from models.losses import get_loss
+from models.models import get_model
+from models.networks import get_nets
+from util import util
+import numpy as np
+from torch.optim.lr_scheduler import CosineAnnealingLR
+cv2.setNumThreads(0)
+import argparse
+parser = argparse.ArgumentParser(description='Image motion deblurring evaluation on GoPro/HIDE')
+parser.add_argument('--job_name', default='xyscannet',
+    type=str, help='current job s name')
+args = parser.parse_args()
+class Trainer:
+    def __init__(self, config, train: DataLoader, val: DataLoader):
+        self.config = config
+        self.train_dataset = train
+        self.val_dataset = val
+        self.metric_counter = MetricCounter(config['experiment_desc'])
+    def train(self):
+        self._init_params()
+        start_epoch = 0
+        print("The current job is: ", args.job_name)
+        model_dir = os.path.join('results/', args.job_name, 'models')
+        util.mkdir(model_dir)
+        if os.path.exists(os.path.join(model_dir, 'last_XYScanNet_stage1.pth')):
+            print('resume learning')
+            training_state = (torch.load(os.path.join(model_dir, 'last_XYScanNet_stage1.pth')))
+            start_epoch = training_state['epoch'] + 1
+            new_weight = self.netG.state_dict()
+            new_weight.update(training_state['model_state'])
+            self.netG.load_state_dict(new_weight)
+            new_optimizer = self.optimizer_G.state_dict()
+            new_optimizer.update(training_state['optimizer_state'])
+            self.optimizer_G.load_state_dict(new_optimizer)
+            new_scheduler = self.scheduler_G.state_dict()
+            new_scheduler.update(training_state['scheduler_state'])
+            self.scheduler_G.load_state_dict(new_scheduler)
+        for epoch in range(start_epoch, config['num_epochs']):
+            self._run_epoch(epoch)
+            if epoch % 30 == 0 or epoch == (config['num_epochs']-1):
+                self._validate(epoch)
+            self.scheduler_G.step()
+            scheduler_state = self.scheduler_G.state_dict()
+            training_state = {'epoch': epoch,  'model_state': self.netG.state_dict(),
+                              'scheduler_state': scheduler_state, 'optimizer_state': self.optimizer_G.state_dict()}
+            if self.metric_counter.update_best_model():
+                torch.save(training_state['model_state'],
+                           os.path.join(model_dir, 'best_{}.pth'.format(self.config['experiment_desc'])))
+            if epoch % 300 == 0:
+                torch.save(training_state,
+                           os.path.join(model_dir, 'last_{}_{}.pth'.format(self.config['experiment_desc'], epoch)))
+            if epoch == (config['num_epochs']-1):
+                torch.save(training_state['model_state'],
+                           os.path.join(model_dir, 'final_{}.pth'.format(self.config['experiment_desc'])))
+            torch.save(training_state,
+                       os.path.join(model_dir, 'last_{}.pth'.format(self.config['experiment_desc'])))
+            logging.debug("Experiment Name: %s, Epoch: %d, Loss: %s" % (
+                self.config['experiment_desc'], epoch, self.metric_counter.loss_message()))
+    def _run_epoch(self, epoch):
+        self.metric_counter.clear()
+        for param_group in self.optimizer_G.param_groups:
+            lr = param_group['lr']
+        epoch_size = config.get('train_batches_per_epoch') or len(self.train_dataset)
+        tq = tqdm.tqdm(self.train_dataset)
+        tq.set_description('Epoch {}, lr {}'.format(epoch, lr))
+        i = 0
+        for data in tq:
+            inputs, targets = self.model.get_input(data)
+            outputs, decomp1, decomp2 = self.netG(inputs)
+            self.optimizer_G.zero_grad()
+            loss_G = self.criterionG(outputs, targets, inputs)
+            loss_G.backward()
+            self.optimizer_G.step()
+            self.metric_counter.add_losses(loss_G.item())
+            curr_psnr, curr_ssim, img_for_vis = self.model.get_images_and_metrics(inputs, outputs, targets)
+            self.metric_counter.add_metrics(curr_psnr, curr_ssim)
+            tq.set_postfix(loss=self.metric_counter.loss_message())
+            if not i:
+                self.metric_counter.add_image(img_for_vis, tag='train')
+            i += 1
+            if i > len(self.train_dataset):
+                break
+        tq.close()
+        self.metric_counter.write_to_tensorboard(epoch)
+    def _validate(self, epoch):
+        self.metric_counter.clear()
+        epoch_size = config.get('val_batches_per_epoch') or len(self.val_dataset)
+        tq = tqdm.tqdm(self.val_dataset)
+        tq.set_description('Validation')
+        i = 0
+        for data in tq:
+            with torch.no_grad():
+                inputs, targets = self.model.get_input(data)
+                outputs, decomp1, decomp2 = self.netG(inputs)
+                loss_G = self.criterionG(outputs, targets, inputs)
+                self.metric_counter.add_losses(loss_G.item())
+                curr_psnr, curr_ssim, img_for_vis = self.model.get_images_and_metrics(inputs, outputs, targets)
+                self.metric_counter.add_metrics(curr_psnr, curr_ssim)
+                if not i:
+                    self.metric_counter.add_image(img_for_vis, tag='val')
+                i += 1
+                if i > len(self.train_dataset):
+                    break
+        tq.close()
+        self.metric_counter.write_to_tensorboard(epoch, validation=True)
+    def _get_optim(self, params):
+        if self.config['optimizer']['name'] == 'adam':
+            optimizer = optim.Adam(params, lr=self.config['optimizer']['lr'])
+        elif self.config['optimizer']['name'] == 'adamw':
+            optimizer = optim.AdamW(params, lr=0.001, weight_decay=0.001, betas=(0.9,0.9))
+        else:
+            raise ValueError("Optimizer [%s] not recognized." % self.config['optimizer']['name'])
+        return optimizer
+    def _get_scheduler(self, optimizer):
+        if self.config['scheduler']['name'] == 'cosine':
+            scheduler = CosineAnnealingLR(optimizer, T_max=self.config['num_epochs'], eta_min=self.config['scheduler']['min_lr'])
+        else:
+            raise ValueError("Scheduler [%s] not recognized." % self.config['scheduler']['name'])
+        return scheduler
+    def _init_params(self):
+        self.criterionG = get_loss(self.config['model'])
+        self.netG = get_nets(self.config['model'])
+        self.netG.cuda()
+        self.model = get_model(self.config['model'])
+        self.optimizer_G = self._get_optim(filter(lambda p: p.requires_grad, self.netG.parameters()))
+        self.scheduler_G = self._get_scheduler(self.optimizer_G)
+if __name__ == '__main__':
+    with open(os.path.join('config/', args.job_name, 'config_stage1.yaml'), 'r') as f:
+        config = yaml.safe_load(f)
+    # setup
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.benchmark = True
+    # set random seed
+    seed = 666
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    batch_size = config.pop('batch_size')
+    get_dataloader = partial(DataLoader, batch_size=batch_size, num_workers=cpu_count(), shuffle=True, drop_last=False)
+    datasets = map(config.pop, ('train', 'val'))
+    datasets = map(PairedDataset.from_config, datasets)
+    train, val = map(get_dataloader, datasets)
+    trainer = Trainer(config, train=train, val=val)
+    trainer.train()

train_XYScanNet_stage2.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import logging
+from functools import partial
+import os
+import cv2
+import torch
+import torch.optim as optim
+import tqdm
+import yaml
+from joblib import cpu_count
+from torch.utils.data import DataLoader
+import random
+from dataset import PairedDataset
+from metric_counter import MetricCounter
+from models.losses import get_loss
+from models.models import get_model
+from models.networks import get_nets
+import numpy as np
+from torch.optim.lr_scheduler import CosineAnnealingLR
+cv2.setNumThreads(0)
+import argparse
+parser = argparse.ArgumentParser(description='Image motion deblurring evaluation on GoPro/HIDE')
+parser.add_argument('--job_name', default='xyscannet',
+    type=str, help='current job s name')
+args = parser.parse_args()
+class Trainer:
+    def __init__(self, config, train: DataLoader, val: DataLoader):
+        self.config = config
+        self.train_dataset = train
+        self.val_dataset = val
+        self.metric_counter = MetricCounter(config['experiment_desc'])
+    def train(self):
+        self._init_params()
+        start_epoch = 0
+        print("The current job is: ", args.job_name)
+        model_dir = os.path.join('results/', args.job_name, 'models')
+        if os.path.exists(os.path.join(model_dir, 'last_XYScanNet_stage2.pth')):
+            print('resume learning')
+            training_state = (torch.load(os.path.join(model_dir, 'last_XYScanNet_stage2.pth')))
+            start_epoch = training_state['epoch'] + 1
+            new_weight = self.netG.state_dict()
+            new_weight.update(training_state['model_state'])
+            self.netG.load_state_dict(new_weight)
+            new_optimizer = self.optimizer_G.state_dict()
+            new_optimizer.update(training_state['optimizer_state'])
+            self.optimizer_G.load_state_dict(new_optimizer)
+            new_scheduler = self.scheduler_G.state_dict()
+            new_scheduler.update(training_state['scheduler_state'])
+            self.scheduler_G.load_state_dict(new_scheduler)
+        else:
+            print('load_weights_stage1')
+            training_state = (torch.load(os.path.join(model_dir, 'final_XYScanNet_stage1.pth')))
+            new_weight = self.netG.state_dict()
+            new_weight.update(training_state)
+            self.netG.load_state_dict(new_weight)
+        for epoch in range(start_epoch, config['num_epochs']):
+            self._run_epoch(epoch)
+            if epoch % 30 == 0 or epoch == (config['num_epochs']-1):
+                self._validate(epoch)
+            self.scheduler_G.step()
+            scheduler_state = self.scheduler_G.state_dict()
+            training_state = {'epoch': epoch,  'model_state': self.netG.state_dict(),
+                              'scheduler_state': scheduler_state, 'optimizer_state': self.optimizer_G.state_dict()}
+            if self.metric_counter.update_best_model():
+                torch.save(training_state['model_state'],
+                           os.path.join(model_dir, 'best_{}.pth'.format(self.config['experiment_desc'])))
+            if epoch % 200 == 0:
+                torch.save(training_state,
+                           os.path.join(model_dir, 'last_{}_{}.pth'.format(self.config['experiment_desc'], epoch)))
+            if epoch == (config['num_epochs']-1):
+                torch.save(training_state['model_state'],
+                           os.path.join(model_dir, 'final_{}.pth'.format(self.config['experiment_desc'])))
+            torch.save(training_state,
+                       os.path.join(model_dir, 'last_{}.pth'.format(self.config['experiment_desc'])))
+            logging.debug("Experiment Name: %s, Epoch: %d, Loss: %s" % (
+                self.config['experiment_desc'], epoch, self.metric_counter.loss_message()))
+    def _run_epoch(self, epoch):
+        self.metric_counter.clear()
+        for param_group in self.optimizer_G.param_groups:
+            lr = param_group['lr']
+        epoch_size = config.get('train_batches_per_epoch') or len(self.train_dataset)
+        tq = tqdm.tqdm(self.train_dataset)
+        tq.set_description('Epoch {}, lr {}'.format(epoch, lr))
+        i = 0
+        for data in tq:
+            inputs, targets = self.model.get_input(data)
+            outputs, decomp1, decomp2 = self.netG(inputs)
+            #outputs = self.netG(inputs)
+            self.optimizer_G.zero_grad()
+            loss_G = self.criterionG(outputs, targets, inputs)
+            loss_G.backward()
+            self.optimizer_G.step()
+            self.metric_counter.add_losses(loss_G.item())
+            curr_psnr, curr_ssim, img_for_vis = self.model.get_images_and_metrics(inputs, outputs, targets)
+            self.metric_counter.add_metrics(curr_psnr, curr_ssim)
+            tq.set_postfix(loss=self.metric_counter.loss_message())
+            if not i:
+                self.metric_counter.add_image(img_for_vis, tag='train')
+            i += 1
+            if i > len(self.train_dataset):
+                break
+        tq.close()
+        self.metric_counter.write_to_tensorboard(epoch)
+    def _validate(self, epoch):
+        self.metric_counter.clear()
+        epoch_size = config.get('val_batches_per_epoch') or len(self.val_dataset)
+        tq = tqdm.tqdm(self.val_dataset)
+        tq.set_description('Validation')
+        i = 0
+        for data in tq:
+            with torch.no_grad():
+                inputs, targets = self.model.get_input(data)
+                outputs, decomp1, decomp2 = self.netG(inputs)
+                #outputs = self.netG(inputs)
+                loss_G = self.criterionG(outputs, targets, inputs)
+                self.metric_counter.add_losses(loss_G.item())
+                curr_psnr, curr_ssim, img_for_vis = self.model.get_images_and_metrics(inputs, outputs, targets)
+                self.metric_counter.add_metrics(curr_psnr, curr_ssim)
+                if not i:
+                    self.metric_counter.add_image(img_for_vis, tag='val')
+                i += 1
+                if i > len(self.train_dataset):
+                    break
+        tq.close()
+        self.metric_counter.write_to_tensorboard(epoch, validation=True)
+    def _get_optim(self, params):
+        if self.config['optimizer']['name'] == 'adam':
+            optimizer = optim.Adam(params, lr=self.config['optimizer']['lr'])
+        else:
+            raise ValueError("Optimizer [%s] not recognized." % self.config['optimizer']['name'])
+        return optimizer
+    def _get_scheduler(self, optimizer):
+        if self.config['scheduler']['name'] == 'cosine':
+            scheduler = CosineAnnealingLR(optimizer, T_max=self.config['num_epochs'], eta_min=self.config['scheduler']['min_lr'])
+        else:
+            raise ValueError("Scheduler [%s] not recognized." % self.config['scheduler']['name'])
+        return scheduler
+    def _init_params(self):
+        self.criterionG = get_loss(self.config['model'])
+        self.netG = get_nets(self.config['model'])
+        self.netG.cuda()
+        self.model = get_model(self.config['model'])
+        self.optimizer_G = self._get_optim(filter(lambda p: p.requires_grad, self.netG.parameters()))
+        self.scheduler_G = self._get_scheduler(self.optimizer_G)
+if __name__ == '__main__':
+    with open(os.path.join('config/', args.job_name, 'config_stage2.yaml'), 'r') as f:
+        config = yaml.safe_load(f)
+    # setup
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.benchmark = True
+    # set random seed
+    seed = 666
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    batch_size = config.pop('batch_size')
+    get_dataloader = partial(DataLoader, batch_size=batch_size, num_workers=cpu_count(), shuffle=True, drop_last=False)
+    datasets = map(config.pop, ('train', 'val'))
+    datasets = map(PairedDataset.from_config, datasets)
+    train, val = map(get_dataloader, datasets)
+    trainer = Trainer(config, train=train, val=val)
+    trainer.train()

util/__init__.py ADDED Viewed

File without changes

util/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (143 Bytes). View file

util/__pycache__/__init__.cpython-36.pyc ADDED Viewed

Binary file (126 Bytes). View file

util/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (145 Bytes). View file