Spaces:

mutou0308
/

GSASR

Running on Zero

App Files Files Community

mt-cly commited on Jun 5

Commit

909940e

1 Parent(s): a6d2ec4

init

Browse files

Files changed (38) hide show

.gitattributes +2 -0
.gitignore +31 -0
README.md +6 -3
app.py +359 -0
assets/0846x4.png +3 -0
assets/0873.png +3 -0
assets/0873x4.png +3 -0
assets/0873x4_cropped_120x120.png +3 -0
assets/0892x4.png +3 -0
assets/Screenshot_cropped_180x100.png +3 -0
dist/gscuda-0.0.0-cp310-cp310-linux_x86_64.whl +3 -0
requirements.txt +12 -0
setup.py +26 -0
utils/edsrbaseline.py +113 -0
utils/fea2gsropeamp.py +749 -0
utils/gaussian_splatting.py +265 -0
utils/gs_cuda/check.py +115 -0
utils/gs_cuda/gs.cu +199 -0
utils/gs_cuda/gs.h +24 -0
utils/gs_cuda/gswrapper.cpp +80 -0
utils/gs_cuda/gswrapper.py +49 -0
utils/gs_cuda/mylineprofiler.py +264 -0
utils/gs_cuda/profile.log +69 -0
utils/gs_cuda/profile.py +137 -0
utils/gs_cuda_dmax/__init__.py +0 -0
utils/gs_cuda_dmax/check.py +122 -0
utils/gs_cuda_dmax/gs copy.cu +212 -0
utils/gs_cuda_dmax/gs.backup.cu +188 -0
utils/gs_cuda_dmax/gs.cu +187 -0
utils/gs_cuda_dmax/gs.h +26 -0
utils/gs_cuda_dmax/gswrapper.cpp +82 -0
utils/gs_cuda_dmax/gswrapper.py +63 -0
utils/gs_cuda_dmax/mylineprofiler.py +264 -0
utils/gs_cuda_dmax/profile.py +142 -0
utils/hatropeamp.py +1156 -0
utils/rdn.py +120 -0
utils/split_and_joint_image.py +232 -0
utils/swinir.py +1243 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+dist/gscuda-0.0.0-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,31 @@

+# Python build and package directories
+build/
+gscuda.egg-info/
+# Additional common Python ignore patterns
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+*.egg
+*.egg-info/
+# IDE and editor files
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS generated files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+# Gradio cache
+.gradio/
+.setup_complete

README.md CHANGED Viewed

@@ -1,13 +1,16 @@
 ---
 title: GSASR
-emoji: 👀
-colorFrom: blue
 colorTo: yellow
 sdk: gradio
-sdk_version: 5.33.0
 app_file: app.py
 pinned: false
 license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: GSASR
+emoji: 🌖
+colorFrom: pink
 colorTo: yellow
 sdk: gradio
+sdk_version: 4.44.1
+python_version: 3.10
 app_file: app.py
 pinned: false
+# suggested_hardware: zero-a10g
 license: mit
+short_description: GSASR(2d gaussian for arbitrary-scale super-resolution)
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,359 @@

+import torch
+import numpy as np
+import gradio as gr
+from PIL import Image
+import math
+import torch.nn.functional as F
+import os
+import tempfile
+import time
+import threading
+from utils.hatropeamp import HATNOUP_ROPE_AMP
+from utils.fea2gsropeamp import Fea2GS_ROPE_AMP
+from utils.edsrbaseline import EDSRNOUP
+from utils.hatropeamp import HATNOUP_ROPE_AMP
+from utils.rdn import RDNNOUP
+from utils.swinir import SwinIRNOUP
+from utils.fea2gsropeamp import Fea2GS_ROPE_AMP
+from utils.gaussian_splatting import generate_2D_gaussian_splatting_step
+from utils.split_and_joint_image import split_and_joint_image
+from huggingface_hub import hf_hub_download
+import subprocess
+import sys
+import spaces
+# Device setup
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# Global stop flag for interrupting inference
+stop_inference = False
+inference_lock = threading.Lock()
+def load_model(
+    pretrained_model_name_or_path: str = "mutou0308/GSASR",
+    model_name: str = "HATL_SA1B",
+    device: str | torch.device = "cuda"
+):
+    enc_path = hf_hub_download(
+            repo_id=pretrained_model_name_or_path, filename=os.path.join(model_name, 'encoder.pth')
+        )
+    dec_path = hf_hub_download(
+            repo_id=pretrained_model_name_or_path, filename=os.path.join(model_name, 'decoder.pth')
+        )
+    enc_weight = torch.load(enc_path, weights_only=True)['params_ema']
+    dec_weight = torch.load(dec_path, weights_only=True)['params_ema']
+    if model_name in ['EDSR_DIV2K', 'EDSR_DF2K']:
+        encoder = EDSRNOUP()
+        decoder = Fea2GS_ROPE_AMP()
+    elif model_name in ['RDN_DIV2K', 'RDN_DF2K']:
+        encoder = RDNNOUP()
+        decoder = Fea2GS_ROPE_AMP(num_crossattn_blocks = 2)
+    elif model_name in ['SwinIR_DIV2K', 'SwinIR_DF2K']:
+        encoder = SwinIRNOUP()
+        decoder = Fea2GS_ROPE_AMP(num_crossattn_blocks=2, num_crossattn_layers=4, num_gs_seed=256, window_size=16)
+    elif model_name in ['HATL_SA1B']:
+        encoder = HATNOUP_ROPE_AMP()
+        decoder = Fea2GS_ROPE_AMP(channel=192, num_crossattn_blocks=4, num_crossattn_layers=4, num_selfattn_blocks=8, num_selfattn_layers=6,
+                                  num_gs_seed=256, window_size=16)
+    else:
+        raise ValueError(f"args.model-{model_name} must be in ['EDSR_DIV2K', 'EDSR_DF2K', 'RDN_DIV2K', 'RDN_DF2K', 'SwinIR_DIV2K', 'SwinIR_DF2K', 'HATL_SA1B']")
+    encoder.load_state_dict(enc_weight, strict=True)
+    decoder.load_state_dict(dec_weight, strict=True)
+    encoder.eval()
+    decoder.eval()
+    encoder = encoder.to(device)
+    decoder = decoder.to(device)
+    return encoder, decoder
+def preprocess(x, denominator=16):
+    """Preprocess image to ensure dimensions are multiples of denominator"""
+    _, c, h, w = x.shape
+    if h % denominator > 0:
+        pad_h = denominator - h % denominator
+    else:
+        pad_h = 0
+    if w % denominator > 0:
+        pad_w = denominator - w % denominator
+    else:
+        pad_w = 0
+    x_new = F.pad(x, (0, pad_w, 0, pad_h), 'reflect')
+    return x_new
+def postprocess(x, gt_size_h, gt_size_w):
+    """Post-process by cropping to target size"""
+    x_new = x[:, :, :gt_size_h, :gt_size_w]
+    return x_new
+def should_use_tile(image_height, image_width, threshold=1024):
+    """Determine if tile processing should be used based on image resolution"""
+    return max(image_height, image_width) > threshold
+def set_stop_flag():
+    """Set the global stop flag to interrupt inference"""
+    global stop_inference
+    with inference_lock:
+        stop_inference = True
+    return "🛑 Stopping inference...", gr.update(interactive=False)
+def reset_stop_flag():
+    """Reset the global stop flag"""
+    global stop_inference
+    with inference_lock:
+        stop_inference = False
+def check_stop_flag():
+    """Check if inference should be stopped"""
+    global stop_inference
+    with inference_lock:
+        return stop_inference
+@spaces.GPU
+def super_resolution_inference(image, scale=4.0):
+    """Super-resolution inference function with automatic tile processing"""
+    # Check if gscuda setup has been run
+    setup_marker = ".setup_complete"
+    if not os.path.exists(setup_marker):
+        print("First run detected, installing dependencies...")
+        try:
+            # subprocess.check_call(["pip", "install", "-e", "."])
+            subprocess.check_call(["pip", "install", "dist/gscuda-0.0.0-cp310-cp310-linux_x86_64.whl"])
+            # Create marker file to indicate setup is complete
+            with open(setup_marker, "w") as f:
+                f.write("Setup completed")
+            print("Setup completed successfully!")
+        except subprocess.CalledProcessError as e:
+            return None, f"❌ Setup failed with error: {e}", None
+    if image is None:
+        return None, "Please upload an image", None
+    # Load model
+    encoder, decoder = load_model(model_name="HATL_SA1B")
+    # Reset stop flag at the beginning
+    reset_stop_flag()
+    # Fixed parameters
+    tile_overlap = 16  # Fixed overlap size
+    crop_size = 8     # Fixed crop size
+    tile_size = 1024   # Fixed tile size for large images
+    try:
+        # Check for interruption
+        if check_stop_flag():
+            return None, "❌ Inference interrupted", None
+        # Convert PIL image to numpy array
+        img_np = np.array(image)
+        if len(img_np.shape) == 3:
+            img_np = img_np[:, :, [2, 1, 0]]  # RGB to BGR
+        # Convert to tensor
+        img = torch.from_numpy(np.transpose(img_np.astype(np.float32) / 255., (2, 0, 1))).float()
+        img = img.unsqueeze(0).to(device)
+        # Check for interruption
+        if check_stop_flag():
+            return None, "❌ Inference interrupted", None
+        # Calculate target size
+        gt_size = [math.floor(scale * img.shape[2]), math.floor(scale * img.shape[3])]
+        # Determine if tile processing should be used
+        use_tile = should_use_tile(img.shape[2], img.shape[3])
+        # Force AMP mixed precision
+        with torch.inference_mode():
+            with torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16):
+                # Check for interruption before main processing
+                if check_stop_flag():
+                    return None, "❌ Inference interrupted", None
+                if use_tile:
+                    # Use tile processing
+                    assert tile_size % 16 == 0, f"tile_size-{tile_size} must be divisible by 16"
+                    assert 2 * tile_overlap < tile_size, f"2 * tile_overlap must be less than tile_size"
+                    assert 2 * crop_size <= tile_overlap, f"2 * crop_size must be less than or equal to tile_overlap"
+                    with torch.no_grad():
+                        output = split_and_joint_image(
+                            lq=img,
+                            scale_factor=scale,
+                            split_size=tile_size,
+                            overlap_size=tile_overlap,
+                            model_g=encoder,
+                            model_fea2gs=decoder,
+                            crop_size=crop_size,
+                            scale_modify=torch.tensor([scale, scale]),
+                            default_step_size=1.2,
+                            cuda_rendering=True,
+                            mode='scale_modify',
+                            if_dmax=True,
+                            dmax_mode='fix',
+                            dmax=0.1
+                        )
+                else:
+                    # Direct processing without tiles
+                    lq_pad = preprocess(img, 16)  # denominator=16 for HATL
+                    gt_size_pad = torch.tensor([math.floor(scale * lq_pad.shape[2]),
+                                            math.floor(scale * lq_pad.shape[3])])
+                    gt_size_pad = gt_size_pad.unsqueeze(0)
+                    with torch.no_grad():
+                        # Check for interruption before encoder
+                        if check_stop_flag():
+                            return None, "❌ Inference interrupted", None
+                        # Encoder output
+                        encoder_output = encoder(lq_pad)  # b,c,h,w
+                        # Check for interruption before decoder
+                        if check_stop_flag():
+                            return None, "❌ Inference interrupted", None
+                        scale_vector = torch.tensor(scale, dtype=torch.float32).unsqueeze(0).to(device)
+                        # Decoder output
+                        batch_gs_parameters = decoder(encoder_output, scale_vector)
+                        gs_parameters = batch_gs_parameters[0, :]
+                        # Check for interruption before gaussian rendering
+                        if check_stop_flag():
+                            return None, "❌ Inference interrupted", None
+                        # Gaussian rendering
+                        b_output = generate_2D_gaussian_splatting_step(
+                            gs_parameters=gs_parameters,
+                            sr_size=gt_size_pad[0],
+                            scale=scale,
+                            sample_coords=None,
+                            scale_modify=torch.tensor([scale, scale]),
+                            default_step_size=1.2,
+                            cuda_rendering=True,
+                            mode='scale_modify',
+                            if_dmax=True,
+                            dmax_mode='fix',
+                            dmax=0.1
+                        )
+                        output = b_output.unsqueeze(0)
+        # Check for interruption before post-processing
+        if check_stop_flag():
+            return None, "❌ Inference interrupted", None
+        # Post-processing
+        output = postprocess(output, gt_size[0], gt_size[1])
+        # Convert back to PIL image format
+        output = output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
+        output = np.transpose(output[[2, 1, 0], :, :], (1, 2, 0))  # BGR to RGB
+        output = (output * 255.0).round().astype(np.uint8)
+        # Convert to PIL image
+        output_pil = Image.fromarray(output)
+        # Generate result information
+        original_size = f"{img.shape[3]}x{img.shape[2]}"
+        output_size = f"{output.shape[1]}x{output.shape[0]}"
+        tile_info = f"Tile processing enabled (size: {tile_size})" if use_tile else "Direct processing (no tiles)"
+        result_info = f"✅ Processing completed successfully!\nOriginal size: {original_size}\nSuper-resolution size: {output_size}\nScale factor: {scale:.2f}x\nProcessing mode: {tile_info}\nAMP acceleration: Force enabled\nOverlap size: {tile_overlap}\nCrop size: {crop_size}"
+        return output_pil, result_info, output_pil
+    except Exception as e:
+        if check_stop_flag():
+            return None, "❌ Inference interrupted", None
+        return None, f"❌ Error during processing: {str(e)}", None
+def predict(image, scale):
+    """Gradio prediction function"""
+    output_image, info, download_image = super_resolution_inference(image, scale)
+    # If processing successful, save image for download
+    if output_image is not None:
+        # Create temporary filename
+        timestamp = int(time.time())
+        temp_filename = f"GSASR_SR_result_{scale}x_{timestamp}.png"
+        temp_path = os.path.join(tempfile.gettempdir(), temp_filename)
+        # Save image
+        output_image.save(temp_path, "PNG")
+        return output_image, temp_path, "✅ Ready", gr.update(interactive=True)
+    else:
+        return output_image, None, info if info else "❌ Processing failed", gr.update(interactive=True)
+# Create Gradio interface
+with gr.Blocks(title="🚀 GSASR (2D Gaussian Splatting Super-Resolution)") as demo:
+    gr.Markdown("# **🚀 GSASR (Generalized and efficient 2d gaussian splatting for arbitrary-scale super-resolution)**")
+    gr.Markdown("Official demo for GSASR. Please refer to our [paper](https://arxiv.org/pdf/2501.06838), [project page](https://mt-cly.github.io/GSASR.github.io/), and [github](https://github.com/ChrisDud0257/GSASR) for more details.")
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(type="pil", label="Input Image")
+            # Scale parameters
+            with gr.Group():
+                gr.Markdown("### SR Scale")
+                scale_slider = gr.Slider(minimum=1.0, maximum=30.0, value=4.0, step=0.1, label="SR Scale")
+            # Control buttons
+            with gr.Row():
+                submit_btn = gr.Button("🚀 Start Super-Resolution", variant="primary")
+                stop_btn = gr.Button("🛑 Stop Inference", variant="stop")
+        with gr.Column():
+            output_image = gr.Image(type="pil", label="Super-Resolution Result")
+            # Status display
+            status_text = gr.Textbox(label="Status", value="✅ Ready", interactive=False)
+            # Download component
+            with gr.Group():
+                gr.Markdown("### 📥 Download Super-Resolution Result")
+                download_btn = gr.File(visible=True)
+    # Event handlers
+    submit_event = submit_btn.click(
+        fn=predict,
+        inputs=[input_image, scale_slider],
+        outputs=[output_image, download_btn, status_text, stop_btn]
+    )
+    stop_btn.click(
+        fn=set_stop_flag,
+        inputs=[],
+        outputs=[status_text, stop_btn],
+        cancels=[submit_event]
+    )
+    # Example images
+    gr.Markdown("### 📚 Example Images")
+    gr.Markdown("Try these examples with different scales:")
+    gr.Examples(
+        examples=[
+            ["assets/0846x4.png", 1.5],
+            ["assets/0892x4.png", 2.8],
+            ["assets/0873x4_cropped_120x120.png", 30.0]
+        ],
+        inputs=[input_image, scale_slider],
+        examples_per_page=3,
+        cache_examples=False,
+        label="Examples"
+    )
+if __name__ == "__main__":
+    demo.launch(share=True, server_name="0.0.0.0")

assets/0846x4.png ADDED Viewed

Git LFS Details

SHA256: 1ed26d96cbd5885f73dfdffbebbe9a048276036bf050c435a4da190199a932a0
Pointer size: 131 Bytes
Size of remote file: 262 kB

assets/0873.png ADDED Viewed

Git LFS Details

SHA256: 3a76a1452be69f0a04bddaeffa825bff46027d7155bb24479fab93c12db9bd73
Pointer size: 131 Bytes
Size of remote file: 197 kB

assets/0873x4.png ADDED Viewed

Git LFS Details

SHA256: 2c034622b96885845c4438c25a5248afc2a5bff00b89734917189f292e57754f
Pointer size: 131 Bytes
Size of remote file: 331 kB

assets/0873x4_cropped_120x120.png ADDED Viewed

Git LFS Details

SHA256: b21380583809cce487b5129f93451148dc9954d9d4ebebcdbb824fdbdc1198a3
Pointer size: 130 Bytes
Size of remote file: 32.4 kB

assets/0892x4.png ADDED Viewed

Git LFS Details

SHA256: e95aebc62748c232bfc5942ad506e5d2d31323b7d10cb977a10287065293ce0b
Pointer size: 131 Bytes
Size of remote file: 315 kB

assets/Screenshot_cropped_180x100.png ADDED Viewed

Git LFS Details

SHA256: 30241bfe51891e2c19c3fe9b949dde8dee50d4baddcbd5e1612befed111543f8
Pointer size: 130 Bytes
Size of remote file: 48.2 kB

dist/gscuda-0.0.0-cp310-cp310-linux_x86_64.whl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:832b5f0cd6cd078e39a8bf68c481488cf606ec9633591d4d981794338a3f2b29
+size 90122

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+torch==2.5.1 --index-url https://download.pytorch.org/whl/cu124
+torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu124
+torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
+# gradio==5.32.0
+gradio==5.23.0
+huggingface-hub==0.32.3
+pillow==11.2.1
+numpy==1.23.0
+einops==0.8.1
+opencv-python==4.11.0.86
+pydantic==2.10.6
+# dist/gscuda-0.0.0-cp310-cp310-linux_x86_64.whl

setup.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+import os
+import torch
+print("Building gscuda")
+# 假设源文件在 gs_cuda 目录下
+file_path = "utils/gs_cuda_dmax"
+setup(
+    name="gscuda",  # 模块名
+    ext_modules=[
+        CUDAExtension(
+            name="gscuda",  # 可以直接作为模块导入
+            sources=[
+                os.path.join(file_path, "gswrapper.cpp"),
+                os.path.join(file_path, "gs.cu")
+            ],
+            # 设置运行时库路径（可选）
+            library_dirs=[os.path.join(os.path.dirname(torch.__file__), 'lib')],
+        )
+    ],
+    cmdclass={
+        "build_ext": BuildExtension
+    },
+)

utils/edsrbaseline.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import collections.abc
+import math
+import torch
+import torchvision
+import warnings
+from itertools import repeat
+from torch import nn as nn
+from torch.nn import functional as F
+from torch.nn import init as init
+from torch.nn.modules.batchnorm import _BatchNorm
+@torch.no_grad()
+def default_init_weights(module_list, scale=1, bias_fill=0, **kwargs):
+    """Initialize network weights.
+    Args:
+        module_list (list[nn.Module] | nn.Module): Modules to be initialized.
+        scale (float): Scale initialized weights, especially for residual
+            blocks. Default: 1.
+        bias_fill (float): The value to fill bias. Default: 0
+        kwargs (dict): Other arguments for initialization function.
+    """
+    if not isinstance(module_list, list):
+        module_list = [module_list]
+    for module in module_list:
+        for m in module.modules():
+            if isinstance(m, nn.Conv2d):
+                init.kaiming_normal_(m.weight, **kwargs)
+                m.weight.data *= scale
+                if m.bias is not None:
+                    m.bias.data.fill_(bias_fill)
+            elif isinstance(m, nn.Linear):
+                init.kaiming_normal_(m.weight, **kwargs)
+                m.weight.data *= scale
+                if m.bias is not None:
+                    m.bias.data.fill_(bias_fill)
+            elif isinstance(m, _BatchNorm):
+                init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    m.bias.data.fill_(bias_fill)
+def make_layer(basic_block, num_basic_block, **kwarg):
+    """Make layers by stacking the same blocks.
+    Args:
+        basic_block (nn.module): nn.module class for basic block.
+        num_basic_block (int): number of blocks.
+    Returns:
+        nn.Sequential: Stacked blocks in nn.Sequential.
+    """
+    layers = []
+    for _ in range(num_basic_block):
+        layers.append(basic_block(**kwarg))
+    return nn.Sequential(*layers)
+class ResidualBlockNoBN(nn.Module):
+    """Residual block without BN.
+    Args:
+        num_feat (int): Channel number of intermediate features.
+            Default: 64.
+        res_scale (float): Residual scale. Default: 1.
+        pytorch_init (bool): If set to True, use pytorch default init,
+            otherwise, use default_init_weights. Default: False.
+    """
+    def __init__(self, num_feat=64, res_scale=1, pytorch_init=False):
+        super(ResidualBlockNoBN, self).__init__()
+        self.res_scale = res_scale
+        self.conv1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1, bias=True)
+        self.conv2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1, bias=True)
+        self.relu = nn.ReLU(inplace=True)
+        if not pytorch_init:
+            default_init_weights([self.conv1, self.conv2], 0.1)
+    def forward(self, x):
+        identity = x
+        out = self.conv2(self.relu(self.conv1(x)))
+        return identity + out * self.res_scale
+class EDSRNOUP(nn.Module):
+    def __init__(self,
+                 num_in_ch=3,
+                 num_out_ch=3,
+                 num_feat=64,
+                 num_block=16,
+                 upscale=4,
+                 res_scale=1):
+        super(EDSRNOUP, self).__init__()
+        self.conv_first = nn.Conv2d(num_in_ch, num_feat, 3, 1, 1)
+        self.body = make_layer(ResidualBlockNoBN, num_block, num_feat=num_feat, res_scale=res_scale, pytorch_init=True)
+        self.conv_after_body = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
+    def forward(self, x):
+        x = self.conv_first(x)
+        res = self.conv_after_body(self.body(x))
+        x = res + x
+        return res
+if __name__ == '__main__':
+    x = torch.randn(8,3,48,48)
+    model = EDSRNOUP(num_in_ch=3, num_out_ch=3)
+    y = model(x)
+    print(y.shape)

utils/fea2gsropeamp.py ADDED Viewed

	@@ -0,0 +1,749 @@

+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+import warnings
+import math
+import copy
+from einops import rearrange
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_, kaiming_normal_
+from einops import rearrange
+from torch.utils.checkpoint import checkpoint
+from functools import partial
+from typing import Any, Optional, Tuple
+import numpy as np
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # From: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/weight_init.py
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. '
+            'The distribution of values may be incorrect.',
+            stacklevel=2)
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        low = norm_cdf((a - mean) / std)
+        up = norm_cdf((b - mean) / std)
+        # Uniformly fill tensor with values from [low, up], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * low - 1, 2 * up - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution.
+    From: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/weight_init.py
+    The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+def init_t_xy(end_x: int, end_y: int, zero_center=False):
+    t = torch.arange(end_x * end_y, dtype=torch.float32)
+    t_x = (t % end_x).float()
+    t_y = torch.div(t, end_x, rounding_mode='floor').float()
+    return t_x, t_y
+def init_random_2d_freqs(head_dim: int, num_heads: int, theta: float = 10.0, rotate: bool = True):
+    freqs_x = []
+    freqs_y = []
+    theta = theta
+    mag = 1 / (theta ** (torch.arange(0, head_dim, 4)[: (head_dim // 4)].float() / head_dim))
+    for i in range(num_heads):
+        angles = torch.rand(1) * 2 * torch.pi if rotate else torch.zeros(1)
+        fx = torch.cat([mag * torch.cos(angles), mag * torch.cos(torch.pi/2 + angles)], dim=-1)
+        fy = torch.cat([mag * torch.sin(angles), mag * torch.sin(torch.pi/2 + angles)], dim=-1)
+        freqs_x.append(fx)
+        freqs_y.append(fy)
+    freqs_x = torch.stack(freqs_x, dim=0)
+    freqs_y = torch.stack(freqs_y, dim=0)
+    freqs = torch.stack([freqs_x, freqs_y], dim=0)
+    return freqs
+def compute_cis(freqs, t_x, t_y):
+    N = t_x.shape[0]
+    # No float 16 for this range
+    with torch.cuda.amp.autocast(enabled=False):
+        freqs_x = (t_x.unsqueeze(-1) @ freqs[0].unsqueeze(-2))
+        freqs_y = (t_y.unsqueeze(-1) @ freqs[1].unsqueeze(-2))
+        freqs_cis = torch.polar(torch.ones_like(freqs_x), freqs_x + freqs_y)
+    return freqs_cis
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    # assert freqs_cis.shape == (x.shape[-2], x.shape[-1])
+    # print(f"freqs_cis shape is {freqs_cis.shape}, x shape is {x.shape}")
+    if freqs_cis.shape == (x.shape[-2], x.shape[-1]):
+        shape = [d if i >= ndim-2 else 1 for i, d in enumerate(x.shape)]
+    elif freqs_cis.shape == (x.shape[-3], x.shape[-2], x.shape[-1]):
+        shape = [d if i >= ndim-3 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # print(f"xq shape is {xq.shape}, xq.shape[:-1] is {xq.shape[:-1]}")
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    # print(f"xq_ shape is {xq_.shape}")
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq).to(xq.device), xk_out.type_as(xk).to(xk.device)
+def apply_rotary_emb_single(x, freqs_cis):
+    x_ = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+    seq_len = x_.shape[2]
+    freqs_cis = freqs_cis[:, :seq_len, :]
+    freqs_cis = freqs_cis.unsqueeze(0).expand_as(x_)
+    x_out = torch.view_as_real(x_ * freqs_cis).flatten(3)
+    return x_out.type_as(x).to(x.device)
+def window_partition(x, window_size):
+    # x is the feature from net_g
+    b, c, h, w = x.shape
+    windows = rearrange(x, 'b c (h_count dh) (w_count dw) -> (b h_count w_count) (dh dw) c', dh=window_size,
+                        dw=window_size)
+    # h_count = h // window_size
+    # w_count = w // window_size
+    # windows = x.reshape(b,c,h_count, window_size, w_count, window_size)
+    # windows = windows.permute(0,1,2,4,3,5) #b,c,h_count,w_count,window_size,window_size
+    # windows = windows.reshape(b,c,h_count*w_count, window_size * window_size)
+    # windows = windows.permute(0,2,3,1) #b,h_count*w_count, window_size*window_size,c
+    # windows = windows.reshape(-1, window_size*window_size, c)
+    return windows
+def with_pos_embed(tensor, pos):
+    return tensor if pos is None else tensor + pos
+class MLP(nn.Module):
+    def __init__(self, in_features, hidden_features, out_features, act_layer=nn.ReLU):
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+class WindowCrossAttn(nn.Module):
+    def __init__(self, dim=180, num_heads=6, window_size=12, num_gs_seed=2304, rope_mixed = True, rope_theta = 10.0):
+        super(WindowCrossAttn, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.num_gs_seed = num_gs_seed
+        self.num_gs_seed_sqrt = int(math.sqrt(num_gs_seed))
+        self.rope_mixed = rope_mixed
+        t_x, t_y = init_t_xy(end_x=max(self.num_gs_seed_sqrt, self.window_size), end_y=max(self.num_gs_seed_sqrt, self.window_size))
+        self.register_buffer('rope_t_x', t_x)
+        self.register_buffer('rope_t_y', t_y)
+        freqs = init_random_2d_freqs(
+            head_dim=self.dim // self.num_heads, num_heads=self.num_heads, theta=rope_theta,
+            rotate=self.rope_mixed
+        )
+        if self.rope_mixed:
+            self.rope_freqs = nn.Parameter(freqs, requires_grad=True)
+        else:
+            self.register_buffer('rope_freqs', freqs)
+            freqs_cis = compute_cis(self.rope_freqs, self.rope_t_x, self.rope_t_y)
+            self.rope_freqs_cis = freqs_cis
+        self.qhead = nn.Linear(dim, dim, bias=True)
+        self.khead = nn.Linear(dim, dim, bias=True)
+        self.vhead = nn.Linear(dim, dim, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, gs, feat):
+        # gs shape: b*h_count*w_count, num_gs, c    the input gs here should already include pos embedding and scale embedding
+        # feat shape: b*h_count*w_count, dh*dw, c    dh=dw=window_size
+        b_, num_gs, c = gs.shape
+        b_, n, c = feat.shape
+        q = self.qhead(gs)  # b_, num_gs_, c
+        q = q.reshape(b_, num_gs, self.num_heads, c // self.num_heads)
+        q = q.permute(0, 2, 1, 3)  # b_, num_heads, n, c // num_heads
+        k = self.khead(feat)  # b_, n_, c
+        k = k.reshape(b_, n, self.num_heads, c // self.num_heads)
+        k = k.permute(0, 2, 1, 3)  # b_, num_heads, n, c // num_heads
+        v = self.vhead(feat)  # b_, n_, c
+        v = v.reshape(b_, n, self.num_heads, c // self.num_heads)
+        v = v.permute(0, 2, 1, 3)  # b_, num_heads, n, c // num_heads
+        ###### Apply rotary position embedding
+        if self.rope_mixed:
+            freqs_cis = compute_cis(self.rope_freqs, self.rope_t_x, self.rope_t_y)
+        else:
+            freqs_cis = self.rope_freqs_cis.to(gs.device)
+        q = apply_rotary_emb_single(q, freqs_cis)
+        k = apply_rotary_emb_single(k, freqs_cis)
+        #########
+        attn = F.scaled_dot_product_attention(q, k, v)
+        x = attn.transpose(1, 2).reshape(b_, num_gs, c)
+        x = self.proj(x)
+        return x
+class WindowCrossAttnLayer(nn.Module):
+    def __init__(self, dim=180, num_heads=6, window_size=12, shift_size=0, num_gs_seed=2308, rope_mixed = True, rope_theta = 10.0):
+        super(WindowCrossAttnLayer, self).__init__()
+        self.gs_cross_attn_scale = nn.MultiheadAttention(dim, num_heads, batch_first=True)
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.norm4 = nn.LayerNorm(dim)
+        self.shift_size = shift_size
+        self.window_size = window_size
+        self.window_cross_attn = WindowCrossAttn(dim=dim, num_heads=num_heads, window_size=window_size,
+                                                         num_gs_seed=num_gs_seed, rope_mixed = rope_mixed, rope_theta = rope_theta)
+        self.mlp_crossattn_scale = MLP(in_features=dim, hidden_features=dim, out_features=dim)
+        self.mlp_crossattn_feature = MLP(in_features=dim, hidden_features=dim, out_features=dim)
+    def forward(self, x, query_pos, feat, scale_embedding):
+        # gs shape: b*h_count*w_count, num_gs, c
+        # query_pos shape: b*h_count*w_count, num_gs, c
+        # feat shape: b,c,h,w
+        # scale_embedding shape: b*h_count*w_count, 1, c
+        ###GS cross attn with scale embedding
+        resi = x
+        x = self.norm1(x)
+        # print(f"x: {x.shape} {x.device}, query_pos: {query_pos.shape}, {query_pos.device}, scale_embedding: {scale_embedding.shape}, {scale_embedding.device}")
+        x, _ = self.gs_cross_attn_scale(with_pos_embed(x, query_pos), scale_embedding, scale_embedding)
+        x = resi + x
+        ###FFN
+        resi = x
+        x = self.norm2(x)
+        x = self.mlp_crossattn_scale(x)
+        x = resi + x
+        ###cross attention for Q,K,V
+        resi = x
+        x = self.norm3(x)
+        if self.shift_size > 0:
+            shift_feat = torch.roll(feat, shifts=(-self.shift_size, -self.shift_size), dims=(2, 3))
+        else:
+            shift_feat = feat
+        shift_feat = window_partition(shift_feat, self.window_size)  # b*h_count*w_count, dh*dw, c  dh=dw=window_size
+        x = self.window_cross_attn(with_pos_embed(x, query_pos),
+                                   shift_feat)  # b*h_count*w_count, num_gs, c  dh=dw=window_size
+        x = resi + x
+        ###FFN
+        resi = x
+        x = self.norm4(x)
+        x = self.mlp_crossattn_feature(x)
+        x = resi + x
+        return x
+class WindowCrossAttnBlock(nn.Module):
+    def __init__(self, dim=180, window_size=12, num_heads=6, num_layers=4, num_gs_seed=230, rope_mixed = True, rope_theta = 10.0):
+        super(WindowCrossAttnBlock, self).__init__()
+        self.num_gs_seed_sqrt = int(math.sqrt(num_gs_seed))
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, dim),
+            nn.ReLU(),
+            nn.Linear(dim, dim)
+        )
+        self.norm = nn.LayerNorm(dim)
+        self.blocks = nn.ModuleList([
+            WindowCrossAttnLayer(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if i % 2 == 0 else window_size // 2,
+                num_gs_seed=num_gs_seed,
+                rope_mixed = rope_mixed, rope_theta = rope_theta) for i in range(num_layers)
+        ])
+        self.conv = nn.Conv2d(dim, dim, 3, 1, 1)
+    def forward(self, x, query_pos, feat, scale_embedding, h_count, w_count):
+        resi = x
+        x = self.norm(x)
+        for block in self.blocks:
+            x = block(x, query_pos, feat, scale_embedding)
+        x = self.mlp(x)
+        x = rearrange(x, '(b m n) (h w) c -> b c (m h) (n w)', m=h_count, n=w_count, h=self.num_gs_seed_sqrt)
+        x = self.conv(x)
+        x = rearrange(x, 'b c (m h) (n w) -> (b m n) (h w) c', m=h_count, n=w_count, h=self.num_gs_seed_sqrt)
+        x = resi + x
+        return x
+class GSSelfAttn(nn.Module):
+    def __init__(self, dim=180, num_heads=6, num_gs_seed_sqrt = 12, rope_mixed = True, rope_theta=10.0):
+        super(GSSelfAttn, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.num_gs_seed_sqrt = num_gs_seed_sqrt
+        self.proj = nn.Linear(dim, dim)
+        self.rope_mixed = rope_mixed
+        t_x, t_y = init_t_xy(end_x=self.num_gs_seed_sqrt, end_y=self.num_gs_seed_sqrt)
+        self.register_buffer('rope_t_x', t_x)
+        self.register_buffer('rope_t_y', t_y)
+        freqs = init_random_2d_freqs(
+            head_dim=self.dim // self.num_heads, num_heads=self.num_heads, theta=rope_theta,
+            rotate=self.rope_mixed
+        )
+        if self.rope_mixed:
+            self.rope_freqs = nn.Parameter(freqs, requires_grad=True)
+        else:
+            self.register_buffer('rope_freqs', freqs)
+            freqs_cis = compute_cis(self.rope_freqs, self.rope_t_x, self.rope_t_y)
+            self.rope_freqs_cis = freqs_cis
+        self.qhead = nn.Linear(dim, dim, bias=True)
+        self.khead = nn.Linear(dim, dim, bias=True)
+        self.vhead = nn.Linear(dim, dim, bias=True)
+    def forward(self, gs):
+        # gs shape: b*h_count*w_count, num_gs, c
+        # pos shape: b*h_count*w_count, num_gs, c
+        b_, num_gs, c = gs.shape
+        q = self.qhead(gs)
+        q = q.reshape(b_, num_gs, self.num_heads, c // self.num_heads)
+        q = q.permute(0, 2, 1, 3)  # b_, num_heads, n, c // num_heads
+        k = self.khead(gs)
+        k = k.reshape(b_, num_gs, self.num_heads, c // self.num_heads)
+        k = k.permute(0, 2, 1, 3)  # b_, num_heads, n, c // num_heads
+        v = self.vhead(gs)
+        v = v.reshape(b_, num_gs, self.num_heads, c // self.num_heads)
+        v = v.permute(0, 2, 1, 3)  # b_, num_heads, n, c // num_heads
+        ###### Apply rotary position embedding
+        if self.rope_mixed:
+            freqs_cis = compute_cis(self.rope_freqs, self.rope_t_x, self.rope_t_y)
+        else:
+            freqs_cis = self.rope_freqs_cis.to(gs.device)
+        q, k = apply_rotary_emb(q, k, freqs_cis)
+        #########
+        attn = F.scaled_dot_product_attention(q, k, v)
+        attn = attn.transpose(1, 2).reshape(b_, num_gs, c)
+        attn = self.proj(attn)
+        return attn
+class GSSelfAttnLayer(nn.Module):
+    def __init__(self, dim=180, num_heads=6, num_gs_seed_sqrt = 12, shift_size = 0, rope_mixed = True, rope_theta=10.0):
+        super(GSSelfAttnLayer, self).__init__()
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.norm4 = nn.LayerNorm(dim)
+        self.gs_self_attn = GSSelfAttn(dim = dim, num_heads = num_heads, num_gs_seed_sqrt = num_gs_seed_sqrt, rope_mixed = rope_mixed, rope_theta=rope_theta)
+        self.mlp_selfattn = MLP(in_features=dim, hidden_features=dim, out_features=dim)
+        self.num_gs_seed_sqrt = num_gs_seed_sqrt
+        self.shift_size = shift_size
+        self.gs_cross_attn_scale = nn.MultiheadAttention(dim, num_heads, batch_first=True)
+        self.mlp_crossattn = MLP(in_features=dim, hidden_features=dim, out_features=dim)
+    def forward(self, gs, pos, h_count, w_count, scale_embedding):
+        # gs shape:b*h_count*w_count, num_gs_seed, channel
+        # pos shape: b*h_count*w_count, num_gs_seed, channel
+        # scale_embedding shape: b*h_count*w_count, 1, channel
+        # gs cross attn with scale_embedding
+        resi = gs
+        gs = self.norm3(gs)
+        gs, _ = self.gs_cross_attn_scale(with_pos_embed(gs, pos), scale_embedding, scale_embedding)
+        gs = gs + resi
+        # FFN
+        resi = gs
+        gs = self.norm4(gs)
+        gs = self.mlp_crossattn(gs)
+        gs = gs + resi
+        resi = gs
+        gs = self.norm1(gs)
+        #### shift gs
+        if self.shift_size > 0:
+            shift_gs = rearrange(gs, '(b m n) (h w) c -> b (m h) (n w) c', m=h_count, n=w_count, h=self.num_gs_seed_sqrt, w = self.num_gs_seed_sqrt)
+            shift_gs = torch.roll(shift_gs, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            shift_gs = rearrange(shift_gs, 'b (m h) (n w) c -> (b m n) (h w) c', m=h_count, n=w_count, h=self.num_gs_seed_sqrt, w = self.num_gs_seed_sqrt)
+        else:
+            shift_gs = gs
+        #### gs self attention
+        gs = self.gs_self_attn(shift_gs)
+        #### shift gs back
+        if self.shift_size > 0:
+            shift_gs = rearrange(gs, '(b m n) (h w) c -> b (m h) (n w) c', m=h_count, n=w_count, h=self.num_gs_seed_sqrt, w = self.num_gs_seed_sqrt)
+            shift_gs = torch.roll(shift_gs, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+            shift_gs = rearrange(shift_gs, 'b (m h) (n w) c -> (b m n) (h w) c', m=h_count, n=w_count, h=self.num_gs_seed_sqrt, w = self.num_gs_seed_sqrt)
+        else:
+            shift_gs = gs
+        gs = shift_gs + resi
+        #FFN
+        resi = gs
+        gs = self.norm2(gs)
+        gs = self.mlp_selfattn(gs)
+        gs = gs + resi
+        return gs
+class GSSelfAttnBlock(nn.Module):
+    def __init__(self, dim=180, num_heads=6, num_selfattn_layers=4, num_gs_seed_sqrt = 12, rope_mixed = True, rope_theta=10.0):
+        super(GSSelfAttnBlock, self).__init__()
+        self.num_gs_seed_sqrt = num_gs_seed_sqrt
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, dim),
+            nn.ReLU(),
+            nn.Linear(dim, dim)
+        )
+        self.norm = nn.LayerNorm(dim)
+        self.blocks = nn.ModuleList([
+            GSSelfAttnLayer(
+                dim = dim,
+                num_heads = num_heads,
+                num_gs_seed_sqrt=num_gs_seed_sqrt,
+                shift_size=0 if i % 2 == 0 else num_gs_seed_sqrt // 2,
+                rope_mixed = rope_mixed, rope_theta=rope_theta
+            ) for i in range(num_selfattn_layers)
+        ])
+        self.conv = nn.Conv2d(dim, dim, 3, 1, 1)
+    def forward(self, gs, pos, h_count, w_count, scale_embedding):
+        resi = gs
+        gs = self.norm(gs)
+        for block in self.blocks:
+            gs = block(gs, pos, h_count, w_count, scale_embedding)
+        gs = self.mlp(gs)
+        gs = rearrange(gs, '(b m n) (h w) c -> b c (m h) (n w)', m=h_count, n=w_count, h=self.num_gs_seed_sqrt)
+        gs = self.conv(gs)
+        gs = rearrange(gs, 'b c (m h) (n w) -> (b m n) (h w) c', m=h_count, n=w_count, h=self.num_gs_seed_sqrt)
+        gs = gs + resi
+        return gs
+class Fea2GS_ROPE_AMP(nn.Module):
+    def __init__(self, inchannel=64, channel=192, num_heads=6, num_crossattn_blocks=1, num_crossattn_layers=2, num_selfattn_blocks = 6, num_selfattn_layers = 6,
+                 num_gs_seed=144, gs_up_factor=1.0, window_size=12, img_range=1.0, shuffle_scale1 = 2, shuffle_scale2 = 2, use_checkpoint = False,
+                 rope_mixed = True, rope_theta = 10.0):
+        """
+        Args:
+            gs_repeat_factor: the ratio of gs embedding number and pixel number along  width&height,  will generate
+            (h * gs_repeat_factor) * (w * gs_repeat_factor) gs embedding, higher values means repeat more gs embedding.
+            gs_up_factor: how many 2d gaussian are generated by one gasussian embedding.
+        """
+        super(Fea2GS_ROPE_AMP, self).__init__()
+        self.channel = channel
+        self.nhead = num_heads
+        self.gs_up_factor = gs_up_factor
+        self.num_gs_seed = num_gs_seed
+        self.window_size = window_size
+        self.img_range = img_range
+        self.use_checkpoint = use_checkpoint
+        self.num_gs_seed_sqrt = int(math.sqrt(num_gs_seed))
+        self.gs_up_factor_sqrt = int(math.sqrt(gs_up_factor))
+        self.shuffle_scale1 = shuffle_scale1
+        self.shuffle_scale2 = shuffle_scale2
+        # shared gaussian embedding and its pos embedding
+        self.gs_embedding = nn.Parameter(torch.randn(self.num_gs_seed, channel), requires_grad=True)
+        self.pos_embedding = nn.Parameter(torch.randn(self.num_gs_seed, channel), requires_grad=True)
+        self.img_feat_proj = nn.Sequential(
+            nn.Conv2d(inchannel, channel, 3, 1, 1),
+            nn.ReLU(),
+            nn.Conv2d(channel, channel, 3, 1, 1)
+        )
+        self.window_crossattn_blocks = nn.ModuleList([
+            WindowCrossAttnBlock(dim=channel,
+                                 window_size=window_size,
+                                 num_heads=num_heads,
+                                 num_layers=num_crossattn_layers,
+                                 num_gs_seed=num_gs_seed, rope_mixed = rope_mixed, rope_theta = rope_theta) for i in range(num_crossattn_blocks)
+        ])
+        self.gs_selfattn_blocks = nn.ModuleList([
+            GSSelfAttnBlock(dim=channel,
+                            num_heads=num_heads,
+                            num_selfattn_layers=num_selfattn_layers,
+                            num_gs_seed_sqrt=self.num_gs_seed_sqrt,
+                            rope_mixed = rope_mixed, rope_theta=rope_theta
+                            ) for i in range(num_selfattn_blocks)
+        ])
+        # GS sigma_x, sigma_y
+        self.mlp_block_sigma = nn.Sequential(
+            nn.Linear(channel, channel),
+            nn.ReLU(),
+            nn.Linear(channel, channel * 4),
+            nn.ReLU(),
+            nn.Linear(channel * 4, int(2 * gs_up_factor))
+        )
+        # GS rho
+        self.mlp_block_rho = nn.Sequential(
+            nn.Linear(channel, channel),
+            nn.ReLU(),
+            nn.Linear(channel, channel * 4),
+            nn.ReLU(),
+            nn.Linear(channel * 4, int(1 * gs_up_factor))
+        )
+        # GS alpha
+        self.mlp_block_alpha = nn.Sequential(
+            nn.Linear(channel, channel),
+            nn.ReLU(),
+            nn.Linear(channel, channel * 4),
+            nn.ReLU(),
+            nn.Linear(channel * 4, int(1 * gs_up_factor))
+        )
+        # GS RGB values
+        self.mlp_block_rgb = nn.Sequential(
+            nn.Linear(channel, channel),
+            nn.ReLU(),
+            nn.Linear(channel, channel * 4),
+            nn.ReLU(),
+            nn.Linear(channel * 4, int(3 * gs_up_factor))
+        )
+        # GS mean_x, mean_y
+        self.mlp_block_mean = nn.Sequential(
+            nn.Linear(channel, channel),
+            nn.ReLU(),
+            nn.Linear(channel, channel * 4),
+            nn.ReLU(),
+            nn.Linear(channel * 4, int(2 * gs_up_factor))
+        )
+        self.scale_mlp = nn.Sequential(
+            nn.Linear(1, channel * 4),
+            nn.ReLU(),
+            nn.Linear(channel * 4, channel)
+        )
+        self.UPNet = nn.Sequential(
+            nn.Conv2d(channel, channel * self.shuffle_scale1 * self.shuffle_scale1, 3, 1, 1),
+            nn.PixelShuffle(self.shuffle_scale1),
+            nn.Conv2d(channel, channel * self.shuffle_scale2 * self.shuffle_scale2, 3, 1, 1),
+            nn.PixelShuffle(self.shuffle_scale2)
+        )
+        self.conv_final = nn.Conv2d(channel, channel, 3, 1, 1)
+    @staticmethod
+    def get_N_reference_points(h, w, device='cuda'):
+        # step_y = 1/(h+1)
+        # step_x = 1/(w+1)
+        step_y = 1 / h
+        step_x = 1 / w
+        ref_y, ref_x = torch.meshgrid(torch.linspace(step_y / 2, 1 - step_y / 2, h, dtype=torch.float32, device=device),
+                                      torch.linspace(step_x / 2, 1 - step_x / 2, w, dtype=torch.float32, device=device))
+        reference_points = torch.stack((ref_x.reshape(-1), ref_y.reshape(-1)), -1)
+        reference_points = reference_points[None, :, None]
+        return reference_points
+    def forward(self, srcs, scale):
+        '''
+        using deformable detr decoder for cross attention
+        Args:
+            query: (batch_size, num_query, dim)
+            query_pos: (batch_size, num_query, dim)
+            srcs: (batch_size, dim, h1, w1)
+        '''
+        b, c, h, w = srcs.shape  ###srcs is pad to the size that could be divided by window_size
+        query = self.gs_embedding.unsqueeze(0).unsqueeze(1).repeat(b, (h // self.window_size) * (w // self.window_size),
+                                                                   1, 1)  # b, h_count*w_count, num_gs_seed, channel
+        query = query.reshape(b * (h // self.window_size) * (w // self.window_size), -1,
+                              self.channel)  # b*h_count*w_count, num_gs_seed, channel
+        scale = 1 / scale
+        scale = scale.unsqueeze(1)  # b*1
+        scale_embedding = self.scale_mlp(scale)  # b*channel
+        scale_embedding = scale_embedding.unsqueeze(1).unsqueeze(2).repeat(1, (h // self.window_size) * (
+                    w // self.window_size), self.num_gs_seed, 1)  # b, h_count*w_count, num_gs_seed, channel
+        scale_embedding = scale_embedding.reshape(b * (h // self.window_size) * (w // self.window_size), -1,
+                                      self.channel) # b*h_count*w_count, num_gs_seed, channel
+        query_pos = self.pos_embedding.unsqueeze(0).unsqueeze(1).repeat(b, (h // self.window_size) * (
+                    w // self.window_size), 1, 1)  # b, h_count*w_count, num_gs_seed, channel
+        feat = self.img_feat_proj(srcs)  # b*channel*h*w
+        query_pos = query_pos.reshape(b * (h // self.window_size) * (w // self.window_size), -1,
+                                      self.channel)  # b*h_count*w_count, num_gs_seed, channel
+        for block in self.window_crossattn_blocks:
+            if self.use_checkpoint:
+                query = checkpoint(block, query, query_pos, feat, scale_embedding, h // self.window_size, w // self.window_size)
+            else:
+                query = block(query, query_pos, feat, scale_embedding, h // self.window_size, w // self.window_size)  # b*h_count*w_count, num_gs_seed, channel
+        resi = query
+        for block in self.gs_selfattn_blocks:
+            if self.use_checkpoint:
+                query = checkpoint(block, query, query_pos, h // self.window_size, w // self.window_size, scale_embedding)
+            else:
+                query = block(query, query_pos, h // self.window_size, w // self.window_size, scale_embedding)
+        query = rearrange(query, '(b m n) (h w) c -> b c (m h) (n w)', m=h // self.window_size, n=w // self.window_size,
+                          h=self.num_gs_seed_sqrt)
+        query = self.conv_final(query)
+        resi = rearrange(resi, '(b m n) (h w) c -> b c (m h) (n w)', m=h // self.window_size, n=w // self.window_size,
+                          h=self.num_gs_seed_sqrt)
+        query = query + resi
+        query = self.UPNet(query)
+        query = query.permute(0,2,3,1)
+        # query = rearrange(query, '(b m n) (h w) c -> b m h n w c', m=h // self.window_size, n=w // self.window_size,
+        #                   h=self.num_gs_seed_sqrt)
+        query_sigma = self.mlp_block_sigma(query).reshape(b, -1, 2)
+        query_rho = self.mlp_block_rho(query).reshape(b, -1, 1)
+        query_alpha = self.mlp_block_alpha(query).reshape(b, -1, 1)
+        query_rgb = self.mlp_block_rgb(query).reshape(b, -1, 3)
+        query_mean = self.mlp_block_mean(query).reshape(b, -1, 2)
+        query_mean = query_mean / torch.tensor(
+            [self.num_gs_seed_sqrt * (w // self.window_size) * self.shuffle_scale1 * self.shuffle_scale2,
+            self.num_gs_seed_sqrt * (h // self.window_size) * self.shuffle_scale1 * self.shuffle_scale2])[
+            None, None].to(query_mean.device)  # b, h_count*w_count*num_gs_seed, 2
+        reference_offset = self.get_N_reference_points(self.num_gs_seed_sqrt * (h // self.window_size) * self.shuffle_scale1 * self.shuffle_scale2,
+                                                       self.num_gs_seed_sqrt * (w // self.window_size) * self.shuffle_scale1 * self.shuffle_scale2, srcs.device)
+        query_mean = query_mean + reference_offset.reshape(1, -1, 2)
+        query = torch.cat([query_sigma, query_rho, query_alpha, query_rgb, query_mean],
+                          dim=-1)  # b, h_count*w_count*num_gs_seed, 9
+        return query
+if __name__ == '__main__':
+    srcs = torch.randn(6, 64, 64, 64, requires_grad = True).cuda()
+    scale = torch.randn(6).cuda()
+    decoder = Fea2GS_ROPE_AMP(inchannel=64, channel=192, num_heads=6,
+                            num_crossattn_blocks=1, num_crossattn_layers=2,
+                            num_selfattn_blocks = 6, num_selfattn_layers = 6,
+                            num_gs_seed=256, gs_up_factor=1.0, window_size=16,
+                            img_range=1.0, shuffle_scale1 = 2, shuffle_scale2 = 2).cuda()
+    import time
+    for i in range(10):
+        torch.cuda.synchronize()
+        time1 = time.time()
+        # with torch.autocast(device_type = 'cuda'):
+        y = decoder(srcs, scale)
+        torch.cuda.synchronize()
+        time2 = time.time()
+        print(f"decoder time is {time2 - time1}")
+        print(y.shape)
+        torch.cuda.synchronize()
+        time3 = time.time()
+        y.sum().backward()
+        torch.cuda.synchronize()
+        time4 = time.time()
+        print(f"backward time is {time4 - time3}")

utils/gaussian_splatting.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import torch
+import numpy as np
+import torch.nn.functional as F
+import math
+import torch.nn as nn
+import torchvision.utils
+from torchvision.utils import save_image
+def rendering_python(sigma_x, sigma_y, rho, coords, colours_with_alpha, sr_size, step_size, device):
+    sr_h, sr_w = sr_size[0], sr_size[1]
+    num_gs = sigma_x.shape[0]
+    sigma_x = sigma_x[...,None]
+    sigma_y = sigma_y[...,None]
+    rho = rho[...,None]
+    covariance = torch.stack(
+        [torch.stack([sigma_x**2, rho*sigma_x*sigma_y], dim=-1),
+        torch.stack([rho*sigma_x*sigma_y, sigma_y**2], dim=-1)],
+        dim=-2
+    )
+    # Check for positive semi-definiteness
+    determinant = (sigma_x**2) * (sigma_y**2) - (rho * sigma_x * sigma_y)**2
+    if (determinant < 0).any():
+        raise ValueError("Covariance matrix must be positive semi-definite")
+    inv_covariance = torch.inverse(covariance)
+    # Sampling progress
+    num_step = int(10 * 2 / step_size)
+    ax_h_batch = torch.tensor([i * step_size for i in range(num_step)]).to(device)[None]
+    ax_h_batch -= ax_h_batch.mean()
+    ax_w_batch = torch.tensor([i * step_size for i in range(num_step)]).to(device)[None]
+    ax_w_batch -= ax_w_batch.mean()
+    # Expanding dims for broadcasting
+    ax_batch_expanded_x = ax_h_batch.unsqueeze(-1).expand(-1, -1, num_step)
+    ax_batch_expanded_y = ax_w_batch.unsqueeze(1).expand(-1, num_step, -1)
+    # Creating a batch-wise meshgrid using broadcasting
+    xx, yy = ax_batch_expanded_x, ax_batch_expanded_y
+    xy = torch.stack([xx, yy], dim=-1)
+    max_buffer = 2000
+    final_image = torch.zeros((3, sr_h, sr_w), device=device)
+    for i in range(num_gs // max_buffer + 1):
+        # print('processing gs buffer id:', i, num_gs // max_buffer )
+        s_idx, e_idx = i * max_buffer, min((i + 1) * max_buffer, num_gs)
+        buffer_size = e_idx - s_idx
+        if buffer_size == 0:
+            break
+        # print(f"buffer_size is {buffer_size}")
+        buff_inv_covariance = inv_covariance[s_idx:e_idx]
+        buff_covariance = covariance[s_idx:e_idx]
+        buffer_pixel_coords = coords[s_idx:e_idx]
+        buffer_alpha = colours_with_alpha[s_idx:e_idx].unsqueeze(-1).unsqueeze(-1)
+        z = torch.einsum('b...i,b...ij,b...j->b...', xy, -0.5 * buff_inv_covariance, xy)
+        kernel = torch.exp(z) / (2 * torch.tensor(np.pi, device=device) * torch.sqrt(torch.det(buff_covariance)).view(buffer_size, 1, 1))
+        kernel_max = kernel.max(dim=-1, keepdim=True)[0].max(dim=-2, keepdim=True)[0]
+        kernel_normalized = kernel / (kernel_max + 1e-4)
+        kernel_reshaped = kernel_normalized.repeat(1, 3, 1).view(buffer_size * 3, num_step, num_step)
+        kernel_reshaped = kernel_reshaped.unsqueeze(0).reshape(buffer_size, 3, num_step, num_step)
+        b, c, h, w = kernel_reshaped.shape
+        # Create a batch of 2D affine matrices
+        theta = torch.zeros(b, 2, 3, dtype=torch.float32, device=device)
+        theta[:, 0, 0] = 1 * sr_w / num_step
+        theta[:, 1, 1] = 1 * sr_h / num_step
+        theta[:, 0, 2] = -buffer_pixel_coords[:, 0] * sr_w / num_step  # !!!!!!!! note -1
+        theta[:, 1, 2] = -buffer_pixel_coords[:, 1] * sr_h / num_step  # !!!!!!!! note -1
+        grid = F.affine_grid(theta, size=(b, c, sr_h, sr_w), align_corners=False)  # !!!!! align_corners=False
+        kernel_reshaped_translated = F.grid_sample(kernel_reshaped, grid,
+                                                   align_corners=False)  # !!!! align_corners=False
+        buffer_final_image = buffer_alpha * kernel_reshaped_translated
+        final_image += buffer_final_image.sum(0)
+    return final_image
+def rendering_cuda(sigma_x, sigma_y, rho, coords, colours_with_alpha, sr_size, step_size, device):
+    from utils.gs_cuda.gswrapper import GSCUDA
+    sigmas = torch.cat([sigma_y/step_size*2/(sr_size[1] - 1), sigma_x/step_size*2/(sr_size[0] - 1),  rho], dim=-1).contiguous()  # (gs num, 3)
+    coords[:, 0] = (coords[:, 0] + 1 - 1/sr_size[1]) * sr_size[1] / (sr_size[1] - 1) - 1.0
+    coords[:, 1] = (coords[:, 1] + 1 - 1/sr_size[0]) * sr_size[0] / (sr_size[0] - 1) - 1.0
+    colours_with_alpha = colours_with_alpha.contiguous()  # (gs num, 3)
+    rendered_img = torch.zeros(sr_size[0], sr_size[1], 3).to(device).type(torch.float32).contiguous()
+    # with torch.no_grad():
+    #    final_image = GSCUDA.apply(sigmas, coords, colours_with_alpha, rendered_img)
+    # final_image = (torch.sum(sigmas)+torch.sum(coords)+torch.sum(colours_with_alpha))*final_image
+    final_image = GSCUDA.apply(sigmas, coords, colours_with_alpha, rendered_img)
+    final_image = final_image.permute(2, 0, 1).contiguous()
+    return final_image
+def rendering_cuda_buffer(sigma_x, sigma_y, rho, coords, colours_with_alpha, sr_size, step_size, device, buffer_size = 1000000):
+    from utils.gs_cuda.gswrapper import GSCUDA
+    sigmas = torch.cat([sigma_y/step_size*2/(sr_size[1] - 1), sigma_x/step_size*2/(sr_size[0] - 1),  rho], dim=-1).contiguous()  # (gs num, 3)
+    coords[:, 0] = (coords[:, 0] + 1 - 1/sr_size[1]) * sr_size[1] / (sr_size[1] - 1) - 1.0
+    coords[:, 1] = (coords[:, 1] + 1 - 1/sr_size[0]) * sr_size[0] / (sr_size[0] - 1) - 1.0
+    colours_with_alpha = colours_with_alpha.contiguous()  # (gs num, 3)
+    final_image = torch.zeros(sr_size[0], sr_size[1], 3).to(device).type(torch.float32).contiguous()
+    # buffer
+    buffer_num = len(sigma_x)// buffer_size+1
+    for buffer_id in range(buffer_num):
+        # print(f'processing{buffer_id+1}/{buffer_num}')
+        idx_start, idx_end = buffer_id * buffer_size, (buffer_id+1) * buffer_size
+        final_image = GSCUDA.apply(sigmas[idx_start:idx_end], coords[idx_start:idx_end],
+                                    colours_with_alpha[idx_start:idx_end], final_image)
+        # final_image += buffer_image
+    final_image = final_image.permute(2, 0, 1).contiguous()
+    return final_image
+def rendering_cuda_dmax(sigma_x, sigma_y, rho, coords, colours_with_alpha, sr_size, step_size,  device, dmax=1):
+    from utils.gs_cuda_dmax.gswrapper import GSCUDA
+    sigmas = torch.cat([sigma_y/step_size*2/(sr_size[1] - 1), sigma_x/step_size*2/(sr_size[0] - 1),  rho], dim=-1).contiguous()  # (gs num, 3)
+    coords[:, 0] = (coords[:, 0] + 1 - 1/sr_size[1]) * sr_size[1] / (sr_size[1] - 1) - 1.0
+    coords[:, 1] = (coords[:, 1] + 1 - 1/sr_size[0]) * sr_size[0] / (sr_size[0] - 1) - 1.0
+    colours_with_alpha = colours_with_alpha.contiguous()  # (gs num, 3)
+    rendered_img = torch.zeros(sr_size[0], sr_size[1], 3).to(device).type(torch.float32).contiguous()
+    # with torch.no_grad():
+    #     final_image = GSCUDA.apply(sigmas, coords, colours_with_alpha, rendered_img, dmax)
+    # final_image = (torch.sum(sigmas)+torch.sum(coords)+torch.sum(colours_with_alpha))*final_image
+    final_image = GSCUDA.apply(sigmas, coords, colours_with_alpha, rendered_img, dmax)
+    final_image = final_image.permute(2, 0, 1).contiguous()
+    return final_image
+def rendering_cuda_dmax_buffer(sigma_x, sigma_y, rho, coords, colours_with_alpha, sr_size, step_size,  device, dmax=1, buffer_size = 1000000):
+    from utils.gs_cuda_dmax.gswrapper import GSCUDA
+    sigmas = torch.cat([sigma_y/step_size*2/(sr_size[1] - 1), sigma_x/step_size*2/(sr_size[0] - 1),  rho], dim=-1).contiguous()  # (gs num, 3)
+    coords[:, 0] = (coords[:, 0] + 1 - 1/sr_size[1]) * sr_size[1] / (sr_size[1] - 1) - 1.0
+    coords[:, 1] = (coords[:, 1] + 1 - 1/sr_size[0]) * sr_size[0] / (sr_size[0] - 1) - 1.0
+    colours_with_alpha = colours_with_alpha.contiguous()  # (gs num, 3)
+    final_image = torch.zeros(sr_size[0], sr_size[1], 3).to(device).type(torch.float32).contiguous()
+    # with torch.no_grad():
+    #     final_image = GSCUDA.apply(sigmas, coords, colours_with_alpha, rendered_img, dmax)
+    # final_image = (torch.sum(sigmas)+torch.sum(coords)+torch.sum(colours_with_alpha))*final_image
+    # buffer
+    buffer_num = len(sigma_x)// buffer_size+1
+    for buffer_id in range(buffer_num):
+        # print(f'processing{buffer_id+1}/{buffer_num}')
+        idx_start, idx_end = buffer_id * buffer_size, (buffer_id+1) * buffer_size
+        final_image = GSCUDA.apply(sigmas[idx_start:idx_end], coords[idx_start:idx_end],
+                                    colours_with_alpha[idx_start:idx_end], final_image, dmax)
+        # final_image += buffer_image
+    final_image = final_image.permute(2, 0, 1).contiguous()
+    return final_image
+def generate_2D_gaussian_splatting_step(sr_size, gs_parameters, scale, scale_modify,
+                                        sample_coords = None, default_step_size = 1.2,
+                                        cuda_rendering=True, mode = 'scale_modify',
+                                        if_dmax = True,
+                                        dmax_mode = 'fix',
+                                        dmax = 25):
+    # set step_size according to scale factor
+    if mode == 'scale':
+        final_scale = scale
+    elif mode == 'scale_modify':
+        assert scale_modify[0] == scale_modify[1], f"scale_modify is not the same-{scale_modify}"
+        final_scale = scale_modify[0]
+    step_size = default_step_size/ final_scale
+    # prepare gaussian properties
+    sigma_x = 0.99999 * torch.sigmoid(gs_parameters[:, 0:1]) + 1e-6
+    sigma_y = 0.99999 * torch.sigmoid(gs_parameters[:, 1:2]) + 1e-6
+    rho = 0.999999 * torch.tanh(gs_parameters[:, 2:3])
+    alpha = torch.sigmoid(gs_parameters[:, 3:4])
+    colours = torch.sigmoid(gs_parameters[:, 4:7])
+    coords = (gs_parameters[:, 7:9] * 2 - 1)
+    colours_with_alpha = colours * alpha
+    ## todo for save GS parameters
+    # GS_parameters = torch.cat([sigma_x, sigma_y, rho, alpha, colours, coords], dim = 1)
+    # torch.save(GS_parameters.cpu(), "/home/notebook/code/personal/S9053766/chendu/myprojects/GSSR_20240606/results/0804_48*48.pt")
+    # print(f"GS_parameter shape is {GS_parameters.shape}")
+    # print(f"-------")
+    # todo for visualization the position of Gaussian
+    # select = (torch.randn_like(alpha[..., 0])>2.5)
+    # colours_with_alpha[select, 0] = 1
+    # colours_with_alpha[select, 1] = 0
+    # colours_with_alpha[select, 2] = 0
+    # todo for visualization the shape of Gaussian
+    # sigma_x = torch.ones_like(sigma_x)*0.05
+    # sigma_y = torch.ones_like(sigma_y)*0.05
+    # rho = torch.ones_like(rho) * 0
+    # colours_with_alpha = torch.ones_like(colours_with_alpha)*0.5
+    # rendering
+    if cuda_rendering:
+        if if_dmax:
+            if dmax_mode == 'dynamic':
+                dmax = (dmax + 2) / min(sr_size[0], sr_size[1])
+            elif dmax_mode == 'fix':
+                pass
+            else:
+                raise ValueError(f"dmax_mode-{dmax_mode} must be fix or dynamic")
+            final_image = rendering_cuda_dmax(sigma_x, sigma_y, rho, coords, colours_with_alpha, sr_size, step_size, dmax=dmax, device=sigma_x.device)
+        else:
+            final_image = rendering_cuda(sigma_x, sigma_y, rho, coords, colours_with_alpha, sr_size, step_size, device=sigma_x.device)
+    else:
+        final_image = rendering_python(sigma_x, sigma_y, rho, coords, colours_with_alpha, sr_size, step_size, device=sigma_x.device)
+    if sample_coords is not None:
+        sample_RGB_values = [final_image[:, coord[0], coord[1]] for coord in sample_coords]
+        final_image = torch.stack(sample_RGB_values, dim = 1)
+    return final_image
+def generate_2D_gaussian_splatting_step_buffer(sr_size, gs_parameters, scale, scale_modify,
+                                        sample_coords = None, default_step_size = 1.2,
+                                        cuda_rendering=True, mode = 'scale_modify',
+                                        if_dmax = True,
+                                        dmax_mode = 'fix',
+                                        dmax = 25,
+                                        buffer_size = 4000000):
+    # set step_size according to scale factor
+    if mode == 'scale':
+        final_scale = scale
+    elif mode == 'scale_modify':
+        assert scale_modify[0] == scale_modify[1], f"scale_modify is not the same-{scale_modify}"
+        final_scale = scale_modify[0]
+    step_size = default_step_size/ final_scale
+    # prepare gaussian properties
+    sigma_x = 0.99999 * torch.sigmoid(gs_parameters[:, 0:1]) + 1e-6
+    sigma_y = 0.99999 * torch.sigmoid(gs_parameters[:, 1:2]) + 1e-6
+    rho = 0.999999 * torch.tanh(gs_parameters[:, 2:3])
+    alpha = torch.sigmoid(gs_parameters[:, 3:4])
+    colours = torch.sigmoid(gs_parameters[:, 4:7])
+    coords = (gs_parameters[:, 7:9] * 2 - 1)
+    colours_with_alpha = colours * alpha
+    # rendering
+    if cuda_rendering:
+        if if_dmax:
+            if dmax_mode == 'dynamic':
+                dmax = (dmax + 2) / min(sr_size[0], sr_size[1])
+            elif dmax_mode == 'fix':
+                pass
+            else:
+                raise ValueError(f"dmax_mode-{dmax_mode} must be fix or dynamic")
+            final_image = rendering_cuda_dmax_buffer(sigma_x, sigma_y, rho, coords, colours_with_alpha,
+                                                    sr_size, step_size, dmax=dmax, device=sigma_x.device,
+                                                    buffer_size = buffer_size)
+        else:
+            final_image = rendering_cuda_buffer(sigma_x, sigma_y, rho, coords, colours_with_alpha,
+                                                sr_size, step_size, device=sigma_x.device,
+                                                buffer_size = buffer_size)
+    else:
+        final_image = rendering_python(sigma_x, sigma_y, rho, coords, colours_with_alpha, sr_size, step_size, device=sigma_x.device)
+    if sample_coords is not None:
+        sample_RGB_values = [final_image[:, coord[0], coord[1]] for coord in sample_coords]
+        final_image = torch.stack(sample_RGB_values, dim = 1)
+    return final_image

utils/gs_cuda/check.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import torch
+from gswrapper import gaussiansplatting_render
+def torch_version(sigmas, coords, colors, image_size):
+    h, w = image_size
+    c = colors.shape[-1]
+    if h >= 50 or w >= 50:
+        logger.warning(f'too large values for h({h}), w({w}), torch version would be slow')
+    rendered_img = torch.zeros(h, w, c).to(colors.device).to(torch.float32)
+    for hi in range(h):
+        for wi in range(w):
+            curh = 2*hi/(h-1)-1.0
+            curw = 2*wi/(w-1)-1.0
+            v = (curw-coords[:,0])**2/sigmas[:,0]**2
+            v -= (2*sigmas[:,2])*(curw-coords[:,0])*(curh-coords[:,1])/sigmas[:,0]/sigmas[:,1]
+            v += (curh-coords[:,1])**2/sigmas[:,1]**2
+            v *= -1.0/(2.0*(1-sigmas[:,2]**2))
+            v = torch.exp(v)
+            for ci in range(c):
+                rendered_img[hi, wi, ci] = torch.sum(v*colors[:, ci])
+    return rendered_img
+if __name__ == "__main__":
+    s = 40 # the number of gs
+    image_size = (49, 49)
+    for _ in range(1):
+        print(f"--------------------------- begins --------------------------------")
+        sigmas = 0.999*torch.rand(s, 3).to(torch.float32).to("cuda")
+        # sigmas[:,:2] = 5*sigmas[:, :2]
+        coords = 2*torch.rand(s, 2).to(torch.float32).to("cuda")-1.0
+        colors = torch.rand(s, 3).to(torch.float32).to("cuda")
+        # sigmas = torch.Tensor([[0.9196, 0.3979, 0.7784]]).to(torch.float32).to("cuda")
+        # coords = torch.Tensor([[-0.0469, -0.1726]]).to(torch.float32).to("cuda")
+        # colors = torch.Tensor([[0.3775, 0.2346, 0.1513]]).to(torch.float32).to("cuda")
+        # colors = torch.ones_like(coords[:,0:1])
+        print(f"sigmas: {sigmas}, \ncoords:{coords}, \ncolors:{colors}")
+        # --- check forward ---
+        with torch.no_grad():
+            rendered_img_th = torch_version(sigmas,coords,colors,image_size)
+            rendered_img_cuda = gaussiansplatting_render(sigmas,coords,colors,image_size)
+        #
+        distance = (rendered_img_th-rendered_img_cuda)**2
+        print(f"check forward - torch: {rendered_img_th[:2,:2,0]}")
+        print(f"check forward - cuda: {rendered_img_cuda[:2,:2,0]}")
+        print(f"check forward - distance: {distance[:2, :2, 0]}")
+        print(f"check forward - sum: {torch.sum(distance)}\n")
+        # --- ends ---
+        # --- check backward ---
+        sigmas.requires_grad_(True)
+        coords.requires_grad_(True)
+        colors.requires_grad_(True)
+        # sigmas.retain_grad()
+        # coords.retain_grad()
+        # colors.retain_grad()
+        weight = torch.rand_like(rendered_img_th) # make each pixel has different grads
+        sigmas.grad = None
+        coords.grad = None
+        colors.grad = None
+        rendered_img_th = torch_version(sigmas,coords,colors,image_size)
+        loss_th = torch.sum(weight*rendered_img_th)
+        loss_th.backward()
+        sigmas_grad_th = sigmas.grad
+        coords_grad_th = coords.grad
+        colors_grad_th = colors.grad
+        sigmas.grad = None
+        coords.grad = None
+        colors.grad = None
+        rendered_img_cuda = gaussiansplatting_render(sigmas,coords,colors,image_size)
+        loss_cuda = torch.sum(weight*rendered_img_cuda)
+        # loss_cuda = torch.sum(rendered_img_cuda)
+        loss_cuda.backward()
+        sigmas_grad_cuda = sigmas.grad
+        coords_grad_cuda = coords.grad
+        colors_grad_cuda = colors.grad
+        distance_sigmas_grad = (sigmas_grad_th-sigmas_grad_cuda)**2
+        distance_coords_grad = (coords_grad_th-coords_grad_cuda)**2
+        distance_colors_grad = (colors_grad_th-colors_grad_cuda)**2
+        print(f"check backward - sigmas - torch: {sigmas_grad_th[:2]}")
+        print(f"check backward - sigmas - cuda: {sigmas_grad_cuda[:2]}")
+        print(f"check backward - sigmas - distance: {distance_sigmas_grad[:2]}")
+        print(f"check backward - sigmas - sum: {torch.sum(distance_sigmas_grad)}\n")
+        print(f"check backward - coords - torch: {coords_grad_th[:2]}")
+        print(f"check backward - coords - cuda: {coords_grad_cuda[:2]}")
+        print(f"check backward - coords - distance: {distance_coords_grad[:2]}")
+        print(f"check backward - coords - sum: {torch.sum(distance_coords_grad)}\n")
+        print(f"check backward - colors - torch: {colors_grad_th[:2]}")
+        print(f"check backward - colors - cuda: {colors_grad_cuda[:2]}")
+        print(f"check backward - colors - distance: {distance_colors_grad[:2]}")
+        print(f"check backward - colors - sum: {torch.sum(distance_colors_grad)}\n")
+        print(f"--------------------------- ends --------------------------------\n\n")

utils/gs_cuda/gs.cu ADDED Viewed

	@@ -0,0 +1,199 @@

+#include <stdio.h>
+#include <cmath>
+#include <curand_kernel.h>
+#define PI 3.1415926536
+#define PI2 6.283153072
+extern "C"
+__global__ void _gs_render_cuda(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+        float *rendered_img,
+	const int s,  // gs num
+	const int h,
+	const int w,
+	const int c
+	){
+        int index = blockIdx.x*blockDim.x + threadIdx.x;
+	int curw = index % w;
+	int curh = int((index-curw)/w);
+	if(curw >= w || curh >=h){
+	    return;
+	}
+	float curw_f = 2.0*curw/(w-1) - 1.0;
+	float curh_f = 2.0*curh/(h-1) - 1.0;
+        // printf("index:%d, curw:%d, curh:%d, curw_f:%f, curh_f:%f\n",index,curw,curh,curw_f,curh_f);
+	for(int si=0; si<s; si++){
+	    // compute the 2d gs value
+	    float sigma_x = sigmas[si*3+0];
+	    float sigma_y = sigmas[si*3+1];
+	    float rho = sigmas[si*3+2];
+            float x = coords[si*2+0];
+            float y = coords[si*2+1];
+	    //
+            float one_div_one_minus_rho2 = 1.0 / (1-rho*rho) ;
+            float one_div_sigma_x = 1.0 / sigma_x;
+            float one_div_sigma_y = 1.0 / sigma_y;
+	    float d_x = curw_f - x;
+	    float d_y = curh_f - y;
+            float v = one_div_sigma_x*one_div_sigma_x*d_x*d_x;
+            v -= 2*rho*d_x*d_y*one_div_sigma_x*one_div_sigma_y;
+            v += d_y*d_y*one_div_sigma_y*one_div_sigma_y;
+            v *= -one_div_one_minus_rho2 / 2.0;
+            v = exp(v);
+	    // since we normlize the v with the max, we remove this step to obtain equal result
+            // v *= one_div_sigma_x * one_div_sigma_y * pow(one_div_one_minus_rho2, 0.5) / PI2 ;
+            // printf("si:%d, sigma_x: %f, sigma_y:%f, rho:%f, x:%f, y:%f, v:%f\n", si, sigma_x, sigma_y, rho, x,y,v);
+            for(int ci=0; ci<c; ci++){
+		rendered_img[(curh*w+curw)*c+ci] += v*colors[si*3+ci];
+	    }
+	}
+}
+void _gs_render(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+        float *rendered_img,
+	const int s,
+	const int h,
+	const int w,
+	const int c
+	) {
+        int threads=64;
+        dim3 grid( h*w, 1);
+        dim3 block( threads, 1);
+        _gs_render_cuda<<<grid, block>>>(sigmas, coords, colors, rendered_img, s, h, w, c);
+}
+extern "C"
+__global__ void _gs_render_backward_cuda(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+        const float *grads,
+        float *grads_sigmas,
+        float *grads_coords,
+        float *grads_colors,
+	const int s,  // gs num
+	const int h,
+	const int w,
+	const int c
+	){
+        int curs = blockIdx.x*blockDim.x + threadIdx.x;
+	if(curs >= s){
+	    return ;
+	}
+	// obtain parameters of gs
+	float sigma_x = sigmas[curs*3+0];
+	float sigma_y = sigmas[curs*3+1];
+	float rho = sigmas[curs*3+2];
+        float x = coords[curs*2+0];
+        float y = coords[curs*2+1];
+	float cr = colors[curs*3+0];
+	float cg = colors[curs*3+1];
+	float cb = colors[curs*3+2];
+	//
+        float w1 = -0.5 / (1-rho*rho) ;
+        float w2 = 1.0 / (sigma_x*sigma_x);
+        float w3 = 1.0 / (sigma_x*sigma_y);
+        float w4 = 1.0 / (sigma_y*sigma_y);
+	float od_sx = 1.0 / sigma_x;
+	float od_sy = 1.0 / sigma_y;
+        // init
+	float _gr=0.0, _gg=0.0, _gb=0.0;
+	float _gx=0.0, _gy=0.0;
+	float _gsx=0.0, _gsy=0.0, _gsr=0.0;
+	for(int hi = 0; hi < h; hi++){
+	    for( int wi=0; wi < w; wi++){
+	        float curw_f = 2.0*wi/(w-1) - 1.0;
+	        float curh_f = 2.0*hi/(h-1) - 1.0;
+		// obtain grad to p^t_r, p^t_g, p^t_b
+		float gptr = grads[(hi*w+wi)*c+0]; // grad of loss to P^t_r
+		float gptg = grads[(hi*w+wi)*c+1];
+		float gptb = grads[(hi*w+wi)*c+2];
+	        // compute the 2d gs value
+		float d_x = curw_f - x; // distance along x axis
+		float d_y = curh_f - y;
+                float d = w2*d_x*d_x - 2*rho*w3*d_x*d_y + w4*d_y*d_y;
+		float v = w1*d;
+		v = exp(v);
+                // printf("si:%d, sigma_x: %f, sigma_y:%f, rho:%f, x:%f, y:%f, v:%f\n", si, sigma_x, sigma_y, rho, x,y,v);
+		// compute grad of colors
+		_gr += v*gptr;
+		_gg += v*gptg;
+		_gb += v*gptb;
+		// compute grad of coords
+		float gpt = gptr*cr+gptg*cg+gptb*cb;
+		float v_2_w1 = v*2*w1;
+		float g_vst_to_gsx = v_2_w1*(-w2*d_x+rho*w3*d_y); // grad of v^{st} to G^s_x
+		_gx += gpt*g_vst_to_gsx;
+		float g_vst_to_gsy = v_2_w1*(-w4*d_y+rho*w3*d_x); // grad of v^{st} to G^s_y
+		_gy += gpt*g_vst_to_gsy;
+		// compute grad of sigmas
+		float g_vst_to_gsigx = v_2_w1*od_sx* (w3*rho*d_x*d_y - w2*d_x*d_x);
+		_gsx += gpt*g_vst_to_gsigx;
+		float g_vst_to_gsigy = v_2_w1*od_sy* (w3*rho*d_x*d_y - w4*d_y*d_y);
+		_gsy += gpt*g_vst_to_gsigy;
+		float g_vst_to_rho = -v_2_w1*(2*w1*rho*d+w3*d_x*d_y);
+		_gsr += gpt*g_vst_to_rho;
+	}
+    }
+    // write the values
+    grads_sigmas[curs*3+0] = _gsx;
+    grads_sigmas[curs*3+1] = _gsy;
+    grads_sigmas[curs*3+2] = _gsr;
+    grads_coords[curs*2+0] = _gx;
+    grads_coords[curs*2+1] = _gy;
+    grads_colors[curs*3+0] = _gr;
+    grads_colors[curs*3+1] = _gg;
+    grads_colors[curs*3+2] = _gb;
+}
+void _gs_render_backward(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+	const float *grads, // (h, w, c)
+	float *grads_sigmas,
+	float *grads_coords,
+	float *grads_colors,
+	const int s,
+	const int h,
+	const int w,
+	const int c
+	) {
+        int threads=64;
+        dim3 grid(s, 1);
+        dim3 block( threads, 1);
+        _gs_render_backward_cuda<<<grid, block>>>(sigmas, coords, colors, grads, grads_sigmas, grads_coords, grads_colors, s, h, w, c);
+}

utils/gs_cuda/gs.h ADDED Viewed

	@@ -0,0 +1,24 @@

+void _gs_render(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+        float *rendered_img,
+	const int s,
+	const int h,
+	const int w,
+	const int c
+);
+void _gs_render_backward(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+        const float *grads,
+        float *grads_sigmas,
+        float *grads_coords,
+        float *grads_colors,
+	const int s,
+	const int h,
+	const int w,
+	const int c
+);

utils/gs_cuda/gswrapper.cpp ADDED Viewed

	@@ -0,0 +1,80 @@

+#include "gs.h"
+#include <torch/extension.h>
+#include <c10/cuda/CUDAGuard.h>
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+void gs_render(
+        torch::Tensor &sigmas,
+        torch::Tensor &coords,
+        torch::Tensor &colors,
+        torch::Tensor &rendered_img,
+	const int s,
+	const int h,
+	const int w,
+	const int c
+        ){
+        CHECK_INPUT(sigmas);
+        CHECK_INPUT(coords);
+        CHECK_INPUT(colors);
+        CHECK_INPUT(rendered_img);
+        // run the code at the cuda device same with the input
+        const at::cuda::OptionalCUDAGuard device_guard(device_of(sigmas));
+        _gs_render(
+            (const float *) sigmas.data_ptr(),
+            (const float *) coords.data_ptr(),
+            (const float *) colors.data_ptr(),
+            (float *) rendered_img.data_ptr(),
+	    s, h, w, c);
+}
+void gs_render_backward(
+        torch::Tensor &sigmas,
+        torch::Tensor &coords,
+        torch::Tensor &colors,
+        torch::Tensor &grads,
+        torch::Tensor &grads_sigmas,
+        torch::Tensor &grads_coords,
+        torch::Tensor &grads_colors,
+	const int s,
+	const int h,
+	const int w,
+	const int c
+        ){
+        CHECK_INPUT(sigmas);
+        CHECK_INPUT(coords);
+        CHECK_INPUT(colors);
+        CHECK_INPUT(grads);
+        CHECK_INPUT(grads_sigmas);
+        CHECK_INPUT(grads_coords);
+        CHECK_INPUT(grads_colors);
+        // run the code at the cuda device same with the input
+        const at::cuda::OptionalCUDAGuard device_guard(device_of(sigmas));
+        _gs_render_backward(
+            (const float *) sigmas.data_ptr(),
+            (const float *) coords.data_ptr(),
+            (const float *) colors.data_ptr(),
+            (const float *) grads.data_ptr(),
+            (float *) grads_sigmas.data_ptr(),
+            (float *) grads_coords.data_ptr(),
+            (float *) grads_colors.data_ptr(),
+	    s, h, w, c);
+}
+PYBIND11_MODULE( TORCH_EXTENSION_NAME, m) {
+        m.def( "gs_render",
+                &gs_render,
+                "cuda forward wrapper");
+        m.def( "gs_render_backward",
+                &gs_render_backward,
+                "cuda backward wrapper");
+}

utils/gs_cuda/gswrapper.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+import torch
+from torch.utils.cpp_extension import load
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+build_path = os.path.join(os.path.split(os.path.abspath(__file__))[0], 'build')
+os.makedirs(build_path, exist_ok=True)
+file_path = os.path.split(os.path.abspath(__file__))[0]
+GSWrapper = load(
+        name="gscuda",
+        # sources=["gs_cuda/gswrapper.cpp", "gs_cuda/gs.cu"],
+        sources=[os.path.join(file_path, "gswrapper.cpp"),
+                 os.path.join(file_path, "gs.cu")],
+        build_directory=build_path,
+        verbose=True)
+class GSCUDA(Function):
+        @staticmethod
+        def forward(ctx, sigmas, coords, colors, rendered_img):
+            ctx.save_for_backward(sigmas, coords, colors)
+            h, w, c = rendered_img.shape
+            s = sigmas.shape[0]
+            GSWrapper.gs_render(sigmas, coords, colors, rendered_img, s, h, w, c)
+            return rendered_img
+        @staticmethod
+        @once_differentiable
+        def backward(ctx, grad_output):
+            sigmas, coords, colors = ctx.saved_tensors
+            h, w, c = grad_output.shape
+            s = sigmas.shape[0]
+            grads_sigmas = torch.zeros_like(sigmas)
+            grads_coords = torch.zeros_like(coords)
+            grads_colors = torch.zeros_like(colors)
+            GSWrapper.gs_render_backward(sigmas, coords, colors, grad_output.contiguous(), grads_sigmas, grads_coords, grads_colors, s, h, w, c)
+            return (grads_sigmas, grads_coords, grads_colors, None)
+def gaussiansplatting_render(sigmas, coords, colors, image_size):
+    sigmas = sigmas.contiguous() # (gs num, 3)
+    coords = coords.contiguous() # (gs num, 2)
+    colors = colors.contiguous() # (gs num, c)
+    h, w = image_size[:2]
+    c = colors.shape[-1]
+    rendered_img = torch.zeros(h, w, c).to(colors.device).to(torch.float32)
+    return GSCUDA.apply(sigmas, coords, colors, rendered_img)

utils/gs_cuda/mylineprofiler.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import os
+import io
+import sys
+import timeit
+import tokenize
+import torch
+import psutil
+import inspect
+from loguru import logger
+from prettytable import PrettyTable
+# implement by xtudbxk
+# github: https://github.com/xtudbxk/lineprofiler
+class MyLineProfiler():
+    def __init__(self, base='ms', cuda_sync=True, gpuids=(0,), warmup=0, warmup_lineno=-1):
+        if base == 'ms':
+            self.base_n = 1000
+        elif base == 's':
+            self.base_n = 1
+        else:
+            logguru.warning(f'Unsupported base - {base}, using "s" instead')
+        self.base = base
+        self.cuda_sync = cuda_sync
+        self.gpuids = gpuids
+        self.warmup = warmup
+        self.warmup_counter = warmup
+        # we should wait this line execute warup_counter times
+        # before recording the stats
+        self.warmup_lineno = warmup_lineno
+        # for time profiling
+        self._times = {}
+        self._func_name = None
+        self._func_filename = None
+        self._last_time = -1
+        self._last_lineno = -1
+        self._func_hit_count = 0
+        self._func_firstlineno = 0
+        # for memory profiling
+        self._process = psutil.Process(os.getpid())
+        self._memory = {}
+        self._last_memory = 0
+        # for cuda memory profiling
+        self._gpu_memory = {}
+        self._gpu_last_memory = 0
+    def __trace_func__(self, frame, event, arg):
+        # print(f'in {frame.f_code.co_filename} func {frame.f_code.co_name} line {frame.f_lineno}, event - {event}')
+        # check if run into the decorated func
+        if self._func_firstlineno == frame.f_code.co_firstlineno and frame.f_code.co_name == self._func_name and frame.f_code.co_filename == self._func_filename:
+            # --- obtain info for current hit ---
+            # cuda related
+            if self.cuda_sync is True:
+                torch.cuda.synchronize()
+            current_time = timeit.default_timer()
+            memory = self._process.memory_info().rss
+            gpu_memory = torch.cuda.memory_allocated()
+            # --- ends ---
+            # --- initilize the info when first hit ---
+            if frame.f_lineno not in self._times: # first hit time for this line
+                self._times[frame.f_lineno] = {'hit':0, 'time': 0}
+                self._memory[frame.f_lineno] = 0
+                self._gpu_memory[frame.f_lineno] = 0
+            # --- ends ---
+            # --- record info before call the decorated func ---
+            # 'call' - before call the func
+            if event == 'call':
+                self._last_time = current_time
+                self._last_lineno = frame.f_lineno
+                self._last_memory = memory
+                self._last_gpu_memory = gpu_memory
+                if self.warmup_lineno < 0:
+                    self.warmup_counter -= 1
+                    if self.warmup_counter < 0:
+                        self._func_hit_count += 1
+            # --- ends ---
+            # 'line' - after excuting the line
+            # 'return' - return from the function
+            if event == 'line' or event == 'return':
+                if event == 'line' and self.warmup_counter < 0:
+                    self._times[frame.f_lineno]['hit'] += 1
+                # --- obtain the memory and time consumed by this line ---
+                if self.warmup_counter < 0:
+                    self._times[self._last_lineno]['time'] += current_time - self._last_time
+                self._memory[self._last_lineno] += memory - self._last_memory
+                self._gpu_memory[self._last_lineno] += gpu_memory - self._gpu_last_memory
+                # --- ends ---
+                if self.cuda_sync is True:
+                    torch.cuda.synchronize()
+                self._last_time = timeit.default_timer()
+                self._last_memory = memory
+                self._gpu_last_memory = gpu_memory
+                self._last_lineno = frame.f_lineno
+        return self.__trace_func__
+    def decorate(self, func):
+        if self._func_name is not None:
+            logger.warning(f'Only support decorate only one func. Aready decorated "{self._func_name}"')
+        self._func_name = func.__name__
+        self._func_filename = func.__code__.co_filename
+        self._func_firstlineno = func.__code__.co_firstlineno
+        def _f(*args, **kwargs):
+            origin_trace_func = sys.gettrace()
+            sys.settrace(self.__trace_func__)
+            ret = func(*args, **kwargs)
+            sys.settrace(origin_trace_func)
+            return ret
+        return _f
+    def _get_table(self):
+        if len(self._times) <= 0:
+            logger.warning(f"un recorded datas, please ensure the function is executed")
+            return None
+        # --- load the source code ---
+        with open(self._func_filename, 'r') as f:
+            source_lines = [line.strip('\n') for line in f.readlines()]
+            code_str = "\n".join(source_lines)
+        def_lineno = min(self._times.keys())
+        final_lineno = max(self._times.keys())
+        # remove the additional blank content
+        pre_blank_count = len(source_lines[def_lineno-1]) - len(source_lines[def_lineno-1].lstrip(' ').lstrip('\t'))
+        # --- ends ---
+        # --- analysize the source code and collect infos for multi-line code ---
+        new_logic_linenos = [token.start[0] for token in tokenize.generate_tokens(
+            io.StringIO(code_str).readline) if token.type == 4]
+        # --- ends ---
+        # --- merge the stats multi-line code ---
+        sorted_linenos = [lineno for lineno in self._times.keys()]
+        sorted_linenos.sort(key=int)
+        lineno_cache = []
+        for lineno in sorted_linenos:
+            if lineno not in new_logic_linenos:
+                lineno_cache.append(lineno)
+            else:
+                # we should merge its info to the prev_lineno
+                if len(lineno_cache) <= 0:
+                    continue
+                else:
+                    lineno_cache.append(lineno)
+                    first_lineno = lineno_cache[0]
+                    for prev_lineno in lineno_cache[1:]:
+                        self._times[first_lineno]["hit"] = min(self._times[first_lineno]["hit"], self._times[prev_lineno]["hit"])
+                        self._times[first_lineno]["time"] += self._times[prev_lineno]["time"]
+                        del self._times[prev_lineno]
+                        self._memory[first_lineno] += self._memory[prev_lineno]
+                        del self._memory[prev_lineno]
+                        self._gpu_memory[first_lineno] += self._gpu_memory[prev_lineno]
+                        del self._gpu_memory[prev_lineno]
+                    lineno_cache = []
+        # --- ends ---
+        # --- initialize the pretty table for output ---
+        table = PrettyTable(['lineno', 'hits', 'time', 'time per hit', 'hit perc', 'time perc', 'mem inc', 'mem peak', 'gpu mem inc', 'gpu mem peak'])
+        # --- ends ---
+        # --- compute some statisticals ---
+        total_hit = 0 # for compute the hit percentage
+        total_time = 0
+        for lineno, stats in self._times.items():
+            if lineno == def_lineno: continue
+            total_hit += stats['hit']
+            total_time += stats['time']
+        total_memory = sum([m for l,m in self._memory.items()]) / 1024 / 1024
+        total_gpu_memory = sum([m for l,m in self._gpu_memory.items()]) / 1024 / 1024
+        # --- ends ---
+        peak_cpu_memory = 0
+        peak_gpu_memory = 0
+        for lineno in range(def_lineno, final_lineno+1):
+            if lineno not in self._times:
+                # the comment line, empty line or merged line from multi-lines code
+                table.add_row([lineno, '-', '-', '-', '-', '-', '-',f'{peak_cpu_memory:5.3f} MB', '-', f'{peak_gpu_memory:5.3f} MB'])
+            else:
+                stats = self._times[lineno]
+                if lineno == def_lineno:
+                    table.add_row([lineno, self._func_hit_count, f'{total_time*self.base_n:.4f} {self.base}', f'{total_time/self._func_hit_count*self.base_n:.4f} {self.base}', '-', '-', f'{total_memory:5.3f} MB', 'baseline', f'{total_gpu_memory:5.3f} MB', 'baseline'])
+                else:
+                    line_result = [lineno, stats['hit'],
+                                  f'{stats["time"]*self.base_n:.4f} {self.base}',
+                                  f'{stats["time"]/stats["hit"]*self.base_n:.4f} {self.base}' if stats['hit'] > 0 else 'nan',
+                                  f'{stats["hit"]/total_hit*100:.3f}%' if total_hit > 0 else 'nan',
+                                  f'{stats["time"]/total_time*100:.3f}%'] if total_time > 0 else 'nan'
+                    line_result += [f'{self._memory[lineno]/1024/1024:5.3f} MB' if stats['hit'] > 0 else '0 MB']
+                    peak_cpu_memory = peak_cpu_memory + self._memory[lineno]/1024/1024
+                    line_result += [f'{peak_cpu_memory:5.3f} MB']
+                    line_result += [f'{self._gpu_memory[lineno]/1024/1024:5.3f} MB' if stats['hit'] > 0 else '0 MB']
+                    peak_gpu_memory = peak_gpu_memory + self._gpu_memory[lineno]/1024/1024
+                    line_result += [f'{peak_gpu_memory:5.3f} MB']
+                    table.add_row(line_result)
+        table.add_column('sources', [source_lines[i-1][pre_blank_count:] if len(source_lines[i-1])>pre_blank_count else '' for i in range(def_lineno, final_lineno+1)], 'l')
+        return table
+    def print(self, filename=None, mode="w"):
+        introducation = '''
+1. The first line of table reports the overall results of the whole function and the following lines reports the statistics of each line in the function.
+2. The `hit perc` and `time perc` represent `hit percentage` and `time percentage`.
+3. For memory, there exists four categories `mem inc`, `mem peak`, `gpu mem inc` and `gpu mem peak`. They denotes `cpu memory increasement`, `cpu memory peak`, `gpu memory increasement` and `gpu memory peak`. All the results are collected in the last run. The number in the increasement field denots the increasement of corresponding memory of each line (the first line is related to the whole function). Sometimes, the number of each line is far less of the number of the first line, which is valid since python may auto release the unused memory after the function execution. The number of each line in the peak filed is a simple sum of the numbers of above lines in the increasement field, which is used to demonstrate the possible maxinum memory usage in the function.
+4. For any issue, please concact us via https://github.com/xtudbxk/lineprofiler or zhengqiang.zhang@hotmail.com
+        '''
+        print(introducation)
+        table = PrettyTable(['lineno', 'hits', 'time', 'time per hit', 'hit perc', 'time perc', 'mem inc', 'mem peak', 'gpu mem inc', 'gpu mem peak'])
+        table = self._get_table()
+        print(table)
+        if filename is not None:
+            with open(filename, mode) as f:
+                f.write(introducation)
+                f.write(f"args - base={self.base}, cuda_sync={self.cuda_sync}, gpuids={self.gpuids}, warmup={self.warmup}\n")
+                f.write(str(table))
+if __name__ == '__main__':
+    import numpy as np
+    def mytest(h='hello',
+               xx="xx"):
+        h = h + 'world'
+        a = []
+        for _ in range(200):
+            # a = np.zeros((1000, 1000), dtype=np.float32)
+            a.append(np.zeros((1000, 1000), dtype=np.float32))
+            a.append(
+                    np.zeros((1000, 1000),
+                              dtype=np.float32))
+            # print(a[0,0])
+        print(h)
+    profiler = MyLineProfiler(cuda_sync=False, warmup=2)
+    mytest = profiler.decorate(mytest)
+    for _ in range(5):
+        mytest()
+    profiler.print()

utils/gs_cuda/profile.log ADDED Viewed

	@@ -0,0 +1,69 @@

+1. The first line of table reports the overall results of the whole function and the following lines reports the statistics of each line in the function.
+2. The `hit perc` and `time perc` represent `hit percentage` and `time percentage`.
+3. For memory, there exists four categories `mem inc`, `mem peak`, `gpu mem inc` and `gpu mem peak`. They denotes `cpu memory increasement`, `cpu memory peak`, `gpu memory increasement` and `gpu memory peak`. All the results are collected in the last run. The number in the increasement field denots the increasement of corresponding memory of each line (the first line is related to the whole function). Sometimes, the number of each line is far less of the number of the first line, which is valid since python may auto release the unused memory after the function execution. The number of each line in the peak filed is a simple sum of the numbers of above lines in the increasement field, which is used to demonstrate the possible maxinum memory usage in the function.
+4. For any issue, please concact us via https://github.com/xtudbxk/lineprofiler or zhengqiang.zhang@hotmail.com
+        args - base=ms, cuda_sync=True, gpuids=(0,), warmup=0
++--------+------+------------+--------------+----------+-----------+----------+----------+-------------+--------------+-----------------------------------------------------------------------------+
+| lineno | hits |    time    | time per hit | hit perc | time perc | mem inc  | mem peak | gpu mem inc | gpu mem peak | sources                                                                     |
++--------+------+------------+--------------+----------+-----------+----------+----------+-------------+--------------+-----------------------------------------------------------------------------+
+|   41   |  1   | 76.8299 ms |  76.8299 ms  |    -     |     -     | 0.902 MB | baseline |   3.500 MB  |   baseline   | def gaussiansplatting_render(sigmas, coords, colors, image_size):           |
+|   42   |  1   | 0.0353 ms  |  0.0353 ms   | 14.286%  |   0.046%  | 0.000 MB | 0.000 MB |   0.000 MB  |   0.000 MB   |     sigmas = sigmas.contiguous() # (gs num, 3)                              |
+|   43   |  1   | 0.0078 ms  |  0.0078 ms   | 14.286%  |   0.010%  | 0.000 MB | 0.000 MB |   0.000 MB  |   0.000 MB   |     coords = coords.contiguous() # (gs num, 2)                              |
+|   44   |  1   | 0.0063 ms  |  0.0063 ms   | 14.286%  |   0.008%  | 0.000 MB | 0.000 MB |   0.000 MB  |   0.000 MB   |     colors = colors.contiguous() # (gs num, c)                              |
+|   45   |  1   | 0.0063 ms  |  0.0063 ms   | 14.286%  |   0.008%  | 0.000 MB | 0.000 MB |   0.000 MB  |   0.000 MB   |     h, w = image_size[:2]                                                   |
+|   46   |  1   | 0.0093 ms  |  0.0093 ms   | 14.286%  |   0.012%  | 0.000 MB | 0.000 MB |   0.000 MB  |   0.000 MB   |     c = colors.shape[-1]                                                    |
+|   47   |  1   | 1.8306 ms  |  1.8306 ms   | 14.286%  |   2.383%  | 0.438 MB | 0.438 MB |   3.000 MB  |   3.000 MB   |     rendered_img = torch.zeros(h, w, c).to(colors.device).to(torch.float32) |
+|   48   |  1   | 74.9344 ms |  74.9344 ms  | 14.286%  |  97.533%  | 0.465 MB | 0.902 MB |   0.000 MB  |   3.000 MB   |     return GSCUDA.apply(sigmas, coords, colors, rendered_img)               |
++--------+------+------------+--------------+----------+-----------+----------+----------+-------------+--------------+-----------------------------------------------------------------------------+
+1. The first line of table reports the overall results of the whole function and the following lines reports the statistics of each line in the function.
+2. The `hit perc` and `time perc` represent `hit percentage` and `time percentage`.
+3. For memory, there exists four categories `mem inc`, `mem peak`, `gpu mem inc` and `gpu mem peak`. They denotes `cpu memory increasement`, `cpu memory peak`, `gpu memory increasement` and `gpu memory peak`. All the results are collected in the last run. The number in the increasement field denots the increasement of corresponding memory of each line (the first line is related to the whole function). Sometimes, the number of each line is far less of the number of the first line, which is valid since python may auto release the unused memory after the function execution. The number of each line in the peak filed is a simple sum of the numbers of above lines in the increasement field, which is used to demonstrate the possible maxinum memory usage in the function.
+4. For any issue, please concact us via https://github.com/xtudbxk/lineprofiler or zhengqiang.zhang@hotmail.com
+        args - base=ms, cuda_sync=True, gpuids=(0,), warmup=0
++--------+------+--------------+--------------+----------+-----------+----------+----------+-------------+--------------+-----------------------------------------------------------------------------+
+| lineno | hits |     time     | time per hit | hit perc | time perc | mem inc  | mem peak | gpu mem inc | gpu mem peak | sources                                                                     |
++--------+------+--------------+--------------+----------+-----------+----------+----------+-------------+--------------+-----------------------------------------------------------------------------+
+|   41   |  1   | 1175.7406 ms | 1175.7406 ms |    -     |     -     | 0.777 MB | baseline |  12.000 MB  |   baseline   | def gaussiansplatting_render(sigmas, coords, colors, image_size):           |
+|   42   |  1   |  0.0304 ms   |  0.0304 ms   | 14.286%  |   0.003%  | 0.000 MB | 0.000 MB |   0.000 MB  |   0.000 MB   |     sigmas = sigmas.contiguous() # (gs num, 3)                              |
+|   43   |  1   |  0.0069 ms   |  0.0069 ms   | 14.286%  |   0.001%  | 0.000 MB | 0.000 MB |   0.000 MB  |   0.000 MB   |     coords = coords.contiguous() # (gs num, 2)                              |
+|   44   |  1   |  0.0064 ms   |  0.0064 ms   | 14.286%  |   0.001%  | 0.000 MB | 0.000 MB |   0.000 MB  |   0.000 MB   |     colors = colors.contiguous() # (gs num, c)                              |
+|   45   |  1   |  0.0065 ms   |  0.0065 ms   | 14.286%  |   0.001%  | 0.000 MB | 0.000 MB |   0.000 MB  |   0.000 MB   |     h, w = image_size[:2]                                                   |
+|   46   |  1   |  0.0099 ms   |  0.0099 ms   | 14.286%  |   0.001%  | 0.000 MB | 0.000 MB |   0.000 MB  |   0.000 MB   |     c = colors.shape[-1]                                                    |
+|   47   |  1   |  1.2594 ms   |  1.2594 ms   | 14.286%  |   0.107%  | 0.133 MB | 0.133 MB |   3.000 MB  |   3.000 MB   |     rendered_img = torch.zeros(h, w, c).to(colors.device).to(torch.float32) |
+|   48   |  1   | 1174.4211 ms | 1174.4211 ms | 14.286%  |  99.888%  | 0.645 MB | 0.777 MB |   0.000 MB  |   3.000 MB   |     return GSCUDA.apply(sigmas, coords, colors, rendered_img)               |
++--------+------+--------------+--------------+----------+-----------+----------+----------+-------------+--------------+-----------------------------------------------------------------------------+
+1. The first line of table reports the overall results of the whole function and the following lines reports the statistics of each line in the function.
+2. The `hit perc` and `time perc` represent `hit percentage` and `time percentage`.
+3. For memory, there exists four categories `mem inc`, `mem peak`, `gpu mem inc` and `gpu mem peak`. They denotes `cpu memory increasement`, `cpu memory peak`, `gpu memory increasement` and `gpu memory peak`. All the results are collected in the last run. The number in the increasement field denots the increasement of corresponding memory of each line (the first line is related to the whole function). Sometimes, the number of each line is far less of the number of the first line, which is valid since python may auto release the unused memory after the function execution. The number of each line in the peak filed is a simple sum of the numbers of above lines in the increasement field, which is used to demonstrate the possible maxinum memory usage in the function.
+4. For any issue, please concact us via https://github.com/xtudbxk/lineprofiler or zhengqiang.zhang@hotmail.com
+        args - base=ms, cuda_sync=True, gpuids=(0,), warmup=0
++--------+------+---------------+--------------+----------+-----------+-----------+-----------+-------------+--------------+-----------------------------------------------------------------------------+
+| lineno | hits |      time     | time per hit | hit perc | time perc |  mem inc  |  mem peak | gpu mem inc | gpu mem peak | sources                                                                     |
++--------+------+---------------+--------------+----------+-----------+-----------+-----------+-------------+--------------+-----------------------------------------------------------------------------+
+|   41   |  10  | 11844.9229 ms | 1184.4923 ms |    -     |     -     | 20.227 MB |  baseline |  15.000 MB  |   baseline   | def gaussiansplatting_render(sigmas, coords, colors, image_size):           |
+|   42   |  10  |   0.1342 ms   |  0.0134 ms   | 14.286%  |   0.001%  |  0.000 MB |  0.000 MB |   0.000 MB  |   0.000 MB   |     sigmas = sigmas.contiguous() # (gs num, 3)                              |
+|   43   |  10  |   0.0654 ms   |  0.0065 ms   | 14.286%  |   0.001%  |  0.000 MB |  0.000 MB |   0.000 MB  |   0.000 MB   |     coords = coords.contiguous() # (gs num, 2)                              |
+|   44   |  10  |   0.0618 ms   |  0.0062 ms   | 14.286%  |   0.001%  |  0.000 MB |  0.000 MB |   0.000 MB  |   0.000 MB   |     colors = colors.contiguous() # (gs num, c)                              |
+|   45   |  10  |   0.0710 ms   |  0.0071 ms   | 14.286%  |   0.001%  |  0.000 MB |  0.000 MB |   0.000 MB  |   0.000 MB   |     h, w = image_size[:2]                                                   |
+|   46   |  10  |   0.0803 ms   |  0.0080 ms   | 14.286%  |   0.001%  |  0.062 MB |  0.062 MB |   0.000 MB  |   0.000 MB   |     c = colors.shape[-1]                                                    |
+|   47   |  10  |   7.2555 ms   |  0.7256 ms   | 14.286%  |   0.061%  | 19.105 MB | 19.168 MB |  30.000 MB  |  30.000 MB   |     rendered_img = torch.zeros(h, w, c).to(colors.device).to(torch.float32) |
+|   48   |  10  | 11837.2547 ms | 1183.7255 ms | 14.286%  |  99.935%  |  1.059 MB | 20.227 MB |   0.000 MB  |  30.000 MB   |     return GSCUDA.apply(sigmas, coords, colors, rendered_img)               |
++--------+------+---------------+--------------+----------+-----------+-----------+-----------+-------------+--------------+-----------------------------------------------------------------------------+
+1. The first line of table reports the overall results of the whole function and the following lines reports the statistics of each line in the function.
+2. The `hit perc` and `time perc` represent `hit percentage` and `time percentage`.
+3. For memory, there exists four categories `mem inc`, `mem peak`, `gpu mem inc` and `gpu mem peak`. They denotes `cpu memory increasement`, `cpu memory peak`, `gpu memory increasement` and `gpu memory peak`. All the results are collected in the last run. The number in the increasement field denots the increasement of corresponding memory of each line (the first line is related to the whole function). Sometimes, the number of each line is far less of the number of the first line, which is valid since python may auto release the unused memory after the function execution. The number of each line in the peak filed is a simple sum of the numbers of above lines in the increasement field, which is used to demonstrate the possible maxinum memory usage in the function.
+4. For any issue, please concact us via https://github.com/xtudbxk/lineprofiler or zhengqiang.zhang@hotmail.com
+        args - base=ms, cuda_sync=True, gpuids=(0,), warmup=0
++--------+------+---------------+--------------+----------+-----------+-----------+-----------+-------------+--------------+-----------------------------------------------------------------------------+
+| lineno | hits |      time     | time per hit | hit perc | time perc |  mem inc  |  mem peak | gpu mem inc | gpu mem peak | sources                                                                     |
++--------+------+---------------+--------------+----------+-----------+-----------+-----------+-------------+--------------+-----------------------------------------------------------------------------+
+|   41   |  10  | 11855.0900 ms | 1185.5090 ms |    -     |     -     | 20.242 MB |  baseline |  15.000 MB  |   baseline   | def gaussiansplatting_render(sigmas, coords, colors, image_size):           |
+|   42   |  10  |   0.1263 ms   |  0.0126 ms   | 14.286%  |   0.001%  |  0.078 MB |  0.078 MB |   0.000 MB  |   0.000 MB   |     sigmas = sigmas.contiguous() # (gs num, 3)                              |
+|   43   |  10  |   0.0632 ms   |  0.0063 ms   | 14.286%  |   0.001%  |  0.000 MB |  0.078 MB |   0.000 MB  |   0.000 MB   |     coords = coords.contiguous() # (gs num, 2)                              |
+|   44   |  10  |   0.0588 ms   |  0.0059 ms   | 14.286%  |   0.000%  |  0.000 MB |  0.078 MB |   0.000 MB  |   0.000 MB   |     colors = colors.contiguous() # (gs num, c)                              |
+|   45   |  10  |   0.0626 ms   |  0.0063 ms   | 14.286%  |   0.001%  |  0.000 MB |  0.078 MB |   0.000 MB  |   0.000 MB   |     h, w = image_size[:2]                                                   |
+|   46   |  10  |   0.0747 ms   |  0.0075 ms   | 14.286%  |   0.001%  |  0.000 MB |  0.078 MB |   0.000 MB  |   0.000 MB   |     c = colors.shape[-1]                                                    |
+|   47   |  10  |   7.0497 ms   |  0.7050 ms   | 14.286%  |   0.059%  | 19.078 MB | 19.156 MB |  30.000 MB  |  30.000 MB   |     rendered_img = torch.zeros(h, w, c).to(colors.device).to(torch.float32) |
+|   48   |  10  | 11847.6547 ms | 1184.7655 ms | 14.286%  |  99.937%  |  0.820 MB | 19.977 MB |   0.000 MB  |  30.000 MB   |     return GSCUDA.apply(sigmas, coords, colors, rendered_img)               |
++--------+------+---------------+--------------+----------+-----------+-----------+-----------+-------------+--------------+-----------------------------------------------------------------------------+

utils/gs_cuda/profile.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import cv2
+import torch
+import numpy as np
+import torch.nn.functional as F
+from torchvision.utils import save_image
+from gswrapper import gaussiansplatting_render
+def generate_2D_gaussian_splatting(kernel_size, sigma_x, sigma_y, rho, coords,
+        colours, image_size=(256, 256, 3), device="cuda"):
+    batch_size = colours.shape[0]
+    sigma_x = sigma_x.view(batch_size, 1, 1)
+    sigma_y = sigma_y.view(batch_size, 1, 1)
+    rho = rho.view(batch_size, 1, 1)
+    covariance = torch.stack(
+        [torch.stack([sigma_x**2, rho*sigma_x*sigma_y], dim=-1),
+        torch.stack([rho*sigma_x*sigma_y, sigma_y**2], dim=-1)],
+        dim=-2
+    )
+    # Check for positive semi-definiteness
+    # determinant = (sigma_x**2) * (sigma_y**2) - (rho * sigma_x * sigma_y)**2
+    # if (determinant <= 0).any():
+    #     raise ValueError("Covariance matrix must be positive semi-definite")
+    inv_covariance = torch.inverse(covariance)
+    # Choosing quite a broad range for the distribution [-5,5] to avoid any clipping
+    start = torch.tensor([-5.0], device=device).view(-1, 1)
+    end = torch.tensor([5.0], device=device).view(-1, 1)
+    base_linspace = torch.linspace(0, 1, steps=kernel_size, device=device)
+    ax_batch = start + (end - start) * base_linspace
+    # Expanding dims for broadcasting
+    ax_batch_expanded_x = ax_batch.unsqueeze(-1).expand(-1, -1, kernel_size)
+    ax_batch_expanded_y = ax_batch.unsqueeze(1).expand(-1, kernel_size, -1)
+    # Creating a batch-wise meshgrid using broadcasting
+    xx, yy = ax_batch_expanded_x, ax_batch_expanded_y # (batchsize, kernelsize, kernelsize)
+    xy = torch.stack([xx, yy], dim=-1) # (batchsize, kernelsize, kernelsize, 2)
+    z = torch.einsum('b...i,b...ij,b...j->b...', xy, -0.5 * inv_covariance, xy) # (batchsize, kernelsize, kernelsize, 2)
+    kernel = torch.exp(z) / (2 * torch.tensor(np.pi, device=device) * torch.sqrt(torch.det(covariance)).view(batch_size, 1, 1)) # (batchsize, kernelsize, kernelsize)
+    kernel_max_1, _ = kernel.max(dim=-1, keepdim=True)  # Find max along the last dimension
+    kernel_max_2, _ = kernel_max_1.max(dim=-2, keepdim=True)  # Find max along the second-to-last dimension
+    kernel_normalized = kernel / kernel_max_2 # (batchsize, kernelsize, kernelsize)
+    kernel_reshaped = kernel_normalized.repeat(1, 3, 1).view(batch_size * 3, kernel_size, kernel_size)
+    kernel_rgb = kernel_reshaped.unsqueeze(0).reshape(batch_size, 3, kernel_size, kernel_size)  # (batchsize, 3, kernelsize, kernelsize)
+    # Calculating the padding needed to match the image size
+    pad_h = image_size[0] - kernel_size
+    pad_w = image_size[1] - kernel_size
+    if pad_h < 0 or pad_w < 0:
+        raise ValueError("Kernel size should be smaller or equal to the image size.")
+    # Adding padding to make kernel size equal to the image size
+    padding = (pad_w // 2, pad_w // 2 + pad_w % 2,  # padding left and right
+               pad_h // 2, pad_h // 2 + pad_h % 2)  # padding top and bottom
+    kernel_rgb_padded = torch.nn.functional.pad(kernel_rgb, padding, "constant", 0) # (batchsize, 3, h, w)
+    # Extracting shape information
+    b, c, h, w = kernel_rgb_padded.shape
+    # Create a batch of 2D affine matrices
+    theta = torch.zeros(b, 2, 3, dtype=torch.float32, device=device)
+    theta[:, 0, 0] = 1.0
+    theta[:, 1, 1] = 1.0
+    theta[:, :, 2] = -coords # (b, 2) - the offset of gaussian splating
+    # Creating grid and performing grid sampling
+    grid = F.affine_grid(theta, size=(b, c, h, w), align_corners=True) # (b, 3, h, w)
+    # grid_y = torch.linspace(-1, 1, steps=h, device=device).reshape(1, h, 1, 1).repeat(1, 1, w, 1)
+    # grid_x = torch.linspace(-1, 1, steps=w, device=device).reshape(1, 1, w, 1).repeat(1, h, 1, 1)
+    # grid = torch.cat([grid_x, grid_y], dim=-1)
+    # grid = grid - coords.reshape(-1, 1, 1, 2)
+    kernel_rgb_padded_translated = F.grid_sample(kernel_rgb_padded, grid, align_corners=True) # (b, 3, h, w)
+    rgb_values_reshaped = colours.unsqueeze(-1).unsqueeze(-1)
+    final_image_layers = rgb_values_reshaped * kernel_rgb_padded_translated
+    final_image = final_image_layers.sum(dim=0)
+    # final_image = torch.clamp(final_image, 0, 1)
+    final_image = final_image.permute(1,2,0)
+    return final_image
+if __name__ == "__main__":
+    from mylineprofiler import MyLineProfiler
+    profiler_th = MyLineProfiler(cuda_sync=True)
+    generate_2D_gaussian_splatting = profiler_th.decorate(generate_2D_gaussian_splatting)
+    profiler_cuda = MyLineProfiler(cuda_sync=True)
+    gaussiansplatting_render = profiler_cuda.decorate(gaussiansplatting_render)
+    # --- test ---
+    s = int(512 * 512)
+    # s = 5
+    image_size = (512, 512, 3)
+    sigmas = 0.2*torch.rand(s, 3).to(torch.float32).to("cuda")
+    sigmas[:,:2] = 5*sigmas[:, :2]
+    coords = 2*torch.rand(s, 2).to(torch.float32).to("cuda")-1.0
+    colors = torch.rand(s, 3).to(torch.float32).to("cuda")
+    # --- torch version ---
+    import gc
+    # gc.collect()
+    # torch.cuda.empty_cache()
+    # for _ in range(1):
+    #     img_python = generate_2D_gaussian_splatting(128, sigmas[:,1], sigmas[:,0], sigmas[:,2], coords, colors, image_size)
+    # profiler_th.print("profile.log", "w")
+    # cv2.imwrite("th.png", 255.0*img_python.detach().clamp(0,1).cpu().numpy())
+    # --- ends ---
+    # --- cuda version ---
+    sigmas[:, 0] = sigmas[:, 0]
+    sigmas[:, 1] = sigmas[:, 1]
+    gc.collect()
+    torch.cuda.empty_cache()
+    for _ in range(10):
+        with torch.no_grad():
+            img_cuda = gaussiansplatting_render(sigmas, coords, colors, image_size)
+    profiler_cuda.print("profile.log", "a")
+    cv2.imwrite("cuda.png", 255.0*img_cuda.detach().clamp(0,1).cpu().numpy())
+    # --- ends ---
+    pass

utils/gs_cuda_dmax/__init__.py ADDED Viewed

File without changes

utils/gs_cuda_dmax/check.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import torch
+from gswrapper import gaussiansplatting_render
+def torch_version(sigmas, coords, colors, image_size, dmax=100):
+    h, w = image_size
+    c = colors.shape[-1]
+    if h >= 50 or w >= 50:
+        logger.warning(f'too large values for h({h}), w({w}), torch version would be slow')
+    rendered_img = torch.zeros(h, w, c).to(colors.device).to(torch.float32)
+    for hi in range(h):
+        for wi in range(w):
+            curh = 2*hi/(h-1)-1.0
+            curw = 2*wi/(w-1)-1.0
+            v = (curw-coords[:,0])**2/sigmas[:,0]**2
+            v -= (2*sigmas[:,2])*(curw-coords[:,0])*(curh-coords[:,1])/sigmas[:,0]/sigmas[:,1]
+            v += (curh-coords[:,1])**2/sigmas[:,1]**2
+            v *= -1.0/(2.0*(1-sigmas[:,2]**2))
+            v = torch.exp(v)
+            mask_w = abs(curw-coords[:,0]) <= dmax
+            mask_h = abs(curh-coords[:,1]) <= dmax
+            mask = torch.logical_and(mask_w, mask_h)
+            for ci in range(c):
+                rendered_img[hi, wi, ci] = torch.sum((v*colors[:, ci])[mask])
+    return rendered_img
+if __name__ == "__main__":
+    s = 4 # the number of gs
+    image_size = (10, 10)
+    for _ in range(1):
+        print(f"--------------------------- begins --------------------------------")
+        sigmas = 0.999*torch.rand(s, 3).to(torch.float32).to("cuda")
+        sigmas[:,:2] = 5*sigmas[:, :2]
+        coords = 2*torch.rand(s, 2).to(torch.float32).to("cuda")-1.0
+        colors = torch.rand(s, 3).to(torch.float32).to("cuda")
+        # colors = torch.rand(s, 5).to(torch.float32).to("cuda")
+        dmax = 0.5
+        # sigmas = torch.Tensor([[0.9196, 0.3979, 0.7784]]).to(torch.float32).to("cuda")
+        # coords = torch.Tensor([[-0.0469, -0.1726]]).to(torch.float32).to("cuda")
+        # colors = torch.Tensor([[0.3775, 0.2346, 0.1513]]).to(torch.float32).to("cuda")
+        # colors = torch.ones_like(coords[:,0:1])
+        print(f"sigmas: {sigmas}, \ncoords:{coords}, \ncolors:{colors}\ndmax:{dmax}")
+        # --- check forward ---
+        with torch.no_grad():
+            rendered_img_th = torch_version(sigmas,coords,colors,image_size,dmax)
+            rendered_img_cuda = gaussiansplatting_render(sigmas,coords,colors,image_size,dmax)
+        #
+        distance = (rendered_img_th-rendered_img_cuda)**2
+        print(f"check forward - torch: {rendered_img_th[:2,:2,0]}")
+        print(f"check forward - cuda: {rendered_img_cuda[:2,:2,0]}")
+        print(f"check forward - distance: {distance[:2, :2, 0]}")
+        print(f"check forward - sum: {torch.sum(distance)}\n")
+        # --- ends ---
+        # --- check backward ---
+        sigmas.requires_grad_(True)
+        coords.requires_grad_(True)
+        colors.requires_grad_(True)
+        # sigmas.retain_grad()
+        # coords.retain_grad()
+        # colors.retain_grad()
+        weight = torch.rand_like(rendered_img_th) # make each pixel has different grads
+        sigmas.grad = None
+        coords.grad = None
+        colors.grad = None
+        rendered_img_th = torch_version(sigmas,coords,colors,image_size,dmax)
+        loss_th = torch.sum(weight*rendered_img_th)
+        # loss_th = torch.sum(rendered_img_th)
+        loss_th.backward()
+        sigmas_grad_th = sigmas.grad
+        coords_grad_th = coords.grad
+        colors_grad_th = colors.grad
+        sigmas.grad = None
+        coords.grad = None
+        colors.grad = None
+        rendered_img_cuda = gaussiansplatting_render(sigmas,coords,colors,image_size,dmax)
+        loss_cuda = torch.sum(weight*rendered_img_cuda)
+        # loss_cuda = torch.sum(rendered_img_cuda)
+        loss_cuda.backward()
+        sigmas_grad_cuda = sigmas.grad
+        coords_grad_cuda = coords.grad
+        colors_grad_cuda = colors.grad
+        distance_sigmas_grad = (sigmas_grad_th-sigmas_grad_cuda)**2
+        distance_coords_grad = (coords_grad_th-coords_grad_cuda)**2
+        distance_colors_grad = (colors_grad_th-colors_grad_cuda)**2
+        print(f"check backward - sigmas - torch: {sigmas_grad_th[:2]}")
+        print(f"check backward - sigmas - cuda: {sigmas_grad_cuda[:2]}")
+        print(f"check backward - sigmas - distance: {distance_sigmas_grad[:2]}")
+        print(f"check backward - sigmas - sum: {torch.sum(distance_sigmas_grad)}\n")
+        print(f"check backward - coords - torch: {coords_grad_th[:2]}")
+        print(f"check backward - coords - cuda: {coords_grad_cuda[:2]}")
+        print(f"check backward - coords - distance: {distance_coords_grad[:2]}")
+        print(f"check backward - coords - sum: {torch.sum(distance_coords_grad)}\n")
+        print(f"check backward - colors - torch: {colors_grad_th[:2]}")
+        print(f"check backward - colors - cuda: {colors_grad_cuda[:2]}")
+        print(f"check backward - colors - distance: {distance_colors_grad[:2]}")
+        print(f"check backward - colors - sum: {torch.sum(distance_colors_grad)}\n")
+        print(f"--------------------------- ends --------------------------------\n\n")

utils/gs_cuda_dmax/gs copy.cu ADDED Viewed

	@@ -0,0 +1,212 @@

+#include <stdio.h>
+#include <cmath>
+#define PI 3.1415926536
+#define PI2 6.283153072
+__global__ void _gs_render_cuda(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+        float *rendered_img,
+	const int s,  // gs num
+	const int h,
+	const int w,
+	const int c,
+	const float dmax
+	){
+        int index = blockIdx.x*blockDim.x + threadIdx.x;
+	int curw = index % w;
+	int curh = int((index-curw)/w);
+	if(curw >= w || curh >=h){
+	    return;
+	}
+	float curw_f = 2.0*curw/(w-1) - 1.0;
+	float curh_f = 2.0*curh/(h-1) - 1.0;
+        // printf("index:%d, curw:%d, curh:%d, curw_f:%f, curh_f:%f\n",index,curw,curh,curw_f,curh_f);
+	for(int si=0; si<s; si++){
+	    // compute the 2d gs value
+	    float sigma_x = sigmas[si*3+0];
+	    float sigma_y = sigmas[si*3+1];
+	    float rho = sigmas[si*3+2];
+            float x = coords[si*2+0];
+            float y = coords[si*2+1];
+	    //
+            float one_div_one_minus_rho2 = 1.0 / (1-rho*rho) ;
+            float one_div_sigma_x = 1.0 / sigma_x;
+            float one_div_sigma_y = 1.0 / sigma_y;
+	    float d_x = curw_f - x;
+	    float d_y = curh_f - y;
+	    if(d_x > dmax || d_x < -dmax || d_y > dmax || d_y < -dmax){
+		    continue;
+	    }
+            float v = one_div_sigma_x*one_div_sigma_x*d_x*d_x;
+            v -= 2*rho*d_x*d_y*one_div_sigma_x*one_div_sigma_y;
+            v += d_y*d_y*one_div_sigma_y*one_div_sigma_y;
+            v *= -one_div_one_minus_rho2 / 2.0;
+            v = exp(v);
+	    // since we normlize the v with the max, we remove this step to obtain equal result
+            // v *= one_div_sigma_x * one_div_sigma_y * pow(one_div_one_minus_rho2, 0.5) / PI2 ;
+            // printf("si:%d, sigma_x: %f, sigma_y:%f, rho:%f, x:%f, y:%f, v:%f\n", si, sigma_x, sigma_y, rho, x,y,v);
+            for(int ci=0; ci<c; ci++){
+		rendered_img[(curh*w+curw)*c+ci] += v*colors[si*c+ci];
+	    }
+	}
+}
+void _gs_render(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+        float *rendered_img,
+	const int s,
+	const int h,
+	const int w,
+	const int c,
+	const float dmax
+	) {
+        int threads=16;
+        dim3 grid( h*w, 1);
+        dim3 block( threads, 1);
+        _gs_render_cuda<<<grid, block>>>(sigmas, coords, colors, rendered_img, s, h, w, c, dmax);
+}
+__global__ void _gs_render_backward_cuda(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+        const float *grads,
+        float *grads_sigmas,
+        float *grads_coords,
+        float *grads_colors,
+	const int s,  // gs num
+	const int h,
+	const int w,
+	const int c,
+	const float dmax
+	){
+        int curs = blockIdx.x*blockDim.x + threadIdx.x;
+	if(curs >= s){
+	    return ;
+	}
+	// obtain parameters of gs
+	float sigma_x = sigmas[curs*3+0];
+	float sigma_y = sigmas[curs*3+1];
+	float rho = sigmas[curs*3+2];
+        float x = coords[curs*2+0];
+        float y = coords[curs*2+1];
+	float cr = colors[curs*3+0];
+	float cg = colors[curs*3+1];
+	float cb = colors[curs*3+2];
+	//
+        float w1 = -0.5 / (1-rho*rho) ;
+        float w2 = 1.0 / (sigma_x*sigma_x);
+        float w3 = 1.0 / (sigma_x*sigma_y);
+        float w4 = 1.0 / (sigma_y*sigma_y);
+	float od_sx = 1.0 / sigma_x;
+	float od_sy = 1.0 / sigma_y;
+        // init
+	float _gr=0.0, _gg=0.0, _gb=0.0;
+	float _gx=0.0, _gy=0.0;
+	float _gsx=0.0, _gsy=0.0, _gsr=0.0;
+	for(int hi = 0; hi < h; hi++){
+	    for( int wi=0; wi < w; wi++){
+	        float curw_f = 2.0*wi/(w-1) - 1.0;
+	        float curh_f = 2.0*hi/(h-1) - 1.0;
+		// obtain grad to p^t_r, p^t_g, p^t_b
+		float gptr = grads[(hi*w+wi)*c+0]; // grad of loss to P^t_r
+		float gptg = grads[(hi*w+wi)*c+1];
+		float gptb = grads[(hi*w+wi)*c+2];
+	        // compute the 2d gs value
+		float d_x = curw_f - x; // distance along x axis
+		float d_y = curh_f - y;
+		// if(d_x > dmax || d_x < -dmax || d_y > dmax || d_y < -dmax){
+		// 	continue;
+		// }
+		// printf("here");
+                float d = w2*d_x*d_x - 2*rho*w3*d_x*d_y + w4*d_y*d_y;
+		float v = w1*d;
+		v = exp(v);
+                // printf("si:%d, sigma_x: %f, sigma_y:%f, rho:%f, x:%f, y:%f, v:%f\n", si, sigma_x, sigma_y, rho, x,y,v);
+		// compute grad of colors
+		_gr += v*gptr;
+		_gg += v*gptg;
+		_gb += v*gptb;
+		// compute grad of coords
+		float gpt = gptr*cr+gptg*cg+gptb*cb;
+		float v_2_w1 = v*2*w1;
+		float g_vst_to_gsx = v_2_w1*(-w2*d_x+rho*w3*d_y); // grad of v^{st} to G^s_x
+		_gx += gpt*g_vst_to_gsx;
+		float g_vst_to_gsy = v_2_w1*(-w4*d_y+rho*w3*d_x); // grad of v^{st} to G^s_y
+		_gy += gpt*g_vst_to_gsy;
+		// compute grad of sigmas
+		float g_vst_to_gsigx = v_2_w1*od_sx* (w3*rho*d_x*d_y - w2*d_x*d_x);
+		_gsx += gpt*g_vst_to_gsigx;
+		float g_vst_to_gsigy = v_2_w1*od_sy* (w3*rho*d_x*d_y - w4*d_y*d_y);
+		_gsy += gpt*g_vst_to_gsigy;
+		float g_vst_to_rho = -v_2_w1*(2*w1*rho*d+w3*d_x*d_y);
+		_gsr += gpt*g_vst_to_rho;
+	}
+    }
+    // write the values
+    grads_sigmas[curs*3+0] = _gsx;
+    grads_sigmas[curs*3+1] = _gsy;
+    grads_sigmas[curs*3+2] = _gsr;
+    grads_coords[curs*2+0] = _gx;
+    grads_coords[curs*2+1] = _gy;
+    grads_colors[curs*3+0] = _gr;
+    grads_colors[curs*3+1] = _gg;
+    grads_colors[curs*3+2] = _gb;
+}
+void _gs_render_backward(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+	const float *grads, // (h, w, c)
+	float *grads_sigmas,
+	float *grads_coords,
+	float *grads_colors,
+	const int s,
+	const int h,
+	const int w,
+	const int c,
+	const float dmax
+	) {
+        int threads=16;
+        dim3 grid(s, 1);
+        dim3 block( threads, 1);
+        _gs_render_backward_cuda<<<grid, block>>>(sigmas, coords, colors, grads, grads_sigmas, grads_coords, grads_colors, s, h, w, c, dmax);
+}

utils/gs_cuda_dmax/gs.backup.cu ADDED Viewed

	@@ -0,0 +1,188 @@

+#include <stdio.h>
+#include <cmath>
+#define PI 3.1415926536
+#define PI2 6.283153072
+__global__ void _gs_render_cuda(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+        float *rendered_img,
+	const int s,  // gs num
+	const int h,
+	const int w,
+	const int c,
+	const float dmax
+	){
+        int index = blockIdx.x*blockDim.x + threadIdx.x;
+	int curw = index % w;
+	int curh = int((index-curw)/w);
+	if(curw >= w || curh >=h){
+	    return;
+	}
+	float curw_f = 2.0*curw/(w-1) - 1.0;
+	float curh_f = 2.0*curh/(h-1) - 1.0;
+        // printf("index:%d, curw:%d, curh:%d, curw_f:%f, curh_f:%f\n",index,curw,curh,curw_f,curh_f);
+	for(int si=0; si<s; si++){
+	    // compute the 2d gs value
+	    float sigma_x = sigmas[si*3+0];
+	    float sigma_y = sigmas[si*3+1];
+	    float rho = sigmas[si*3+2];
+            float x = coords[si*2+0];
+            float y = coords[si*2+1];
+	    //
+            float one_div_one_minus_rho2 = 1.0 / (1-rho*rho) ;
+            float one_div_sigma_x = 1.0 / sigma_x;
+            float one_div_sigma_y = 1.0 / sigma_y;
+	    float d_x = curw_f - x;
+	    float d_y = curh_f - y;
+	    if(d_x > dmax || d_x < -dmax || d_y > dmax || d_y < -dmax){
+		    continue;
+	    }
+            float v = one_div_sigma_x*one_div_sigma_x*d_x*d_x;
+            v -= 2*rho*d_x*d_y*one_div_sigma_x*one_div_sigma_y;
+            v += d_y*d_y*one_div_sigma_y*one_div_sigma_y;
+            v *= -one_div_one_minus_rho2 / 2.0;
+            v = exp(v);
+	    // since we normlize the v with the max, we remove this step to obtain equal result
+            // v *= one_div_sigma_x * one_div_sigma_y * pow(one_div_one_minus_rho2, 0.5) / PI2 ;
+            // printf("si:%d, sigma_x: %f, sigma_y:%f, rho:%f, x:%f, y:%f, v:%f\n", si, sigma_x, sigma_y, rho, x,y,v);
+            for(int ci=0; ci<c; ci++){
+		rendered_img[(curh*w+curw)*c+ci] += v*colors[si*c+ci];
+	    }
+	}
+}
+void _gs_render(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+        float *rendered_img,
+	const int s,
+	const int h,
+	const int w,
+	const int c,
+	const float dmax
+	) {
+        int threads=16;
+        dim3 grid( h*w, 1);
+        dim3 block( threads, 1);
+        _gs_render_cuda<<<grid, block>>>(sigmas, coords, colors, rendered_img, s, h, w, c, dmax);
+}
+__global__ void _gs_render_backward_cuda(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+        const float *grads,
+        float *grads_sigmas,
+        float *grads_coords,
+        float *grads_colors,
+	const int s,  // gs num
+	const int h,
+	const int w,
+	const int c,
+	const float dmax
+	){
+        int curs = blockIdx.x*blockDim.x + threadIdx.x;
+	if(curs >= s){
+	    return ;
+	}
+	// obtain parameters of gs
+	float sigma_x = sigmas[curs*3+0];
+	float sigma_y = sigmas[curs*3+1];
+	float rho = sigmas[curs*3+2];
+        float x = coords[curs*2+0];
+        float y = coords[curs*2+1];
+	//
+        float w1 = -0.5 / (1-rho*rho) ;
+        float w2 = 1.0 / (sigma_x*sigma_x);
+        float w3 = 1.0 / (sigma_x*sigma_y);
+        float w4 = 1.0 / (sigma_y*sigma_y);
+	float od_sx = 1.0 / sigma_x;
+	float od_sy = 1.0 / sigma_y;
+        // init
+	for(int hi = 0; hi < h; hi++){
+	    for( int wi=0; wi < w; wi++){
+	        float curw_f = 2.0*wi/(w-1) - 1.0;
+	        float curh_f = 2.0*hi/(h-1) - 1.0;
+	        // compute the 2d gs value
+		float d_x = curw_f - x; // distance along x axis
+		float d_y = curh_f - y;
+		if(d_x > dmax || d_x < -dmax || d_y > dmax || d_y < -dmax){
+			continue;
+		}
+                float d = w2*d_x*d_x - 2*rho*w3*d_x*d_y + w4*d_y*d_y;
+		float v = w1*d;
+		v = exp(v);
+                // printf("si:%d, sigma_x: %f, sigma_y:%f, rho:%f, x:%f, y:%f, v:%f\n", si, sigma_x, sigma_y, rho, x,y,v);
+		// compute grad of coords
+		float v_2_w1 = v*2*w1;
+		float g_vst_to_gsx = v_2_w1*(-w2*d_x+rho*w3*d_y); // grad of v^{st} to G^s_x
+		float g_vst_to_gsy = v_2_w1*(-w4*d_y+rho*w3*d_x); // grad of v^{st} to G^s_y
+		// compute grad of sigmas
+		float g_vst_to_gsigx = v_2_w1*od_sx* (w3*rho*d_x*d_y - w2*d_x*d_x);
+		float g_vst_to_gsigy = v_2_w1*od_sy* (w3*rho*d_x*d_y - w4*d_y*d_y);
+		float g_vst_to_rho = -v_2_w1*(2*w1*rho*d+w3*d_x*d_y);
+		for(int ci=0; ci<c; ci++){
+		    float _gptc = grads[(hi*w+wi)*c+ci];
+		    float _gpt = _gptc*colors[curs*c+ci];
+		    grads_colors[curs*c+ci] += v*_gptc;
+                    grads_coords[curs*2+0] += _gpt*g_vst_to_gsx;
+                    grads_coords[curs*2+1] += _gpt*g_vst_to_gsy;
+                    grads_sigmas[curs*3+0] += _gpt*g_vst_to_gsigx;
+                    grads_sigmas[curs*3+1] += _gpt*g_vst_to_gsigy;
+                    grads_sigmas[curs*3+2] += _gpt*g_vst_to_rho;
+		}
+	}
+    }
+}
+void _gs_render_backward(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+	const float *grads, // (h, w, c)
+	float *grads_sigmas,
+	float *grads_coords,
+	float *grads_colors,
+	const int s,
+	const int h,
+	const int w,
+	const int c,
+	const float dmax
+	) {
+        int threads=16;
+        dim3 grid(s, 1);
+        dim3 block( threads, 1);
+        _gs_render_backward_cuda<<<grid, block>>>(sigmas, coords, colors, grads, grads_sigmas, grads_coords, grads_colors, s, h, w, c, dmax);
+}

utils/gs_cuda_dmax/gs.cu ADDED Viewed

	@@ -0,0 +1,187 @@

+#include <stdio.h>
+#include <cmath>
+#define PI 3.1415926536
+#define PI2 6.283153072
+__global__ void _gs_render_cuda(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+        float *rendered_img,
+	const int s,  // gs num
+	const int h,
+	const int w,
+	const int c,
+	const float dmax
+	){
+        int curs = blockIdx.x*blockDim.x + threadIdx.x;
+	if(curs >= s){
+	    return;
+	}
+	float sigma_x = sigmas[curs*3+0];
+	float sigma_y = sigmas[curs*3+1];
+	float rho = sigmas[curs*3+2];
+        float x = coords[curs*2+0];
+        float y = coords[curs*2+1];
+        float r = colors[curs*3];
+        float g = colors[curs*3+1];
+        float b = colors[curs*3+2];
+	float negative_half_one_div_one_minus_rho2 = -0.5 / (1-rho*rho);
+	float one_div_sigma_x_2 = 1.0 / sigma_x / sigma_x;
+	float one_div_sigma_y_2 = 1.0 / sigma_y / sigma_y;
+	float two_rho_div_sigma_x_one_div_sigma_y = 2*rho / sigma_x / sigma_y;
+	for(int hi=0; hi<h; hi++){
+	    float curh_f = 2.0*hi/(h-1) - 1.0;
+	    float d_y = curh_f - y;
+	    if(d_y > dmax || d_y < -dmax){
+                continue;
+	    }
+	    for(int wi=0; wi<w; wi++){
+	        float curw_f = 2.0*wi/(w-1) - 1.0;
+	        float d_x = curw_f - x;
+	        if(d_x > dmax || d_x < -dmax){
+		    continue;
+	        }
+		float v = one_div_sigma_x_2*d_x*d_x;
+		v -= two_rho_div_sigma_x_one_div_sigma_y*d_x*d_y;
+		v += one_div_sigma_y_2*d_y*d_y;
+		v *= negative_half_one_div_one_minus_rho2;
+		v = exp(v);
+		atomicAdd(&rendered_img[(hi*w+wi)*c+0], v*r);
+		atomicAdd(&rendered_img[(hi*w+wi)*c+1], v*g);
+		atomicAdd(&rendered_img[(hi*w+wi)*c+2], v*b);
+	    }
+	}
+}
+void _gs_render(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+        float *rendered_img,
+	const int s,
+	const int h,
+	const int w,
+	const int c,
+	const float dmax
+	) {
+        int threads=64;
+        dim3 grid(int(s/threads)+1);
+        dim3 block(threads);
+        _gs_render_cuda<<<grid, block>>>(sigmas, coords, colors, rendered_img, s, h, w, c, dmax);
+}
+__global__ void _gs_render_backward_cuda(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+        const float *grads,
+        float *grads_sigmas,
+        float *grads_coords,
+        float *grads_colors,
+	const int s,  // gs num
+	const int h,
+	const int w,
+	const int c,
+	const float dmax
+	){
+        int curs = blockIdx.x*blockDim.x + threadIdx.x;
+	if(curs >= s){
+	    return ;
+	}
+	// obtain parameters of gs
+	float sigma_x = sigmas[curs*3+0];
+	float sigma_y = sigmas[curs*3+1];
+	float rho = sigmas[curs*3+2];
+        float x = coords[curs*2+0];
+        float y = coords[curs*2+1];
+	//
+        float w1 = -0.5 / (1-rho*rho) ;
+        float w2 = 1.0 / (sigma_x*sigma_x);
+        float w3 = 1.0 / (sigma_x*sigma_y);
+        float w4 = 1.0 / (sigma_y*sigma_y);
+	float od_sx = 1.0 / sigma_x;
+	float od_sy = 1.0 / sigma_y;
+        // init
+	for(int hi = 0; hi < h; hi++){
+	    for( int wi=0; wi < w; wi++){
+	        float curw_f = 2.0*wi/(w-1) - 1.0;
+	        float curh_f = 2.0*hi/(h-1) - 1.0;
+	        // compute the 2d gs value
+		float d_x = curw_f - x; // distance along x axis
+		float d_y = curh_f - y;
+		if(d_x > dmax || d_x < -dmax || d_y > dmax || d_y < -dmax){
+			continue;
+		}
+                float d = w2*d_x*d_x - 2*rho*w3*d_x*d_y + w4*d_y*d_y;
+		float v = w1*d;
+		v = exp(v);
+                // printf("si:%d, sigma_x: %f, sigma_y:%f, rho:%f, x:%f, y:%f, v:%f\n", si, sigma_x, sigma_y, rho, x,y,v);
+		// compute grad of coords
+		float v_2_w1 = v*2*w1;
+		float g_vst_to_gsx = v_2_w1*(-w2*d_x+rho*w3*d_y); // grad of v^{st} to G^s_x
+		float g_vst_to_gsy = v_2_w1*(-w4*d_y+rho*w3*d_x); // grad of v^{st} to G^s_y
+		// compute grad of sigmas
+		float g_vst_to_gsigx = v_2_w1*od_sx* (w3*rho*d_x*d_y - w2*d_x*d_x);
+		float g_vst_to_gsigy = v_2_w1*od_sy* (w3*rho*d_x*d_y - w4*d_y*d_y);
+		float g_vst_to_rho = -v_2_w1*(2*w1*rho*d+w3*d_x*d_y);
+		for(int ci=0; ci<c; ci++){
+		    float _gptc = grads[(hi*w+wi)*c+ci];
+		    float _gpt = _gptc*colors[curs*c+ci];
+		    grads_colors[curs*c+ci] += v*_gptc;
+                    grads_coords[curs*2+0] += _gpt*g_vst_to_gsx;
+                    grads_coords[curs*2+1] += _gpt*g_vst_to_gsy;
+                    grads_sigmas[curs*3+0] += _gpt*g_vst_to_gsigx;
+                    grads_sigmas[curs*3+1] += _gpt*g_vst_to_gsigy;
+                    grads_sigmas[curs*3+2] += _gpt*g_vst_to_rho;
+		}
+	}
+    }
+}
+void _gs_render_backward(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+	const float *grads, // (h, w, c)
+	float *grads_sigmas,
+	float *grads_coords,
+	float *grads_colors,
+	const int s,
+	const int h,
+	const int w,
+	const int c,
+	const float dmax
+	) {
+        int threads=64;
+        dim3 grid(s, 1);
+        dim3 block( threads, 1);
+        _gs_render_backward_cuda<<<grid, block>>>(sigmas, coords, colors, grads, grads_sigmas, grads_coords, grads_colors, s, h, w, c, dmax);
+}

utils/gs_cuda_dmax/gs.h ADDED Viewed

	@@ -0,0 +1,26 @@

+void _gs_render(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+        float *rendered_img,
+	const int s,
+	const int h,
+	const int w,
+	const int c,
+	const float dmax
+);
+void _gs_render_backward(
+        const float *sigmas,
+        const float *coords,
+        const float *colors,
+        const float *grads,
+        float *grads_sigmas,
+        float *grads_coords,
+        float *grads_colors,
+	const int s,
+	const int h,
+	const int w,
+	const int c,
+	const float dmax
+);

utils/gs_cuda_dmax/gswrapper.cpp ADDED Viewed

	@@ -0,0 +1,82 @@

+#include "gs.h"
+#include <torch/extension.h>
+#include <c10/cuda/CUDAGuard.h>
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+void gs_render(
+        torch::Tensor &sigmas,
+        torch::Tensor &coords,
+        torch::Tensor &colors,
+        torch::Tensor &rendered_img,
+	const int s,
+	const int h,
+	const int w,
+	const int c,
+	const float dmax
+        ){
+        CHECK_INPUT(sigmas);
+        CHECK_INPUT(coords);
+        CHECK_INPUT(colors);
+        CHECK_INPUT(rendered_img);
+        // run the code at the cuda device same with the input
+        const at::cuda::OptionalCUDAGuard device_guard(device_of(sigmas));
+        _gs_render(
+            (const float *) sigmas.data_ptr(),
+            (const float *) coords.data_ptr(),
+            (const float *) colors.data_ptr(),
+            (float *) rendered_img.data_ptr(),
+	    s, h, w, c, dmax);
+}
+void gs_render_backward(
+        torch::Tensor &sigmas,
+        torch::Tensor &coords,
+        torch::Tensor &colors,
+        torch::Tensor &grads,
+        torch::Tensor &grads_sigmas,
+        torch::Tensor &grads_coords,
+        torch::Tensor &grads_colors,
+	const int s,
+	const int h,
+	const int w,
+	const int c,
+	const float dmax
+        ){
+        CHECK_INPUT(sigmas);
+        CHECK_INPUT(coords);
+        CHECK_INPUT(colors);
+        CHECK_INPUT(grads);
+        CHECK_INPUT(grads_sigmas);
+        CHECK_INPUT(grads_coords);
+        CHECK_INPUT(grads_colors);
+        // run the code at the cuda device same with the input
+        const at::cuda::OptionalCUDAGuard device_guard(device_of(sigmas));
+        _gs_render_backward(
+            (const float *) sigmas.data_ptr(),
+            (const float *) coords.data_ptr(),
+            (const float *) colors.data_ptr(),
+            (const float *) grads.data_ptr(),
+            (float *) grads_sigmas.data_ptr(),
+            (float *) grads_coords.data_ptr(),
+            (float *) grads_colors.data_ptr(),
+	    s, h, w, c, dmax);
+}
+PYBIND11_MODULE( TORCH_EXTENSION_NAME, m) {
+        m.def( "gs_render",
+                &gs_render,
+                "cuda forward wrapper");
+        m.def( "gs_render_backward",
+                &gs_render_backward,
+                "cuda backward wrapper");
+}

utils/gs_cuda_dmax/gswrapper.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+import torch
+from torch.utils.cpp_extension import load
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+#
+build_path = os.path.join(os.path.split(os.path.abspath(__file__))[0], 'build')
+os.makedirs(build_path, exist_ok=True)
+file_path = os.path.split(os.path.abspath(__file__))[0]
+# GSWrapper = load(
+#         name="gscuda",
+#         # sources=["gs_cuda/gswrapper.cpp", "gs_cuda/gs.cu"],
+#         sources=[os.path.join(file_path, "gswrapper.cpp"),
+#                  os.path.join(file_path, "gs.cu")],
+#         build_directory=build_path,
+#         verbose=True)
+import gscuda
+GSWrapper = gscuda
+class GSCUDA(Function):
+        @staticmethod
+        def forward(ctx, sigmas, coords, colors, rendered_img, dmax):
+            ctx.save_for_backward(sigmas, coords, colors)
+            ctx.dmax = dmax
+            h, w, c = rendered_img.shape
+            s = sigmas.shape[0]
+            GSWrapper.gs_render(sigmas, coords, colors, rendered_img, s, h, w, c, dmax)
+            return rendered_img
+        @staticmethod
+        @once_differentiable
+        def backward(ctx, grad_output):
+            sigmas, coords, colors = ctx.saved_tensors
+            dmax = ctx.dmax
+            h, w, c = grad_output.shape
+            s = sigmas.shape[0]
+            grads_sigmas = torch.zeros_like(sigmas)
+            grads_coords = torch.zeros_like(coords)
+            grads_colors = torch.zeros_like(colors)
+            GSWrapper.gs_render_backward(sigmas, coords, colors, grad_output.contiguous(), grads_sigmas, grads_coords, grads_colors, s, h, w, c, dmax)
+            return (grads_sigmas, grads_coords, grads_colors, None, None)
+def gaussiansplatting_render(sigmas, coords, colors, image_size,dmax=100):
+    sigmas = sigmas.contiguous() # (gs num, 3)
+    coords = coords.contiguous() # (gs num, 2)
+    colors = colors.contiguous() # (gs num, c)
+    h, w = image_size[:2]
+    c = colors.shape[-1]
+    rendered_img = torch.zeros(h, w, c).to(colors.device).to(torch.float32)
+    return GSCUDA.apply(sigmas, coords, colors, rendered_img, dmax)
+if __name__ == "__main__":
+    sigmas = torch.randn(10, 3).cuda()
+    coords = torch.randn(10, 2).cuda()
+    colors = torch.randn(10, 3).cuda()
+    image_size = (100, 100)
+    dmax = 0.1
+    rendered_img = gaussiansplatting_render(sigmas, coords, colors, image_size, dmax)
+    print(rendered_img.shape)

utils/gs_cuda_dmax/mylineprofiler.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import os
+import io
+import sys
+import timeit
+import tokenize
+import torch
+import psutil
+import inspect
+from loguru import logger
+from prettytable import PrettyTable
+# implement by xtudbxk
+# github: https://github.com/xtudbxk/lineprofiler
+class MyLineProfiler():
+    def __init__(self, base='ms', cuda_sync=True, gpuids=(0,), warmup=0, warmup_lineno=-1):
+        if base == 'ms':
+            self.base_n = 1000
+        elif base == 's':
+            self.base_n = 1
+        else:
+            logguru.warning(f'Unsupported base - {base}, using "s" instead')
+        self.base = base
+        self.cuda_sync = cuda_sync
+        self.gpuids = gpuids
+        self.warmup = warmup
+        self.warmup_counter = warmup
+        # we should wait this line execute warup_counter times
+        # before recording the stats
+        self.warmup_lineno = warmup_lineno
+        # for time profiling
+        self._times = {}
+        self._func_name = None
+        self._func_filename = None
+        self._last_time = -1
+        self._last_lineno = -1
+        self._func_hit_count = 0
+        self._func_firstlineno = 0
+        # for memory profiling
+        self._process = psutil.Process(os.getpid())
+        self._memory = {}
+        self._last_memory = 0
+        # for cuda memory profiling
+        self._gpu_memory = {}
+        self._gpu_last_memory = 0
+    def __trace_func__(self, frame, event, arg):
+        # print(f'in {frame.f_code.co_filename} func {frame.f_code.co_name} line {frame.f_lineno}, event - {event}')
+        # check if run into the decorated func
+        if self._func_firstlineno == frame.f_code.co_firstlineno and frame.f_code.co_name == self._func_name and frame.f_code.co_filename == self._func_filename:
+            # --- obtain info for current hit ---
+            # cuda related
+            if self.cuda_sync is True:
+                torch.cuda.synchronize()
+            current_time = timeit.default_timer()
+            memory = self._process.memory_info().rss
+            gpu_memory = torch.cuda.memory_allocated()
+            # --- ends ---
+            # --- initilize the info when first hit ---
+            if frame.f_lineno not in self._times: # first hit time for this line
+                self._times[frame.f_lineno] = {'hit':0, 'time': 0}
+                self._memory[frame.f_lineno] = 0
+                self._gpu_memory[frame.f_lineno] = 0
+            # --- ends ---
+            # --- record info before call the decorated func ---
+            # 'call' - before call the func
+            if event == 'call':
+                self._last_time = current_time
+                self._last_lineno = frame.f_lineno
+                self._last_memory = memory
+                self._last_gpu_memory = gpu_memory
+                if self.warmup_lineno < 0:
+                    self.warmup_counter -= 1
+                    if self.warmup_counter < 0:
+                        self._func_hit_count += 1
+            # --- ends ---
+            # 'line' - after excuting the line
+            # 'return' - return from the function
+            if event == 'line' or event == 'return':
+                if event == 'line' and self.warmup_counter < 0:
+                    self._times[frame.f_lineno]['hit'] += 1
+                # --- obtain the memory and time consumed by this line ---
+                if self.warmup_counter < 0:
+                    self._times[self._last_lineno]['time'] += current_time - self._last_time
+                self._memory[self._last_lineno] += memory - self._last_memory
+                self._gpu_memory[self._last_lineno] += gpu_memory - self._gpu_last_memory
+                # --- ends ---
+                if self.cuda_sync is True:
+                    torch.cuda.synchronize()
+                self._last_time = timeit.default_timer()
+                self._last_memory = memory
+                self._gpu_last_memory = gpu_memory
+                self._last_lineno = frame.f_lineno
+        return self.__trace_func__
+    def decorate(self, func):
+        if self._func_name is not None:
+            logger.warning(f'Only support decorate only one func. Aready decorated "{self._func_name}"')
+        self._func_name = func.__name__
+        self._func_filename = func.__code__.co_filename
+        self._func_firstlineno = func.__code__.co_firstlineno
+        def _f(*args, **kwargs):
+            origin_trace_func = sys.gettrace()
+            sys.settrace(self.__trace_func__)
+            ret = func(*args, **kwargs)
+            sys.settrace(origin_trace_func)
+            return ret
+        return _f
+    def _get_table(self):
+        if len(self._times) <= 0:
+            logger.warning(f"un recorded datas, please ensure the function is executed")
+            return None
+        # --- load the source code ---
+        with open(self._func_filename, 'r') as f:
+            source_lines = [line.strip('\n') for line in f.readlines()]
+            code_str = "\n".join(source_lines)
+        def_lineno = min(self._times.keys())
+        final_lineno = max(self._times.keys())
+        # remove the additional blank content
+        pre_blank_count = len(source_lines[def_lineno-1]) - len(source_lines[def_lineno-1].lstrip(' ').lstrip('\t'))
+        # --- ends ---
+        # --- analysize the source code and collect infos for multi-line code ---
+        new_logic_linenos = [token.start[0] for token in tokenize.generate_tokens(
+            io.StringIO(code_str).readline) if token.type == 4]
+        # --- ends ---
+        # --- merge the stats multi-line code ---
+        sorted_linenos = [lineno for lineno in self._times.keys()]
+        sorted_linenos.sort(key=int)
+        lineno_cache = []
+        for lineno in sorted_linenos:
+            if lineno not in new_logic_linenos:
+                lineno_cache.append(lineno)
+            else:
+                # we should merge its info to the prev_lineno
+                if len(lineno_cache) <= 0:
+                    continue
+                else:
+                    lineno_cache.append(lineno)
+                    first_lineno = lineno_cache[0]
+                    for prev_lineno in lineno_cache[1:]:
+                        self._times[first_lineno]["hit"] = min(self._times[first_lineno]["hit"], self._times[prev_lineno]["hit"])
+                        self._times[first_lineno]["time"] += self._times[prev_lineno]["time"]
+                        del self._times[prev_lineno]
+                        self._memory[first_lineno] += self._memory[prev_lineno]
+                        del self._memory[prev_lineno]
+                        self._gpu_memory[first_lineno] += self._gpu_memory[prev_lineno]
+                        del self._gpu_memory[prev_lineno]
+                    lineno_cache = []
+        # --- ends ---
+        # --- initialize the pretty table for output ---
+        table = PrettyTable(['lineno', 'hits', 'time', 'time per hit', 'hit perc', 'time perc', 'mem inc', 'mem peak', 'gpu mem inc', 'gpu mem peak'])
+        # --- ends ---
+        # --- compute some statisticals ---
+        total_hit = 0 # for compute the hit percentage
+        total_time = 0
+        for lineno, stats in self._times.items():
+            if lineno == def_lineno: continue
+            total_hit += stats['hit']
+            total_time += stats['time']
+        total_memory = sum([m for l,m in self._memory.items()]) / 1024 / 1024
+        total_gpu_memory = sum([m for l,m in self._gpu_memory.items()]) / 1024 / 1024
+        # --- ends ---
+        peak_cpu_memory = 0
+        peak_gpu_memory = 0
+        for lineno in range(def_lineno, final_lineno+1):
+            if lineno not in self._times:
+                # the comment line, empty line or merged line from multi-lines code
+                table.add_row([lineno, '-', '-', '-', '-', '-', '-',f'{peak_cpu_memory:5.3f} MB', '-', f'{peak_gpu_memory:5.3f} MB'])
+            else:
+                stats = self._times[lineno]
+                if lineno == def_lineno:
+                    table.add_row([lineno, self._func_hit_count, f'{total_time*self.base_n:.4f} {self.base}', f'{total_time/self._func_hit_count*self.base_n:.4f} {self.base}', '-', '-', f'{total_memory:5.3f} MB', 'baseline', f'{total_gpu_memory:5.3f} MB', 'baseline'])
+                else:
+                    line_result = [lineno, stats['hit'],
+                                  f'{stats["time"]*self.base_n:.4f} {self.base}',
+                                  f'{stats["time"]/stats["hit"]*self.base_n:.4f} {self.base}' if stats['hit'] > 0 else 'nan',
+                                  f'{stats["hit"]/total_hit*100:.3f}%' if total_hit > 0 else 'nan',
+                                  f'{stats["time"]/total_time*100:.3f}%'] if total_time > 0 else 'nan'
+                    line_result += [f'{self._memory[lineno]/1024/1024:5.3f} MB' if stats['hit'] > 0 else '0 MB']
+                    peak_cpu_memory = peak_cpu_memory + self._memory[lineno]/1024/1024
+                    line_result += [f'{peak_cpu_memory:5.3f} MB']
+                    line_result += [f'{self._gpu_memory[lineno]/1024/1024:5.3f} MB' if stats['hit'] > 0 else '0 MB']
+                    peak_gpu_memory = peak_gpu_memory + self._gpu_memory[lineno]/1024/1024
+                    line_result += [f'{peak_gpu_memory:5.3f} MB']
+                    table.add_row(line_result)
+        table.add_column('sources', [source_lines[i-1][pre_blank_count:] if len(source_lines[i-1])>pre_blank_count else '' for i in range(def_lineno, final_lineno+1)], 'l')
+        return table
+    def print(self, filename=None, mode="w"):
+        introducation = '''
+1. The first line of table reports the overall results of the whole function and the following lines reports the statistics of each line in the function.
+2. The `hit perc` and `time perc` represent `hit percentage` and `time percentage`.
+3. For memory, there exists four categories `mem inc`, `mem peak`, `gpu mem inc` and `gpu mem peak`. They denotes `cpu memory increasement`, `cpu memory peak`, `gpu memory increasement` and `gpu memory peak`. All the results are collected in the last run. The number in the increasement field denots the increasement of corresponding memory of each line (the first line is related to the whole function). Sometimes, the number of each line is far less of the number of the first line, which is valid since python may auto release the unused memory after the function execution. The number of each line in the peak filed is a simple sum of the numbers of above lines in the increasement field, which is used to demonstrate the possible maxinum memory usage in the function.
+4. For any issue, please concact us via https://github.com/xtudbxk/lineprofiler or zhengqiang.zhang@hotmail.com
+        '''
+        print(introducation)
+        table = PrettyTable(['lineno', 'hits', 'time', 'time per hit', 'hit perc', 'time perc', 'mem inc', 'mem peak', 'gpu mem inc', 'gpu mem peak'])
+        table = self._get_table()
+        print(table)
+        if filename is not None:
+            with open(filename, mode) as f:
+                f.write(introducation)
+                f.write(f"args - base={self.base}, cuda_sync={self.cuda_sync}, gpuids={self.gpuids}, warmup={self.warmup}\n")
+                f.write(str(table))
+if __name__ == '__main__':
+    import numpy as np
+    def mytest(h='hello',
+               xx="xx"):
+        h = h + 'world'
+        a = []
+        for _ in range(200):
+            # a = np.zeros((1000, 1000), dtype=np.float32)
+            a.append(np.zeros((1000, 1000), dtype=np.float32))
+            a.append(
+                    np.zeros((1000, 1000),
+                              dtype=np.float32))
+            # print(a[0,0])
+        print(h)
+    profiler = MyLineProfiler(cuda_sync=False, warmup=2)
+    mytest = profiler.decorate(mytest)
+    for _ in range(5):
+        mytest()
+    profiler.print()

utils/gs_cuda_dmax/profile.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import cv2
+import torch
+import numpy as np
+import torch.nn.functional as F
+from gswrapper import gaussiansplatting_render
+def generate_2D_gaussian_splatting(kernel_size, sigma_x, sigma_y, rho, coords,
+        colours, image_size=(256, 256, 3), device="cuda"):
+    batch_size = colours.shape[0]
+    sigma_x = sigma_x.view(batch_size, 1, 1)
+    sigma_y = sigma_y.view(batch_size, 1, 1)
+    rho = rho.view(batch_size, 1, 1)
+    covariance = torch.stack(
+        [torch.stack([sigma_x**2, rho*sigma_x*sigma_y], dim=-1),
+        torch.stack([rho*sigma_x*sigma_y, sigma_y**2], dim=-1)],
+        dim=-2
+    )
+    # Check for positive semi-definiteness
+    # determinant = (sigma_x**2) * (sigma_y**2) - (rho * sigma_x * sigma_y)**2
+    # if (determinant <= 0).any():
+    #     raise ValueError("Covariance matrix must be positive semi-definite")
+    inv_covariance = torch.inverse(covariance)
+    # Choosing quite a broad range for the distribution [-5,5] to avoid any clipping
+    start = torch.tensor([-5.0], device=device).view(-1, 1)
+    end = torch.tensor([5.0], device=device).view(-1, 1)
+    base_linspace = torch.linspace(0, 1, steps=kernel_size, device=device)
+    ax_batch = start + (end - start) * base_linspace
+    # Expanding dims for broadcasting
+    ax_batch_expanded_x = ax_batch.unsqueeze(-1).expand(-1, -1, kernel_size)
+    ax_batch_expanded_y = ax_batch.unsqueeze(1).expand(-1, kernel_size, -1)
+    # Creating a batch-wise meshgrid using broadcasting
+    xx, yy = ax_batch_expanded_x, ax_batch_expanded_y # (batchsize, kernelsize, kernelsize)
+    xy = torch.stack([xx, yy], dim=-1) # (batchsize, kernelsize, kernelsize, 2)
+    z = torch.einsum('b...i,b...ij,b...j->b...', xy, -0.5 * inv_covariance, xy) # (batchsize, kernelsize, kernelsize, 2)
+    kernel = torch.exp(z) / (2 * torch.tensor(np.pi, device=device) * torch.sqrt(torch.det(covariance)).view(batch_size, 1, 1)) # (batchsize, kernelsize, kernelsize)
+    kernel_max_1, _ = kernel.max(dim=-1, keepdim=True)  # Find max along the last dimension
+    kernel_max_2, _ = kernel_max_1.max(dim=-2, keepdim=True)  # Find max along the second-to-last dimension
+    kernel_normalized = kernel / kernel_max_2 # (batchsize, kernelsize, kernelsize)
+    kernel_reshaped = kernel_normalized.repeat(1, 3, 1).view(batch_size * 3, kernel_size, kernel_size)
+    kernel_rgb = kernel_reshaped.unsqueeze(0).reshape(batch_size, 3, kernel_size, kernel_size)  # (batchsize, 3, kernelsize, kernelsize)
+    # Calculating the padding needed to match the image size
+    pad_h = image_size[0] - kernel_size
+    pad_w = image_size[1] - kernel_size
+    if pad_h < 0 or pad_w < 0:
+        raise ValueError("Kernel size should be smaller or equal to the image size.")
+    # Adding padding to make kernel size equal to the image size
+    padding = (pad_w // 2, pad_w // 2 + pad_w % 2,  # padding left and right
+               pad_h // 2, pad_h // 2 + pad_h % 2)  # padding top and bottom
+    kernel_rgb_padded = torch.nn.functional.pad(kernel_rgb, padding, "constant", 0) # (batchsize, 3, h, w)
+    # Extracting shape information
+    b, c, h, w = kernel_rgb_padded.shape
+    # Create a batch of 2D affine matrices
+    theta = torch.zeros(b, 2, 3, dtype=torch.float32, device=device)
+    theta[:, 0, 0] = 1.0
+    theta[:, 1, 1] = 1.0
+    theta[:, :, 2] = -coords # (b, 2) - the offset of gaussian splating
+    # Creating grid and performing grid sampling
+    grid = F.affine_grid(theta, size=(b, c, h, w), align_corners=True) # (b, 3, h, w)
+    # grid_y = torch.linspace(-1, 1, steps=h, device=device).reshape(1, h, 1, 1).repeat(1, 1, w, 1)
+    # grid_x = torch.linspace(-1, 1, steps=w, device=device).reshape(1, 1, w, 1).repeat(1, h, 1, 1)
+    # grid = torch.cat([grid_x, grid_y], dim=-1)
+    # grid = grid - coords.reshape(-1, 1, 1, 2)
+    kernel_rgb_padded_translated = F.grid_sample(kernel_rgb_padded, grid, align_corners=True) # (b, 3, h, w)
+    rgb_values_reshaped = colours.unsqueeze(-1).unsqueeze(-1)
+    final_image_layers = rgb_values_reshaped * kernel_rgb_padded_translated
+    final_image = final_image_layers.sum(dim=0)
+    # final_image = torch.clamp(final_image, 0, 1)
+    final_image = final_image.permute(1,2,0)
+    return final_image
+if __name__ == "__main__":
+    from mylineprofiler import MyLineProfiler
+    profiler_th = MyLineProfiler(cuda_sync=True)
+    generate_2D_gaussian_splatting = profiler_th.decorate(generate_2D_gaussian_splatting)
+    profiler_cuda = MyLineProfiler(cuda_sync=True)
+    gaussiansplatting_render = profiler_cuda.decorate(gaussiansplatting_render)
+    # --- test ---
+    # s = 1000
+    s = 5
+    # image_size = (512, 512, 3)
+    image_size = (511, 511, 3)
+    # image_size = (256, 512, 3)
+    # image_size = (256, 256, 3)
+    sigmas = 0.999*torch.rand(s, 3).to(torch.float32).to("cuda")
+    sigmas[:,:2] = 5*sigmas[:, :2]
+    coords = 2*torch.rand(s, 2).to(torch.float32).to("cuda")-1.0
+    colors = torch.rand(s, 3).to(torch.float32).to("cuda")
+    # --- torch version ---
+    import gc
+    gc.collect()
+    torch.cuda.empty_cache()
+    for _ in range(20):
+        img = generate_2D_gaussian_splatting(101, sigmas[:,1], sigmas[:,0], sigmas[:,2], coords, colors, image_size)
+    profiler_th.print("profile.log", "w")
+    cv2.imwrite("th.png", 255.0 * img.detach().clamp(0, 1).cpu().numpy())
+    # --- ends ---
+    # --- cuda version ---
+    _stepsize_of_gs_th = 10 / (101-1)
+    _stepsize_of_gs_cuda_w = 2 / (image_size[1]-1)
+    _stepsize_of_gs_cuda_h = 2 / (image_size[0]-1)
+    sigmas[:, 0] = sigmas[:, 0] * _stepsize_of_gs_cuda_w / _stepsize_of_gs_th
+    sigmas[:, 1] = sigmas[:, 1] * _stepsize_of_gs_cuda_h / _stepsize_of_gs_th
+    dmax = 101/2*_stepsize_of_gs_cuda_w
+    gc.collect()
+    torch.cuda.empty_cache()
+    for _ in range(20):
+        img = gaussiansplatting_render(sigmas, coords, colors, image_size, dmax)
+    profiler_cuda.print("profile.log", "a")
+    cv2.imwrite("cuda.png", 255.0 * img.detach().clamp(0, 1).cpu().numpy())
+    # --- ends ---

utils/hatropeamp.py ADDED Viewed

	@@ -0,0 +1,1156 @@

+import math
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+import torch.nn.functional as F
+import collections.abc
+from itertools import repeat
+from functools import partial
+from typing import Any, Optional, Tuple
+from einops import rearrange
+# From PyTorch
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # From: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/weight_init.py
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. '
+            'The distribution of values may be incorrect.',
+            stacklevel=2)
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        low = norm_cdf((a - mean) / std)
+        up = norm_cdf((b - mean) / std)
+        # Uniformly fill tensor with values from [low, up], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * low - 1, 2 * up - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution.
+    From: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/weight_init.py
+    The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+def init_t_xy(end_x: int, end_y: int, zero_center=False):
+    t = torch.arange(end_x * end_y, dtype=torch.float32)
+    t_x = (t % end_x).float()
+    t_y = torch.div(t, end_x, rounding_mode='floor').float()
+    return t_x, t_y
+def init_random_2d_freqs(head_dim: int, num_heads: int, theta: float = 10.0, rotate: bool = True):
+    freqs_x = []
+    freqs_y = []
+    theta = theta
+    mag = 1 / (theta ** (torch.arange(0, head_dim, 4)[: (head_dim // 4)].float() / head_dim))
+    for i in range(num_heads):
+        angles = torch.rand(1) * 2 * torch.pi if rotate else torch.zeros(1)
+        fx = torch.cat([mag * torch.cos(angles), mag * torch.cos(torch.pi/2 + angles)], dim=-1)
+        fy = torch.cat([mag * torch.sin(angles), mag * torch.sin(torch.pi/2 + angles)], dim=-1)
+        freqs_x.append(fx)
+        freqs_y.append(fy)
+    freqs_x = torch.stack(freqs_x, dim=0)
+    freqs_y = torch.stack(freqs_y, dim=0)
+    freqs = torch.stack([freqs_x, freqs_y], dim=0)
+    return freqs
+def compute_cis(freqs, t_x, t_y):
+    N = t_x.shape[0]
+    # No float 16 for this range
+    with torch.cuda.amp.autocast(enabled=False):
+        freqs_x = (t_x.unsqueeze(-1) @ freqs[0].unsqueeze(-2))
+        freqs_y = (t_y.unsqueeze(-1) @ freqs[1].unsqueeze(-2))
+        freqs_cis = torch.polar(torch.ones_like(freqs_x), freqs_x + freqs_y)
+    return freqs_cis
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    # assert freqs_cis.shape == (x.shape[-2], x.shape[-1])
+    # print(f"freqs_cis shape is {freqs_cis.shape}, x shape is {x.shape}")
+    if freqs_cis.shape == (x.shape[-2], x.shape[-1]):
+        shape = [d if i >= ndim-2 else 1 for i, d in enumerate(x.shape)]
+    elif freqs_cis.shape == (x.shape[-3], x.shape[-2], x.shape[-1]):
+        shape = [d if i >= ndim-3 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # print(f"xq shape is {xq.shape}, xq.shape[:-1] is {xq.shape[:-1]}")
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    # print(f"xq_ shape is {xq_.shape}")
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq).to(xq.device), xk_out.type_as(xk).to(xk.device)
+def apply_rotary_emb_single(x, freqs_cis):
+    x_ = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+    seq_len = x_.shape[2]
+    freqs_cis = freqs_cis[:, :seq_len, :]
+    freqs_cis = freqs_cis.unsqueeze(0).expand_as(x_)
+    x_out = torch.view_as_real(x_ * freqs_cis).flatten(3)
+    return x_out.type_as(x).to(x.device)
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    From: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    From: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+class ChannelAttention(nn.Module):
+    """Channel attention used in RCAN.
+    Args:
+        num_feat (int): Channel number of intermediate features.
+        squeeze_factor (int): Channel squeeze factor. Default: 16.
+    """
+    def __init__(self, num_feat, squeeze_factor=16):
+        super(ChannelAttention, self).__init__()
+        self.attention = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(num_feat, num_feat // squeeze_factor, 1, padding=0),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(num_feat // squeeze_factor, num_feat, 1, padding=0),
+            nn.Sigmoid())
+    def forward(self, x):
+        y = self.attention(x)
+        return x * y
+class CAB(nn.Module):
+    def __init__(self, num_feat, compress_ratio=3, squeeze_factor=30):
+        super(CAB, self).__init__()
+        self.cab = nn.Sequential(
+            nn.Conv2d(num_feat, num_feat // compress_ratio, 3, 1, 1),
+            nn.GELU(),
+            nn.Conv2d(num_feat // compress_ratio, num_feat, 3, 1, 1),
+            ChannelAttention(num_feat, squeeze_factor)
+            )
+    def forward(self, x):
+        return self.cab(x)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (b, h, w, c)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*b, window_size, window_size, c)
+    """
+    b, h, w, c = x.shape
+    x = x.view(b, h // window_size, window_size, w // window_size, window_size, c)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, c)
+    return windows
+def window_reverse(windows, window_size, h, w):
+    """
+    Args:
+        windows: (num_windows*b, window_size, window_size, c)
+        window_size (int): Window size
+        h (int): Height of image
+        w (int): Width of image
+    Returns:
+        x: (b, h, w, c)
+    """
+    b = int(windows.shape[0] / (h * w / window_size / window_size))
+    x = windows.view(b, h // window_size, w // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(b, h, w, -1)
+    return x
+class WindowAttention(nn.Module):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0., rope_mixed = True, rope_theta=10.0):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.rope_mixed = rope_mixed
+        t_x, t_y = init_t_xy(end_x=self.window_size[1], end_y=self.window_size[0])
+        self.register_buffer('rope_t_x', t_x)
+        self.register_buffer('rope_t_y', t_y)
+        freqs = init_random_2d_freqs(
+            head_dim=self.dim // self.num_heads, num_heads=self.num_heads, theta=rope_theta,
+            rotate=self.rope_mixed
+        )
+        if self.rope_mixed:
+            self.rope_freqs = nn.Parameter(freqs, requires_grad=True)
+        else:
+            self.register_buffer('rope_freqs', freqs)
+            freqs_cis = compute_cis(self.rope_freqs, self.rope_t_x, self.rope_t_y)
+            self.rope_freqs_cis = freqs_cis
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, rpi, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*b, n, c)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        b_, n, c = x.shape
+        qkv = self.qkv(x).reshape(b_, n, 3, self.num_heads, c // self.num_heads).permute(2, 0, 3, 1, 4).contiguous()
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        ###### Apply rotary position embedding
+        if self.rope_mixed:
+            freqs_cis = compute_cis(self.rope_freqs, self.rope_t_x, self.rope_t_y)
+        else:
+            freqs_cis = self.rope_freqs_cis.to(x.device)
+        q, k = apply_rotary_emb(q, k, freqs_cis)
+        #########
+        attn = F.scaled_dot_product_attention(q, k, v)
+        attn = attn.transpose(1, 2).reshape(b_, n, c)
+        x = self.proj(attn)
+        x = self.proj_drop(x)
+        return x
+class HAB(nn.Module):
+    r""" Hybrid Attention Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 compress_ratio=3,
+                 squeeze_factor=30,
+                 conv_scale=0.01,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 rope_mixed = True, rope_theta=10.0):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size'
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            rope_mixed = rope_mixed, rope_theta=rope_theta)
+        self.conv_scale = conv_scale
+        self.conv_block = CAB(num_feat=dim, compress_ratio=compress_ratio, squeeze_factor=squeeze_factor)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+    def forward(self, x, x_size, rpi_sa, attn_mask):
+        h, w = x_size
+        b, _, c = x.shape
+        # assert seq_len == h * w, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(b, h, w, c)
+        # Conv_X
+        conv_x = self.conv_block(x.permute(0, 3, 1, 2).contiguous())
+        conv_x = conv_x.permute(0, 2, 3, 1).contiguous().view(b, h * w, c)
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = attn_mask
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nw*b, window_size, window_size, c
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, c)  # nw*b, window_size*window_size, c
+        # W-MSA/SW-MSA (to be compatible for testing on images whose shapes are the multiple of window size
+        attn_windows = self.attn(x_windows, rpi=rpi_sa, mask=attn_mask)
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, c)
+        shifted_x = window_reverse(attn_windows, self.window_size, h, w)  # b h' w' c
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            attn_x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            attn_x = shifted_x
+        attn_x = attn_x.view(b, h * w, c)
+        # FFN
+        x = shortcut + self.drop_path(attn_x) + conv_x * self.conv_scale
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchMerging(nn.Module):
+    r""" Patch Merging Layer.
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x):
+        """
+        x: b, h*w, c
+        """
+        h, w = self.input_resolution
+        b, seq_len, c = x.shape
+        assert seq_len == h * w, 'input feature has wrong size'
+        assert h % 2 == 0 and w % 2 == 0, f'x size ({h}*{w}) are not even.'
+        x = x.view(b, h, w, c)
+        x0 = x[:, 0::2, 0::2, :]  # b h/2 w/2 c
+        x1 = x[:, 1::2, 0::2, :]  # b h/2 w/2 c
+        x2 = x[:, 0::2, 1::2, :]  # b h/2 w/2 c
+        x3 = x[:, 1::2, 1::2, :]  # b h/2 w/2 c
+        x = torch.cat([x0, x1, x2, x3], -1)  # b h/2 w/2 4*c
+        x = x.view(b, -1, 4 * c)  # b h/2*w/2 4*c
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class OCAB(nn.Module):
+    # overlapping cross-attention block
+    def __init__(self, dim,
+                input_resolution,
+                window_size,
+                overlap_ratio,
+                num_heads,
+                qkv_bias=True,
+                qk_scale=None,
+                mlp_ratio=2,
+                norm_layer=nn.LayerNorm,
+                rope_mixed = True, rope_theta = 10.0
+                ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.window_size = window_size
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.rope_mixed = rope_mixed
+        self.overlap_win_size = int(window_size * overlap_ratio) + window_size
+        self.norm1 = norm_layer(dim)
+        self.qkv = nn.Linear(dim, dim * 3,  bias=qkv_bias)
+        self.unfold = nn.Unfold(kernel_size=(self.overlap_win_size, self.overlap_win_size), stride=window_size, padding=(self.overlap_win_size-window_size)//2)
+        t_x, t_y = init_t_xy(end_x=max(self.window_size, self.overlap_win_size), end_y=max(self.window_size, self.overlap_win_size))
+        self.register_buffer('rope_t_x', t_x)
+        self.register_buffer('rope_t_y', t_y)
+        freqs = init_random_2d_freqs(
+            head_dim=self.dim // self.num_heads, num_heads=self.num_heads, theta=rope_theta,
+            rotate=self.rope_mixed
+        )
+        if self.rope_mixed:
+            self.rope_freqs = nn.Parameter(freqs, requires_grad=True)
+        else:
+            self.register_buffer('rope_freqs', freqs)
+            freqs_cis = compute_cis(self.rope_freqs, self.rope_t_x, self.rope_t_y)
+            self.rope_freqs_cis = freqs_cis
+        self.proj = nn.Linear(dim,dim)
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=nn.GELU)
+    def forward(self, x, x_size, rpi):
+        h, w = x_size
+        b, _, c = x.shape
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(b, h, w, c)
+        qkv = self.qkv(x).reshape(b, h, w, 3, c).permute(3, 0, 4, 1, 2).contiguous() # 3, b, c, h, w
+        q = qkv[0].permute(0, 2, 3, 1).contiguous() # b, h, w, c
+        kv = torch.cat((qkv[1], qkv[2]), dim=1) # b, 2*c, h, w
+        # partition windows
+        q_windows = window_partition(q, self.window_size)  # nw*b, window_size, window_size, c
+        q_windows = q_windows.view(-1, self.window_size * self.window_size, c)  # nw*b, window_size*window_size, c
+        kv_windows = self.unfold(kv) # b, c*w*w, nw
+        kv_windows = rearrange(kv_windows, 'b (nc ch owh oww) nw -> nc (b nw) (owh oww) ch', nc=2, ch=c, owh=self.overlap_win_size, oww=self.overlap_win_size).contiguous() # 2, nw*b, ow*ow, c
+        k_windows, v_windows = kv_windows[0], kv_windows[1] # nw*b, ow*ow, c
+        b_, nq, _ = q_windows.shape
+        _, n, _ = k_windows.shape
+        # print(f"nq is {nq}, n is {n}")
+        d = self.dim // self.num_heads
+        q = q_windows.reshape(b_, nq, self.num_heads, d).permute(0, 2, 1, 3).contiguous() # nw*b, nH, nq, d
+        k = k_windows.reshape(b_, n, self.num_heads, d).permute(0, 2, 1, 3).contiguous() # nw*b, nH, n, d
+        v = v_windows.reshape(b_, n, self.num_heads, d).permute(0, 2, 1, 3).contiguous() # nw*b, nH, n, d
+        ###### Apply rotary position embedding
+        if self.rope_mixed:
+            freqs_cis = compute_cis(self.rope_freqs, self.rope_t_x, self.rope_t_y)
+        else:
+            freqs_cis = self.rope_freqs_cis.to(x.device)
+        q = apply_rotary_emb_single(q, freqs_cis)
+        k = apply_rotary_emb_single(k, freqs_cis)
+        #########
+        attn = F.scaled_dot_product_attention(q, k, v)
+        attn_windows = attn.transpose(1, 2).reshape(b_, nq, self.dim)
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, self.dim)
+        x = window_reverse(attn_windows, self.window_size, h, w)  # b h w c
+        x = x.view(b, h * w, self.dim)
+        x = self.proj(x) + shortcut
+        x = x + self.mlp(self.norm2(x))
+        return x
+class AttenBlocks(nn.Module):
+    """ A series of attention blocks for one RHAG.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 depth,
+                 num_heads,
+                 window_size,
+                 compress_ratio,
+                 squeeze_factor,
+                 conv_scale,
+                 overlap_ratio,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False,
+                 rope_mixed = True, rope_theta=10.0):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            HAB(
+                dim=dim,
+                input_resolution=input_resolution,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                compress_ratio=compress_ratio,
+                squeeze_factor=squeeze_factor,
+                conv_scale=conv_scale,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer,
+                rope_mixed = rope_mixed, rope_theta=rope_theta) for i in range(depth)
+        ])
+        # OCAB
+        self.overlap_attn = OCAB(
+                            dim=dim,
+                            input_resolution=input_resolution,
+                            window_size=window_size,
+                            overlap_ratio=overlap_ratio,
+                            num_heads=num_heads,
+                            qkv_bias=qkv_bias,
+                            qk_scale=qk_scale,
+                            mlp_ratio=mlp_ratio,
+                            norm_layer=norm_layer,
+                            rope_mixed = rope_mixed, rope_theta = rope_theta)
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x, x_size, params):
+        for blk in self.blocks:
+            x = blk(x, x_size, params['rpi_sa'], params['attn_mask'])
+        x = self.overlap_attn(x, x_size, params['rpi_oca'])
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+class RHAG(nn.Module):
+    """Residual Hybrid Attention Group (RHAG).
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+        img_size: Input image size.
+        patch_size: Patch size.
+        resi_connection: The convolutional block before residual connection.
+    """
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 depth,
+                 num_heads,
+                 window_size,
+                 compress_ratio,
+                 squeeze_factor,
+                 conv_scale,
+                 overlap_ratio,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False,
+                 img_size=224,
+                 patch_size=4,
+                 resi_connection='1conv',
+                 rope_mixed = True, rope_theta=10.0):
+        super(RHAG, self).__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.residual_group = AttenBlocks(
+            dim=dim,
+            input_resolution=input_resolution,
+            depth=depth,
+            num_heads=num_heads,
+            window_size=window_size,
+            compress_ratio=compress_ratio,
+            squeeze_factor=squeeze_factor,
+            conv_scale=conv_scale,
+            overlap_ratio=overlap_ratio,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            drop=drop,
+            attn_drop=attn_drop,
+            drop_path=drop_path,
+            norm_layer=norm_layer,
+            downsample=downsample,
+            use_checkpoint=use_checkpoint,
+            rope_mixed = rope_mixed, rope_theta=rope_theta)
+        if resi_connection == '1conv':
+            self.conv = nn.Conv2d(dim, dim, 3, 1, 1)
+        elif resi_connection == 'identity':
+            self.conv = nn.Identity()
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim, norm_layer=None)
+        self.patch_unembed = PatchUnEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim, norm_layer=None)
+    def forward(self, x, x_size, params):
+        return self.patch_embed(self.conv(self.patch_unembed(self.residual_group(x, x_size, params), x_size))) + x
+class PatchEmbed(nn.Module):
+    r""" Image to Patch Embedding
+    Args:
+        img_size (int): Image size.  Default: 224.
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        x = x.flatten(2).transpose(1, 2)  # b Ph*Pw c
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+class PatchUnEmbed(nn.Module):
+    r""" Image to Patch Unembedding
+    Args:
+        img_size (int): Image size.  Default: 224.
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+    def forward(self, x, x_size):
+        x = x.transpose(1, 2).contiguous().view(x.shape[0], self.embed_dim, x_size[0], x_size[1])  # b Ph*Pw c
+        return x
+class Upsample(nn.Sequential):
+    """Upsample module.
+    Args:
+        scale (int): Scale factor. Supported scales: 2^n and 3.
+        num_feat (int): Channel number of intermediate features.
+    """
+    def __init__(self, scale, num_feat):
+        m = []
+        if (scale & (scale - 1)) == 0:  # scale = 2^n
+            for _ in range(int(math.log(scale, 2))):
+                m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1))
+                m.append(nn.PixelShuffle(2))
+        elif scale == 3:
+            m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1))
+            m.append(nn.PixelShuffle(3))
+        else:
+            raise ValueError(f'scale {scale} is not supported. ' 'Supported scales: 2^n and 3.')
+        super(Upsample, self).__init__(*m)
+class HATNOUP_ROPE_AMP(nn.Module):
+    def __init__(self,
+                 img_size=64,
+                 patch_size=1,
+                 in_chans=3,
+                 embed_dim=192,
+                 depths=(6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6),
+                 num_heads=(6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6),
+                 window_size=16,
+                 compress_ratio=3,
+                 squeeze_factor=32,
+                 conv_scale=0.01,
+                 overlap_ratio=0.5,
+                 mlp_ratio=2,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 use_checkpoint=False,
+                 upscale=4,
+                 img_range=1.,
+                 upsampler='pixelshuffle',
+                 resi_connection='1conv',
+                 rope_mixed = True,
+                 rope_theta=10.0,
+                 **kwargs):
+        super(HATNOUP_ROPE_AMP, self).__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.overlap_ratio = overlap_ratio
+        num_in_ch = in_chans
+        num_out_ch = in_chans
+        num_feat = 64
+        self.img_range = img_range
+        if in_chans == 3:
+            rgb_mean = (0.4488, 0.4371, 0.4040)
+            self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1)
+        else:
+            self.mean = torch.zeros(1, 1, 1, 1)
+        self.upscale = upscale
+        self.upsampler = upsampler
+        # relative position index
+        relative_position_index_SA = self.calculate_rpi_sa()
+        relative_position_index_OCA = self.calculate_rpi_oca()
+        self.register_buffer('relative_position_index_SA', relative_position_index_SA)
+        self.register_buffer('relative_position_index_OCA', relative_position_index_OCA)
+        # ------------------------- 1, shallow feature extraction ------------------------- #
+        self.conv_first = nn.Conv2d(num_in_ch, embed_dim, 3, 1, 1)
+        # ------------------------- 2, deep feature extraction ------------------------- #
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.num_features = embed_dim
+        self.mlp_ratio = mlp_ratio
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=embed_dim,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+        # merge non-overlapping patches into image
+        self.patch_unembed = PatchUnEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=embed_dim,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        # build Residual Hybrid Attention Groups (RHAG)
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = RHAG(
+                dim=embed_dim,
+                input_resolution=(patches_resolution[0], patches_resolution[1]),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                compress_ratio=compress_ratio,
+                squeeze_factor=squeeze_factor,
+                conv_scale=conv_scale,
+                overlap_ratio=overlap_ratio,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],  # no impact on SR results
+                norm_layer=norm_layer,
+                downsample=None,
+                use_checkpoint=use_checkpoint,
+                img_size=img_size,
+                patch_size=patch_size,
+                resi_connection=resi_connection,
+                rope_mixed = rope_mixed, rope_theta=rope_theta)
+            self.layers.append(layer)
+        self.norm = norm_layer(self.num_features)
+        self.use_checkpoint = use_checkpoint
+        # build the last conv layer in deep feature extraction
+        if resi_connection == '1conv':
+            self.conv_after_body = nn.Conv2d(embed_dim, embed_dim, 3, 1, 1)
+        elif resi_connection == 'identity':
+            self.conv_after_body = nn.Identity()
+        # ------------------------- 3, high quality image reconstruction ------------------------- #
+        if self.upsampler == 'pixelshuffle':
+            # for classical SR
+            self.conv_before_upsample = nn.Sequential(
+                nn.Conv2d(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU(inplace=True))
+            # self.upsample = Upsample(upscale, num_feat)
+            # self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def calculate_rpi_sa(self):
+        # calculate relative position index for SA
+        coords_h = torch.arange(self.window_size)
+        coords_w = torch.arange(self.window_size)
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        return relative_position_index
+    def calculate_rpi_oca(self):
+        # calculate relative position index for OCA
+        window_size_ori = self.window_size
+        window_size_ext = self.window_size + int(self.overlap_ratio * self.window_size)
+        coords_h = torch.arange(window_size_ori)
+        coords_w = torch.arange(window_size_ori)
+        coords_ori = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, ws, ws
+        coords_ori_flatten = torch.flatten(coords_ori, 1)  # 2, ws*ws
+        coords_h = torch.arange(window_size_ext)
+        coords_w = torch.arange(window_size_ext)
+        coords_ext = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, wse, wse
+        coords_ext_flatten = torch.flatten(coords_ext, 1)  # 2, wse*wse
+        relative_coords = coords_ext_flatten[:, None, :] - coords_ori_flatten[:, :, None]   # 2, ws*ws, wse*wse
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # ws*ws, wse*wse, 2
+        relative_coords[:, :, 0] += window_size_ori - window_size_ext + 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size_ori - window_size_ext + 1
+        relative_coords[:, :, 0] *= window_size_ori + window_size_ext - 1
+        relative_position_index = relative_coords.sum(-1)
+        return relative_position_index
+    def calculate_mask(self, x_size):
+        # calculate attention mask for SW-MSA
+        h, w = x_size
+        img_mask = torch.zeros((1, h, w, 1))  # 1 h w 1
+        h_slices = (slice(0, -self.window_size), slice(-self.window_size,
+                                                       -self.shift_size), slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size), slice(-self.window_size,
+                                                       -self.shift_size), slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(img_mask, self.window_size)  # nw, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        return attn_mask
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'absolute_pos_embed'}
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'relative_position_bias_table'}
+    def forward_features(self, x):
+        x_size = (x.shape[2], x.shape[3])
+        # Calculate attention mask and relative position index in advance to speed up inference.
+        # The original code is very time-consuming for large window size.
+        attn_mask = self.calculate_mask(x_size).to(x.device)
+        params = {'attn_mask': attn_mask, 'rpi_sa': self.relative_position_index_SA, 'rpi_oca': self.relative_position_index_OCA}
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+        for layer in self.layers:
+            x = layer(x, x_size, params)
+        x = self.norm(x)  # b seq_len c
+        x = self.patch_unembed(x, x_size)
+        return x
+    def forward(self, x):
+        # self.mean = self.mean.type_as(x)
+        # x = (x - self.mean) * self.img_range
+        if self.upsampler == 'pixelshuffle':
+            # for classical SR
+            x = self.conv_first(x)
+            if self.use_checkpoint:
+                x = self.conv_after_body(checkpoint(self.forward_features, x)) + x
+            else:
+                x = self.conv_after_body(self.forward_features(x)) + x
+            x = self.conv_before_upsample(x)
+        #     x = self.conv_last(self.upsample(x))
+        # x = x / self.img_range + self.mean
+        return x
+if __name__ == '__main__':
+    srcs = torch.randn(8, 3, 64, 64).cuda()
+    encoder = HATNOUP_ROPE_AMP(upscale=4, in_chans=3, img_size=64, window_size=16, compress_ratio=3, squeeze_factor=32, conv_scale=0.01, overlap_ratio=0.5,
+                          img_range=1.,
+                          depths=(6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6),
+                          embed_dim=192,
+                          num_heads=(6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6),
+                          mlp_ratio=2,
+                          upsampler='pixelshuffle',
+                          resi_connection='1conv',
+                          use_checkpoint=False).cuda()
+    feature = encoder(srcs)
+    pass

utils/rdn.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import collections.abc
+import math
+import torch
+import torchvision
+import warnings
+from distutils.version import LooseVersion
+from itertools import repeat
+from torch import nn as nn
+from torch.nn import functional as F
+from torch.nn import init as init
+from torch.nn.modules.batchnorm import _BatchNorm
+class RDB_Conv(nn.Module):
+    def __init__(self, inChannels, growRate, kSize=3):
+        super(RDB_Conv, self).__init__()
+        Cin = inChannels
+        G  = growRate
+        self.conv = nn.Sequential(*[
+            nn.Conv2d(Cin, G, kSize, padding=(kSize-1)//2, stride=1),
+            nn.ReLU()
+        ])
+    def forward(self, x):
+        out = self.conv(x)
+        return torch.cat((x, out), 1)
+class RDB(nn.Module):
+    def __init__(self, growRate0, growRate, nConvLayers, kSize=3):
+        super(RDB, self).__init__()
+        G0 = growRate0
+        G  = growRate
+        C  = nConvLayers
+        convs = []
+        for c in range(C):
+            convs.append(RDB_Conv(G0 + c*G, G))
+        self.convs = nn.Sequential(*convs)
+        # Local Feature Fusion
+        self.LFF = nn.Conv2d(G0 + C*G, G0, 1, padding=0, stride=1)
+    def forward(self, x):
+        return self.LFF(self.convs(x)) + x
+class RDNNOUP(nn.Module):
+    def __init__(self, G0 = 64, kSize = 3, r = 4, n_colors = 3, RDNconfig = 'B',
+                 no_upsampling = True, img_range = 1.0):
+        super(RDNNOUP, self).__init__()
+        self.no_upsampling = no_upsampling
+        self.img_range = img_range
+        # number of RDB blocks, conv layers, out channels
+        self.D, C, G = {
+            'A': (20, 6, 32),
+            'B': (16, 8, 64),
+        }[RDNconfig]
+        # Shallow feature extraction net
+        self.SFENet1 = nn.Conv2d(n_colors, G0, kSize, padding=(kSize-1)//2, stride=1)
+        self.SFENet2 = nn.Conv2d(G0, G0, kSize, padding=(kSize-1)//2, stride=1)
+        # Redidual dense blocks and dense feature fusion
+        self.RDBs = nn.ModuleList()
+        for i in range(self.D):
+            self.RDBs.append(
+                RDB(growRate0 = G0, growRate = G, nConvLayers = C)
+            )
+        # Global Feature Fusion
+        self.GFF = nn.Sequential(*[
+            nn.Conv2d(self.D * G0, G0, 1, padding=0, stride=1),
+            nn.Conv2d(G0, G0, kSize, padding=(kSize-1)//2, stride=1)
+        ])
+        if no_upsampling:
+            self.out_dim = G0
+        else:
+            self.out_dim = n_colors
+            # Up-sampling net
+            if r == 2 or r == 3:
+                self.UPNet = nn.Sequential(*[
+                    nn.Conv2d(G0, G * r * r, kSize, padding=(kSize-1)//2, stride=1),
+                    nn.PixelShuffle(r),
+                    nn.Conv2d(G, n_colors, kSize, padding=(kSize-1)//2, stride=1)
+                ])
+            elif r == 4:
+                self.UPNet = nn.Sequential(*[
+                    nn.Conv2d(G0, G * 4, kSize, padding=(kSize-1)//2, stride=1),
+                    nn.PixelShuffle(2),
+                    nn.Conv2d(G, G * 4, kSize, padding=(kSize-1)//2, stride=1),
+                    nn.PixelShuffle(2),
+                    nn.Conv2d(G, n_colors, kSize, padding=(kSize-1)//2, stride=1)
+                ])
+            else:
+                raise ValueError("scale must be 2 or 3 or 4.")
+    def forward(self, x):
+        x = x * self.img_range
+        f__1 = self.SFENet1(x)
+        x  = self.SFENet2(f__1)
+        RDBs_out = []
+        for i in range(self.D):
+            x = self.RDBs[i](x)
+            RDBs_out.append(x)
+        x = self.GFF(torch.cat(RDBs_out,1))
+        x += f__1
+        if self.no_upsampling:
+            return x
+        else:
+            return self.UPNet(x)
+if __name__ == '__main__':
+    x = torch.randn(8,3,48,48)
+    model = RDNNOUP()
+    y = model(x)
+    print(y.shape)

utils/split_and_joint_image.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import torch
+import torch.nn.functional as F
+import math
+from utils.gaussian_splatting import generate_2D_gaussian_splatting_step, generate_2D_gaussian_splatting_step_buffer
+### If the GPU memory is limited, please use the following code to do tiling process for input LR image
+# def split_and_joint_image(lq, scale_factor, model_g, model_fea2gs, scale_modify, split_size = 48,
+#                                             overlap_size = 8,
+#                                             crop_size = 4,
+#                                             default_step_size = 1.2, mode = 'scale_modify',
+#                                             cuda_rendering = True,
+#                                             if_dmax = False,
+#                                             dmax_mode = 'fix',
+#                                             dmax = 0.1):
+#     h_lq, w_lq = lq.shape[-2:]
+#     assert overlap_size > 0 and overlap_size < split_size // 2, f"overlap size is wrong"
+#     tile_nums_h = math.ceil((h_lq - overlap_size) / (split_size - overlap_size))
+#     tile_nums_w = math.ceil((w_lq - overlap_size) / (split_size - overlap_size))
+#     pad_h_lq = tile_nums_h * (split_size - overlap_size) + overlap_size - h_lq
+#     pad_w_lq = tile_nums_w * (split_size - overlap_size) + overlap_size - w_lq
+#     lq_pad = F.pad(input=lq, pad=(0, pad_w_lq, 0, pad_h_lq), mode='reflect')
+#     split_size_sr = math.ceil(split_size * scale_factor)
+#     sr_tile_list = []
+#     for h_num in range(tile_nums_h):
+#         for w_num in range(tile_nums_w):
+#             tile_lq_position_start_h = h_num * (split_size - overlap_size)
+#             tile_lq_position_start_w = w_num * (split_size - overlap_size)
+#             tile_lq_position_end_h = tile_lq_position_start_h + split_size
+#             tile_lq_position_end_w = tile_lq_position_start_w + split_size
+#             input_tile = lq_pad[:,:, tile_lq_position_start_h:tile_lq_position_end_h, tile_lq_position_start_w:tile_lq_position_end_w]
+#             model_g_output = model_g(input_tile)
+#             scale_vector = scale_modify[0].unsqueeze(0).to(model_g_output.device)
+#             batch_gs_parameters = model_fea2gs(model_g_output, scale_vector)
+#             gs_parameters = batch_gs_parameters[0, :]
+#             b_output = generate_2D_gaussian_splatting_step(sr_size=torch.tensor([split_size_sr, split_size_sr]), gs_parameters=gs_parameters,
+#                                                            lq=input_tile[0, :], scale=scale_factor, sample_coords=None,
+#                                                            scale_modify = scale_modify,
+#                                                            default_step_size = default_step_size, mode = mode,
+#                                                            cuda_rendering = cuda_rendering,
+#                                                            if_dmax = if_dmax,
+#                                                            dmax_mode = dmax_mode,
+#                                                            dmax = dmax)
+#             sr_tile_list.append(b_output.unsqueeze(0))
+#     tile_sr_h = sr_tile_list[0].shape[2]
+#     tile_sr_w = sr_tile_list[0].shape[3]
+#     assert tile_sr_w == split_size_sr and tile_sr_h == split_size_sr, \
+#         f'tile_sr_h-{tile_sr_w}, tile_sr_w-{tile_sr_w}, split_size_sr-{split_size_sr} is not the same'
+#     overlap_sr = math.ceil(overlap_size * scale_factor)
+#     sr_pad = torch.zeros(lq.shape[0], lq.shape[1],
+#                          math.ceil(lq_pad.shape[2] * scale_factor),
+#                          math.ceil(lq_pad.shape[3] * scale_factor),
+#                          device=lq.device)
+#     idx = 0
+#     for h_num in range(tile_nums_h):
+#         for w_num in range(tile_nums_w):
+#             tile_sr_position_start_w = w_num * (split_size_sr - overlap_sr)
+#             tile_sr_position_end_w = tile_sr_position_start_w + split_size_sr
+#             tile_sr_position_start_h = h_num * (split_size_sr - overlap_sr)
+#             tile_sr_position_end_h = tile_sr_position_start_h + split_size_sr
+#             if h_num == 0 and w_num == 0:
+#                 sr_pad[:, :, tile_sr_position_start_h:tile_sr_position_end_h,
+#                 tile_sr_position_start_w:tile_sr_position_end_w] = sr_tile_list[idx]
+#             elif h_num == 0 and w_num !=0:
+#                 sr_pad[:, :, tile_sr_position_start_h:tile_sr_position_end_h,
+#                 tile_sr_position_start_w+crop_size:tile_sr_position_end_w] = sr_tile_list[idx][:,:,:,crop_size:]
+#             elif h_num != 0 and w_num ==0:
+#                 sr_pad[:, :, tile_sr_position_start_h+crop_size:tile_sr_position_end_h,
+#                 tile_sr_position_start_w:tile_sr_position_end_w] = sr_tile_list[idx][:,:,crop_size:,:]
+#             else:
+#                 sr_pad[:,:,tile_sr_position_start_h+crop_size:tile_sr_position_end_h,
+#                 tile_sr_position_start_w+crop_size:tile_sr_position_end_w] = sr_tile_list[idx][:,:,crop_size:,crop_size:]
+#             idx = idx + 1
+#     print(f"sr_pad shape is {sr_pad.shape}")
+#     # sr_final = sr_pad[:,:, 0:math.ceil(h_lq * scale_factor), 0: math.ceil(w_lq * scale_factor)]
+#     sr_final = sr_pad
+#     return sr_final
+def split_and_joint_image(lq, scale_factor, split_size,
+                                            overlap_size, model_g, model_fea2gs,
+                                            scale_modify, crop_size = 2,
+                                            default_step_size = 1.2, mode = 'scale_modify',
+                                            cuda_rendering = True,
+                                            if_dmax = False,
+                                            dmax_mode = 'fix',
+                                            dmax = 25):
+    h_lq, w_lq = lq.shape[-2:]
+    # assert h_lq > split_size, f'h_lq-{h_lq} should be larger than split_size-{split_size}, please do not use tile_process, or decrease the split_size'
+    # assert w_lq > split_size, f'w_lq-{w_lq} should be larger than split_size-{split_size}, please do not use tile_process, or decrease the split_size'
+    assert overlap_size > 0 and overlap_size < split_size // 2, f"overlap size is wrong"
+    tile_nums_h = math.ceil((h_lq - overlap_size) / (split_size - overlap_size))
+    tile_nums_w = math.ceil((w_lq - overlap_size) / (split_size - overlap_size))
+    pad_h_lq = tile_nums_h * (split_size - overlap_size) + overlap_size - h_lq
+    pad_w_lq = tile_nums_w * (split_size - overlap_size) + overlap_size - w_lq
+    assert pad_h_lq < h_lq, f'pad_h_lq-{pad_h_lq} should be smaller than h_lq-{h_lq}, please decrease the split_size-{split_size}'
+    assert pad_w_lq < w_lq, f'pad_w_lq-{pad_w_lq} should be smaller than w_lq-{w_lq}, please decrease the split_size-{split_size}'
+    lq_pad = F.pad(input=lq, pad=(0, pad_w_lq, 0, pad_h_lq), mode='reflect')
+    # lq_pad = F.pad(input=lq, pad=(0, pad_w_lq, 0, pad_h_lq), mode='constant', value=0)
+    split_size_sr = math.ceil(split_size * scale_factor)
+    sr_tile_list = []
+    for h_num in range(tile_nums_h):
+        for w_num in range(tile_nums_w):
+            tile_lq_position_start_h = h_num * (split_size - overlap_size)
+            tile_lq_position_start_w = w_num * (split_size - overlap_size)
+            tile_lq_position_end_h = tile_lq_position_start_h + split_size
+            tile_lq_position_end_w = tile_lq_position_start_w + split_size
+            input_tile = lq_pad[:,:, tile_lq_position_start_h:tile_lq_position_end_h, tile_lq_position_start_w:tile_lq_position_end_w]
+            model_g_output = model_g(input_tile)
+            scale_vector = scale_modify[0].unsqueeze(0).to(model_g_output.device)
+            batch_gs_parameters = model_fea2gs(model_g_output, scale_vector)
+            gs_parameters = batch_gs_parameters[0, :]
+            b_output = generate_2D_gaussian_splatting_step(sr_size=torch.tensor([split_size_sr, split_size_sr]), gs_parameters=gs_parameters,
+                                                           scale=scale_factor, sample_coords=None,
+                                                           scale_modify = scale_modify,
+                                                           default_step_size = default_step_size, mode = mode,
+                                                           cuda_rendering = cuda_rendering,
+                                                           if_dmax = if_dmax,
+                                                           dmax_mode = dmax_mode,
+                                                           dmax = dmax)
+            sr_tile_list.append(b_output.unsqueeze(0))
+    tile_sr_h = sr_tile_list[0].shape[2]
+    tile_sr_w = sr_tile_list[0].shape[3]
+    assert tile_sr_w == split_size_sr and tile_sr_h == split_size_sr, \
+        f'tile_sr_h-{tile_sr_w}, tile_sr_w-{tile_sr_w}, split_size_sr-{split_size_sr} is not the same'
+    overlap_sr = math.ceil(overlap_size * scale_factor)
+    sr_pad = torch.zeros(lq.shape[0], lq.shape[1],
+                         (tile_nums_h - 1) * (split_size_sr - overlap_sr) + split_size_sr,
+                         (tile_nums_w - 1) * (split_size_sr - overlap_sr) + split_size_sr,
+                         device=lq.device)
+    idx = 0
+    if scale_factor != int(scale_factor):
+        for h_num in range(tile_nums_h):
+            for w_num in range(tile_nums_w):
+                tile_sr_position_start_w = w_num * (split_size_sr - overlap_sr)
+                tile_sr_position_end_w = tile_sr_position_start_w + split_size_sr
+                tile_sr_position_start_h = h_num * (split_size_sr - overlap_sr)
+                tile_sr_position_end_h = tile_sr_position_start_h + split_size_sr
+                if h_num == 0 and w_num == 0:
+                    sr_pad[:, :, tile_sr_position_start_h:tile_sr_position_end_h,
+                    tile_sr_position_start_w:tile_sr_position_end_w] = sr_tile_list[idx]
+                elif h_num == 0 and w_num !=0:
+                    if w_num != tile_nums_w - 1:
+                        sr_pad[:, :, tile_sr_position_start_h:tile_sr_position_end_h,
+                        tile_sr_position_start_w+crop_size:tile_sr_position_end_w] = sr_tile_list[idx][:,:,:,crop_size:]
+                    else:
+                        sr_pad[:, :, tile_sr_position_start_h:tile_sr_position_end_h,
+                        tile_sr_position_start_w+crop_size:sr_pad.shape[3]] = sr_tile_list[idx][:,:,:,crop_size:sr_pad.shape[3] - tile_sr_position_start_w]
+                elif h_num != 0 and w_num ==0:
+                    if h_num != tile_nums_h - 1:
+                        sr_pad[:, :, tile_sr_position_start_h+crop_size:tile_sr_position_end_h,
+                        tile_sr_position_start_w:tile_sr_position_end_w] = sr_tile_list[idx][:,:,crop_size:,:]
+                    else:
+                        sr_pad[:, :, tile_sr_position_start_h+crop_size:sr_pad.shape[2],
+                        tile_sr_position_start_w:tile_sr_position_end_w] = sr_tile_list[idx][:,:,crop_size:sr_pad.shape[2] - tile_sr_position_start_h,:]
+                else:
+                    if w_num != tile_nums_w - 1 and h_num != tile_nums_h - 1:
+                        sr_pad[:,:,tile_sr_position_start_h+crop_size:tile_sr_position_end_h,
+                        tile_sr_position_start_w+crop_size:tile_sr_position_end_w] = sr_tile_list[idx][:,:,crop_size:,crop_size:]
+                    elif w_num == tile_nums_w - 1 and h_num != tile_nums_h - 1:
+                        sr_pad[:, :, tile_sr_position_start_h:tile_sr_position_end_h,
+                        tile_sr_position_start_w+crop_size:sr_pad.shape[3]] = sr_tile_list[idx][:,:,:,crop_size:sr_pad.shape[3] - tile_sr_position_start_w]
+                    elif w_num != tile_nums_w - 1 and h_num == tile_nums_h - 1:
+                        sr_pad[:, :, tile_sr_position_start_h+crop_size:sr_pad.shape[2],
+                        tile_sr_position_start_w:tile_sr_position_end_w] = sr_tile_list[idx][:,:,crop_size:sr_pad.shape[2] - tile_sr_position_start_h,:]
+                    elif w_num == tile_nums_w - 1 and h_num == tile_nums_h - 1:
+                        sr_pad[:,:,tile_sr_position_start_h+crop_size:sr_pad.shape[2],
+                        tile_sr_position_start_w+crop_size:sr_pad.shape[3]] = sr_tile_list[idx][:,:,crop_size:sr_pad.shape[2] - tile_sr_position_start_h,crop_size:sr_pad.shape[3] - tile_sr_position_start_w]
+                idx = idx + 1
+    else:
+        for h_num in range(tile_nums_h):
+            for w_num in range(tile_nums_w):
+                tile_sr_position_start_w = w_num * (split_size_sr - overlap_sr)
+                tile_sr_position_end_w = tile_sr_position_start_w + split_size_sr
+                tile_sr_position_start_h = h_num * (split_size_sr - overlap_sr)
+                tile_sr_position_end_h = tile_sr_position_start_h + split_size_sr
+                if h_num == 0 and w_num == 0:
+                    sr_pad[:, :, tile_sr_position_start_h:tile_sr_position_end_h,
+                    tile_sr_position_start_w:tile_sr_position_end_w] = sr_tile_list[idx]
+                elif h_num == 0 and w_num !=0:
+                    sr_pad[:, :, tile_sr_position_start_h:tile_sr_position_end_h,
+                    tile_sr_position_start_w+crop_size:tile_sr_position_end_w] = sr_tile_list[idx][:,:,:,crop_size:]
+                elif h_num != 0 and w_num ==0:
+                    sr_pad[:, :, tile_sr_position_start_h+crop_size:tile_sr_position_end_h,
+                    tile_sr_position_start_w:tile_sr_position_end_w] = sr_tile_list[idx][:,:,crop_size:,:]
+                else:
+                    sr_pad[:,:,tile_sr_position_start_h+crop_size:tile_sr_position_end_h,
+                    tile_sr_position_start_w+crop_size:tile_sr_position_end_w] = sr_tile_list[idx][:,:,crop_size:,crop_size:]
+                idx = idx + 1
+    print(f"sr_pad shape is {sr_pad.shape}")
+    # sr_final = sr_pad[:,:, 0:math.ceil(h_lq * scale_factor), 0: math.ceil(w_lq * scale_factor)]
+    sr_final = sr_pad
+    return sr_final

utils/swinir.py ADDED Viewed

	@@ -0,0 +1,1243 @@

+# Modified from https://github.com/JingyunLiang/SwinIR
+# SwinIR: Image Restoration Using Swin Transformer, https://arxiv.org/abs/2108.10257
+# Originally Written by Ze Liu, Modified by Jingyun Liang.
+import collections.abc
+import torchvision
+import warnings
+from distutils.version import LooseVersion
+from itertools import repeat
+import math
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+# From PyTorch
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # From: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/weight_init.py
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. '
+            'The distribution of values may be incorrect.',
+            stacklevel=2)
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        low = norm_cdf((a - mean) / std)
+        up = norm_cdf((b - mean) / std)
+        # Uniformly fill tensor with values from [low, up], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * low - 1, 2 * up - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution.
+    From: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/weight_init.py
+    The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    From: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    From: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (b, h, w, c)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*b, window_size, window_size, c)
+    """
+    b, h, w, c = x.shape
+    x = x.view(b, h // window_size, window_size, w // window_size, window_size, c)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, c)
+    return windows
+def window_reverse(windows, window_size, h, w):
+    """
+    Args:
+        windows: (num_windows*b, window_size, window_size, c)
+        window_size (int): Window size
+        h (int): Height of image
+        w (int): Width of image
+    Returns:
+        x: (b, h, w, c)
+    """
+    b = int(windows.shape[0] / (h * w / window_size / window_size))
+    x = windows.view(b, h // window_size, w // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(b, h, w, -1)
+    return x
+class WindowAttention(nn.Module):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer('relative_position_index', relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*b, n, c)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        b_, n, c = x.shape
+        qkv = self.qkv(x).reshape(b_, n, 3, self.num_heads, c // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nw = mask.shape[0]
+            attn = attn.view(b_ // nw, nw, self.num_heads, n, n) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, n, n)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(b_, n, c)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
+    def flops(self, n):
+        # calculate flops for 1 window with token length of n
+        flops = 0
+        # qkv = self.qkv(x)
+        flops += n * self.dim * 3 * self.dim
+        # attn = (q @ k.transpose(-2, -1))
+        flops += self.num_heads * n * (self.dim // self.num_heads) * n
+        #  x = (attn @ v)
+        flops += self.num_heads * n * n * (self.dim // self.num_heads)
+        # x = self.proj(x)
+        flops += n * self.dim * self.dim
+        return flops
+class SwinTransformerBlock(nn.Module):
+    r""" Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size'
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        if self.shift_size > 0:
+            attn_mask = self.calculate_mask(self.input_resolution)
+        else:
+            attn_mask = None
+        self.register_buffer('attn_mask', attn_mask)
+    def calculate_mask(self, x_size):
+        # calculate attention mask for SW-MSA
+        h, w = x_size
+        img_mask = torch.zeros((1, h, w, 1))  # 1 h w 1
+        h_slices = (slice(0, -self.window_size), slice(-self.window_size,
+                                                       -self.shift_size), slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size), slice(-self.window_size,
+                                                       -self.shift_size), slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(img_mask, self.window_size)  # nw, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        return attn_mask
+    def forward(self, x, x_size):
+        h, w = x_size
+        b, _, c = x.shape
+        # assert seq_len == h * w, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(b, h, w, c)
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_x = x
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nw*b, window_size, window_size, c
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, c)  # nw*b, window_size*window_size, c
+        # W-MSA/SW-MSA (to be compatible for testing on images whose shapes are the multiple of window size
+        if self.input_resolution == x_size:
+            attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nw*b, window_size*window_size, c
+        else:
+            attn_windows = self.attn(x_windows, mask=self.calculate_mask(x_size).to(x.device))
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, c)
+        shifted_x = window_reverse(attn_windows, self.window_size, h, w)  # b h' w' c
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        x = x.view(b, h * w, c)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+    def extra_repr(self) -> str:
+        return (f'dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, '
+                f'window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}')
+    def flops(self):
+        flops = 0
+        h, w = self.input_resolution
+        # norm1
+        flops += self.dim * h * w
+        # W-MSA/SW-MSA
+        nw = h * w / self.window_size / self.window_size
+        flops += nw * self.attn.flops(self.window_size * self.window_size)
+        # mlp
+        flops += 2 * h * w * self.dim * self.dim * self.mlp_ratio
+        # norm2
+        flops += self.dim * h * w
+        return flops
+class PatchMerging(nn.Module):
+    r""" Patch Merging Layer.
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x):
+        """
+        x: b, h*w, c
+        """
+        h, w = self.input_resolution
+        b, seq_len, c = x.shape
+        assert seq_len == h * w, 'input feature has wrong size'
+        assert h % 2 == 0 and w % 2 == 0, f'x size ({h}*{w}) are not even.'
+        x = x.view(b, h, w, c)
+        x0 = x[:, 0::2, 0::2, :]  # b h/2 w/2 c
+        x1 = x[:, 1::2, 0::2, :]  # b h/2 w/2 c
+        x2 = x[:, 0::2, 1::2, :]  # b h/2 w/2 c
+        x3 = x[:, 1::2, 1::2, :]  # b h/2 w/2 c
+        x = torch.cat([x0, x1, x2, x3], -1)  # b h/2 w/2 4*c
+        x = x.view(b, -1, 4 * c)  # b h/2*w/2 4*c
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+    def extra_repr(self) -> str:
+        return f'input_resolution={self.input_resolution}, dim={self.dim}'
+    def flops(self):
+        h, w = self.input_resolution
+        flops = h * w * self.dim
+        flops += (h // 2) * (w // 2) * 4 * self.dim * 2 * self.dim
+        return flops
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 depth,
+                 num_heads,
+                 window_size,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(
+                dim=dim,
+                input_resolution=input_resolution,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer) for i in range(depth)
+        ])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x, x_size):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x, x_size)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}'
+    def flops(self):
+        flops = 0
+        for blk in self.blocks:
+            flops += blk.flops()
+        if self.downsample is not None:
+            flops += self.downsample.flops()
+        return flops
+class RSTB(nn.Module):
+    """Residual Swin Transformer Block (RSTB).
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+        img_size: Input image size.
+        patch_size: Patch size.
+        resi_connection: The convolutional block before residual connection.
+    """
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 depth,
+                 num_heads,
+                 window_size,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False,
+                 img_size=224,
+                 patch_size=4,
+                 resi_connection='1conv'):
+        super(RSTB, self).__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.residual_group = BasicLayer(
+            dim=dim,
+            input_resolution=input_resolution,
+            depth=depth,
+            num_heads=num_heads,
+            window_size=window_size,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            drop=drop,
+            attn_drop=attn_drop,
+            drop_path=drop_path,
+            norm_layer=norm_layer,
+            downsample=downsample,
+            use_checkpoint=use_checkpoint)
+        if resi_connection == '1conv':
+            self.conv = nn.Conv2d(dim, dim, 3, 1, 1)
+        elif resi_connection == '3conv':
+            # to save parameters and memory
+            self.conv = nn.Sequential(
+                nn.Conv2d(dim, dim // 4, 3, 1, 1), nn.LeakyReLU(negative_slope=0.2, inplace=True),
+                nn.Conv2d(dim // 4, dim // 4, 1, 1, 0), nn.LeakyReLU(negative_slope=0.2, inplace=True),
+                nn.Conv2d(dim // 4, dim, 3, 1, 1))
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim, norm_layer=None)
+        self.patch_unembed = PatchUnEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim, norm_layer=None)
+    def forward(self, x, x_size):
+        return self.patch_embed(self.conv(self.patch_unembed(self.residual_group(x, x_size), x_size))) + x
+    def flops(self):
+        flops = 0
+        flops += self.residual_group.flops()
+        h, w = self.input_resolution
+        flops += h * w * self.dim * self.dim * 9
+        flops += self.patch_embed.flops()
+        flops += self.patch_unembed.flops()
+        return flops
+class PatchEmbed(nn.Module):
+    r""" Image to Patch Embedding
+    Args:
+        img_size (int): Image size.  Default: 224.
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        x = x.flatten(2).transpose(1, 2)  # b Ph*Pw c
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+    def flops(self):
+        flops = 0
+        h, w = self.img_size
+        if self.norm is not None:
+            flops += h * w * self.embed_dim
+        return flops
+class PatchUnEmbed(nn.Module):
+    r""" Image to Patch Unembedding
+    Args:
+        img_size (int): Image size.  Default: 224.
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+    def forward(self, x, x_size):
+        x = x.transpose(1, 2).view(x.shape[0], self.embed_dim, x_size[0], x_size[1])  # b Ph*Pw c
+        return x
+    def flops(self):
+        flops = 0
+        return flops
+class Upsample(nn.Sequential):
+    """Upsample module.
+    Args:
+        scale (int): Scale factor. Supported scales: 2^n and 3.
+        num_feat (int): Channel number of intermediate features.
+    """
+    def __init__(self, scale, num_feat):
+        m = []
+        if (scale & (scale - 1)) == 0:  # scale = 2^n
+            for _ in range(int(math.log(scale, 2))):
+                m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1))
+                m.append(nn.PixelShuffle(2))
+        elif scale == 3:
+            m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1))
+            m.append(nn.PixelShuffle(3))
+        else:
+            raise ValueError(f'scale {scale} is not supported. Supported scales: 2^n and 3.')
+        super(Upsample, self).__init__(*m)
+class UpsampleOneStep(nn.Sequential):
+    """UpsampleOneStep module (the difference with Upsample is that it always only has 1conv + 1pixelshuffle)
+       Used in lightweight SR to save parameters.
+    Args:
+        scale (int): Scale factor. Supported scales: 2^n and 3.
+        num_feat (int): Channel number of intermediate features.
+    """
+    def __init__(self, scale, num_feat, num_out_ch, input_resolution=None):
+        self.num_feat = num_feat
+        self.input_resolution = input_resolution
+        m = []
+        m.append(nn.Conv2d(num_feat, (scale**2) * num_out_ch, 3, 1, 1))
+        m.append(nn.PixelShuffle(scale))
+        super(UpsampleOneStep, self).__init__(*m)
+    def flops(self):
+        h, w = self.input_resolution
+        flops = h * w * self.num_feat * 3 * 9
+        return flops
+class SwinIR(nn.Module):
+    r""" SwinIR
+        A PyTorch impl of : `SwinIR: Image Restoration Using Swin Transformer`, based on Swin Transformer.
+    Args:
+        img_size (int | tuple(int)): Input image size. Default 64
+        patch_size (int | tuple(int)): Patch size. Default: 1
+        in_chans (int): Number of input image channels. Default: 3
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
+        upscale: Upscale factor. 2/3/4/8 for image SR, 1 for denoising and compress artifact reduction
+        img_range: Image range. 1. or 255.
+        upsampler: The reconstruction reconstruction module. 'pixelshuffle'/'pixelshuffledirect'/'nearest+conv'/None
+        resi_connection: The convolutional block before residual connection. '1conv'/'3conv'
+    """
+    def __init__(self,
+                 img_size=64,
+                 patch_size=1,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=(6, 6, 6, 6),
+                 num_heads=(6, 6, 6, 6),
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 use_checkpoint=False,
+                 upscale=2,
+                 img_range=1.,
+                 upsampler='',
+                 resi_connection='1conv',
+                 **kwargs):
+        super(SwinIR, self).__init__()
+        num_in_ch = in_chans
+        num_out_ch = in_chans
+        num_feat = 64
+        self.img_range = img_range
+        if in_chans == 3:
+            rgb_mean = (0.4488, 0.4371, 0.4040)
+            self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1)
+        else:
+            self.mean = torch.zeros(1, 1, 1, 1)
+        self.upscale = upscale
+        self.upsampler = upsampler
+        # ------------------------- 1, shallow feature extraction ------------------------- #
+        self.conv_first = nn.Conv2d(num_in_ch, embed_dim, 3, 1, 1)
+        # ------------------------- 2, deep feature extraction ------------------------- #
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.num_features = embed_dim
+        self.mlp_ratio = mlp_ratio
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=embed_dim,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+        # merge non-overlapping patches into image
+        self.patch_unembed = PatchUnEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=embed_dim,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        # build Residual Swin Transformer blocks (RSTB)
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = RSTB(
+                dim=embed_dim,
+                input_resolution=(patches_resolution[0], patches_resolution[1]),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],  # no impact on SR results
+                norm_layer=norm_layer,
+                downsample=None,
+                use_checkpoint=use_checkpoint,
+                img_size=img_size,
+                patch_size=patch_size,
+                resi_connection=resi_connection)
+            self.layers.append(layer)
+        self.norm = norm_layer(self.num_features)
+        # build the last conv layer in deep feature extraction
+        if resi_connection == '1conv':
+            self.conv_after_body = nn.Conv2d(embed_dim, embed_dim, 3, 1, 1)
+        elif resi_connection == '3conv':
+            # to save parameters and memory
+            self.conv_after_body = nn.Sequential(
+                nn.Conv2d(embed_dim, embed_dim // 4, 3, 1, 1), nn.LeakyReLU(negative_slope=0.2, inplace=True),
+                nn.Conv2d(embed_dim // 4, embed_dim // 4, 1, 1, 0), nn.LeakyReLU(negative_slope=0.2, inplace=True),
+                nn.Conv2d(embed_dim // 4, embed_dim, 3, 1, 1))
+        # ------------------------- 3, high quality image reconstruction ------------------------- #
+        if self.upsampler == 'pixelshuffle':
+            # for classical SR
+            self.conv_before_upsample = nn.Sequential(
+                nn.Conv2d(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU(inplace=True))
+            self.upsample = Upsample(upscale, num_feat)
+            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
+        elif self.upsampler == 'pixelshuffledirect':
+            # for lightweight SR (to save parameters)
+            self.upsample = UpsampleOneStep(upscale, embed_dim, num_out_ch,
+                                            (patches_resolution[0], patches_resolution[1]))
+        elif self.upsampler == 'nearest+conv':
+            # for real-world SR (less artifacts)
+            assert self.upscale == 4, 'only support x4 now.'
+            self.conv_before_upsample = nn.Sequential(
+                nn.Conv2d(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU(inplace=True))
+            self.conv_up1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
+            self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
+            self.conv_hr = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
+            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
+            self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
+        else:
+            # for image denoising and JPEG compression artifact reduction
+            self.conv_last = nn.Conv2d(embed_dim, num_out_ch, 3, 1, 1)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'absolute_pos_embed'}
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'relative_position_bias_table'}
+    def forward_features(self, x):
+        x_size = (x.shape[2], x.shape[3])
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+        for layer in self.layers:
+            x = layer(x, x_size)
+        x = self.norm(x)  # b seq_len c
+        x = self.patch_unembed(x, x_size)
+        return x
+    def forward(self, x):
+        self.mean = self.mean.type_as(x)
+        x = (x - self.mean) * self.img_range
+        if self.upsampler == 'pixelshuffle':
+            # for classical SR
+            x = self.conv_first(x)
+            x = self.conv_after_body(self.forward_features(x)) + x
+            x = self.conv_before_upsample(x)
+            x = self.conv_last(self.upsample(x))
+        elif self.upsampler == 'pixelshuffledirect':
+            # for lightweight SR
+            x = self.conv_first(x)
+            x = self.conv_after_body(self.forward_features(x)) + x
+            x = self.upsample(x)
+        elif self.upsampler == 'nearest+conv':
+            # for real-world SR
+            x = self.conv_first(x)
+            x = self.conv_after_body(self.forward_features(x)) + x
+            x = self.conv_before_upsample(x)
+            x = self.lrelu(self.conv_up1(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest')))
+            x = self.lrelu(self.conv_up2(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest')))
+            x = self.conv_last(self.lrelu(self.conv_hr(x)))
+        else:
+            # for image denoising and JPEG compression artifact reduction
+            x_first = self.conv_first(x)
+            res = self.conv_after_body(self.forward_features(x_first)) + x_first
+            x = x + self.conv_last(res)
+        x = x / self.img_range + self.mean
+        return x
+    def flops(self):
+        flops = 0
+        h, w = self.patches_resolution
+        flops += h * w * 3 * self.embed_dim * 9
+        flops += self.patch_embed.flops()
+        for layer in self.layers:
+            flops += layer.flops()
+        flops += h * w * 3 * self.embed_dim * self.embed_dim
+        flops += self.upsample.flops()
+        return flops
+class SwinIRNOUP(nn.Module):
+    def __init__(self,
+                 img_size=48,
+                 patch_size=1,
+                 in_chans=3,
+                 embed_dim=180,
+                 depths=(6, 6, 6, 6, 6, 6),
+                 num_heads=(6, 6, 6, 6, 6, 6),
+                 window_size=8,
+                 mlp_ratio=2,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 use_checkpoint=False,
+                 upscale=4,
+                 img_range=1.,
+                 upsampler='pixelshuffle',
+                 resi_connection='1conv',
+                 **kwargs):
+        super(SwinIRNOUP, self).__init__()
+        num_in_ch = in_chans
+        num_out_ch = in_chans
+        num_feat = 64
+        self.img_range = img_range
+        self.upsampler = upsampler
+        # ------------------------- 1, shallow feature extraction ------------------------- #
+        self.conv_first = nn.Conv2d(num_in_ch, embed_dim, 3, 1, 1)
+        # ------------------------- 2, deep feature extraction ------------------------- #
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.num_features = embed_dim
+        self.mlp_ratio = mlp_ratio
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=embed_dim,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+        # merge non-overlapping patches into image
+        self.patch_unembed = PatchUnEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=embed_dim,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        # build Residual Swin Transformer blocks (RSTB)
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = RSTB(
+                dim=embed_dim,
+                input_resolution=(patches_resolution[0], patches_resolution[1]),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],  # no impact on SR results
+                norm_layer=norm_layer,
+                downsample=None,
+                use_checkpoint=use_checkpoint,
+                img_size=img_size,
+                patch_size=patch_size,
+                resi_connection=resi_connection)
+            self.layers.append(layer)
+        self.norm = norm_layer(self.num_features)
+        # build the last conv layer in deep feature extraction
+        if resi_connection == '1conv':
+            self.conv_after_body = nn.Conv2d(embed_dim, embed_dim, 3, 1, 1)
+        elif resi_connection == '3conv':
+            # to save parameters and memory
+            self.conv_after_body = nn.Sequential(
+                nn.Conv2d(embed_dim, embed_dim // 4, 3, 1, 1), nn.LeakyReLU(negative_slope=0.2, inplace=True),
+                nn.Conv2d(embed_dim // 4, embed_dim // 4, 1, 1, 0), nn.LeakyReLU(negative_slope=0.2, inplace=True),
+                nn.Conv2d(embed_dim // 4, embed_dim, 3, 1, 1))
+        # ------------------------- 3, high quality image reconstruction ------------------------- #
+        if self.upsampler == 'pixelshuffle':
+            # for classical SR
+            self.conv_before_upsample = nn.Sequential(
+                nn.Conv2d(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU(inplace=True))
+        elif self.upsampler == 'pixelshuffledirect':
+            # for lightweight SR (to save parameters)
+            self.upsample = UpsampleOneStep(upscale, embed_dim, num_out_ch,
+                                            (patches_resolution[0], patches_resolution[1]))
+        elif self.upsampler == 'nearest+conv':
+            # for real-world SR (less artifacts)
+            assert self.upscale == 4, 'only support x4 now.'
+            self.conv_before_upsample = nn.Sequential(
+                nn.Conv2d(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU(inplace=True))
+            self.conv_up1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
+            self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
+            self.conv_hr = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
+            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
+            self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
+        else:
+            # for image denoising and JPEG compression artifact reduction
+            self.conv_last = nn.Conv2d(embed_dim, num_out_ch, 3, 1, 1)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'absolute_pos_embed'}
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'relative_position_bias_table'}
+    def forward_features(self, x):
+        x_size = (x.shape[2], x.shape[3])
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+        for layer in self.layers:
+            x = layer(x, x_size)
+        x = self.norm(x)  # b seq_len c
+        x = self.patch_unembed(x, x_size)
+        return x
+    def forward(self, x):
+        if self.upsampler == 'pixelshuffle':
+            # for classical SR
+            x = self.conv_first(x)
+            x = self.conv_after_body(self.forward_features(x)) + x
+            x = self.conv_before_upsample(x)
+        elif self.upsampler == 'pixelshuffledirect':
+            # for lightweight SR
+            x = self.conv_first(x)
+            x = self.conv_after_body(self.forward_features(x)) + x
+        elif self.upsampler == 'nearest+conv':
+            # for real-world SR
+            x = self.conv_first(x)
+            x = self.conv_after_body(self.forward_features(x)) + x
+            x = self.conv_before_upsample(x)
+        else:
+            # for image denoising and JPEG compression artifact reduction
+            x_first = self.conv_first(x)
+            res = self.conv_after_body(self.forward_features(x_first)) + x_first
+            x = x + self.conv_last(res)
+        return x
+    def flops(self):
+        flops = 0
+        h, w = self.patches_resolution
+        flops += h * w * 3 * self.embed_dim * 9
+        flops += self.patch_embed.flops()
+        for layer in self.layers:
+            flops += layer.flops()
+        flops += h * w * 3 * self.embed_dim * self.embed_dim
+        flops += self.upsample.flops()
+        return flops
+if __name__ == '__main__':
+    upscale = 4
+    window_size = 8
+    height = (1024 // upscale // window_size + 1) * window_size
+    width = (720 // upscale // window_size + 1) * window_size
+    model = SwinIR(
+        upscale=2,
+        img_size=(height, width),
+        window_size=window_size,
+        img_range=1.,
+        depths=[6, 6, 6, 6],
+        embed_dim=60,
+        num_heads=[6, 6, 6, 6],
+        mlp_ratio=2,
+        upsampler='pixelshuffledirect')
+    print(model)
+    print(height, width, model.flops() / 1e9)
+    x = torch.randn((1, 3, height, width))
+    x = model(x)
+    print(x.shape)