Spaces:

Svane20
/

unet-swin-sky-replacement

Running

App Files Files Community

Svane20 commited on Jun 12

Commit

2c67a31

1 Parent(s): 6c0e2b4

Updated model to use PyTorch instead of ONNX

Browse files

Files changed (3) hide show

app.py +36 -25
model.py +139 -0
requirements.txt +2 -2

app.py CHANGED Viewed

@@ -1,38 +1,47 @@
 import gradio as gr
 import torch
 from torchvision.transforms import Compose, Resize, ToTensor, Normalize
-import onnxruntime as ort
 import pymatting
 import numpy as np
-import os
 from PIL import Image
 from typing import Tuple
 import random
 from pathlib import Path
-def _load_model(checkpoint):
-    """
-    Load the ONNX model for inference.
-    Args:
-        checkpoint (str): Path to the ONNX model file.
-    Returns:
-        session (onnxruntime.InferenceSession): The ONNX runtime session.
-        input_name (str): The name of the input tensor.
-        output_name (str): The name of the output tensor.
-    """
-    session_options = ort.SessionOptions()
-    session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
-    session_options.intra_op_num_threads = min(1, os.cpu_count() - 1)
-    providers = ['CUDAExecutionProvider'] if torch.cuda.is_available() else ['CPUExecutionProvider']
-    session = ort.InferenceSession(checkpoint, providers=providers)
-    input_name = session.get_inputs()[0].name
-    output_name = session.get_outputs()[0].name
-    return session, input_name, output_name
 transforms = Compose(
@@ -44,9 +53,9 @@ transforms = Compose(
 )
 share_repo = False
-checkpoint_path = "swin_small_patch4_window7_224_512_v1_latest.onnx"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-session, input_name, output_name = _load_model(checkpoint_path)
 def _get_foreground_estimation(image, alpha):
@@ -130,9 +139,11 @@ def _inference(image):
     Returns:
         np.ndarray: The predicted alpha mask.
     """
-    output = session.run(output_names=[output_name], input_feed={input_name: image.cpu().numpy()})[0]
     # Ensure the output is in valid range [0, 1]
     output = np.clip(output, a_min=0, a_max=1)
     return np.squeeze(output, axis=0).squeeze()
@@ -276,4 +287,4 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
     run_button.click(fn=predict, inputs=input_image, outputs=[output_mask, output_sky])
 # Launch the interface
-demo.launch(share=share_repo)

 import gradio as gr
 import torch
 from torchvision.transforms import Compose, Resize, ToTensor, Normalize
 import pymatting
 import numpy as np
 from PIL import Image
 from typing import Tuple
 import random
 from pathlib import Path
+from model import SwinMattingModel
+def _load_checkpoint(model, checkpoint_path):
+    # Load the checkpoint
+    checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
+    # Check if there are any errors when loading the state dictionary
+    missing_keys, unexpected_keys = model.load_state_dict(checkpoint)
+    if missing_keys:
+        print(missing_keys)
+        raise RuntimeError("Missing keys in checkpoint.")
+    if unexpected_keys:
+        print(unexpected_keys)
+        raise RuntimeError("Unexpected keys in checkpoint.")
+def _load_model(checkpoint, device):
+    model = SwinMattingModel({
+        "encoder": {
+            "model_name": "microsoft/swin-small-patch4-window7-224"
+        },
+        "decoder": {
+            "use_attn": True,
+            "refine_channels": 16
+        }
+    })
+    _load_checkpoint(model, checkpoint)
+    model.to(device)
+    model.eval()
+    return model
 transforms = Compose(
 )
 share_repo = False
+checkpoint_path = "swin_small_patch4_window7_224_512_v1_latest.pt"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = _load_model(checkpoint_path, device)
 def _get_foreground_estimation(image, alpha):
     Returns:
         np.ndarray: The predicted alpha mask.
     """
+    with torch.inference_mode():
+        output = model(image)
     # Ensure the output is in valid range [0, 1]
+    output = output.detach().cpu().numpy()
     output = np.clip(output, a_min=0, a_max=1)
     return np.squeeze(output, axis=0).squeeze()
     run_button.click(fn=predict, inputs=input_image, outputs=[output_mask, output_sky])
 # Launch the interface
+demo.launch(share=share_repo, ssr_mode=False)

model.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoBackbone
+from typing import Any, Dict
+class SwinMattingModel(nn.Module):
+    def __init__(self, config: Dict[str, Any]):
+        super().__init__()
+        encoder_config = config['encoder']
+        decoder_config = config['decoder']
+        self.encoder = SwinEncoder(model_name=encoder_config["model_name"])
+        self.decoder = MattingDecoder(
+            use_attn=decoder_config["use_attn"],
+            refine_channels=decoder_config["refine_channels"]
+        )
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input image [B, 3, 512, 512], normalized as needed for Swin.
+        Returns:
+            torch.Tensor: Alpha matte [B, 1, 512, 512].
+        """
+        features = self.encoder(x)  # list of 4 feature maps
+        return self.decoder(features, x)  # decoded and refined alpha matte
+class SwinEncoder(nn.Module):
+    def __init__(self, model_name="microsoft/swin-small-patch4-window7-224"):
+        super().__init__()
+        self.backbone = AutoBackbone.from_pretrained(model_name, out_indices=(1, 2, 3, 4))
+    def forward(self, x):
+        outputs = self.backbone(pixel_values=x)
+        features = outputs.feature_maps
+        features = list(features)
+        return features
+class MattingDecoder(nn.Module):
+    def __init__(self, use_attn=False, refine_channels=16):
+        super().__init__()
+        self.use_attn = use_attn
+        self.refine_channels = refine_channels
+        # Bottom convolution (process 1/32 feature)
+        self.conv_bottom = nn.Conv2d(768, 768, kernel_size=3, padding=1)
+        self.bn_bottom = nn.BatchNorm2d(768)
+        # Upsample + fuse with skip connections
+        self.conv_up3 = nn.Conv2d(768 + 384, 384, kernel_size=3, padding=1)
+        self.bn_up3 = nn.BatchNorm2d(384)
+        self.conv_up2 = nn.Conv2d(384 + 192, 192, kernel_size=3, padding=1)
+        self.bn_up2 = nn.BatchNorm2d(192)
+        self.conv_up1 = nn.Conv2d(192 + 96, 96, kernel_size=3, padding=1)
+        self.bn_up1 = nn.BatchNorm2d(96)
+        self.conv_out = nn.Conv2d(96, 1, kernel_size=3, padding=1)
+        # Detail refinement
+        self.refine_conv1 = nn.Conv2d(4, self.refine_channels, kernel_size=3, padding=1)
+        self.bn_refine1 = nn.BatchNorm2d(self.refine_channels)
+        self.refine_conv2 = nn.Conv2d(self.refine_channels, self.refine_channels, kernel_size=3, padding=1)
+        self.bn_refine2 = nn.BatchNorm2d(self.refine_channels)
+        self.refine_conv3 = nn.Conv2d(self.refine_channels, 1, kernel_size=3, padding=1)
+        # Attention gates
+        if self.use_attn:
+            self.reduce_768_to_384 = nn.Conv2d(768, 384, kernel_size=1)
+            self.reduce_384_to_192 = nn.Conv2d(384, 192, kernel_size=1)
+            self.reduce_192_to_96 = nn.Conv2d(192, 96, kernel_size=1)
+            self.gate_16 = nn.Conv2d(384, 384, kernel_size=1)
+            self.skip_16 = nn.Conv2d(384, 384, kernel_size=1)
+            self.gate_8 = nn.Conv2d(192, 192, kernel_size=1)
+            self.skip_8 = nn.Conv2d(192, 192, kernel_size=1)
+            self.gate_4 = nn.Conv2d(96, 96, kernel_size=1)
+            self.skip_4 = nn.Conv2d(96, 96, kernel_size=1)
+    def forward(self, features, original_image):
+        f1, f2, f3, f4 = features  # [1/4, 1/8, 1/16, 1/32]
+        # Bottom (1/32)
+        x = F.relu(self.bn_bottom(self.conv_bottom(f4)))
+        # 1/16 stage
+        x = F.interpolate(x, scale_factor=2.0, mode='nearest')  # -> [B, 768, 32, 32]
+        if self.use_attn:
+            x_reduced = self.reduce_768_to_384(x)
+            g = self.gate_16(x_reduced)
+            skip = self.skip_16(f3)
+            att = torch.sigmoid(g + skip)
+            f3 = f3 * att
+        x = torch.cat([x, f3], dim=1)
+        x = F.relu(self.bn_up3(self.conv_up3(x)))  # -> [B, 384, 32, 32]
+        # 1/8 stage
+        x = F.interpolate(x, scale_factor=2.0, mode='nearest')
+        if self.use_attn:
+            x_reduced = self.reduce_384_to_192(x)
+            g = self.gate_8(x_reduced)
+            skip = self.skip_8(f2)
+            att = torch.sigmoid(g + skip)
+            f2 = f2 * att
+        x = torch.cat([x, f2], dim=1)
+        x = F.relu(self.bn_up2(self.conv_up2(x)))  # -> [B, 192, 64, 64]
+        # 1/4 stage
+        x = F.interpolate(x, scale_factor=2.0, mode='nearest')
+        if self.use_attn:
+            x_reduced = self.reduce_192_to_96(x)
+            g = self.gate_4(x_reduced)
+            skip = self.skip_4(f1)
+            att = torch.sigmoid(g + skip)
+            f1 = f1 * att
+        x = torch.cat([x, f1], dim=1)
+        x = F.relu(self.bn_up1(self.conv_up1(x)))  # -> [B, 96, 128, 128]
+        # Upsample to full resolution and predict coarse alpha
+        x = F.interpolate(x, size=original_image.shape[-2:], mode='nearest')  # -> [B, 96, 512, 512]
+        coarse_alpha = self.conv_out(x)
+        # Detail refinement
+        refine_input = torch.cat([coarse_alpha, original_image], dim=1)
+        r = F.relu(self.bn_refine1(self.refine_conv1(refine_input)))
+        r = F.relu(self.bn_refine2(self.refine_conv2(r)))
+        refined_alpha = self.refine_conv3(r)
+        return torch.sigmoid(refined_alpha)

requirements.txt CHANGED Viewed

@@ -1,8 +1,8 @@
 gradio
 torch
 torchvision
 numpy
 pillow
 pymatting
-opencv-python
-onnxruntime-gpu

 gradio
 torch
 torchvision
+transformers
 numpy
 pillow
 pymatting
+opencv-python