yongyizang
/

MSR_UFormers

Model card Files Files and versions Community

yongyizang commited on May 27

Commit

deba883

verified ·

1 Parent(s): f0a09d7

Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

__pycache__/model.cpython-313.pyc +0 -0
checkpoints/acoustic_guitar.pth +3 -0
checkpoints/bass.pth +3 -0
checkpoints/electric_guitar.pth +3 -0
checkpoints/guitars.pth +3 -0
checkpoints/keyboards.pth +3 -0
checkpoints/orchestra.pth +3 -0
checkpoints/rhythm_section.pth +3 -0
checkpoints/synth.pth +3 -0
checkpoints/vocals.pth +3 -0
inference.py +110 -0
model.py +550 -0
requirements.txt +105 -0

__pycache__/model.cpython-313.pyc ADDED Viewed

Binary file (22 kB). View file

checkpoints/acoustic_guitar.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43c8060f061fea8d9dd42e7244004cbbbdb5672e353dfb1a8de5dcc2837ff848
+size 57419739

checkpoints/bass.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5abfb0f75f1f10d07f483acca4612494767d676a62035877b81c48d67db7d73f
+size 57419739

checkpoints/electric_guitar.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:615143058e675760c757ac2eab996c8c426d4163f0b64949f112e5cc0c4072e4
+size 57419739

checkpoints/guitars.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6fcf5090367aee602bda4ccc6cb345da127cb7139be669bdd4ad9aad5b025a0d
+size 57419739

checkpoints/keyboards.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:132a79c2d6476a00e818031e097a0555662350510aa9d0733a679d34e3acf2c5
+size 57419739

checkpoints/orchestra.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e2c68f28302f6256008185c98db7b4610a606dce973fbf7f605627b19ef7cbab
+size 57419739

checkpoints/rhythm_section.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5aa8c854981359d3564b720c338d19c614029570586e856b0576124515bf01e2
+size 57419739

checkpoints/synth.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b54daf5f65b9eeaef7d98efcdfd9b17616f732d1127a4a32c9a0bdd11689c4a
+size 57419739

checkpoints/vocals.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1029b7c5f3fb06969f740a7583ca27a6944a8cd078b2cd5c6169dc512dd7a097
+size 57419739

inference.py ADDED Viewed

	@@ -0,0 +1,110 @@

+#!/usr/bin/env python
+import io, os, torch, numpy as np, soundfile as sf
+from huggingface_hub import snapshot_download
+from model import UFormer, UFormerConfig
+# ——————————————————————
+# 1) Setup
+# ——————————————————————
+REPO_ID  = "yongyizang/MSR_UFormers"
+device   = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+local_dir= snapshot_download(REPO_ID)
+config   = UFormerConfig()
+_model_cache = {}
+VALID_CKPTS = [
+    "acoustic_guitar","bass","electric_guitar","guitars","keyboards",
+    "orchestra","rhythm_section","synth","vocals"
+]
+def _get_model(ckpt_name: str):
+    if ckpt_name not in VALID_CKPTS:
+        raise ValueError(f"Invalid checkpoint {ckpt_name!r}, choose from {VALID_CKPTS}")
+    if ckpt_name in _model_cache:
+        return _model_cache[ckpt_name]
+    path = os.path.join(local_dir, "checkpoints", f"{ckpt_name}.pth")
+    m = UFormer(config).to(device).eval()
+    sd = torch.load(path, map_location="cpu")
+    m.load_state_dict(sd)
+    _model_cache[ckpt_name] = m
+    return m
+# ——————————————————————
+# 2) Overlap-add helper
+# ——————————————————————
+def _overlap_add(model, x: np.ndarray, sr: int, chunk_s: float=5., hop_s: float=2.5):
+    C, T = x.shape
+    chunk, hop = int(sr*chunk_s), int(sr*hop_s)
+    pad = (-(T - chunk) % hop) if T>chunk else 0
+    x_pad = np.pad(x, ((0,0),(0,pad)), mode="reflect")
+    win   = np.hanning(chunk)[None,:]
+    out   = np.zeros_like(x_pad); norm = np.zeros((1,x_pad.shape[1]))
+    n_chunks = 1 + (x_pad.shape[1] - chunk)//hop
+    for i in range(n_chunks):
+        s = i*hop
+        seg = x_pad[:, s:s+chunk]
+        with torch.no_grad():
+            y = model(torch.from_numpy(seg[None]).to(device)).squeeze(0).cpu().numpy()
+        out[:, s:s+chunk] += y * win
+        norm[0, s:s+chunk] += win
+    return (out / norm)[:, :T]
+# ——————————————————————
+# 3) HF Inference entry-point
+# ——————————————————————
+def inference(input_bytes: bytes, checkpoint: str = "guitars") -> bytes:
+    """
+    audio_bytes in → restored_bytes out.
+    Pass {"inputs": <bytes>, "parameters": {"checkpoint": "<name>"}} to choose.
+    """
+    audio, sr = sf.read(io.BytesIO(input_bytes))
+    if audio.ndim==1: audio = np.stack([audio,audio],axis=1)
+    x = audio.T  # (C,T)
+    model = _get_model(checkpoint)
+    if x.shape[1] <= sr*5:
+        with torch.no_grad():
+            y = model(torch.from_numpy(x[None]).to(device)).squeeze(0).cpu().numpy()
+    else:
+        y = _overlap_add(model, x, sr)
+    buf = io.BytesIO()
+    sf.write(buf, y.T, sr, format="WAV")
+    return buf.getvalue()
+# ——————————————————————
+# 4) CLI & Gradio
+# ——————————————————————
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser("UFormer RESTORE")
+    parser.add_argument("-i","--input", type=str, help="noisy WAV")
+    parser.add_argument("-o","--output",type=str, help="restored WAV")
+    parser.add_argument("-c","--checkpoint",type=str,default="guitars",
+                        choices=VALID_CKPTS)
+    parser.add_argument("--serve",action="store_true", help="launch Gradio")
+    args = parser.parse_args()
+    if args.serve:
+        import gradio as gr
+        def _gr(path, ckpt):
+            return inference(open(path,"rb").read(), checkpoint=ckpt)
+        gr.Interface(
+            fn=_gr,
+            inputs=[
+                gr.Audio(source="upload", type="filepath"),
+                gr.Dropdown(VALID_CKPTS, label="Checkpoint")
+            ],
+            outputs=gr.Audio(type="filepath"),
+            title="🎵 Music Source Restoration Restoration",
+            description="Choose which instrument/group model to run."
+        ).launch()
+    else:
+        assert args.input and args.output
+        out = inference(open(args.input,"rb").read(),
+                        checkpoint=args.checkpoint)
+        open(args.output,"wb").write(out)
+        print(f"✅ Restored → {args.output} using {args.checkpoint}")

model.py ADDED Viewed

	@@ -0,0 +1,550 @@

+from __future__ import annotations
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+import numpy as np
+from dataclasses import dataclass
+class Fourier(nn.Module):
+    def __init__(self,
+        n_fft=2048,
+        hop_length=441,
+        return_complex=True,
+        normalized=True
+    ):
+        super(Fourier, self).__init__()
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.return_complex = return_complex
+        self.normalized = normalized
+    def stft(self, waveform):
+        """
+        Args:
+            waveform: (b, c, samples_num)
+        Returns:
+            complex_sp: (b, c, t, f)
+        """
+        B, C, T = waveform.shape
+        x = rearrange(waveform, 'b c t -> (b c) t')
+        x = torch.stft(
+            input=x,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            window=torch.hann_window(self.n_fft).to(x.device),
+            normalized=self.normalized,
+            return_complex=self.return_complex
+        )
+        # shape: (batch_size * channels_num, freq_bins, frames_num)
+        complex_sp = rearrange(x, '(b c) f t -> b c t f', b=B, c=C)
+        # shape: (batch_size, channels_num, frames_num, freq_bins)
+        return complex_sp
+    def istft(self, complex_sp):
+        """
+        Args:
+            complex_sp: (batch_size, channels_num, frames_num, freq_bins)
+        Returns:
+            waveform: (batch_size, channels_num, samples_num)
+        """
+        B, C, T, F = complex_sp.shape
+        x = rearrange(complex_sp, 'b c t f -> (b c) f t')
+        x = torch.istft(
+            input=x,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            window=torch.hann_window(self.n_fft).to(x.device),
+            normalized=self.normalized,
+        )
+        # shape: (batch_size * channels_num, samples_num)
+        x = rearrange(x, '(b c) t -> b c t', b=B, c=C)
+        # shape: (batch_size, channels_num, samples_num)
+        return x
+class Block(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.att_norm = RMSNorm(config.n_embd)
+        self.att = SelfAttention(config)
+        self.ffn_norm = RMSNorm(config.n_embd)
+        self.mlp = MLP(config)
+    def forward(
+        self,
+        x: torch.Tensor,
+        rope: torch.Tensor,
+        mask: torch.Tensor,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            x: (b, t, d)
+            rope: (t, head_dim/2)
+            mask: (1, 1, t, t)
+        Outputs:
+            x: (b, t, d)
+        """
+        x = x + self.att(self.att_norm(x), rope, mask)
+        x = x + self.mlp(self.ffn_norm(x))
+        return x
+class RMSNorm(nn.Module):
+    r"""Root Mean Square Layer Normalization.
+    Ref: https://github.com/meta-llama/llama/blob/main/llama/model.py
+    """
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.scale = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        r"""RMSNorm.
+        Args:
+            x: (b, t, d)
+        Outputs:
+            x: (b, t, d)
+        """
+        norm_x = torch.mean(x ** 2, dim=-1, keepdim=True)
+        output = x * torch.rsqrt(norm_x + self.eps) * self.scale
+        return output
+class SelfAttention(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
+        # output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+    def forward(
+        self,
+        x: torch.Tensor,
+        rope: torch.Tensor,
+        mask: torch.Tensor,
+    ) -> torch.Tensor:
+        r"""Causal self attention.
+        b: batch size
+        t: time steps
+        d: latent dim
+        h: heads num
+        Args:
+            x: (b, t, d)
+            rope: (t, head_dim/2, 2)
+            mask: (1, 1, )
+        Outputs:
+            x: (b, t, d)
+        """
+        B, T, D = x.shape
+        # Calculate query, key, values
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        # q, k, v shapes: (b, t, d)
+        k = k.view(B, T, self.n_head, D // self.n_head)
+        q = q.view(B, T, self.n_head, D // self.n_head)
+        v = v.view(B, T, self.n_head, D // self.n_head)
+        # q, k, v shapes: (b, t, h, head_dim)
+        q = apply_rope(q, rope)
+        k = apply_rope(k, rope)
+        # q, k shapes: (b, t, h, head_dim)
+        k = k.transpose(1, 2)
+        q = q.transpose(1, 2)
+        v = v.transpose(1, 2)
+        # q, k, v shapes: (b, h, t, head_dim)
+        # Efficient attention using Flash Attention CUDA kernels
+        x = F.scaled_dot_product_attention(
+            query=q,
+            key=k,
+            value=v,
+            attn_mask=mask,
+            dropout_p=0.0
+        )
+        # shape: (b, h, t, head_dim)
+        x = x.transpose(1, 2).contiguous().view(B, T, D)  # shape: (b, t, d)
+        # output projection
+        x = self.c_proj(x)  # shape: (b, t, d)
+        return x
+class MLP(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        # The hyper-parameters follow https://github.com/Lightning-AI/lit-llama/blob/main/lit_llama/model.py
+        hidden_dim = 4 * config.n_embd
+        n_hidden = int(2 * hidden_dim / 3)
+        self.c_fc1 = nn.Linear(config.n_embd, n_hidden, bias=False)
+        self.c_fc2 = nn.Linear(config.n_embd, n_hidden, bias=False)
+        self.c_proj = nn.Linear(n_hidden, config.n_embd, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""Causal self attention.
+        Args:
+            x: (b, t, d)
+        Outputs:
+            x: (b, t, d)
+        """
+        x = F.silu(self.c_fc1(x)) * self.c_fc2(x)
+        x = self.c_proj(x)
+        return x
+def build_rope(
+    seq_len: int, head_dim: int, base: int = 10000
+) -> torch.Tensor:
+    r"""Rotary Position Embedding.
+    Modified from: https://github.com/Lightning-AI/lit-llama/blob/main/lit_llama/model.py
+    Args:
+        seq_len: int, e.g., 1024
+        head_dim: head dim, e.g., 768/24
+        base: int
+    Outputs:
+        cache: (t, head_dim/2, 2)
+    """
+    theta = 1.0 / (base ** (torch.arange(0, head_dim, 2) / head_dim))
+    seq_idx = torch.arange(seq_len)
+    # Calculate the product of position index and $\theta_i$
+    idx_theta = torch.outer(seq_idx, theta).float()
+    cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
+    return cache
+def apply_rope(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
+    # truncate to support variable sizes
+    T = x.size(1)
+    rope_cache = rope_cache[:T]
+    # cast because the reference does
+    xshaped = x.float().reshape(*x.shape[:-1], -1, 2)
+    rope_cache = rope_cache.view(1, xshaped.size(1), 1, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
+            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return x_out2.type_as(x)
+@dataclass
+class UFormerConfig:
+    sr: float = 44100
+    n_fft: int = 2048
+    hop_length: int = 441
+    n_layer: int = 6
+    n_head: int = 8
+    n_embd: int = 256
+class UFormer(Fourier):
+    def __init__(self, config: UFormerConfig) -> None:
+        super(UFormer, self).__init__(
+            n_fft=config.n_fft,
+            hop_length=config.hop_length,
+            return_complex=True,
+            normalized=True
+        )
+        self.ds_factor = 16  # Downsample factor
+        self.fps = config.sr // config.hop_length
+        self.audio_channels = 2
+        self.cmplx_num = 2
+        in_channels = self.audio_channels * self.cmplx_num
+        self.encoder_block1 = EncoderBlock(in_channels, 16)
+        self.encoder_block2 = EncoderBlock(16, 64)
+        self.encoder_block3 = EncoderBlock(64, 256)
+        self.encoder_block4 = EncoderBlock(256, config.n_embd)
+        self.decoder_block1 = DecoderBlock(config.n_embd, 256)
+        self.decoder_block2 = DecoderBlock(256, 64)
+        self.decoder_block3 = DecoderBlock(64, 16)
+        self.decoder_block4 = DecoderBlock(16, 16)
+        self.t_blocks = nn.ModuleList(Block(config) for _ in range(config.n_layer))
+        self.f_blocks = nn.ModuleList(Block(config) for _ in range(config.n_layer))
+        self.head_dim = config.n_embd // config.n_head
+        t_rope = build_rope(seq_len=config.n_fft // 16, head_dim=self.head_dim)
+        f_rope = build_rope(seq_len=self.fps * 20, head_dim=self.head_dim)
+        self.register_buffer(name="t_rope", tensor=t_rope)  # shape: (t, head_dim/2, 2)
+        self.register_buffer(name="f_rope", tensor=f_rope)  # shape: (t, head_dim/2, 2)
+        self.post_fc = nn.Conv2d(
+            in_channels=16,
+            out_channels=in_channels,
+            kernel_size=1,
+            padding=0,
+        )
+    def forward(self, audio):
+        """Separation model.
+        b: batch_size
+        c: channels_num
+        l: audio_samples
+        t: frames_num
+        f: freq_bins
+        Args:
+            audio: (b, c, t)
+        Outputs:
+            output: (b, c, t)
+        """
+        # Complex spectrum
+        complex_sp = self.stft(audio)  # shape: (b, c, t, f)
+        x = torch.view_as_real(complex_sp)  # shape: (b, c, t, f, 2)
+        x = rearrange(x, 'b c t f k -> b (c k) t f')  # shape: (b, d, t, f)
+        # pad stft
+        x, pad_t = self.pad_tensor(x)  # x: (b, d, t, f)
+        B = x.shape[0]
+        x1, latent1 = self.encoder_block1(x)
+        x2, latent2 = self.encoder_block2(x1)
+        x3, latent3 = self.encoder_block3(x2)
+        x, latent4 = self.encoder_block4(x3)
+        for t_block, f_block in zip(self.t_blocks, self.f_blocks):
+            x = rearrange(x, 'b d t f -> (b f) t d')
+            x = t_block(x, self.t_rope, mask=None)  # shape: (b*f, t, d)
+            x = rearrange(x, '(b f) t d -> (b t) f d', b=B)
+            x = f_block(x, self.f_rope, mask=None)  # shape: (b*t, f, d)
+            x = rearrange(x, '(b t) f d -> b d t f', b=B)  # shape: (b, d, t, f)
+        x5 = self.decoder_block1(x, latent4)
+        x6 = self.decoder_block2(x5, latent3)
+        x7 = self.decoder_block3(x6, latent2)
+        x8 = self.decoder_block4(x7, latent1)
+        x = self.post_fc(x8)
+        x = rearrange(x, 'b (c k) t f -> b c t f k', k=self.cmplx_num).contiguous()
+        x = x.to(torch.float)  # compatible with bf16
+        mask = torch.view_as_complex(x)  # shape: (b, c, t, f)
+        # Unpad mask to the original shape
+        mask = self.unpad_tensor(mask, pad_t)  # shape: (b, c, t, f)
+        # Calculate stft of separated audio
+        # sep_stft = mask * complex_sp  # shape: (b, c, t, f)
+        # ISTFT
+        output = self.istft(mask)  # shape: (b, c, l)
+        return output
+    def pad_tensor(self, x: torch.Tensor) -> tuple[torch.Tensor, int]:
+        """Pad a spectrum that can be evenly divided by downsample_ratio.
+        Args:
+            x: E.g., (b, c, t=201, f=1025)
+        Outpus:
+            output: E.g., (b, c, t=208, f=1024)
+        """
+        # Pad last frames, e.g., 201 -> 208
+        T = x.shape[2]
+        pad_t = -T % self.ds_factor
+        x = F.pad(x, pad=(0, 0, 0, pad_t))
+        # Remove last frequency bin, e.g., 1025 -> 1024
+        x = x[:, :, :, 0 : -1]
+        return x, pad_t
+    def unpad_tensor(self, x: torch.Tensor, pad_t: int) -> torch.Tensor:
+        """Unpad a spectrum to the original shape.
+        Args:
+            x: E.g., (b, c, t=208, f=1024)
+        Outpus:
+            x: E.g., (b, c, t=201, f=1025)
+        """
+        # Pad last frequency bin, e.g., 1024 -> 1025
+        x = F.pad(x, pad=(0, 1))
+        # Unpad last frames, e.g., 208 -> 201
+        x = x[:, :, 0 : -pad_t, :]
+        return x
+class ConvBlock(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, kernel_size):
+        r"""Residual block."""
+        super(ConvBlock, self).__init__()
+        padding = [kernel_size[0] // 2, kernel_size[1] // 2]
+        self.bn1 = nn.BatchNorm2d(in_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+            bias=False,
+        )
+        self.conv2 = nn.Conv2d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+            bias=False,
+        )
+        if in_channels != out_channels:
+            self.shortcut = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(1, 1),
+                padding=(0, 0),
+            )
+            self.is_shortcut = True
+        else:
+            self.is_shortcut = False
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: (b, c_in, t, f)
+        Returns:
+            output: (b, c_out, t, f)
+        """
+        h = self.conv1(F.leaky_relu_(self.bn1(x)))
+        h = self.conv2(F.leaky_relu_(self.bn2(h)))
+        if self.is_shortcut:
+            return self.shortcut(x) + h
+        else:
+            return x + h
+class EncoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=(3, 3)):
+        super(EncoderBlock, self).__init__()
+        self.pool_size = 2
+        self.conv_block = ConvBlock(in_channels, out_channels, kernel_size)
+    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            x: (b, c_in, t, f)
+        Returns:
+            latent: (b, c_out, t, f)
+            output: (b, c_out, t/2, f/2)
+        """
+        latent = self.conv_block(x)  # shape: (b, c_out, t, f)
+        output = F.avg_pool2d(latent, kernel_size=self.pool_size)  # shape: (b, c_out, t/2, f/2)
+        return output, latent
+class DecoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=(3, 3)):
+        super(DecoderBlock, self).__init__()
+        stride = 2
+        self.upsample = torch.nn.ConvTranspose2d(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=stride,
+            stride=stride,
+            padding=(0, 0),
+            bias=False,
+        )
+        self.conv_block = ConvBlock(in_channels * 2, out_channels, kernel_size)
+    def forward(self, x: torch.Tensor, latent: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: (b, c_in, t/2, f/2)
+        Returns:
+            output: (b, c_out, t, f)
+        """
+        x = self.upsample(x)  # shape: (b, c_in, t, f)
+        x = torch.cat((x, latent), dim=1)  # shape: (b, 2*c_in, t, f)
+        x = self.conv_block(x)  # shape: (b, c_out, t, f)
+        return x
+if __name__ == "__main__":
+    # Example usage
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    config = UFormerConfig()
+    model = UFormer(config)
+    checkpoint_path = None
+    model.load_state_dict(torch.load(checkpoint_path, map_location=device))
+    model.to(device)
+    audio = torch.randn(1, 2, 10*44100).to(device)  # Example audio input (batch_size=1, channels=2, samples=88200)
+    output = model(audio)
+    print(output.shape)  # Output shape

requirements.txt ADDED Viewed

	@@ -0,0 +1,105 @@

+huggingface_hub
+absl-py==2.2.2
+altair==5.5.0
+attrs==25.3.0
+audioop-lts==0.2.1
+audioread==3.0.1
+blinker==1.9.0
+cachetools==5.5.2
+certifi==2025.4.26
+cffi==1.17.1
+charset-normalizer==3.4.2
+click==8.1.8
+contourpy==1.3.2
+cycler==0.12.1
+decorator==5.2.1
+einops==0.8.1
+filelock==3.18.0
+fonttools==4.57.0
+fsspec==2025.3.2
+gitdb==4.0.12
+GitPython==3.1.44
+grpcio==1.71.0
+idna==3.10
+imageio==2.37.0
+Jinja2==3.1.6
+joblib==1.5.0
+jsonschema==4.23.0
+jsonschema-specifications==2025.4.1
+kiwisolver==1.4.8
+lazy_loader==0.4
+librosa==0.11.0
+llvmlite==0.44.0
+Markdown==3.8
+MarkupSafe==3.0.2
+matplotlib==3.10.1
+mpmath==1.3.0
+msgpack==1.1.0
+narwhals==1.38.0
+networkx==3.4.2
+numba==0.61.2
+numpy==2.2.5
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvrtc-cu12==12.6.77
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.5.1.17
+nvidia-cufft-cu12==11.3.0.4
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-cusparselt-cu12==0.6.3
+nvidia-nccl-cu12==2.26.2
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvtx-cu12==12.6.77
+packaging==24.2
+pandas==2.2.3
+pedalboard==0.9.16
+pillow==11.2.1
+platformdirs==4.3.7
+pooch==1.8.2
+progressbar==2.5
+protobuf==6.30.2
+pyarrow==20.0.0
+pycparser==2.22
+pydeck==0.9.1
+pyparsing==3.2.3
+python-dateutil==2.9.0.post0
+pytz==2025.2
+PyYAML==6.0.2
+referencing==0.36.2
+requests==2.32.3
+rpds-py==0.24.0
+scikit-image==0.25.2
+scikit-learn==1.6.1
+scipy==1.15.2
+setuptools==78.1.1
+six==1.17.0
+smmap==5.0.2
+soundfile==0.13.1
+soxr==0.5.0.post1
+standard-aifc==3.13.0
+standard-chunk==3.13.0
+standard-sunau==3.13.0
+streamlit==1.45.0
+sympy==1.14.0
+tenacity==9.1.2
+tensorboard==2.19.0
+tensorboard-data-server==0.7.2
+threadpoolctl==3.6.0
+tifffile==2025.3.30
+toml==0.10.2
+torch==2.7.0
+torchaudio==2.7.0
+torchlibrosa==0.1.0
+tornado==6.4.2
+tqdm==4.67.1
+triton==3.3.0
+typing_extensions==4.13.2
+tzdata==2025.2
+urllib3==2.4.0
+voicefixer==0.1.3
+watchdog==6.0.0
+Werkzeug==3.1.3
+wheel==0.45.1