Upload 4 files

Browse files

Files changed (5) hide show

.gitattributes +1 -0
location_encoder.py +158 -0
modeling_closp.py +202 -0
positional_encoding.py +110 -0
spherical_armonics.py +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+spherical_armonics.py filter=lfs diff=lfs merge=lfs -text

location_encoder.py ADDED Viewed

	@@ -0,0 +1,158 @@

+# Copyright (c) Microsoft Corporation.
+import math
+import torch
+from einops import rearrange
+from torch import nn
+from torch.nn import functional as F
+from .positional_encoding import SphericalHarmonics
+class LocationEncoder(nn.Module):
+    def __init__(
+        self,
+        dim_hidden: int,
+        num_layers: int,
+        dim_out: int,
+        legendre_polys: int = 10,
+    ):
+        super().__init__()
+        self.posenc = SphericalHarmonics(legendre_polys=legendre_polys)
+        self.nnet = SirenNet(
+            dim_in=self.posenc.embedding_dim,
+            dim_hidden=dim_hidden,
+            num_layers=num_layers,
+            dim_out=dim_out,
+        )
+    def forward(self, x):
+        x = self.posenc(x)
+        return self.nnet(x)
+class SirenNet(nn.Module):
+    """Sinusoidal Representation Network (SIREN)"""
+    def __init__(
+        self,
+        dim_in,
+        dim_hidden,
+        dim_out,
+        num_layers,
+        w0=1.0,
+        w0_initial=30.0,
+        use_bias=True,
+        final_activation=None,
+        degreeinput=False,
+        dropout=True,
+    ):
+        super().__init__()
+        self.num_layers = num_layers
+        self.dim_hidden = dim_hidden
+        self.degreeinput = degreeinput
+        self.layers = nn.ModuleList([])
+        for ind in range(num_layers):
+            is_first = ind == 0
+            layer_w0 = w0_initial if is_first else w0
+            layer_dim_in = dim_in if is_first else dim_hidden
+            self.layers.append(
+                Siren(
+                    dim_in=layer_dim_in,
+                    dim_out=dim_hidden,
+                    w0=layer_w0,
+                    use_bias=use_bias,
+                    is_first=is_first,
+                    dropout=dropout,
+                )
+            )
+        final_activation = (
+            nn.Identity() if not exists(final_activation) else final_activation
+        )
+        self.last_layer = Siren(
+            dim_in=dim_hidden,
+            dim_out=dim_out,
+            w0=w0,
+            use_bias=use_bias,
+            activation=final_activation,
+            dropout=False,
+        )
+    def forward(self, x, mods=None):
+        # do some normalization to bring degrees in a -pi to pi range
+        if self.degreeinput:
+            x = torch.deg2rad(x) - torch.pi
+        mods = cast_tuple(mods, self.num_layers)
+        for layer, mod in zip(self.layers, mods):
+            x = layer(x)
+            if exists(mod):
+                x *= rearrange(mod, "d -> () d")
+        return self.last_layer(x)
+class Sine(nn.Module):
+    def __init__(self, w0=1.0):
+        super().__init__()
+        self.w0 = w0
+    def forward(self, x):
+        return torch.sin(self.w0 * x)
+class Siren(nn.Module):
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        w0=1.0,
+        c=6.0,
+        is_first=False,
+        use_bias=True,
+        activation=None,
+        dropout=False,
+    ):
+        super().__init__()
+        self.dim_in = dim_in
+        self.is_first = is_first
+        self.dim_out = dim_out
+        self.dropout = dropout
+        weight = torch.zeros(dim_out, dim_in)
+        bias = torch.zeros(dim_out) if use_bias else None
+        self.init_(weight, bias, c=c, w0=w0)
+        self.weight = nn.Parameter(weight)
+        self.bias = nn.Parameter(bias) if use_bias else None
+        self.activation = Sine(w0) if activation is None else activation
+    def init_(self, weight, bias, c, w0):
+        dim = self.dim_in
+        w_std = (1 / dim) if self.is_first else (math.sqrt(c / dim) / w0)
+        weight.uniform_(-w_std, w_std)
+        if exists(bias):
+            bias.uniform_(-w_std, w_std)
+    def forward(self, x):
+        out = F.linear(x, self.weight, self.bias)
+        if self.dropout:
+            out = F.dropout(out, training=self.training)
+        out = self.activation(out)
+        return out
+def exists(val):
+    return val is not None
+def cast_tuple(val, repeat=1):
+    return val if isinstance(val, tuple) else ((val,) * repeat)

modeling_closp.py ADDED Viewed

	@@ -0,0 +1,202 @@

+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm import create_model
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoTokenizer,
+    PretrainedConfig,
+    PreTrainedModel,
+)
+from transformers.utils import ModelOutput
+from .location_encoder import LocationEncoder
+class CLOSPConfig(PretrainedConfig):
+    """
+    Configuration class for CLOSPModel.
+    This class stores the configuration of a CLOSPModel, which is used to instantiate the model
+    according to the specified parameters.
+    """
+    model_type = "closp"
+    def __init__(
+        self,
+        # Vision model parameters
+        vision_model_key: str = "vit-s",
+        s1_embedding_dim: int = 384,
+        s2_embedding_dim: int = 384,
+        s1_head_dim: int = 0,
+        s2_head_dim: int = 0,
+        # Text model parameters
+        text_model_name_or_path: str = "distilbert-base-uncased",
+        # Location encoder parameters (optional)
+        use_location_encoder: bool = True,
+        location_embedding_dim: int = 512,
+        # General model parameters
+        projection_dim: int = 768,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vision_model_key = vision_model_key
+        self.s1_embedding_dim = s1_embedding_dim
+        self.s2_embedding_dim = s2_embedding_dim
+        self.text_model_name_or_path = text_model_name_or_path
+        self.use_location_encoder = use_location_encoder
+        self.location_embedding_dim = location_embedding_dim
+        self.projection_dim = projection_dim
+        self.s1_head_dim = s1_head_dim
+        self.s2_head_dim = s2_head_dim
+# --- Structured Model Output ---
+@dataclass
+class CLOSPOutput(ModelOutput):
+    """
+    Base class for CLOSP model's outputs.
+    """
+    loss: torch.FloatTensor = None
+    logits_per_image: torch.FloatTensor = None
+    logits_per_text: torch.FloatTensor = None
+    logits_per_loc_img: torch.FloatTensor = None
+    logits_per_img_loc: torch.FloatTensor = None
+    image_embeds: torch.FloatTensor = None
+    text_embeds: torch.FloatTensor = None
+    location_embeds: torch.FloatTensor = None
+class CLOSPModel(PreTrainedModel):
+    config_class = CLOSPConfig
+    def __init__(self, config: CLOSPConfig):
+        super().__init__(config)
+        # --- Vision Encoders ---
+        self.s1_encoder = create_model(
+            config.vision_model_key,
+            in_chans=2,
+            num_classes=config.s1_head_dim,
+            pretrained=False,
+        )
+        self.s2_encoder = create_model(
+            config.vision_model_key,
+            in_chans=13,
+            num_classes=config.s2_head_dim,
+            pretrained=False,
+        )
+        self.s1_projection = nn.Linear(config.s1_embedding_dim, config.projection_dim)
+        self.s2_projection = nn.Linear(config.s2_embedding_dim, config.projection_dim)
+        # --- Text Encoder ---
+        self.text_model = AutoModel.from_config(
+            AutoConfig.from_pretrained(config.text_model_name_or_path)
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(config.text_model_name_or_path)
+        # --- Location Encoder ---
+        if config.use_location_encoder:
+            self.location_encoder = LocationEncoder(512, 2, 256, 10)
+            self.location_projection = nn.Linear(
+                config.location_embedding_dim, config.projection_dim
+            )
+    def tokenize_text(self, text: str):
+        """Tokenizes input text using the model's tokenizer."""
+        return self.tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+    def get_image_features(self, image: torch.Tensor) -> torch.Tensor:
+        """Encodes an image tensor into features."""
+        image = image.float()
+        if image.shape[1] == 2:  # Sentinel-1
+            image_features = self.s1_projection(self.s1_encoder(image))
+        else:  # Sentinel-2
+            image_features = self.s2_projection(self.s2_encoder(image))
+        return F.normalize(image_features, p=2, dim=-1)
+    def get_text_features(
+        self, input_ids: torch.Tensor, attention_mask: torch.Tensor
+    ) -> torch.Tensor:
+        """Encodes text tokens into features."""
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+        )
+        text_features = text_outputs.last_hidden_state[:, 0, :]
+        return F.normalize(text_features, p=2, dim=-1)
+    def get_location_features(self, coords: torch.Tensor) -> torch.Tensor:
+        """Encodes coordinates into features."""
+        if not self.config.use_location_encoder:
+            raise ValueError(
+                "Location encoder is not enabled for this model. Set `use_location_encoder=True` in config."
+            )
+        location_features = self.location_encoder(coords)
+        location_features = self.location_projection(location_features)
+        return F.normalize(location_features, p=2, dim=-1)
+    def forward(
+        self,
+        image: torch.Tensor,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        coords: torch.Tensor = None,
+        return_loss: bool = False,
+    ) -> CLOSPOutput:
+        image_embeds = self.get_image_features(image)
+        text_embeds = self.get_text_features(input_ids, attention_mask)
+        # Cosine similarity as logits
+        logits_per_image = image_embeds @ text_embeds.T
+        logits_per_text = logits_per_image.T
+        # --- Optional Location Logic ---
+        location_embeds = None
+        logits_per_loc_img = None
+        logits_per_img_loc = None
+        if self.config.use_location_encoder:
+            if coords is None:
+                raise ValueError(
+                    "Coordinates must be provided when use_location_encoder is True."
+                )
+            location_embeds = self.get_location_features(coords)
+            logits_per_loc_img = location_embeds @ image_embeds.T
+            logits_per_img_loc = image_embeds @ location_embeds.T
+        # --- Optional Loss Calculation ---
+        loss = None
+        if return_loss:
+            outputs = [
+                logits_per_image,
+                logits_per_text,
+                logits_per_loc_img,
+                logits_per_img_loc,
+            ]
+            ground_truth = torch.arange(len(input_ids)).to(self.device)
+            loss = [F.cross_entropy(o, ground_truth) for o in outputs if o is not None]
+            loss = sum(loss) / len(loss)
+        return CLOSPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            logits_per_loc_img=logits_per_loc_img,
+            logits_per_img_loc=logits_per_img_loc,
+            image_embeds=image_embeds,
+            text_embeds=text_embeds,
+            location_embeds=location_embeds,
+        )

positional_encoding.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright (c) Microsoft Corporation.
+import math
+import torch
+from torch import nn
+from .spherical_armonics import SH as SH_analytic
+class SphericalHarmonics(nn.Module):
+    """
+    Spherical Harmonics locaiton encoder
+    """
+    def __init__(self, legendre_polys: int = 10, harmonics_calculation="analytic"):
+        """
+        legendre_polys: determines the number of legendre polynomials.
+                        more polynomials lead more fine-grained resolutions
+        calculation of spherical harmonics:
+            analytic uses pre-computed equations. This is exact, but works only up to degree 50,
+            closed-form uses one equation but is computationally slower (especially for high degrees)
+        """
+        super(SphericalHarmonics, self).__init__()
+        self.L, self.M = int(legendre_polys), int(legendre_polys)
+        self.embedding_dim = self.L * self.M
+        if harmonics_calculation == "closed-form":
+            self.SH = SH_closed_form
+        elif harmonics_calculation == "analytic":
+            self.SH = SH_analytic
+    def forward(self, lonlat):
+        lon, lat = lonlat[:, 0], lonlat[:, 1]
+        # convert degree to rad
+        phi = torch.deg2rad(lon + 180)
+        theta = torch.deg2rad(lat + 90)
+        """
+        greater_than_50 = (lon > 50).any() or (lat > 50).any()
+        if greater_than_50:
+            SH = SH_closed_form
+        else:
+            SH = SH_analytic
+        """
+        SH = self.SH
+        Y = []
+        for l in range(self.L):
+            for m in range(-l, l + 1):
+                y = SH(m, l, phi, theta)
+                if isinstance(y, float):
+                    y = y * torch.ones_like(phi)
+                if y.isnan().any():
+                    print(m, l, y)
+                Y.append(y)
+        return torch.stack(Y, dim=-1)
+####################### Spherical Harmonics utilities ########################
+# Code copied from https://github.com/BachiLi/redner/blob/master/pyredner/utils.py
+# Code adapted from "Spherical Harmonic Lighting: The Gritty Details", Robin Green
+# http://silviojemma.com/public/papers/lighting/spherical-harmonic-lighting.pdf
+def associated_legendre_polynomial(l, m, x):
+    pmm = torch.ones_like(x)
+    if m > 0:
+        somx2 = torch.sqrt((1 - x) * (1 + x))
+        fact = 1.0
+        for i in range(1, m + 1):
+            pmm = pmm * (-fact) * somx2
+            fact += 2.0
+    if l == m:
+        return pmm
+    pmmp1 = x * (2.0 * m + 1.0) * pmm
+    if l == m + 1:
+        return pmmp1
+    pll = torch.zeros_like(x)
+    for ll in range(m + 2, l + 1):
+        pll = ((2.0 * ll - 1.0) * x * pmmp1 - (ll + m - 1.0) * pmm) / (ll - m)
+        pmm = pmmp1
+        pmmp1 = pll
+    return pll
+def SH_renormalization(l, m):
+    return math.sqrt(
+        (2.0 * l + 1.0) * math.factorial(l - m) / (4 * math.pi * math.factorial(l + m))
+    )
+def SH_closed_form(m, l, phi, theta):
+    if m == 0:
+        return SH_renormalization(l, m) * associated_legendre_polynomial(
+            l, m, torch.cos(theta)
+        )
+    elif m > 0:
+        return (
+            math.sqrt(2.0)
+            * SH_renormalization(l, m)
+            * torch.cos(m * phi)
+            * associated_legendre_polynomial(l, m, torch.cos(theta))
+        )
+    else:
+        return (
+            math.sqrt(2.0)
+            * SH_renormalization(l, -m)
+            * torch.sin(-m * phi)
+            * associated_legendre_polynomial(l, -m, torch.cos(theta))
+        )

spherical_armonics.py ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1fc4e9b49abb4e81411376fc6d09b1281aa8ed96cef64b7aa95cc4aeeccb97a4
+size 10994723