Upload model

Browse files

Files changed (3) hide show

config.json +4 -0
configuration_llip.py +12 -0
modeling_llip.py +364 -0

config.json CHANGED Viewed

@@ -2,6 +2,10 @@
   "architectures": [
     "LlipModel"
   ],
   "init_logit_bias": -10,
   "initializer_factor": 1.0,
   "logit_scale_init_value": 2.6592,

   "architectures": [
     "LlipModel"
   ],
+  "auto_map": {
+    "AutoConfig": "configuration_llip.LlipConfig",
+    "AutoModel": "modeling_llip.LlipModel"
+  },
   "init_logit_bias": -10,
   "initializer_factor": 1.0,
   "logit_scale_init_value": 2.6592,

configuration_llip.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from transformers import CLIPConfig
+class LlipConfig(CLIPConfig):
+    model_type = "llip"
+    def __init__(self, use_norm=True, ncls=64, num_heads=8, temp=1.0, **kwargs):
+        super().__init__(**kwargs)
+        self.use_norm = use_norm
+        self.num_heads = num_heads
+        self.temp = temp
+        # TODO: Get the vision_config parameters

modeling_llip.py ADDED Viewed

	@@ -0,0 +1,364 @@

+"""DLC DiT replaces class label conditioning with DLC conditioning
+class labels are a single discrete token between 0 and num_embeds_ada_norm-1
+DLCs are a fixed-length sequence of L discrete tokens between 0 and V-1
+we replace LabelEmbedder with DLCEmbedder
+- maintain the embedding matrix and drop_token
+- but apply it to a DLC sequence of L tokens, instead of a single class
+"""
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+from transformers import (
+    CLIPModel,
+    PretrainedConfig,
+)
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from transformers.models.clip.modeling_clip import CLIPVisionTransformer
+from transformers.utils import ModelOutput
+from configuration_llip import LlipConfig
+@dataclass
+class LlipOutput(ModelOutput):
+    loss: Optional[float] = None
+    K: Optional[torch.tensor] = None
+    V: Optional[torch.tensor] = None
+    Q: Optional[torch.tensor] = None
+    image_embeds: Optional[torch.tensor] = None
+    text_embeds: Optional[torch.tensor] = None
+    logit_scale: Optional[torch.tensor] = None
+    logit_bias: Optional[torch.tensor] = None
+class LlipPred(torch.nn.Module):
+    def __init__(self, embed_dim):
+        super().__init__()
+        scale_out = embed_dim**-0.5
+        self.out_proj = nn.Parameter(scale_out * torch.randn(embed_dim, embed_dim))
+    def cross_attention(self, K, Q, V, weight_scale, out_proj):
+        attn = (torch.einsum("vhnd,thd->vthn", K, Q) / weight_scale).softmax(-1)
+        zv = torch.einsum("vthn,vhnd->vthd", attn, V).reshape(
+            K.shape[0], Q.shape[0], -1
+        )
+        zv = zv @ out_proj
+        return zv
+    def forward(self, K, Q, V, weight_scale):
+        out = self.cross_attention(K, Q, V, weight_scale, self.out_proj)
+        return out
+def torch_int(x):
+    """
+    Casts an input to a torch int64 tensor if we are in a tracing context, otherwise to a Python int.
+    """
+    import torch
+    return (
+        x.to(torch.int64)
+        if torch.jit.is_tracing() and isinstance(x, torch.Tensor)
+        else int(x)
+    )
+class LlipVisionTransformer(CLIPVisionTransformer):
+    def __init__(self, config):
+        super().__init__(config)
+        self.embeddings = LlipVisionEmbeddings(config)
+    def forward(
+        self,
+        pixel_values=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        interpolate_pos_encoding=False,
+    ):
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        hidden_states = self.embeddings(
+            pixel_values, interpolate_pos_encoding=interpolate_pos_encoding
+        )
+        hidden_states = self.pre_layrnorm(hidden_states)
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        last_hidden_state = encoder_outputs.last_hidden_state
+        pooled_output = last_hidden_state[:, : self.config.ncls, :]
+        pooled_output = self.post_layernorm(pooled_output)
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+class LlipVisionEmbeddings(torch.nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.ncls = config.ncls
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + self.ncls
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions).expand((1, -1)),
+            persistent=False,
+        )
+    def interpolate_pos_encoding(
+        self, embeddings: torch.Tensor, height: int, width: int
+    ) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+        num_patches = embeddings.shape[1] - 1
+        position_embedding = self.position_embedding.weight.unsqueeze(0)
+        num_positions = position_embedding.shape[1] - 1
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if (
+            not torch.jit.is_tracing()
+            and num_patches == num_positions
+            and height == width
+        ):
+            return self.position_embedding(self.position_ids)
+        class_pos_embed = position_embedding[:, :1]
+        patch_pos_embed = position_embedding[:, 1:]
+        dim = embeddings.shape[-1]
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(
+            1, sqrt_num_positions, sqrt_num_positions, dim
+        )
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+        self.class_embedding = nn.Parameter(1, self.ncls, torch.randn(self.embed_dim))
+    def forward(
+        self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False
+    ) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        if not interpolate_pos_encoding and (
+            height != self.image_size or width != self.image_size
+        ):
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
+            )
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(dtype=target_dtype)
+        )  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, self.ncls, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(
+                embeddings, height, width
+            )
+        else:
+            embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+class LlipModel(CLIPModel):
+    config_class = LlipConfig
+    def __init__(self, *args, **kwargs):
+        # we use dlc_embed_l and dlc_embed_v instead of num_embeds_ada_norm_zero
+        # we still need to set num_embeds_ada_norm_zero since there's a check in DiT code
+        # but it will be overridden in our code with DLCEmbedding
+        super().__init__(*args, **kwargs)
+        self.visual_projection = None
+        # self.config.vision_config is broken.
+        self.vision_model = LlipVisionTransformer(self.config.vision_config)
+        ncls = self.config.vision_config.ncls
+        embed_dim = self.config.projection_dim
+        self.num_heads = self.config.num_heads
+        scale_visual = self.config.vision_config.hidden_size**-0.5
+        if self.config.vision_config.pass_all_tokens:
+            num_proj = self.vision_model.embeddings.positional_embedding.weight.size(0)
+        else:
+            num_proj = ncls
+        self.v_proj = nn.Parameter(
+            scale_visual
+            * torch.randn(num_proj, self.config.vision_config.hidden_size, embed_dim)
+        )
+        self.k_proj = nn.Parameter(
+            scale_visual
+            * torch.randn(num_proj, self.config.vision_config.hidden_size, embed_dim)
+        )
+        scale_text = self.config.text_config.hidden_size**-0.5
+        self.q_proj = nn.Parameter(
+            scale_text * torch.randn(self.config.text_config.hidden_size, embed_dim)
+        )
+        self.logit_bias = -10
+        if self.config.use_norm:
+            self.K_norm = nn.LayerNorm(embed_dim)
+            self.Q_norm = nn.LayerNorm(embed_dim)
+            self.V_norm = nn.LayerNorm(embed_dim)
+        else:
+            self.K_norm = nn.Identity()
+            self.Q_norm = nn.Identity()
+            self.V_norm = nn.Identity()
+        self.pred = LlipPred(embed_dim)
+    def get_image_features(self, image):
+        """
+        Returns K, V
+        """
+        h = self.vision_model(image).pooler_output
+        K = h.transpose(0, 1) @ self.k_proj
+        V = h.transpose(0, 1) @ self.v_proj
+        N, B, C = K.shape
+        K = self.K_norm(K)
+        V = self.V_norm(V)
+        K = K.reshape(N, B, self.num_heads, C // self.num_heads).permute(
+            1, 2, 0, 3
+        )  # [B, num_heads, N, D]
+        V = V.reshape(N, B, self.num_heads, C // self.num_heads).permute(1, 2, 0, 3)
+        return K, V
+    def get_text_features(self, text):
+        """
+        Returns Q, zt
+        """
+        # h = self.token_embedding(text)  # [batch_size, n_ctx, d_model]
+        # h = h + self.positional_embedding
+        # h = h.permute(1, 0, 2)  # NLD -> LND
+        # h = self.text_model(h, attn_mask=self.attn_mask).last_hidden_state
+        # h = h.permute(1, 0, 2)  # LND -> NLD
+        # h = self.ln_final(h)
+        # # x.shape = [batch_size, n_ctx, transformer.width]
+        # # take features from the eot embedding (eot_token is the highest number in each sequence)
+        # h = h[torch.arange(h.shape[0]), text.argmax(dim=-1)]
+        h = self.text_model(text).pooler_output
+        Q = h @ self.q_proj
+        B, C = Q.shape
+        Q = self.Q_norm(Q)
+        Q = Q.reshape(B, self.num_heads, C // self.num_heads)
+        zt = self.text_projection(h)
+        return Q, zt
+    def forward(
+        self,
+        input_ids,
+        pixel_values,
+        clamp_logit_scale_to=None,
+        compute_image_embeds=False,
+        compute_loss=False,
+        return_dict=False,
+    ):
+        """
+        Returns (K, V), (Q, zt), logit_scale, logit_bias
+        """
+        K, V = self.get_image_features(pixel_values)
+        Q, zt = self.get_text_features(input_ids)
+        if clamp_logit_scale_to is not None:
+            with torch.no_grad():
+                self.logit_scale.data.clamp_(0, clamp_logit_scale_to)
+        loss = None
+        image_embeds = None
+        if compute_image_embeds:
+            image_embeds = self.pred(K, Q, V, self.config.temp)
+        if compute_loss:
+            assert compute_image_embeds
+            normalized_image_embeds = torch.nn.functional.normalize(
+                image_embeds, dim=-1
+            )
+            normalized_text_embeds = torch.nn.functional.normalize(zt, dim=-1)
+            logits = self.logit_scale.exp() * (
+                normalized_text_embeds[None] * normalized_image_embeds
+            )
+            logits += self.logit_bias
+            labels = -torch.ones(
+                (len(logits), len(logits)), device=logits.device, dtype=logits.dtype
+            )
+            labels = (
+                2 * torch.eye(len(logits), device=logits.device, dtype=logits.dtype)
+                + labels
+            )
+            loss = -torch.nn.functional.logsigmoid(labels * logits).sum() / len(
+                image_embeds
+            )
+        return LlipOutput(
+            loss=loss,
+            K=K,
+            V=V,
+            Q=Q,
+            text_embeds=zt,
+            image_embeds=image_embeds,
+            logit_scale=self.logit_scale.exp(),
+            logit_bias=self.logit_bias,
+        )