make xformers an optional dependency

This adapts the LlamaMLP from the llama modeling code in transformers to handle splitting the w12 weight during the forward pass, and uses it in case xformers is not available on the system.

This enables the model to be used on MacOS for example.

Files changed (1) hide show

model.py +24 -2

model.py CHANGED Viewed

@@ -9,7 +9,11 @@ from torch.nn.functional import scaled_dot_product_attention
 from typing import Optional
 import numpy as np
-from xformers.ops import SwiGLU
 try:
     from flash_attn.flash_attn_interface import flash_attn_varlen_func
@@ -100,6 +104,21 @@ class NeoBERTConfig(PretrainedConfig):
         self.max_length = max_length
         self.kwargs = kwargs
 class EncoderBlock(nn.Module):
     """Transformer encoder block."""
@@ -117,7 +136,10 @@ class EncoderBlock(nn.Module):
         multiple_of = 8
         intermediate_size = int(2 * config.intermediate_size / 3)
         intermediate_size = multiple_of * ((intermediate_size + multiple_of - 1) // multiple_of)
-        self.ffn = SwiGLU(config.hidden_size, intermediate_size, config.hidden_size, bias=False)
         # Layer norms
         self.attention_norm = nn.RMSNorm(config.hidden_size, config.norm_eps)

 from typing import Optional
 import numpy as np
+try:
+    from xformers.ops import SwiGLU
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    XFORMERS_AVAILABLE = False
 try:
     from flash_attn.flash_attn_interface import flash_attn_varlen_func
         self.max_length = max_length
         self.kwargs = kwargs
+# Adapted from transformers.models.llama.modeling_llama.LlamaMLP
+class NeobertMLP(nn.Module):
+    def __init__(self, hidden_size, intermediate_size, bias=False):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.w12 = nn.Linear(self.hidden_size, 2 * self.intermediate_size, bias=bias)
+        self.w3 = nn.Linear(self.intermediate_size, self.hidden_size, bias=bias)
+        self.act_fn = nn.SiLU()
+    def forward(self, x):
+        w1, w2 = self.w12(x).chunk(2, dim=-1)
+        w3 = self.w3(self.act_fn(w1) * w2)
+        return w3
 class EncoderBlock(nn.Module):
     """Transformer encoder block."""
         multiple_of = 8
         intermediate_size = int(2 * config.intermediate_size / 3)
         intermediate_size = multiple_of * ((intermediate_size + multiple_of - 1) // multiple_of)
+        if XFORMERS_AVAILABLE:
+            self.ffn = SwiGLU(config.hidden_size, intermediate_size, config.hidden_size, bias=False)
+        else:
+            self.ffn = NeobertMLP(config.hidden_size, intermediate_size, config.hidden_size, bias=False)
         # Layer norms
         self.attention_norm = nn.RMSNorm(config.hidden_size, config.norm_eps)