# -------------------------------------------------------- # Copyright (c) 2025 NVIDIA # Licensed under customized NSCLv1 [see LICENSE.md for details] # -------------------------------------------------------- import warnings from typing import Any, List, Optional, Tuple, Union import math import torch.utils.checkpoint import transformers from torch import nn from torch.nn import CrossEntropyLoss from transformers import (AutoModel, GenerationConfig, LlamaTokenizer) from transformers.modeling_outputs import CausalLMOutputWithPast from transformers.modeling_utils import PreTrainedModel from transformers.utils import ModelOutput, logging from peft import LoraConfig, get_peft_model from .conversation import get_conv_template from .modeling_siglip import SiglipVisionModel logger = logging.get_logger(__name__) from .configuration_eagle_chat import Eagle2ChatConfig from .llama_bidirectional_model import LlamaBidirectionalModel def version_cmp(v1, v2, op='eq'): import operator from packaging import version op_func = getattr(operator, op) return op_func(version.parse(v1), version.parse(v2)) class Eagle2ChatModel(PreTrainedModel): config_class = Eagle2ChatConfig main_input_name = 'pixel_values' _no_split_modules = ['LlamaDecoderLayer'] def __init__(self, config: Eagle2ChatConfig, vision_model=None, language_model=None): super().__init__(config) image_size = config.force_image_size or config.vision_config.image_size if hasattr(config.vision_config, 'grid_size'): grid_size = config.vision_config.grid_size self.patch_size = 14 self.num_image_token = int((grid_size * config.downsample_ratio) ** 2) else: patch_size = config.vision_config.patch_size self.patch_size = patch_size self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2)) self.select_layer = config.select_layer self.template = config.template self.downsample_ratio = config.downsample_ratio logger.info(f'num_image_token: {self.num_image_token}') if vision_model is not None: self.vision_model = vision_model else: if config.vision_config.model_type == 'siglip_vision_model': self.vision_model = SiglipVisionModel(config.vision_config) else: raise NotImplementedError if language_model is not None: self.language_model = language_model else: if config.llm_config.architectures[0] == 'LlamaBidirectionalModel': config.llm_config._attn_implementation = "flash_attention_2" self.language_model = LlamaBidirectionalModel(config.llm_config) else: raise NotImplementedError(f'{config.llm_config.architectures[0]} is not implemented.') vit_hidden_size = config.vision_config.hidden_size if vit_hidden_size == 'lazy_calculation': # a hack for Mixture of Backbones vit_hidden_size = self.vision_model.hidden_size print("The lazy calculated hidden_size: {} .. ".format(vit_hidden_size)) llm_hidden_size = config.llm_config.hidden_size self.moe_version_type = getattr(config.vision_config, 'moe_version_type', None) self.mlp1 = nn.Sequential( nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2), nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size), nn.GELU(), nn.Linear(llm_hidden_size, llm_hidden_size) ) self.img_context_token_id = None self.conv_template = get_conv_template(self.template) self.system_message = self.conv_template.system_message if config.use_backbone_lora: self.wrap_backbone_lora(r=config.use_backbone_lora, lora_alpha=2 * config.use_backbone_lora) if config.use_llm_lora: self.wrap_llm_lora(r=config.use_llm_lora, lora_alpha=2 * config.use_llm_lora) def forward( self, pixel_values: torch.FloatTensor = None, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, image_flags: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, num_patches_list: Optional[List[torch.Tensor]] = None, ) -> Union[Tuple, CausalLMOutputWithPast]: return_dict = return_dict if return_dict is not None else self.config.use_return_dict input_embeds = self.language_model.get_input_embeddings()(input_ids) if pixel_values != None: if image_flags is None: image_flags = torch.ones(pixel_values.shape[0]) image_flags = image_flags.squeeze(-1) if self.moe_version_type in ['seq_concat', 'feat_concat'] and not isinstance(pixel_values, dict): raise NotImplementedError vit_embeds = self.extract_feature(pixel_values).to(device=input_embeds.device) if not isinstance(image_flags, list): image_flags = image_flags.squeeze(-1) vit_embeds = vit_embeds[image_flags == 1] if isinstance(pixel_values, dict): # for MOE vit_batch_size = sum(pixel_values['num_patches']) else: vit_batch_size = pixel_values.shape[0] B, N, C = input_embeds.shape input_embeds = input_embeds.reshape(B * N, C) #if torch.distributed.get_rank() == 0: # print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}') input_ids = input_ids.reshape(B * N) selected = (input_ids == self.config.img_context_token_id) try: input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds.reshape(-1, C) except Exception as e: vit_embeds = vit_embeds.reshape(-1, C) print(f'warning: {e}, input_embeds[selected].shape={input_embeds[selected].shape}, ' f'vit_embeds.shape={vit_embeds.shape}') n_token = selected.sum() input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds[:n_token] input_embeds = input_embeds.reshape(B, N, C) outputs = self.language_model( inputs_embeds=input_embeds, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, ) logits = None loss = None if hasattr(outputs, 'logits'): logits = outputs.logits if labels is not None: # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() shift_logits = shift_logits.view(-1, self.language_model.config.vocab_size) shift_labels = shift_labels.view(-1) # Enable model parallelism shift_labels = shift_labels.to(shift_logits.device) loss = loss_fct(shift_logits, shift_labels) if not return_dict: output = (logits,) + outputs[1:] return (loss,) + output if loss is not None else output return CausalLMOutputWithPast( loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) def pixel_shuffle(self, x, scale_factor=0.5): n, w, h, c = x.size() # N, W, H, C --> N, W, H * scale, C // scale x = x.view(n, w, int(h * scale_factor), int(c / scale_factor)) # N, W, H * scale, C // scale --> N, H * scale, W, C // scale x = x.permute(0, 2, 1, 3).contiguous() # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2) x = x.view(n, int(h * scale_factor), int(w * scale_factor), int(c / (scale_factor * scale_factor))) x = x.permute(0, 2, 1, 3).contiguous() return x def extract_feature(self, pixel_values): """ """ if self.select_layer == -1: vit_embeds = self.vision_model( pixel_values=pixel_values, output_hidden_states=False, return_dict=True) # if there is vit_embeds.last_hidden_state, use it. if hasattr(vit_embeds, 'last_hidden_state'): vit_embeds = vit_embeds.last_hidden_state else: vit_embeds = self.vision_model( pixel_values=pixel_values, output_hidden_states=True, return_dict=True).hidden_states[self.select_layer] if type(self.vision_model) == SiglipVisionModel: pass else: vit_embeds = vit_embeds[:, 1:, :] # torch.Size([B, 1024, 1024]) #if self.training and self.neftune_alpha is not None: # vit_embeds = self.noised_embed(vit_embeds, self.neftune_alpha) """if self.moe_version_type in ['feat_concat', 'seq_concat']: raise NotImplementedError elif self.moe_version_type == 'convnext_512_siglip_448': siglip_embeds = vit_embeds['siglip'] convnext_embeds = vit_embeds['convnext'] h = w = int(siglip_embeds.shape[1] ** 0.5) siglip_embeds = siglip_embeds.reshape(siglip_embeds.shape[0], h, w, -1) siglip_embeds = self.pixel_shuffle(siglip_embeds, scale_factor=self.downsample_ratio) siglip_embeds = siglip_embeds.reshape(siglip_embeds.shape[0], -1, siglip_embeds.shape[-1]) vit_embeds = self.mlp1(torch.cat([siglip_embeds, convnext_embeds], dim=-1)) elif self.moe_version_type == 'radio_448_siglip_448': siglip_embeds = vit_embeds['siglip'] h = w = int(siglip_embeds.shape[1] ** 0.5) siglip_embeds = siglip_embeds.reshape(siglip_embeds.shape[0], h, w, -1) siglip_embeds = self.pixel_shuffle(siglip_embeds, scale_factor=self.downsample_ratio) siglip_embeds = siglip_embeds.reshape(siglip_embeds.shape[0], -1, siglip_embeds.shape[-1]) radio_embeds = vit_embeds['radio'] h_1 = w_1 = int(radio_embeds.shape[1] ** 0.5) radio_embeds = radio_embeds.reshape(radio_embeds.shape[0], h_1, w_1, -1) if h_1 != h or w_1 != w: #interpolate (radio_embeds.shape[0], h_1, w_1, -1) -> (radio_embeds.shape[0], h, w, -1) radio_embeds = radio_embeds.permute(0, 3, 1, 2) radio_embeds = F.interpolate(radio_embeds, size=(h, w), mode='bilinear', align_corners=False) radio_embeds = radio_embeds.permute(0, 2, 3, 1) radio_embeds = self.pixel_shuffle(radio_embeds, scale_factor=self.downsample_ratio) radio_embeds = radio_embeds.reshape(radio_embeds.shape[0], -1, radio_embeds.shape[-1]) vit_embeds = self.mlp1(torch.cat([siglip_embeds, radio_embeds], dim=-1))""" if self.moe_version_type != None: raise NotImplementedError else: h = w = int(vit_embeds.shape[1] ** 0.5) vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio) # torch.Size([B, 1024, 1024]) -> torch.Size([B, 16, 16, 4096]) vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1]) # torch.Size([B, 16, 16, 4096]) -> torch.Size([B, 256, 4096]) vit_embeds = self.mlp1(vit_embeds)#.to(pixel_values.device) return vit_embeds def get_input_embeddings(self): return self.language_model.get_input_embeddings() def get_output_embeddings(self): return self.language_model.get_output_embeddings()