o ;Qg? @stddlZddlmZddlmZmZmZddlmZzddl m Z WnYGdddej Z Gddde ZdS) N)CLIPImageProcessorCLIPVisionConfigCLIPVisionModel) rank0_print)forwardcseZdZdfdd ZdddZddZd d Zed d Zed dZ eddZ eddZ eddZ eddZ eddZeddZZS)CLIPVisionTowerFcstd|_||_|j|_t|dd|_|s%td|| dSt|ddr5td| dSt |drId|j vrItd | dSt |j|_dS) NFmm_vision_select_featurepatchzLoading vision tower: unfreeze_mm_vision_towerzYThe checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.mm_tunable_partsmm_vision_towerzfThe checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.)super__init__ is_loadedvision_tower_namemm_vision_select_layer select_layergetattrselect_featurer load_modelhasattrr rfrom_pretrainedcfg_onlyself vision_towerargs delay_load __class__M/mnt/sfs-common/jkyang/EgoGPT/egogpt/model/multimodal_encoder/clip_encoder.pyrs*      zCLIPVisionTower.__init__NcCsP|jr td|jdSt|j|_tj|j|d|_|j dd|_dS)N:{} is already loaded, `load_model` called again, skipping. device_mapFT) rrformatrrrimage_processorrrrequires_grad_rr$r r r!r+s  zCLIPVisionTower.load_modelcs|j}|jdvr.tjd}tjfddt||jtj|Ddd}|dd}n$|jd vrLgd }tjfd d|Ddd}|d d}nj|j}|d krb|ddddf}|S|dkrj|}|Std|)N)Zslicefour_patchZslicefour_cls_patchcg|]}j|qSr  hidden_states.0iimage_forward_outsr r! Dsz2CLIPVisionTower.feature_select..)dimZ slicefour_)Zslice_m25811_f6_patchZslice_m25811_f6_cls_patch)iicr*r r+r-r0r r!r2UsZslice_m25811_f6_r  cls_patchzUnexpected select feature: ) rlenr,torchcatrangerreplace ValueError)rr1Zselect_feature_typeZselect_every_k_layerimage_featuresZ select_layersr r0r!feature_select>s8    zCLIPVisionTower.feature_selectcCst|tur.g}|D]!}|j|j|j|jdddd}|||j}||q |S|j|j|j|jddd}|||j}|S)NdevicedtyperToutput_hidden_states) typelistrtorDrE unsqueezerBappend)rimagesrAimageZimage_forward_out image_featurer1r r r!rcs  zCLIPVisionTower.forwardcCstjd|j|j|jdS)Nr9rC)r<zeros hidden_sizerDrErr r r! dummy_featurevszCLIPVisionTower.dummy_featurecC|jjSN)rrErRr r r!rEzzCLIPVisionTower.dtypecCrTrU)rrDrRr r r!rD~rVzCLIPVisionTower.devicecCs|jr|jjS|jSrU)rrconfigrrRr r r!rWszCLIPVisionTower.configcCs0|jj}d|jvr |d9}d|jvr|d9}|S)NZ slicefourr)Zslice_m25811_f6)rWrQr)rZ _hidden_sizer r r!rQs   zCLIPVisionTower.hidden_sizecCs|jj|jjSrU)rW image_size patch_sizerRr r r!num_patches_per_sidesz$CLIPVisionTower.num_patches_per_sidecCs*|jj|jjd}d|jvr|d7}|S)Nr:r9)rWrYrZr)rZ _num_patchesr r r! num_patchess zCLIPVisionTower.num_patchescCrTrU)rWrYrRr r r!rYrVzCLIPVisionTower.image_sizeFrU)__name__ __module__ __qualname__rrrBrpropertyrSrErDrWrQr[r]rY __classcell__r r rr!r s* %       rcsDeZdZd fdd ZdddZddZd d Zed d ZZ S)CLIPVisionTowerS2Fcst|dd|_ttt|jd|_|j|jd|_|jd|_t ||||r4t|ddrJ|j|j j d<|j|j j d <|j j d <dSdS) N s2_scalesz 336,672,1008,rr3r F shortest_edgeheightwidth)rrerImapintsplitsort s2_split_size s2_image_sizer rr&size crop_sizerrr r!rs   zCLIPVisionTowerS2.__init__NcCsx|jr td|jdSt|j|_tj|j|d|_|j d|j |jj d<|j |jj d<|jj d<d|_dS)Nr"r#FrgrhriT) rrr%rrrr&rrr'rorprqr(r r r!rs(  zCLIPVisionTowerS2.load_modelcCs2|j|j|j|jddd}|||j}|S)NrCTrF)rrJrDrErB)rrMr1rAr r r!forward_features z!CLIPVisionTowerS2.forward_featurecCsbt|tur#g}|D]}t|j|d|j|jdd}||q |St|j||j|jdd}|S)NrT)Z img_sizesZmax_split_sizeZ split_forward)rHrImultiscale_forwardrrrKrernrL)rrMrArNrOr r r!rs(   zCLIPVisionTowerS2.forwardcCs|jjt|jSrU)rWrQr;rerRr r r!rQszCLIPVisionTowerS2.hidden_sizer^rU) r_r`rarrrrrrbrQrcr r rr!rds rd)r<torch.nnnn transformersrrr egogpt.utilsrZ s2wrapperrrsModulerrdr r r r!s