U )Ob+a@s2ddlZddlmZddlmmZddlmmZddl m Z m Z m Z ddl mZddlmZddlmZmZddlmZddlmZGdd d ejZGd d d ejZGd d d ejZGdddejZGdddejZGdddejZd:ddZd;ddZ dddddddZ!eddd Z$ed?d!d"Z%ed@d#d$Z&edAd%d&Z'edBd'd(Z(edCd)d*Z)edDd+d,Z*e+d-kr.d.Z,e-d/d0e,e,.Z/ed1d1d2d1gd3d0d0d0d0gd4.Z0e1e0e0e/e02Z2e1d5e2d6e3d7d8e04DZ5e1d9e5dS)EN)DropPath to_2tuple trunc_normal_)register_model) transforms)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)create_transform) _pil_interpcs0eZdZddejdffdd ZddZZS)MlpNcsNt|p|}|p|}t|||_||_t|||_t||_dSN) super__init__nnLinearfc1actfc2Dropoutdrop)self in_featureshidden_features out_features act_layerr __class__0/datadrive/UniCL/model/image_encoder/focalnet.pyrs z Mlp.__init__cCs6||}||}||}||}||}|Sr )rrrrrxrrrforwards      z Mlp.forward)__name__ __module__ __qualname__rGELUrr" __classcell__rrrrr s r cs<eZdZdfdd ZddZed d d Zd d ZZS)FocalModulationTr Fc st||_||_||_||_||_tj|d||jd|d|_ tj ||dd|d|_ t |_ t|||_t||_t|_g|_t|jD]P}|j||j} |jttj ||| d|| dddt |j| q|jrt||_dS)Nr))bias) kernel_sizestrider+F)r,r-groupspaddingr+)rrdim focal_window focal_level focal_factor use_postlnrrfConv2dhr&rprojr proj_drop ModuleList focal_layers kernel_sizesrangeappend Sequential LayerNormln) rr0r1r2r3r+r9r4kr,rrrr's8      zFocalModulation.__init__c Cs|jd}||dddd}t||||jdfd\}}|_d}t|jD]4}|j ||}|||jdd||df}qT| |j dddj ddd}|||jdd|jdf}| ||_ ||j }|dddd}|jr||}||}||}|S) zP Args: x: input features with shape of (B, H, W, C) rr*r)NT)keepdim)shaper5permute contiguoustorchsplitr2gatesr=r;rmeanr7Z modulatorr4rAr8r9) rr!CqctxZctx_alllZ ctx_globalx_outrrrr"Fs" "$      zFocalModulation.forwardreturncCs d|jS)Ndim=)r0rrrr extra_represzFocalModulation.extra_reprcCsd}|||j|jd|jd7}t|jD]$}|||j|dd|j7}q0||d|j7}|||j|jd7}|||j|j7}|S)Nrr)r*)r0r2r=r<)rNflopsrBrrrrXhs""zFocalModulation.flops)r)Tr F) r#r$r%rr"strrVrXr'rrrrr(&sr(c sXeZdZdZdddejejdddddf fdd Zd d Ze d d dZ ddZ Z S) FocalNetBlocka+ Focal Modulation Network Block. Args: dim (int): Number of input channels. input_resolution (tuple[int]): Input resulotion. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. drop (float, optional): Dropout rate. Default: 0.0 drop_path (float, optional): Stochastic depth rate. Default: 0.0 act_layer (nn.Module, optional): Activation layer. Default: nn.GELU norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm focal_level (int): Number of focal levels. focal_window (int): Focal window size at first focal level use_layerscale (bool): Whether use layerscale layerscale_value (float): Initial layerscale value use_postln (bool): Whether use layernorm after modulation @r r*rDF-C6?c st||_||_||_| |_||_|||_t||| |j| d|_ |dkrXt |nt |_ |||_t||} t|| ||d|_d|_d|_| rt j| t|dd|_t j| t|dd|_d|_d|_dS)N)r9r1r2r4r )rrrr?T) requires_grad)rrr0input_resolution mlp_ratior1r2norm1r( modulationrrIdentity drop_pathnorm2intr mlpgamma_1gamma_2 ParameterrIonesHW)rr0r_r`rrdr norm_layerr2r1use_layerscalelayerscale_valuer4mlp_hidden_dimrrrrs&    zFocalNetBlock.__init__c Cs|j|j}}|j\}}}|}||}|||||}||||||}|||j|}|||j| | |}|Sr ) rlrmrFraviewrbrdrhrirgre)rr!rlrmBLrMshortcutrrrr"s   zFocalNetBlock.forwardrRcCsd|jd|jd|jS)NrT, input_resolution=z , mlp_ratio=)r0r_r`rUrrrrVszFocalNetBlock.extra_reprcCsld}|j\}}||j||7}||j||7}|d|||j|j|j7}||j||7}|S)Nrr))r_r0rbrXr`)rrXrlrmrrrrXs "zFocalNetBlock.flops) r#r$r%__doc__rr&r@rr"rYrVrXr'rrrrrZ|srZc sZeZdZdZdddejddddddddf fdd Zd d Zed d dZ ddZ Z S) BasicLayera A basic Focal Transformer layer for one stage. Args: dim (int): Number of input channels. input_resolution (tuple[int]): Input resolution. depth (int): Number of blocks. window_size (int): Local window size. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. focal_level (int): Number of focal levels focal_window (int): Focal window size at first focal level use_layerscale (bool): Whether use layerscale layerscale_value (float): Initial layerscale value use_postln (bool): Whether use layernorm after modulation r[r NFr*r\c st|_|_||_| |_t f ddt|D|_ | dk rv| d|| dd|_ nd|_ dS)Ncs<g|]4}tttr$|n  d qS)) r0r_r`rrdrnr2r1rorpr4)rZ isinstancelist.0i r0rrdr2r1r_rpr`rnror4rr sz'BasicLayer.__init__..r)Fimg_size patch_sizein_chans embed_dimuse_conv_embedrnis_stem) rrr0r_depthuse_checkpointrr:r=blocks downsample)rr0out_dimr_rr`rrdrnrrr2r1rrorpr4rr~rrs& "  zBasicLayer.__init__cCs|jD].}|||_|_|jr,t||}q||}q|jdk rp|dd|jdd||}||\}}}n ||}}|||fS)Nr*r)rrC) rrlrmr checkpointr transposereshaperF)rr!rlrmblkHoWorrrr"s    zBasicLayer.forwardrRcCsd|jd|jd|jS)NrTrvz, depth=)r0r_rrUrrrrV szBasicLayer.extra_reprcCs8d}|jD]}||7}q |jdk r4||j7}|S)Nr)rrXr)rrXrrrrrX#s   zBasicLayer.flops) r#r$r%rwrr@rr"rYrVrXr'rrrrrxs"+rxcs2eZdZdZdfdd Zd d Zd d ZZS) PatchEmbedax Image to Patch Embedding Args: img_size (int): Image size. Default: 224. patch_size (int): Patch token size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. norm_layer (nn.Module, optional): Normalization layer. Default: None rrD`FNc stt|}|d|d|d|dg}||_||_||_|d|d|_||_||_|r|rxd} d} d} n d} d} d} t j ||| | | d|_ nt j ||||d|_ |dk r|||_ nd|_ dS) Nrr*r)rrD)r,r-r/)r,r-) rrrrrpatches_resolution num_patchesrrrr6r8norm) rrrrrrrnrrr,r/r-rrrr6s,   zPatchEmbed.__init__cCsZ|j\}}}}||}|jdd\}}|ddd}|jdk rP||}|||fS)Nr)r*)rFr8flattenrr)rr!rsrMrlrmrrrr"Qs   zPatchEmbed.forwardcCsR|j\}}|||j|j|jd|jd}|jdk rN||||j7}|S)Nrr*)rrrrr)rrrrXrrrrX[s  ( zPatchEmbed.flops)rrrDrFNF)r#r$r%rwrr"rXr'rrrrr+s  rcseZdZdZdddddddddgd d d ejd d ddddgddddgd d dd ffdd ZddZej j ddZ ej j ddZ ddZ ddZddZZS)FocalNetaQ Focal Modulation Networks (FocalNets) Args: img_size (int | tuple(int)): Input image size. Default 224 patch_size (int | tuple(int)): Patch size. Default: 4 in_chans (int): Number of input image channels. Default: 3 num_classes (int): Number of classes for classification head. Default: 1000 embed_dim (int): Patch embedding dimension. Default: 96 depths (tuple(int)): Depth of each Focal Transformer layer. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 drop_rate (float): Dropout rate. Default: 0 drop_path_rate (float): Stochastic depth rate. Default: 0.1 norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. patch_norm (bool): If True, add normalization after patch embedding. Default: True use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False focal_levels (list): How many focal levels at all stages. Note that this excludes the finest-grain level. Default: [1, 1, 1, 1] focal_windows (list): The focal window size at all stages. Default: [7, 5, 3, 1] use_conv_embed (bool): Whether use convolutional embedding. We noted that using convolutional embedding usually improve the performance, but we do not use it by default. Default: False use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False layerscale_value (float): Value for layer scale. Default: 1e-4 use_postln (bool): Whether use layernorm after modulation (it helps stablize training of large models) rrrDirr)r[r g?TFr\c stt||_fddt|jD||_|_| |_d|_||_ t t |||d||jrl| nddd|_ |j j }|j j}||_tj|d|_ddtd| t|D}t|_t|jD]}t|||jd kr|d nd|dd ||d d |f|||j ||t|d|t|d|d | ||jd krZt nd| ||||| |||d }|j|q| |j|_td |_|dkrt|j|nt|_|j|_| |j!dS) Ncsg|]}d|qS)r)rr{rrrrsz%FocalNet.__init__..rCrTr)pcSsg|] }|qSr)item)r|r!rrrrsr*r))r0rr_rr`rrdrnrr2r1rrrorpr4)"rrlen num_layersr= num_classesr patch_norm num_featuresr`rr patch_embedrrrrpos_droprIlinspacesumr:layersrxr>rAdaptiveAvgPool1davgpoolrrcheaddim_outapply _init_weights)rrrrrrdepthsr` drop_ratedrop_path_raternrr focal_levels focal_windowsrrorpr4kwargsrrdpri_layerlayerrrrrysb      &  "zFocalNet.__init__cCsrt|tjrBt|jddt|tjrn|jdk rntj|jdn,t|tjrntj|jdtj|jddS)Ng{Gz?)stdrr]) ryrrrweightr+init constant_r@)rmrrrrs  zFocalNet._init_weightscCsdhSNrrUrrrno_weight_decayszFocalNet.no_weight_decaycCsdhSrrrUrrrno_weight_decay_keywordssz!FocalNet.no_weight_decay_keywordscCsd||\}}}||}|jD]}||||\}}}q ||}||dd}t|d}|S)Nr*r))rrrrrrrIr)rr!rlrmrrrrforward_featuress    zFocalNet.forward_featurescCs||}||}|Sr )rrr rrrr"s  zFocalNet.forwardcCsnd}||j7}t|jD]\}}||7}q||j|jd|jdd|j7}||j|j7}|S)Nrr*r))rrX enumeraterrrrr)rrXr}rrrrrXs(zFocalNet.flops)r#r$r%rwrr@rrrIjitignorerrrr"rXr'rrrrrbs8   O    rFcCsg}|r>td|}|tj|tdd|t|n|tj|tdd|t|ttt t |SNg$I$I?bicubic) interpolation) rfr>rResizer CenterCropToTensor NormalizerrComposer center_croptsizerrrbuild_transformss rcCsng}|r>td|}|tj|tdd|t|n|tj|tdd|tt|Sr)rfr>rrr rrrrrrrbuild_transforms4displays rr)focalnet_tiny_srffocalnet_small_srffocalnet_base_srffocalnet_tiny_lrffocalnet_small_lrffocalnet_base_lrfcKsLtfddddgdd|}|rHtd}tjj|ddd}||d |S) Nr)rrrrrcpuTurl map_location check_hashmodelr model_urlsrIhubload_state_dict_from_urlload_state_dict pretrainedrrrrrrrrs rcKsJtfddddgdd|}|rFtd}tjj|dd}||d|S) Nr)rrrrrrrrrrrrr's rcKsJtfddddgdd|}|rFtd}tjj|dd}||d|S) Nr)rrrrrrrrrrrr0s rcKsVtfddddgdddddgd|}|rRtd}tjj|ddd }||d |S) Nr)rrrDrrrrrTrrrrrrrr9s &rcKsTtfddddgdddddgd|}|rPtd}tjj|dd}||d |S) Nr)rrrDrrrrrrrrrrrBs &rcKsTtfddddgdddddgd|}|rPtd}tjj|dd}||d |S) Nr)rrrDrrrrrrrrrrrKs &rcKsPtfdgdddgdgd|}|rLtd}tjj|ddd }||d |S) N rDrrrrrfocalnet_tiny_iso_16rTrrrrrrrrTs  rcKsNtfdgdddgdgd|}|rJtd}tjj|dd}||d |S) NrrirDrfocalnet_small_iso_16rrrrrrrrr]s  rc KsRtfdgdddgdgddd|}|rNtd}tjj|dd }||d |S) NrrirDT)rrrrrror4focalnet_base_iso_16rrrrrrrrrfs $r__main__rrrDr)rrrznumber of GFLOPs: geAccs|]}|jr|VqdSr )r^numel)r|rrrr zsrznumber of params: )F)F)F)F)F)F)F)F)F)F)F)6rItorch.nnrtorch.nn.functional functionalFtorch.utils.checkpointutilsrtimm.models.layersrrrZtimm.models.registryr torchvisionrZtimm.data.constantsrr timm.datar timm.data.transformsr Moduler r(rZrxrrrrrrrrrrrrrrr#rrandcudar!rprintrXr parameters n_parametersrrrrsj     VS\7             "