U )Ob^@sddlZddlmZddlmmZddlmZmZm Z Gdddej Z ddZ ddZ Gd d d ej ZGd d d ej ZGd ddej ZGdddej ZGdddej ZGdddej ZdS)N)DropPath to_2tuple trunc_normal_cs0eZdZddejdffdd ZddZZS)MlpNcsNt|p|}|p|}t|||_||_t|||_t||_dSN) super__init__nnLinearfc1actfc2Dropoutdrop)self in_featureshidden_features out_features act_layerr __class__8/datadrive/UniCL/model/image_encoder/swin_transformer.pyr s z Mlp.__init__cCs6||}||}||}||}||}|Sr)r r rrrxrrrforwards      z Mlp.forward)__name__ __module__ __qualname__r GELUr r __classcell__rrrrrs rcCsR|j\}}}}|||||||||}|ddddddd|||}|S)z Args: x: (B, H, W, C) window_size (int): window size Returns: windows: (num_windows*B, window_size, window_size, C) r)shapeviewpermute contiguous)r window_sizeBHWCwindowsrrrwindow_partition!s $r2cCsbt|jd||||}||||||||d}|dddddd|||d}|S)z Args: windows: (num_windows*B, window_size, window_size, C) window_size (int): Window size H (int): Height of image W (int): Width of image Returns: x: (B, H, W, C) rr'r"r#r$r%r&)intr(r)r*r+)r1r,r.r/r-rrrrwindow_reverse0s $r4csBeZdZdZdfdd ZdddZed d d Zd d ZZ S)WindowAttentiona Window based multi-head self attention (W-MSA) module with relative position bias. It supports both of shifted and non-shifted window. Args: dim (int): Number of input channels. window_size (tuple[int]): The height and width of the window. num_heads (int): Number of attention heads. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 proj_drop (float, optional): Dropout ratio of output. Default: 0.0 TNrcst||_||_||_||}|p.|d|_tt d|ddd|dd||_ t |jd} t |jd} t t | | g} t| d} | dddddf| dddddf} | ddd} | dddddf|jdd7<| dddddf|jdd7<| dddddfd|jdd9<| d}|d|tj||d|d|_t||_t|||_t||_t|j d d tjdd |_dS) Ngr$rr"r'relative_position_indexr#bias{Gz?std)dim)rr r<r, num_headsscaler Parametertorchzerosrelative_position_bias_tablearangestackmeshgridflattenr*r+sumregister_bufferr qkvr attn_dropproj proj_droprSoftmaxsoftmax)rr<r,r=qkv_biasqk_scalerJrLhead_dimcoords_hcoords_wcoordscoords_flattenrelative_coordsr6rrrr Os4 & ,((,    zWindowAttention.__init__c Csl|j\}}}||||d|j||jddddd}|d|d|d}}} ||j}||dd} |j|j d |j d|j d|j d|j dd} | ddd } | | d} |dk r&|jd} | || | |j||| d d} | d|j||} | | } n | | } || } | | dd|||}||}||}|S) z Args: x: input features with shape of (num_windows*B, N, C) mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None r#r$rr"r%r'N)r(rIreshaper=r*r> transposerBr6r)r,r+ unsqueezerNrJrKrL) rrmaskB_Nr0rIqkvattnrelative_position_biasnWrrrrqs. .   (     zWindowAttention.forwardreturncCsd|jd|jd|jS)Ndim=, window_size= , num_heads=)r<r,r=rrrr extra_reprszWindowAttention.extra_reprcCspd}|||jd|j7}||j||j|j|7}||j|||j|j7}|||j|j7}|S)Nrr#)r<r=)rr]flopsrrrrks zWindowAttention.flops)TNrr)N) rrr__doc__r rstrrjrkr!rrrrr5As  " !r5c sXeZdZdZddddddddejejf fdd Zd d Ze d d dZ ddZ Z S)SwinTransformerBlocka Swin Transformer Block. Args: dim (int): Number of input channels. input_resolution (tuple[int]): Input resulotion. num_heads (int): Number of attention heads. window_size (int): Window size. shift_size (int): Shift size for SW-MSA. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float, optional): Stochastic depth rate. Default: 0.0 act_layer (nn.Module, optional): Activation layer. Default: nn.GELU norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm r@TNrc st||_||_||_||_||_||_t|j|jkrPd|_t|j|_d|jkrh|jksrnt d| ||_ t |t |j|||| | d|_ | dkrt| nt|_| ||_t||}t||| | d|_|jdkr|j\}}td||df}td|j t|j |j t|j df}td|j t|j |j t|j df}d}|D]2}|D]&}||dd||ddf<|d7}qlqdt||j}|d|j|j}|d|d}||dktd |dktd}nd}|d |dS) Nrz shift_size must in 0-window_size)r,r=rOrPrJrLr)rrrrr"r'r$gY attn_mask)rr r<input_resolutionr=r, shift_size mlp_ratiominAssertionErrornorm1r5rrarr Identity drop_pathnorm2r3rmlpr@rAslicer2r)rZ masked_fillfloatrH)rr<rrr=r,rsrtrOrPrrJryr norm_layermlp_hidden_dimr.r/img_maskh_slicesw_slicescnthw mask_windowsrqrrrr s^  "          &zSwinTransformerBlock.__init__c Cs0|j\}}|j\}}}|||ks*td|}||}|||||}|jdkrptj||j |j fdd}n|}t||j } | d|j |j |} |j | |j d} | d|j |j |} t | |j ||}|jdkrtj||j|jfdd}n|}|||||}|| |}|| |||}|S)Ninput feature has wrong sizer)r"r$)shiftsdimsr')r[)rrr(rvrwr)rsr@rollr2r,rarqr4ryr{rz) rrr.r/r-Lr0shortcut shifted_x x_windows attn_windowsrrrrs*      zSwinTransformerBlock.forwardrdc Cs4d|jd|jd|jd|jd|jd|j S)Nrf, input_resolution=rhrgz , shift_size=z , mlp_ratio=)r<rrr=r,rsrtrirrrrjszSwinTransformerBlock.extra_reprcCsd}|j\}}||j||7}|||j|j}|||j|j|j7}|d|||j|j|j7}||j||7}|S)Nrr$)rrr<r,rarkrt)rrkr.r/rcrrrrks "zSwinTransformerBlock.flops) rrrrlr r LayerNormr rrmrjrkr!rrrrrns3'rncsDeZdZdZejffdd ZddZedddZ d d Z Z S) PatchMergingz Patch Merging Layer. Args: input_resolution (tuple[int]): Resolution of input feature. dim (int): Number of input channels. norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm csBt||_||_tjd|d|dd|_|d||_dS)Nr%r$Fr7)rr rrr<r r reductionnorm)rrrr<rrrrr ,s  zPatchMerging.__init__c Cs6|j\}}|j\}}}|||ks*td|ddkrB|ddksXtd|d|d|||||}|ddddddddddf}|ddddddddddf}|ddddddddddf} |ddddddddddf} t||| | gd }||d d |}||}||}|S) z x: B, H*W, C rr$rzx size (*z) are not even.Nr"r'r%)rrr(rvr)r@catrr) rrr.r/r-rr0x0x1x2x3rrrr3s  .$$$$  zPatchMerging.forwardrdcCsd|jd|jS)Nzinput_resolution=z, dim=rrr<rirrrrjJszPatchMerging.extra_reprcCsD|j\}}|||j}||d|dd|jd|j7}|S)Nr$r%r)rr.r/rkrrrrkMs (zPatchMerging.flops rrrrlr rr rrmrjrkr!rrrrr#s rc sTeZdZdZddddddejddf fdd Zd d Zed d d Z ddZ Z S) BasicLayera/ A basic Swin Transformer layer for one stage. Args: dim (int): Number of input channels. input_resolution (tuple[int]): Input resolution. depth (int): Number of blocks. num_heads (int): Number of attention heads. window_size (int): Local window size. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. rpTNrFc sxt|_|_||_||_t f ddt|D|_ | dk rn| d|_ nd|_ dS)NcsRg|]J}t |ddkr"dn d ttrD|nd qS)r$r) r<rrr=r,rsrtrOrPrrJryr)rn isinstancelist).0i rJr<rryrrrtrr=rPrOr,rr ss z'BasicLayer.__init__..)r<r) rr r<rrdepthuse_checkpointr ModuleListrangeblocks downsample)rr<rrrr=r,rtrOrPrrJryrrrrrrr hs "  zBasicLayer.__init__cCs@|jD] }|jrt||}q||}q|jdk r<||}|Sr)rr checkpointr)rrblkrrrrs    zBasicLayer.forwardrdcCsd|jd|jd|jS)Nrfrz, depth=)r<rrrrirrrrjszBasicLayer.extra_reprcCs8d}|jD]}||7}q |jdk r4||j7}|S)Nr)rrkr)rrkrrrrrks   zBasicLayer.flopsrrrrrrTs rcs2eZdZdZd fdd Zd d Zd d ZZS) PatchEmbedax Image to Patch Embedding Args: img_size (int): Image size. Default: 224. patch_size (int): Patch token size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. norm_layer (nn.Module, optional): Normalization layer. Default: None r%r#`Ncstt|}t|}|d|d|d|dg}||_||_||_|d|d|_||_||_t j ||||d|_ |dk r|||_ nd|_ dS)Nrr") kernel_sizestride) rr rimg_size patch_sizepatches_resolution num_patchesin_chans embed_dimr Conv2drKr)rrrrrrrrrrr s   zPatchEmbed.__init__c Cs|j\}}}}||jdkr*||jdksXtd|d|d|jdd|jdd ||ddd}|jdk r||}|S)Nrr"zInput image size (rz) doesn't match model (z).r$)r(rrvrKrFrYr)rrr-r0r.r/rrrrs(  zPatchEmbed.forwardcCsR|j\}}|||j|j|jd|jd}|jdk rN||||j7}|S)Nrr")rrrrr)rZHoZWorkrrrrks  ( zPatchEmbed.flops)rr%r#rN)rrrrlr rrkr!rrrrrs  rcseZdZdZdddddddddgddd d gd d d ddddejdd dffdd ZddZej j ddZ ej j ddZ ddZ ddZddZZS) SwinTransformera Swin Transformer A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - https://arxiv.org/pdf/2103.14030 Args: img_size (int | tuple(int)): Input image size. Default 224 patch_size (int | tuple(int)): Patch size. Default: 4 in_chans (int): Number of input image channels. Default: 3 num_classes (int): Number of classes for classification head. Default: 1000 embed_dim (int): Patch embedding dimension. Default: 96 depths (tuple(int)): Depth of each Swin Transformer layer. num_heads (tuple(int)): Number of attention heads in different layers. window_size (int): Window size. Default: 7 mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None drop_rate (float): Dropout rate. Default: 0 attn_drop_rate (float): Attention dropout rate. Default: 0 drop_path_rate (float): Stochastic depth rate. Default: 0.1 norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. ape (bool): If True, add absolute position embedding to the patch embedding. Default: False patch_norm (bool): If True, add normalization after patch embedding. Default: True use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False rr%r#irr$ rorpTNrg?Fc st||_t||_||_||_||_t|d|jd|_ | |_ t |||||jr^|ndd|_ |j j }|j j}||_|jrttd|||_t|jddtj| d|_ddtd |t|D}t|_t|jD]}tt|d||d d||dd|f||||||j | | | | |t|d|t|d|d|||jdkrntnd|d }|j|q||j |_t d|_!|d krt"|j |nt#|_$|j |_%|&|j'dS) Nr$r")rrrrrr9r:)pcSsg|] }|qSr)item)rrrrrrsz,SwinTransformer.__init__..r)r<rrrr=r,rtrOrPrrJryrrr)(rr num_classeslen num_layersrape patch_normr3 num_featuresrtr patch_embedrrr r?r@rAabsolute_pos_embedrrpos_droplinspacerGrlayersrrrappendrAdaptiveAvgPool1davgpoolr rxheaddim_outapply _init_weights)rrrrrrdepthsr=r,rtrOrP drop_rateattn_drop_ratedrop_path_raterrrrkwargsrrdpri_layerlayerrrrr s`    &   "zSwinTransformer.__init__cCsrt|tjrBt|jddt|tjrn|jdk rntj|jdn,t|tjrntj|jdtj|jddS)Nr9r:rg?) rr r rweightr8init constant_r)rmrrrrs  zSwinTransformer._init_weightscCsdhS)Nrrrirrrno_weight_decay(szSwinTransformer.no_weight_decaycCsdhS)NrBrrirrrno_weight_decay_keywords,sz(SwinTransformer.no_weight_decay_keywordscCsd||}|jr||j}||}|jD] }||}q*||}||dd}t |d}|S)Nr"r$) rrrrrrrrYr@rF)rrrrrrforward_features0s       z SwinTransformer.forward_featurescCs||}||}|Sr)rrrrrrr>s  zSwinTransformer.forwardcCsnd}||j7}t|jD]\}}||7}q||j|jd|jdd|j7}||j|j7}|S)Nrr"r$)rrk enumeraterrrrr)rrkrrrrrrkCs(zSwinTransformer.flops)rrrrlr rr rr@jitignorerrrrrkr!rrrrrs2 ;   r)r@torch.nnr torch.utils.checkpointutilsrtimm.models.layersrrrModulerr2r4r5rnrrrrrrrrs b1F0