o g i@sdZddlmZddlmZddlmZmZddlZddl m Z ddl m m Z ddlmZmZddlmZmZdd lmZmZmZmZmZdd lmZgd Zd0d dZeddeddeddeddeddd ZGddde jZ Gddde jZ!Gddde jZ"Gddde jZ#Gddde jZ$Gdd d e jZ%d!d"Z&d1d$d%Z'ed2d&d'Z(ed2d(d)Z)ed2d*d+Z*ed2d,d-Z+ed2d.d/Z,dS)3z CoaT architecture. Paper: Co-Scale Conv-Attentional Image Transformers - https://arxiv.org/abs/2104.06399 Official CoaT code at: https://github.com/mlpc-ucsd/CoaT Modified from timm/models/vision_transformer.py )deepcopy)partial)TupleListNIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)build_model_with_cfgoverlay_external_default_cfg) PatchEmbedMlpDropPath to_2tuple trunc_normal_)register_model) coat_tiny coat_minicoat_lite_tinycoat_lite_minicoat_lite_smallc Ks |ddddddttddd |S) N)rg?bicubicTzpatch_embed1.projhead) url num_classes input_size pool_sizecrop_pct interpolationfixed_input_sizemeanstd first_conv classifierr)rkwargsr))/home/terry/ogs_model/timm/models/coat.py _cfg_coat!sr+zlhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_tiny-473c2a20.pth)rzlhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_mini-2c6baf49.pthzqhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_lite_tiny-461b07a7.pthzqhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_lite_mini-d7842000.pthzrhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_lite_small-fea1d5a1.pthcs6eZdZdZfddZdeeeffddZZS) ConvRelPosEncz+ Convolutional relative position encoding. c stt|tr||i}||_n t|tr||_ntt|_ g|_ | D]5\}}d}||d|dd}tj ||||f||f||f|d}|j ||j |q*fdd|j D|_dS)aj Initialization. Ch: Channels per head. h: Number of heads. window: Window size(s) in convolutional relative positional encoding. It can have two forms: 1. An integer of window size, which assigns all attention heads with the same window s size in ConvRelPosEnc. 2. A dict mapping window size to #attention head splits ( e.g. {window size 1: #attention head split 1, window size 2: #attention head split 2}) It will apply different window size to the attention head splits. r ) kernel_sizepaddingdilationgroupscsg|]}|qSr)r).0xChr)r* gz*ConvRelPosEnc.__init__..N)super__init__ isinstanceintwindowdict ValueErrornn ModuleList conv_list head_splitsitemsConv2dappendchannel_splits) selfr6hr= cur_windowcur_head_splitr0 padding_sizecur_conv __class__r5r*r:As*    zConvRelPosEnc.__init__sizecCs|j\}}}}|\}} |d|| ksJ|ddddddddf} |ddddddddf} | dd||||| } tj| |jdd} g} t|jD] \}}| || |qStj | dd}|||||| dd}| |}t |d}|S)Nr dim)rrr rrr) shape transposereshapetorchsplitrG enumeraterBrFcatFpad)rHqvrPBrINr6HWq_imgv_img v_img_listconv_v_img_listiconv conv_v_imgEV_hatr)r)r*forwardis   zConvRelPosEnc.forward __name__ __module__ __qualname____doc__r:rr<rl __classcell__r)r)rNr*r,?s (r,cs8eZdZdZd fdd Zdeeeffd d ZZS) FactorAtt_ConvRelPosEnczK Factorized attention with convolutional relative position encoding class. FNcsht||_||}|d|_tj||d|d|_t||_t|||_ t||_ ||_ dS)Ngr)bias) r9r: num_headsscaler@LinearqkvDropout attn_dropproj proj_dropcrpe)rHrTrwqkv_biasr|r~ shared_crpehead_dimrNr)r*r:s     z FactorAtt_ConvRelPosEnc.__init__rPc Cs|j\}}}||||d|j||jddddd}|d|d|d}}} |jdd} | dd| } || } |j|| |d } |j| | }|dd|||}| |}| |}|S) Nrr-rr rSrQrRrP) rUrzrWrwpermutesoftmaxrVrrxr}r~) rHr4rPr`raCrzr^kr_ k_softmax factor_attrr)r)r*rls .   zFactorAtt_ConvRelPosEnc.forward)rtFruruNrmr)r)rNr*rssrscs8eZdZdZdfdd ZdeeeffddZZS) ConvPosEnczz Convolutional Position Encoding. Note: This module is similar to the conditional position encoding in CPVT. rcs.tt|tj|||d|d|d|_dS)Nr r-)r1)r9rr:r@rEr})rHrTrrNr)r*r:s zConvPosEnc.__init__rPc Cs|j\}}}|\}}|d||ksJ|ddddf|ddddf}} | dd||||} || | }|ddd}tj||fdd}|S)Nr r-rS)rUrVviewr}flattenrXr[) rHr4rPr`rarrbrc cls_token img_tokensfeatr)r)r*rls *zConvPosEnc.forward)rrmr)r)rNr*rsrc sNeZdZdZdddddejejddf fdd Zdee e ffd d Z Z S) SerialBlockz Serial block class. Note: In this implementation, each serial block only contains a conv-attention and a FFN (MLP) module. @FruNc svt| |_| ||_t|||||| d|_|dkr t|nt|_ | ||_ t ||} t || ||d|_ dS)Nrwrr|r~rru in_featureshidden_features act_layerdrop)r9r:cpenorm1rsfactoratt_crperr@Identity drop_pathnorm2r<r mlp) rHrTrw mlp_ratiorrr|rr norm_layer shared_cpermlp_hidden_dimrNr)r*r:s     zSerialBlock.__init__rPcCsV|||}||}|||}|||}||}||}|||}|SN)rrrrrr)rHr4rPcurr)r)r*rls     zSerialBlock.forward) rnrorprqr@GELU LayerNormr:rr<rlrrr)r)rNr*rs rcseZdZdZgddddejejdffdd Zdede e e ffd d Z dede e e ffd d Z d ede e e ffddZ dee e e ffddZZS) ParallelBlockz Parallel block class. FruNc s\t| |d|_| |d|_| |d|_t|d||||| dd|_t|d||||| dd|_t|d||||| dd|_|dkrOt |nt |_ | |d|_ | |d|_| |d|_|d|dkrz|dks}JJ|d|dkr|dksJJt|d|d} t|d| ||d|_|_|_dS)Nr r-rrrur)r9r:norm12norm13norm14rsfactoratt_crpe2factoratt_crpe3factoratt_crpe4rr@rrnorm22norm23norm24r<r mlp2mlp3mlp4) rHdimsrw mlp_ratiosrrr|rrr shared_crpesrrNr)r*r:s4 (( zParallelBlock.__init__factorrPcCs|j|||dS)z Feature map up-sampling.  scale_factorrP interpolaterHr4rrPr)r)r*upsample szParallelBlock.upsamplecCs|j|d||dS)z Feature map down-sampling. ?rrrr)r)r* downsampleszParallelBlock.downsamplerc Cs|j\}}}|\}}|d||ksJ|ddddddf} |ddddddf} | dd||||} tj| |dddd} | ||ddd} tj| | fdd} | S) z Feature map interpolation. r Nr-Fbilinear)rrecompute_scale_factormode align_cornersrQrS)rUrVrWr\rrXr[) rHr4rrPr`rarrbrcrroutr)r)r*rs  zParallelBlock.interpolatesizescCst|\}}}} ||} ||} ||} |j| |d} |j| |d} |j| | d} |j| d|d} |j| d| d}|j| d| d}|j| d|d}|j| d|d}|j| d|d}| | |} | ||} | ||} ||| }||| }||| }| |} | |} | |} | | } | | } || } ||| }||| }||| }||||fS)Nrg@)rrPr)rrrrrrrrrrrrrrr)rHx1x2x3x4r_S2S3S4cur2cur3cur4 upsample3_2 upsample4_3 upsample4_2 downsample2_3 downsample3_4 downsample2_4r)r)r*rl$s:              zParallelBlock.forward)rnrorprqr@rrr:floatrr<rrrrrlrrr)r)rNr*rs !"rcseZdZdZddddddddddd d d eejd d d d d ffdd ZddZe j j ddZ ddZ d!ddZddZddZddZdd ZZS)"CoaTz CoaT class. rrr)rrrrrTrugư>)epsFNc s,t|p dddd}|_|__d_|_t|}t|||dt j d_ tdd|Dddd t j d_ td d|Ddd dt j d_ td d|Ddddt j d_t td d d_t td d d _t td d d_t td d d_tddd _td dd _tddd _tddd _td|d _td |d _td|d _td|d _| dksJt f ddt!|dD_"t f ddt!|d D_#t f ddt!|dD_$t f ddt!|dD_%|_&j&dkrt f ddt!|D_'nd_'jsj'durd _(d_)nd_(_)d_*j&dkrd dkrˆdksJJtj j+dd d d_,|dkrt -j|nt ._/n|dkrt -j|nt ._/t0jddt0jddt0jddt0jdd1j2dS)Nr-r)rrQr)img_size patch_sizein_chans embed_dimrcSg|]}|dqS)rr)r2r)r)r*r7\r8z!CoaT.__init__..r cSr)rtr)r2r)r)r*r7_r8cSr)rr)r2r)r)r*r7br8)rTr)r6rIr=ruc4g|]}tddjjd qS)r rTrwrrrr|rrrr)rcpe1crpe1r3r attn_drop_ratedpr drop_rate embed_dimsrrrwrrHr)r*r7|cr)r r)rcpe2crpe2rrr)r*r7rcr)r-r)rcpe3crpe3rrr)r*r7rcr)rr)rcpe4crpe4rrr)r*r7rcs6g|]}tjjjjfd qS)) rrwrrrr|rrr)rrrrrrrr)r*r7s) in_channels out_channelsr.{Gz?r%)3r9r:return_interm_layers out_featuresr num_featuresrrr r@r patch_embed1 patch_embed2 patch_embed3 patch_embed4 ParameterrXzeros cls_token1 cls_token2 cls_token3 cls_token4rrrrrr,rrrrrArangeserial_blocks1serial_blocks2serial_blocks3serial_blocks4parallel_depthparallel_blocksrnorm3norm4Conv1d aggregateryrrrapply _init_weights)rHrrrrr serial_depthsr rwrrrrdrop_path_raterrr crpe_windowr(rNrr*r:Is                     ,$"z CoaT.__init__cCst|tjr&t|jddt|tjr"|jdur$tj|jddSdSdSt|tjr>tj|jdtj|jddSdS)Nrrrr) r;r@ryrweightrvinit constant_r)rHmr)r)r*rs  zCoaT._init_weightscCshdS)N>rrrrr)rHr)r)r*no_weight_decayszCoaT.no_weight_decaycCs|jSr)rrr)r)r*get_classifierszCoaT.get_classifierrcCs0||_|dkrt|j||_dSt|_dS)Nr)rr@ryrrr)rHr global_poolr)r)r*reset_classifiers*zCoaT.reset_classifiercCs*||jddd}tj||fdd}|S)z Insert CLS token. rrQr rS)expandrUrXr[)rHr4r cls_tokensr)r)r* insert_clsszCoaT.insert_clscCs|ddddddfS)z Remove CLS token. Nr r)rHr4r)r)r* remove_clsszCoaT.remove_clsc CsP|jd}||}|jj\}}|||j}|jD] }||||fd}q||}||||ddddd }| |}|j j\} } |||j }|j D] }||| | fd}qO||} | || | ddddd } | | } |j j\} }|| |j} |jD] }|| | |fd} q|| }||| |ddddd }||}|jj\}}|||j}|jD] }||||fd}q||}||||ddddd }|jdur"tjs|jri}d|jvr||d<d|jvr| |d<d |jvr||d <d |jvr||d <|S||}|dddf}|S|jD]6}||| | f|| | |f||||f}} }|||| |||f| | f| |f||fgd \}}} }q%tjs|jri}d|jvr||}||||ddddd }||d<d|jvr||} | || | ddddd } | |d<d |jvr|| }||| |ddddd }||d <d |jvr||}||||ddddd }||d <|S||}|| } ||}|ddddf}| ddddf}|ddddf}tj|||fdd }| |j!dd }|S) NrrrQrr r-x1_noclsx2_noclsx3_noclsx4_nocls)rrS)"rUr grid_sizer rrr"rWr contiguousrrrrrrrrr r rXjit is_scriptingrrr rrrrr r[rsqueeze)rHx0r`rH1W1blkr#rH2W2r$rH3W3r%rH4W4r&feat_outx4_clsx2_clsx3_cls merged_clsr)r)r*forward_featuress                            46               zCoaT.forward_featurescCs(|jr||S||}||}|Sr)rr;rr!r)r)r*rlGs    z CoaT.forwardr)rnrorprqrr@rr:rrXr)ignorerrrr r"r;rlrrr)r)rNr*rGs"    _rcCsVi}|D]"\}}|ds#|jdur|ds#|jdur$|dr$q|||<q|S)Nrrr )rD startswithrr ) state_dictmodelout_dictrr_r)r)r*checkpoint_filter_fnRs  rBFcKs6|ddr tdtt||ft|td|}|S)N features_onlyztd dgdgdddgdd|}td d |i|}|S) Nr)rMrMrMr-r-r-r-rtrrrrrrrr rwrrrJr))rr>rKrJr( model_cfgr@r)r)r*rjrc KrL) Nr)rMrVrVrNrOrtrPrQrrJr))rrRrSr)r)r*rsrUrc KrL) Nr)@@rNrrtrtrtrrrQrrJr))rrRrSr)r)r*r|rUrc KrL) NrrWrXrZirNrrtr[rQrrJr))rrRrSr)r)r*rrUrc KrL) Nrr\)rrrOrrrtr[rQrrJr))rrRrSr)r)r*rrUrr<)FN)F)-rqcopyr functoolsrtypingrrrXtorch.nnr@torch.nn.functional functionalr\ timm.datarrhelpersr r layersr r rrrregistryr__all__r+rHModuler,rsrrrrrBrKrrrrrr)r)r)r*sb     @*#`