U b%c i@sdZddlmZddlmZddlmZmZddlZddl m Z ddl m m Z ddlmZmZddlmZmZdd lmZmZmZmZmZdd lmZd d d ddgZd0ddZeddeddeddeddedddZGddde jZ Gddde jZ!Gddde jZ"Gd d!d!e jZ#Gd"d#d#e jZ$Gd$d%d%e jZ%d&d'Z&d1d)d*Z'ed2d+d Z(ed3d,d Z)ed4d-d Z*ed5d.dZ+ed6d/dZ,dS)7z CoaT architecture. Paper: Co-Scale Conv-Attentional Image Transformers - https://arxiv.org/abs/2104.06399 Official CoaT code at: https://github.com/mlpc-ucsd/CoaT Modified from timm/models/vision_transformer.py )deepcopy)partial)TupleListNIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)build_model_with_cfgoverlay_external_default_cfg) PatchEmbedMlpDropPath to_2tuple trunc_normal_)register_model coat_tiny coat_minicoat_lite_tinycoat_lite_minicoat_lite_smallc Ks |ddddddttddd |S) N)rg?bicubicTzpatch_embed1.projhead) url num_classes input_size pool_sizecrop_pct interpolationfixed_input_sizemeanstd first_conv classifierr)rkwargsr)=/home/chou/Projects/FGVC/FGVC-PIM-master2/timm/models/coat.py _cfg_coat!sr+zlhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_tiny-473c2a20.pth)rzlhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_mini-2c6baf49.pthzqhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_lite_tiny-461b07a7.pthzqhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_lite_mini-d7842000.pthzrhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_lite_small-fea1d5a1.pth)rrrrrcs6eZdZdZfddZeeefdddZZS) ConvRelPosEncz+ Convolutional relative position encoding. c stt|tr$||i}||_nt|tr6||_ntt|_ g|_ | D]j\}}d}||d|dd}tj ||||f||f||f|d}|j ||j |qTfdd|j D|_dS)aj Initialization. Ch: Channels per head. h: Number of heads. window: Window size(s) in convolutional relative positional encoding. It can have two forms: 1. An integer of window size, which assigns all attention heads with the same window s size in ConvRelPosEnc. 2. A dict mapping window size to #attention head splits ( e.g. {window size 1: #attention head split 1, window size 2: #attention head split 2}) It will apply different window size to the attention head splits. r ) kernel_sizepaddingdilationgroupscsg|] }|qSr)r).0xChr)r* gsz*ConvRelPosEnc.__init__..N)super__init__ isinstanceintwindowdict ValueErrornn ModuleList conv_listZ head_splitsitemsConv2dappendchannel_splits) selfr6hr<Z cur_windowZcur_head_splitr0 padding_sizeZcur_conv __class__r5r*r9As*     zConvRelPosEnc.__init__sizecCs|j\}}}}|\}} |d|| ks*t|ddddddddf} |ddddddddf} | dd||||| } tj| |jdd} g} t|jD]\}}| || |qtj | dd}|||||| dd}| |}t |d}|S)Nr dim)rrr rrr) shapeAssertionError transposereshapetorchsplitrE enumeraterArDcatFpad)rFqvrLBrGNr6HWZq_imgZv_imgZ v_img_listZconv_v_img_listiconvZ conv_v_imgZEV_hatr)r)r*forwardis   zConvRelPosEnc.forward __name__ __module__ __qualname____doc__r9rr;rc __classcell__r)r)rIr*r,?s (r,cs8eZdZdZd fdd Zeeefdd d ZZS) FactorAtt_ConvRelPosEnczK Factorized attention with convolutional relative position encoding class. FNcsht||_||}|d|_tj||d|d|_t||_t|||_ t||_ ||_ dS)Ngr)bias) r8r9 num_headsscaler?LinearqkvDropout attn_dropproj proj_dropcrpe)rFrPrnqkv_biasrsru shared_crpehead_dimrIr)r*r9s    z FactorAtt_ConvRelPosEnc.__init__rKc Cs|j\}}}||||d|j||jddddd}|d|d|d}}} |jdd} | dd| } || } |j|| |d } |j| | }|dd|||}| |}| |}|S) Nrr-rr rOrMrNrK) rQrqrTrnpermutesoftmaxrSrvrortru) rFr4rLr]r^Crqr[kr\Z k_softmaxZ factor_attrvr)r)r*rcs .   zFactorAtt_ConvRelPosEnc.forward)rkFrlrlNrdr)r)rIr*rjsrjcs8eZdZdZdfdd ZeeefdddZZS) ConvPosEnczz Convolutional Position Encoding. Note: This module is similar to the conditional position encoding in CPVT. rcs.tt|tj|||d|d|d|_dS)Nr r-)r1)r8rr9r?rCrt)rFrPr~rIr)r*r9szConvPosEnc.__init__rKc Cs|j\}}}|\}}|d||ks(t|ddddf|ddddf}} | dd||||} || | }|ddd}tj||fdd}|S)Nr r-rO)rQrRrSviewrtflattenrUrX) rFr4rLr]r^r}r_r` cls_token img_tokensfeatr)r)r*rcs *zConvPosEnc.forward)rrdr)r)rIr*rsrc sNeZdZdZdddddejejddf fdd Zee e fdd d Z Z S) SerialBlockz Serial block class. Note: In this implementation, each serial block only contains a conv-attention and a FFN (MLP) module. @FrlNc svt| |_| ||_t|||||| d|_|dkr@t|nt|_ | ||_ t ||} t || ||d|_ dS)Nrnrwrsrurxrl in_featureshidden_features act_layerdrop)r8r9cpenorm1rjfactoratt_crperr?Identity drop_pathnorm2r;r mlp) rFrPrn mlp_ratiorwrrsrr norm_layer shared_cperxmlp_hidden_dimrIr)r*r9s    zSerialBlock.__init__rKcCsV|||}||}|||}|||}||}||}|||}|SN)rrrrrr)rFr4rLcurr)r)r*rcs     zSerialBlock.forward) rerfrgrhr?GELU LayerNormr9rr;rcrir)r)rIr*rsrcseZdZdZgddddejejdffdd Zee e e fddd Z ee e e fdd d Z ee e e fd d dZ ee e e fdddZZS) ParallelBlockz Parallel block class. FrlNc s\t| |d|_| |d|_| |d|_t|d||||| dd|_t|d||||| dd|_t|d||||| dd|_|dkrt |nt |_ | |d|_ | |d|_| |d|_|d|dkr|dksnt|d|dkr|dks"ntt|d|d} t|d| ||d|_|_|_dS)Nr r-rrrlr)r8r9norm12norm13norm14rjfactoratt_crpe2factoratt_crpe3factoratt_crpe4rr?rrnorm22norm23norm24rRr;r mlp2mlp3mlp4) rFdimsrn mlp_ratiosrwrrsrrr shared_crpesrrIr)r*r9sR &*zParallelBlock.__init__factorrLcCs|j|||dS)z Feature map up-sampling.  scale_factorrL interpolaterFr4rrLr)r)r*upsample szParallelBlock.upsamplecCs|j|d||dS)z Feature map down-sampling. ?rrrr)r)r* downsampleszParallelBlock.downsamplerc Cs|j\}}}|\}}|d||ks(t|ddddddf} |ddddddf} | dd||||} tj| |dddd} | ||ddd} tj| | fdd} | S) z Feature map interpolation. r Nr-Fbilinear)rrecompute_scale_factormode align_cornersrMrO)rQrRrSrTrYrrUrX) rFr4rrLr]r^r}r_r`rroutr)r)r*rs  zParallelBlock.interpolatesizescCst|\}}}} ||} ||} ||} |j| |d} |j| |d} |j| | d} |j| d|d} |j| d| d}|j| d| d}|j| d|d}|j| d|d}|j| d|d}| | |} | ||} | ||} ||| }||| }||| }| |} | |} | |} | | } | | } || } ||| }||| }||| }||||fS)NrKg@rr)rrrrrrrrrrrrrrr)rFx1x2x3x4r_ZS2ZS3ZS4Zcur2Zcur3Zcur4Z upsample3_2Z upsample4_3Z upsample4_2Z downsample2_3Z downsample3_4Z downsample2_4r)r)r*rc$s:             zParallelBlock.forward)rerfrgrhr?rrr9floatrr;rrrrrcrir)r)rIr*rs!rcseZdZdZddddddddddd d d eejd d d d d ffdd ZddZe j j ddZ ddZ d!ddZddZddZddZdd ZZS)"CoaTz CoaT class. rrr)rrrrrTrlgư>)epsFNc s,t|pdddd}|_|__d_|_t|}t|||dt j d_ tdd|Dddd t j d_ td d|Ddd dt j d_ td d|Ddddt j d_t td d d_t td d d _t td d d_t td d d_tddd _td dd _tddd _tddd _td|d _td |d _td|d _td|d _| dkst t !f ddt"|dD_#t !f ddt"|d D_$t !f ddt"|dD_%t !f ddt"|dD_&|_'j'dkrt !f ddt"|D_(nd_(jsj(dk r:d _)d_*n d_)_*d_+j'dkrˆd dkrdksnt tj j,dd d d_-|dkrt .j|nt /_0n"|dkrt .j|nt /_0t1jddt1jddt1jddt1jdd2j3dS)Nr-r)rrMr)img_size patch_sizein_chans embed_dimrcSsg|] }|dqS)rzr)r2r)r)r*r7\sz!CoaT.__init__..r cSsg|] }|dqS)rkr)r2r)r)r*r7_scSsg|] }|dqS)rr)r2r)r)r*r7bs)rPr~)r6rGr<rlcs4g|],}tddjjd qS)r rPrnrrwrrsrrrrx)rcpe1crpe1r3r attn_drop_ratedpr drop_rate embed_dimsrrrnrwrFr)r*r7|scs4g|],}tddjjd qS)r r)rcpe2crpe2rrr)r*r7scs4g|],}tddjjd qS)r-r)rcpe3crpe3rrr)r*r7scs4g|],}tddjjd qS)rr)rcpe4crpe4rrr)r*r7scs6g|].}tjjjjfd qS)) rrnrrwrrsrrr)rrrrrrrr)r*r7s) in_channels out_channelsr.{Gz?r%)4r8r9return_interm_layers out_featuresr num_featuresrrr r?r patch_embed1 patch_embed2 patch_embed3 patch_embed4 ParameterrUzeros cls_token1 cls_token2 cls_token3 cls_token4rrrrrr,rrrrrRr@rangeserial_blocks1serial_blocks2serial_blocks3serial_blocks4parallel_depthparallel_blocksrnorm3norm4Conv1d aggregaterprrrapply _init_weights)rFrrrrr serial_depthsrrnrrwrrdrop_path_raterrrZ crpe_windowr(rIrr*r9Is              *$"z CoaT.__init__cCsrt|tjrBt|jddt|tjrn|jdk rntj|jdn,t|tjrntj|jdtj|jddS)Nrrrr) r:r?rprweightrminit constant_r)rFmr)r)r*rs  zCoaT._init_weightscCs ddddhS)Nrrrrr)rFr)r)r*no_weight_decayszCoaT.no_weight_decaycCs|jSr)rrr)r)r*get_classifierszCoaT.get_classifierrcCs*||_|dkrt|j|nt|_dS)Nr)rr?rprrr)rFr global_poolr)r)r*reset_classifierszCoaT.reset_classifiercCs*||jddd}tj||fdd}|S)z Insert CLS token. rrMr rO)expandrQrUrX)rFr4r cls_tokensr)r)r* insert_clsszCoaT.insert_clscCs|ddddddfS)z Remove CLS token. Nr r)rFr4r)r)r* remove_clsszCoaT.remove_clsc Cs\|jd}||}|jj\}}|||j}|jD]}||||fd}q4||}||||ddddd }| |}|j j\} } |||j }|j D]}||| | fd}q||} | || | ddddd } | | } |j j\} }|| |j} |jD]}|| | |fd} q|| }||| |ddddd }||}|jj\}}|||j}|jD]}||||fd}qt||}||||ddddd }|jdkrLtjs.|jr.i}d|jkr||d<d|jkr| |d<d |jkr||d <d |jkr*||d <|S||}|dddf}|S|jD]l}||| | f|| | |f||||f}} }|||| |||f| | f| |f||fgd \}}} }qRtjs|jri}d|jkr||}||||ddddd }||d<d|jkrT||} | || | ddddd } | |d<d |jkr|| }||| |ddddd }||d <d |jkr||}||||ddddd }||d <|S||}|| } ||}|ddddf}| ddddf}|ddddf}tj|||fdd }| |j!dd }|SdS) NrrKrMrr r-x1_noclsx2_noclsx3_noclsx4_noclsrrO)"rQr grid_sizerrrrrTr{ contiguousrrrrrrrrrrrUjit is_scriptingrrrrrrrrrXrsqueeze)rFx0r]rZH1W1blkr rZH2W2r rZH3ZW3r rZH4ZW4r Zfeat_outZx4_clsZx2_clsZx3_clsZ merged_clsr)r)r*forward_featuress                            46               zCoaT.forward_featurescCs,|jr||S||}||}|SdSr)rrrrr)r)r*rcGs    z CoaT.forward)r)rerfrgrhrr?rr9rrUrignorerrrrrrrcrir)r)rIr*rGs:    _rcCsVi}|D]D\}}|ds |jdkr2|ds |jdkrH|drHq |||<q |S)Nrrr)rB startswithrr) state_dictmodelout_dictr~r\r)r)r*checkpoint_filter_fnRs  rFcKs6|ddrtdtt||ft|td|}|S)N features_onlyzsj     @*#`