U b@sddlmZddlmZmZddlZddlZddlZddl Z ddl m m Z ddl m Z ddlmZmZddlmZeeZGdd d e jZGd d d e jZGd d d e jZGddde jZeddZdS)) OrderedDict)TupleUnionN)nn)DropPath trunc_normal_)register_lang_encodercs&eZdZdfdd ZddZZS) LayerNorm-q=cs<tt|tt||_tt||_ ||_ dS)zWConstruct a layernorm module in the TF style (epsilon inside the square root). N) superr __init__r Parametertorchonesweightzerosbiasvariance_epsilon)self hidden_sizeeps __class__2/datadrive/UniCL/model/text_encoder/transformer.pyr szLayerNorm.__init__cCsb|j}|}|jddd}||djddd}||t||j}|j|||j S)NT)keepdim) dtypefloatmeanpowrsqrtrrtor)rxZpdtypeusrrrforwards zLayerNorm.forward)r )__name__ __module__ __qualname__r r( __classcell__rrrrr sr c@seZdZejdddZdS) QuickGELU)r%cCs|td|S)NgZd;?)rsigmoid)rr%rrrr($szQuickGELU.forwardN)r)r*r+rTensorr(rrrrr-#sr-csXeZdZd eeejedfdd Zd ejejdddZd ejejdd d Z Z S)ResidualAttentionBlockN)d_modeln_head attn_mask drop_pathc stt|||_t||_ttdt ||dfdt fdt |d|fg|_ t||_ ||_ |dkr~t|nt|_dS)NZc_fcgeluZc_projr1)r r rMultiheadAttentionattnr ln_1 SequentialrLinearr-mlpln_2r4rIdentityr5)rr2r3r4r5rrrr )s    zResidualAttentionBlock.__init__)r%key_padding_maskcCs@|jdk r|jj|j|jdnd|_|j||||d|jddS)N)rdeviceF)r@ need_weightsr4r)r4r$rrAr9rr%r@rrr attention;sz ResidualAttentionBlock.attentioncCs<|||j|||d}|||||}|S)N)r@)r5rDr:r=r>rCrrrr(GszResidualAttentionBlock.forward)Nr1)N)N) r)r*r+intrr/r r rDr(r,rrrrr0(s r0c sveZdZdeeeeeeedfdd ZeddZdd Z d d Z d gdfd dZ e j jddZdddZZS) Transformerr1T)context_length vocab_sizewidthlayersheadsr5 autogressivecstt||_||_tt|j|_ |_ ||_ ||_ |rR| ndddtd||Dtfddt|D|_t|_t|j ddt|jjdd||jdS)NcSsg|] }|qSr)item).0r%rrr csz(Transformer.__init__..rcsg|]}t|qSr)r0)rNir4dprrKrIrrrOes{Gz?std)r r r Embeddingtoken_embeddingrGrremptypositional_embeddingrIrJrLbuild_attention_masklinspace ModuleListrange resblocksr ln_finalrrapply _init_weights)rrGrHrIrJrKr5rLrrQrr Ns(   zTransformer.__init__cCs|jS)N)rIrrrrdim_outrszTransformer.dim_outcCs,t|j|j}|td|d|S)Nz-infr)rrXrGfill_r triu_)rmaskrrrrZvs z Transformer.build_attention_maskcCsvt|tjtjfrPtdt|jdd|jdk rrtdtj |jdn"t|tj tj frrtj |jddS)Nz/=> init weight of Linear/Conv2d from trunc normrSrTz&=> init bias of Linear/Conv2d to zerosr) isinstancerr<Conv2dloggerinforrrinit constant_r BatchNorm2d)rmrrrra~s   zTransformer._init_weightsc stj|rtj|dd}td||fdd|D}i}|D]L\}}| dd|kpx|ddk}|rT|rtd |d ||||<qT|j |d d dS) Ncpu) map_locationz=> loading pretrained model cs"i|]\}}|kr||qSr)keys)rNkv model_dictrr s z/Transformer.load_pretrained...r*z=> init z from F)strict) ospathisfilerloadloggingrj state_dictitemssplitload_state_dict) r pretrainedpretrained_layersverbosepretrained_dictneed_init_state_dictrsrt need_initrrurload_pretraineds"    zTransformer.load_pretrainedcCsddhS)NrYrWrrbrrrno_weight_decayszTransformer.no_weight_decayNcCsj|js|dknd}||}||j}|ddd}|jD]}|||}q:|ddd}||}d|iS)Nrrrlast_hidden_state)rLrWrYpermuter^r_)r input_idsattention_maskr@r%blockrrrr(s     zTransformer.forward)r1T)N)r)r*r+rEr boolr propertyrcrZrarrjitignorerr(r,rrrrrFMs&$   rFc KsBt|d|j|d|d|d|ddd}|dr>||S) NCONTEXT_LENGTHWIDTHLAYERSHEADS AUTOGRESSIVET)rGrHrIrJrKrLLOAD_PRETRAINED)rFrHgetr)config_encoder tokenizerrkwargs transformerrrr lang_encoders  r) collectionsrtypingrrrr{numpynprtorch.nn.functionalr functionalFtimm.models.layersrrregistryr getLoggerr)riModuler r-r0rFrrrrrs    %g