U ˜€dì/ã@sjddlmZddlZddlmZddlmmZGdd„deƒZGdd„dej ƒZ Gdd „d ej ƒZ dS) é)Ú BertModeléNcs|eZdZddddgddddgdf‡fdd„ Zd d „Zd d „Zd d„Zdd„Zdd„Zdd„Z dd„Z dd„Z dd„Z ‡Z S)ÚMultiModalBertééé é rçc stƒ |¡||_||_||_‡fdd„tt|ƒƒDƒ}t ¡|_ t ¡|_ t ¡|_ tdt|ƒƒD]¤}||}t d|ddd|||d} |j   | ¡t tjddddt ¡tjddddt ¡¡} tj | dj¡tj | dj¡|j   | ¡|j   t d¡¡qddS) Ncsg|]}ˆd|‘qS)é©)Ú.0Úi©Ú embed_dimr ú3/home/yxchng/Downloads/elia/bert/multimodal_bert.pyÚ sz+MultiModalBert.__init__..ri)Ú num_headsÚdropoutF)Úbiasr )ÚsuperÚ__init__Úpwam_idxÚnum_heads_fusionÚ fusion_dropÚrangeÚlenÚnnÚ ModuleListÚpwamsÚ res_gatesÚnormsÚPWAMÚappendÚ SequentialÚLinearÚReLUÚTanhÚinitÚzeros_ÚweightÚ LayerNorm) ÚselfÚconfigrrrrZ pwam_dimsr ÚdimÚfusionZres_gate©Ú __class__rrr s:    ú ü zMultiModalBert.__init__cCsB| ¡}tj|tj|jd}| |||j¡}|j||d}||fS)N)ÚdtypeÚdevice)Ú input_idsÚtoken_type_ids)ÚsizeÚtorchÚzerosÚlongr2Zget_extended_attention_maskÚ embeddings)r+r3Úattention_maskÚ input_shaper4Úextended_attention_maskZembedding_outputr r rÚ forward_stem,sÿzMultiModalBert.forward_stemcCs<td|jdƒD]"}|jj|}|||ƒ}|d}q|dS©Nr©rrÚencoderÚlayer©r+Z hidden_statesr:r Z layer_moduleZ layer_outputsr r rÚforward_stage18s þ zMultiModalBert.forward_stage1cCsBt|jd|jdƒD]"}|jj|}|||ƒ}|d}q|dS)Nrrr?rBr r rÚforward_stage2Cs þ zMultiModalBert.forward_stage2cCsBt|jd|jdƒD]"}|jj|}|||ƒ}|d}q|dS)Nrr rr?rBr r rÚforward_stage3Ns þ zMultiModalBert.forward_stage3cCsBt|jd|jdƒD]"}|jj|}|||ƒ}|d}q|dS)Nr rrr?rBr r rÚforward_stage4Ys þ zMultiModalBert.forward_stage4cCs:|jd|||ƒ}||jd|ƒ|}|jd|ƒ|fSr>©rrr ©r+ÚxÚlÚl_maskZ l_residualr r rÚ forward_pwam1dszMultiModalBert.forward_pwam1cCs:|jd|||ƒ}||jd|ƒ|}|jd|ƒ|fS)NrrGrHr r rÚ forward_pwam2iszMultiModalBert.forward_pwam2cCs:|jd|||ƒ}||jd|ƒ|}|jd|ƒ|fS)Nr rGrHr r rÚ forward_pwam3nszMultiModalBert.forward_pwam3cCs:|jd|||ƒ}||jd|ƒ|}|jd|ƒ|fS)NrrGrHr r rÚ forward_pwam4sszMultiModalBert.forward_pwam4)Ú__name__Ú __module__Ú __qualname__rr=rCrDrErFrLrMrNrOÚ __classcell__r r r/rr s$"     rcs&eZdZd‡fdd„ Zdd„Z‡ZS)r!rr csptt|ƒ ¡t t ||¡t ¡t |¡¡|_t ||||||d|_ t t  ||dd¡t ¡t |¡¡|_ dS)N)Ú out_channelsrr) rr!rrr#r$ÚGELUÚDropoutÚ vis_projectÚSpatialImageLanguageAttentionÚimage_lang_attÚConv1dÚ project_mm)r+r-Ú v_in_channelsÚ l_in_channelsÚ key_channelsÚvalue_channelsrrr/r rrys þûþz PWAM.__init__cCsX| |¡}| |||¡}| ddd¡}t | ddd¡|¡}| |¡}| ddd¡}|S)Nrr r)rWrYÚpermuter6Úmulr[)r+rIrJrKÚvisÚlangÚmmr r rÚforward’s  z PWAM.forward)rr ©rPrQrRrrerSr r r/rr!xsr!cs&eZdZd‡fdd„ Zdd„Z‡ZS)rXNrcsÜtt|ƒ ¡||_||_||_||_||_||_|dkrB|j|_t   t j |j|jddd¡|_ t   t j |j|jdddt   |j¡¡|_t   t j |j|jdddt   |j¡¡|_t   t j |j|jdddt   |j¡¡|_dS)Nr)Ú kernel_sizeÚstride)rrXrr\r]rTr^r_rrr#rZÚf_queryÚInstanceNorm1dÚf_keyÚf_valueÚW)r+r\r]r^r_rTrr/r rr¯s0ÿ þ  þ þz&SpatialImageLanguageAttention.__init__c Cs|| d¡}| d¡| d¡}}| ddd¡}| ddd¡}|}| |¡}||}| ddd¡}| |¡}| |¡}| d¡} | ||j|j|j|¡}| ||j|j|j|¡}| || |j|j|j¡ dddd¡}|  d¡}t   ||¡} |jd| } | d|d} t j | dd} t   | | dddd¡¡} |  dddd¡ ¡ || |j¡} |  ddd¡} | | ¡} |  ddd¡} | S) Nrrr réÿÿÿÿgà¿gˆÃ@)r-)Úsqueezer5r`rirkrlÚreshaperr^Ú unsqueezer6ÚmatmulÚFÚsoftmaxÚ contiguousr_rm) r+rIrJrKÚBÚHWÚqueryÚkeyÚvalueZn_lZsim_mapÚoutr r rreØs2     &    z%SpatialImageLanguageAttention.forward)Nrrfr r r/rrX®s)rX) Z modeling_bertrr6Útorch.nnrÚtorch.nn.functionalÚ functionalrsrÚModuler!rXr r r rÚs  o6