o ý"ŒeÙ?ã@s°ddlZddlZddlmZddlmZddlZddlZe e ¡Z Gdd„dej ƒZ ej jdd„ƒZGdd „d ej ƒZGd d „d ej ƒZGd d „d ej ƒZGdd„dej ƒZdS)éN)Únn)Ú functionalcs&eZdZd‡fdd„ Zdd„Z‡ZS)Ú LayerNormçñh㈵øä>cs>tƒ ¡||_||_t t |¡¡|_t t  |¡¡|_ dS©N) ÚsuperÚ__init__ÚchannelsÚepsrÚ ParameterÚtorchÚonesÚgammaÚzerosÚbeta)Úselfr r ©Ú __class__©ú"/workspace/OpenVoice/attentions.pyr s zLayerNorm.__init__cCs4| dd¡}t ||jf|j|j|j¡}| dd¡S)Nééÿÿÿÿ)Ú transposeÚFÚ layer_normr rrr )rÚxrrrÚforwards  zLayerNorm.forward)r©Ú__name__Ú __module__Ú __qualname__rrÚ __classcell__rrrrr srcCs\|d}||}t |dd…d|…dd…f¡}t |dd…|d…dd…f¡}||}|S)Nr)r ÚtanhÚsigmoid)Úinput_aÚinput_bÚ n_channelsZn_channels_intZin_actZt_actZs_actÚactsrrrÚfused_add_tanh_sigmoid_multiplys   r(cs0eZdZ    d ‡fdd„ Zd dd „Z‡ZS) ÚEncoderrçéTc s0tƒ ¡||_||_||_||_||_||_||_|j|_ d| vrJ| d|_ |j dkrJt   |j |j¡|_ d| vr=| dnd|_ |j |jksJJdƒ‚t  |¡|_t  ¡|_t  ¡|_t  ¡|_t  ¡|_t|jƒD],} |j t|||||d¡|j t|ƒ¡|j t|||||d¡|j t|ƒ¡qidS)NÚ gin_channelsrÚcond_layer_idxéz+cond_layer_idx should be less than n_layers)Ú p_dropoutÚ window_size©r/)rrÚhidden_channelsÚfilter_channelsÚn_headsÚn_layersÚ kernel_sizer/r0r-r,rÚLinearÚspk_emb_linearÚDropoutÚdropÚ ModuleListÚ attn_layersÚ norm_layers_1Ú ffn_layersÚ norm_layers_2ÚrangeÚappendÚMultiHeadAttentionrÚFFN) rr2r3r4r5r6r/r0ÚisflowÚkwargsÚirrrr&sZ   ÿÿ     ûÿ ûÿ ìzEncoder.__init__NcCsÒ| d¡| d¡}||}t|jƒD]O}||jkr5|dur5| | dd¡¡}| dd¡}||}||}|j||||ƒ}| |¡}|j|||ƒ}|j |||ƒ}| |¡}|j |||ƒ}q||}|S)Nr.rr) Ú unsqueezer@r5r-r8rr<r:r=r>r?)rrÚx_maskÚgÚ attn_maskrFÚyrrrrhs    zEncoder.forward)rr*r+Trrrrrrr)%s÷Br)cs.eZdZ    d ‡fdd„ Zdd„Z‡ZS) ÚDecoderrr*FTc stƒ ¡||_||_||_||_||_||_||_||_ t   |¡|_ t   ¡|_t   ¡|_t   ¡|_t   ¡|_t   ¡|_t   ¡|_t|jƒD]B} |j t||||||d¡|j t|ƒ¡|j t||||d¡|j t|ƒ¡|j t|||||dd¡|j t|ƒ¡qFdS)N)r/Ú proximal_biasÚ proximal_initr1T)r/Úcausal)rrr2r3r4r5r6r/rMrNrr9r:r;Úself_attn_layersÚ norm_layers_0Úencdec_attn_layersr=r>r?r@rArBrrC) rr2r3r4r5r6r/rMrNrErFrrrr}s\        úÿ ÿÿúÿ äzDecoder.__init__c CsÞt | d¡¡j|j|jd}| d¡| d¡}||}t|jƒD]F}|j ||||ƒ}|  |¡}|j |||ƒ}|j ||||ƒ}|  |¡}|j |||ƒ}|j|||ƒ}|  |¡}|j|||ƒ}q"||}|S)z< x: decoder input h: encoder output r.©ÚdeviceÚdtyper)ÚcommonsÚsubsequent_maskÚsizeÚtorTrUrGr@r5rPr:rQrRr=r>r?) rrrHÚhZh_maskZself_attn_maskZencdec_attn_maskrFrKrrrr¸s"ÿ   zDecoder.forward)rr*FTrrrrrrL|s÷;rLcsneZdZ      d‡fdd„ Zddd„Zdd d „Zd d „Zd d„Zdd„Zdd„Z dd„Z dd„Z ‡Z S)rBr*NTFc sœtƒ ¡||dks J‚||_||_||_||_||_||_||_||_ | |_ d|_ |||_ t  ||d¡|_t  ||d¡|_t  ||d¡|_t  ||d¡|_t  |¡|_|dur‰|r^dn|} |j d} t  t | |dd|j ¡| ¡|_t  t | |dd|j ¡| ¡|_t j |jj¡t j |jj¡t j |jj¡| rÌt ¡|jj |jj¡|jj |jj¡WdƒdS1sÅwYdSdS)Nrrgà¿r.) rrr Ú out_channelsr4r/r0Ú heads_shareÚ block_lengthrMrNÚattnÚ k_channelsrÚConv1dÚconv_qÚconv_kÚconv_vÚconv_or9r:r r ÚrandnÚ emb_rel_kÚ emb_rel_vÚinitÚxavier_uniform_ÚweightÚno_gradÚcopy_Úbias) rr r[r4r/r0r\r]rMrNZ n_heads_relZ rel_stddevrrrrÓsP     ÿÿÿÿ "þÿzMultiHeadAttention.__init__cCsD| |¡}| |¡}| |¡}|j||||d\}|_| |¡}|S)N)Úmask)rarbrcÚ attentionr^rd)rrÚcrJÚqÚkÚvrrrrs    zMultiHeadAttention.forwardcCsîg| ¡¢| d¡‘R\}}}}| ||j|j|¡ dd¡}| ||j|j|¡ dd¡}| ||j|j|¡ dd¡}t |t |j¡| dd¡¡} |j durs||ksWJdƒ‚|  |j |¡} |  |t |j¡| ¡} |  | ¡} | | } |jrŒ||ks~Jdƒ‚| | |¡j| j| jd} |dur»|  |dkd ¡} |jdur»||ks¥Jd ƒ‚t | ¡ |j ¡ |j¡} |  | dkd ¡} tj| dd }| |¡}t ||¡}|j duræ| |¡}|  |j|¡}|| ||¡}| dd¡ ¡ |||¡}||fS) Nr.ééþÿÿÿrz8Relative attention is only available for self-attention.z3Proximal bias is only available for self-attention.rSrgˆÃÀz5Local attention is only available for self-attention.)Údim)rXÚviewr4r_rr ÚmatmulÚmathÚsqrtr0Ú_get_relative_embeddingsrfÚ_matmul_with_relative_keysÚ'_relative_position_to_absolute_positionrMÚ_attention_bias_proximalrYrTrUÚ masked_fillr]Ú ones_likeÚtriuÚtrilrÚsoftmaxr:Ú'_absolute_position_to_relative_positionrgÚ_matmul_with_relative_valuesÚ contiguous)rÚqueryÚkeyÚvaluernÚbÚdZt_sZt_tÚscoresZkey_relative_embeddingsZ rel_logitsZ scores_localÚ block_maskZp_attnÚoutputZrelative_weightsZvalue_relative_embeddingsrrrrosZ    ÿÿ  ÿ  ÿ ý    ÿÿÿzMultiHeadAttention.attentioncCst || d¡¡}|S)zU x: [b, h, l, m] y: [h or 1, m, d] ret: [b, h, l, d] r)r rxrG©rrrKÚretrrrr…Esz/MultiHeadAttention._matmul_with_relative_valuescCst || d¡ dd¡¡}|S)zU x: [b, h, l, d] y: [h or 1, m, d] ret: [b, h, l, m] rrur)r rxrGrrrrrr|Nsz-MultiHeadAttention._matmul_with_relative_keysc Csd|jdt||jddƒ}t|jd|dƒ}|d|d}|dkr:t |t ddg||gddgg¡¡}n|}|dd…||…f}|S)Nr.rr)r0ÚmaxrÚpadrVÚconvert_pad_shape)rZrelative_embeddingsÚlengthÚ pad_lengthZslice_start_positionZslice_end_positionZpadded_relative_embeddingsZused_relative_embeddingsrrrr{Wsþÿz+MultiHeadAttention._get_relative_embeddingsc Cs¼| ¡\}}}}t |t ddgddgddgddgg¡¡}| |||d|g¡}t |t ddgddgd|dgg¡¡}| |||dd|dg¡dd…dd…d|…|dd…f}|S)z? x: [b, h, l, 2*l-1] ret: [b, h, l, l] rrr.N©rXrr’rVr“rw©rrÚbatchÚheadsr”Ú_Zx_flatZx_finalrrrr}is* ÿÿz:MultiHeadAttention._relative_position_to_absolute_positionc Cs¸| ¡\}}}}t |t ddgddgddgd|dgg¡¡}| |||d||dg¡}t |t ddgddg|dgg¡¡}| |||d|g¡dd…dd…dd…dd…f}|S)z? x: [b, h, l, l] ret: [b, h, l, 2*l-1] rrr.Nr–r—rrrr„~s&ÿ $2z:MultiHeadAttention._absolute_position_to_relative_positionc CsJtj|tjd}t |d¡t |d¡}t t t t |¡¡ d¡d¡S)zÄBias for self-attention to encourage attention to close positions. Args: length: an integer scalar. Returns: a Tensor with shape [1, 1, length, length] )rUrr)r ÚarangeÚfloat32rGÚlog1pÚabs)rr”ÚrÚdiffrrrr~Žs"z+MultiHeadAttention._attention_bias_proximal)r*NTNFFr) rrr rrror…r|r{r}r„r~r!rrrrrBÒs ö 5 3  rBcs<eZdZ   d ‡fdd„ Zdd„Zdd „Zd d „Z‡ZS) rCr*NFcsztƒ ¡||_||_||_||_||_||_||_|r!|j |_ n|j |_ t   |||¡|_t   |||¡|_t  |¡|_dSr)rrÚ in_channelsr[r3r6r/Ú activationrOÚ_causal_paddingÚpaddingÚ _same_paddingrr`Úconv_1Úconv_2r9r:)rr¡r[r3r6r/r¢rOrrrr›s  z FFN.__init__cCsb| | ||¡¡}|jdkr|t d|¡}nt |¡}| |¡}| | ||¡¡}||S)NÚgelug¬Zd;û?)r¦r¤r¢r r#Úrelur:r§)rrrHrrrr·s   z FFN.forwardcCsH|jdkr|S|jd}d}ddgddg||gg}t |t |¡¡}|S)Nrr©r6rr’rVr“©rrÚpad_lÚpad_rr¤rrrr£Ás  zFFN._causal_paddingcCsR|jdkr|S|jdd}|jd}ddgddg||gg}t |t |¡¡}|S)Nrr.rrªr«rrrr¥Ês  zFFN._same_padding)r*NF)rrr rrr£r¥r!rrrrrCšsø  rC)ryr rÚtorch.nnrrrVÚloggingÚ getLoggerrÚloggerÚModulerÚjitÚscriptr(r)rLrBrCrrrrÚs     WVI