o !cgFLã@sxdZddlmZddlmZddlZddlZddlmZddlmm Z ddl Z ddl m Z ddlmZGdd „d ejƒZGd d „d ejƒZGd d „d ejƒZd/d0dd„Zd1dd„ZGdd„dejƒZGdd„dejƒZGdd„dejƒZGdd „d ejƒZGd!d"„d"ejƒZGd#d$„d$ejƒZGd%d&„d&ƒZGd'd(„d(ƒZGd)d*„d*ejƒZGd+d,„d,ejƒZGd-d.„d.ejƒZ dS)2z\ ein notation: b - batch n - sequence nt - text sequence nw - raw wave length d - dimension é)Ú annotations)ÚOptionalN)Únn)Ú rearrange)Úapply_rotary_pos_embcs8eZdZ         d ‡fd d „ Zd d „Z‡ZS)ÚMelSpecééédéÀ]FéNTc sJtƒ ¡||_tjj||||||| ||d |_|jdt  d¡dddS)N) Ú sample_rateÚn_fftÚ win_lengthÚ hop_lengthÚn_melsÚpowerÚcenterÚ normalizedÚnormÚdummyrF)Ú persistent) ÚsuperÚ__init__Ún_mel_channelsÚ torchaudioÚ transformsÚMelSpectrogramÚmel_stftÚregister_bufferÚtorchÚtensor) ÚselfZ filter_lengthrrrÚtarget_sample_rateÚ normalizerrr©Ú __class__©ú/home/user/app/model/modules.pyrs ÷ zMelSpec.__init__cCsbt|jƒdkr t|dƒ}t|jƒdksJ‚|jj|jkr"| |j¡| |¡}|jdd ¡}|S)Nézb 1 nw -> b nwégñh㈵øä>)Úmin) ÚlenÚshaperrÚdeviceÚtorÚclampÚlog)r"ÚinpÚmelr'r'r(Úforward7s   zMelSpec.forward) rr rr r Fr NT©Ú__name__Ú __module__Ú __qualname__rr4Ú __classcell__r'r'r%r(rsörcó&eZdZ‡fdd„Zddd„Z‡ZS)ÚSinusPositionEmbeddingcstƒ ¡||_dS©N)rrÚdim©r"r=r%r'r(rHs  zSinusPositionEmbedding.__init__éècCsv|j}|jd}t d¡|d}t tj||d ¡| ¡}|| d¡| d¡}tj |  ¡|  ¡fdd}|S)Nr*i'r ©r.réÿÿÿÿ©r=) r.r=Úmathr1r ÚexpÚarangeÚfloatÚ unsqueezeÚcatÚsinÚcos)r"ÚxÚscaler.Úhalf_dimÚembr'r'r(r4Ls zSinusPositionEmbedding.forward)r?r5r'r'r%r(r;Gs r;cs*eZdZd ‡fdd„ Zd dd d „Z‡ZS)ÚConvPositionEmbeddingééc s`tƒ ¡|ddks J‚t tj|||||ddt ¡tj|||||ddt ¡¡|_dS)Nr*r)ÚgroupsÚpadding)rrrÚ SequentialÚConv1dÚMishÚconv1d)r"r=Ú kernel_sizerRr%r'r(rYs  üzConvPositionEmbedding.__init__NrKúfloat['b n d']Úmaskúbool['b n'] | NonecCsV|dur|d}| |d¡}t|dƒ}| |¡}t|dƒ}|dur)| |d¡}|S)N).Nçzb n d -> b d nzb d n -> b n d)Ú masked_fillrrW)r"rKrZÚoutr'r'r(r4cs   zConvPositionEmbedding.forward)rPrQr<)rKrYrZr[r5r'r'r%r(rOXs rOçˆÃ@çð?r=ÚintÚendÚthetarFcCs„||||d9}d|t d|d¡d|d… ¡|}tj||jd}t ||¡ ¡}t |¡}t |¡}tj||gddS)Nr*r`rr@rArB)r rErFr.ÚouterrJrIrH)r=rbrcZtheta_rescale_factorÚfreqsÚtZ freqs_cosZ freqs_sinr'r'r(Úprecompute_freqs_cists*  rgcCs`|tj|tjd}| d¡tj||jtjd d¡| d¡ ¡}t ||k||d¡}|S)N)Údtyper )r.rhr)r Ú ones_likeÚfloat32rGrEr.ÚlongÚwhere)ÚstartÚlengthÚmax_posrLÚposr'r'r(Úget_pos_embed_indicessÿþrqcó$eZdZ‡fdd„Zdd„Z‡ZS)ÚGRNcs:tƒ ¡t t dd|¡¡|_t t dd|¡¡|_dS)Nr )rrrÚ Parameterr ÚzerosÚgammaÚbetar>r%r'r(rs z GRN.__init__cCs@tj|dddd}||jdddd}|j|||j|S)Nr*r T)Úpr=ÚkeepdimrA)r=ryçíµ ÷ư>)r rÚmeanrvrw)r"rKZGxZNxr'r'r(r4”sz GRN.forwardr5r'r'r%r(rsŽs rscs,eZdZ d d‡fdd„ Zdd d „Z‡ZS)ÚConvNeXtV2Blockr r=raÚintermediate_dimÚdilationcsrtƒ ¡|dd}tj||d|||d|_tj|dd|_t ||¡|_t  ¡|_ t |ƒ|_ t ||¡|_ dS)Nér*é)rXrSrRr~rz)Úeps)rrrrUÚdwconvÚ LayerNormrÚLinearÚpwconv1ÚGELUÚactrsÚgrnÚpwconv2)r"r=r}r~rSr%r'r(ržs    zConvNeXtV2Block.__init__rKú torch.TensorÚreturncCs`|}| dd¡}| |¡}| dd¡}| |¡}| |¡}| |¡}| |¡}| |¡}||S)Nr r*)Ú transposer‚rr…r‡rˆr‰)r"rKÚresidualr'r'r(r4­s        zConvNeXtV2Block.forward)r )r=rar}rar~ra)rKrŠr‹rŠr5r'r'r%r(r|sür|cr:)ÚAdaLayerNormZerocó<tƒ ¡t ¡|_t ||d¡|_tj|ddd|_dS)NrFrz©Úelementwise_affiner© rrrÚSiLUÚsilur„Úlinearrƒrr>r%r'r(r¾ó  zAdaLayerNormZero.__init__Nc Csh| | |¡¡}tj|ddd\}}}}}}| |¡d|dd…df|dd…df}|||||fS)Nrr rB©r•r”r Úchunkr) r"rKrNZ shift_msaZ scale_msaÚgate_msaÚ shift_mlpÚ scale_mlpÚgate_mlpr'r'r(r4Æs.zAdaLayerNormZero.forwardr<r5r'r'r%r(r޽s rŽcrr)ÚAdaLayerNormZero_Finalcr)Nr*Frzrr’r>r%r'r(rÒr–zAdaLayerNormZero_Final.__init__cCsb| | |¡¡}tj|ddd\}}| |¡d|dd…ddd…f|dd…ddd…f}|S)Nr*r rBr—)r"rKrNrLÚshiftr'r'r(r4Ús:zAdaLayerNormZero_Final.forwardr5r'r'r%r(rÑs rcs(eZdZd d ‡fdd„ Zd d „Z‡ZS) Ú FeedForwardNér\ÚnoneÚ approximateÚstrc shtƒ ¡t||ƒ}|dur|n|}tj|d}t t ||¡|¡}t |t |¡t ||¡¡|_dS)N)r¢) rrrarr†rTr„ÚDropoutÚff) r"r=Zdim_outÚmultÚdropoutr¢Ú inner_dimÚ activationÚ project_inr%r'r(rås    þ  ýzFeedForward.__init__cCs | |¡Sr<)r¥)r"rKr'r'r(r4õs zFeedForward.forward)Nr r\r¡)r¢r£r5r'r'r%r(rŸäsrŸcs>eZdZ     dd‡fdd„ Z    dddd„Z‡ZS)Ú Attentionéé@r\NÚ processorú"JointAttnProcessor | AttnProcessorr=raÚheadsÚdim_headr§rFÚ context_dimú Optional[int]cstƒ ¡ttdƒstdƒ‚||_||_||_|||_||_ ||_ ||_ t   ||j¡|_t   ||j¡|_t   ||j¡|_|j dur_t   ||j¡|_t   ||j¡|_|j dur_t   ||j¡|_t  g¡|_|j t   |j|¡¡|j t  |¡¡|j dur‹|j st   |j|¡|_dSdSdS)NÚscaled_dot_product_attentionzHAttention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.)rrÚhasattrÚFÚ ImportErrorr®r=r°r¨r§r²Úcontext_pre_onlyrr„Úto_qÚto_kÚto_vÚto_k_cÚto_v_cÚto_q_cÚ ModuleListÚto_outÚappendr¤Úto_out_c)r"r®r=r°r±r§r²r¸r%r'r(rýs0     ÿzAttention.__init__rKrYÚcrZr[r‹rŠcCs0|dur|j||||||dS|j||||dS)N)rÃrZÚropeÚc_rope)rZrÄ)r®)r"rKrÃrZrÄrÅr'r'r(r4'szAttention.forward)r¬r­r\NN) r®r¯r=rar°rar±rar§rFr²r³©NNNN)rKrYrÃrYrZr[r‹rŠr5r'r'r%r(r«üsø-úr«c@s$eZdZdd„Z  ddd d „ZdS)Ú AttnProcessorcCódSr<r'©r"r'r'r(r8ózAttnProcessor.__init__NÚattnr«rKrYrZr[r‹útorch.FloatTensorcCs†|jd}| |¡}| |¡}| |¡}|dur6|\} } | dur&| | dfnd\} } t|| | ƒ}t|| | ƒ}|jd} | |j}| |d|j|¡ dd¡}| |d|j|¡ dd¡}| |d|j|¡ dd¡}|dur‚|}t|dƒ}|  ||j|jd|jd¡}nd}t j ||||d d d }| dd¡  |d|j|¡}|  |j¡}|jd|ƒ}|jd|ƒ}|durÁt|d ƒ}| |d ¡}|S) Nrçð¿©r`r`rAr r*úb n -> b 1 1 néþÿÿÿr\F©Ú attn_maskÚ dropout_pÚ is_causalú b n -> b n 1)r-r¹rºr»rr°ÚviewrŒrÚexpandr¶r´Úreshaper/rhrÀr])r"rËrKrZrÄÚ batch_sizeÚqueryÚkeyÚvaluereÚ xpos_scaleÚ q_xpos_scaleÚ k_xpos_scaler¨Úhead_dimrÒr'r'r(Ú__call__;s8            zAttnProcessor.__call__©NN)rËr«rKrYrZr[r‹rÌ©r6r7r8rrár'r'r'r(rÇ7s ûrÇc@s(eZdZdd„Z    dddd„ZdS)ÚJointAttnProcessorcCrÈr<r'rÉr'r'r(rurÊzJointAttnProcessor.__init__NrËr«rKrYrÃúfloat['b nt d']rZr[r‹rÌcCs†|}|jd}| |¡} | |¡} | |¡} | |¡} | |¡} | |¡}|durG|\}}|dur7||dfnd\}}t| ||ƒ} t| ||ƒ} |duri|\}}|durY||dfnd\}}t| ||ƒ} t| ||ƒ} tj | | gdd} tj | | gdd} tj | |gdd} | jd}||j }|   |d|j |¡  dd¡} |   |d|j |¡  dd¡} |   |d|j |¡  dd¡} |durÛt j|d|jdfdd }t|d ƒ}| ||j | jd | jd ¡}nd}t j| | | |d d d}|  dd¡ |d|j |¡}| | j¡}|dd…d|jd…f|dd…|jdd…f}}|jd|ƒ}|jd|ƒ}|js.| |¡}|dur?t|dƒ}| |d ¡}||fS)NrrÍrÎr rBrAr*T)rÜrÏrÐr\FrÑrÕ)r-r¹rºr»r¾r¼r½rr rHr°rÖrŒr¶Úpadrr×r´rØr/rhrÀr¸rÂr])r"rËrKrÃrZrÄrÅrrÙrÚrÛrÜZc_queryZc_keyZc_valuererÝrÞrßr¨ràrÒr'r'r(ráxsZ                þ   zJointAttnProcessor.__call__rÆ) rËr«rKrYrÃrårZr[r‹rÌrãr'r'r'r(rätsùräcs(eZdZd‡fdd„ Zd dd„Z‡ZS) ÚDiTBlockr çš™™™™™¹?csRtƒ ¡t|ƒ|_ttƒ||||d|_tj|ddd|_ t |||dd|_ dS)N)r®r=r°r±r§FrzrÚtanh©r=r¦r§r¢) rrrŽÚ attn_normr«rÇrËrrƒÚff_normrŸr¥)r"r=r°r±Úff_multr§r%r'r(rËs  ûzDiTBlock.__init__Nc Csˆ|j||d\}}}}} |j|||d} || d¡| }| |¡d|dd…df|dd…df}| |¡} ||  d¡| }|S)N©rN)rKrZrÄr )rërËrGrìr¥) r"rKrfrZrÄrr™ršr›rœÚ attn_outputZ ff_outputr'r'r(r4Ús. zDiTBlock.forward)r rèrâr5r'r'r%r(rçÉsrçcs,eZdZdZd ‡fdd„ Zd dd „Z‡ZS) Ú MMDiTBlocka  modified from diffusers/src/diffusers/models/attention.py notes. _c: context related. text, cond, etc. (left part in sd3 fig2.b) _x: noised input related. (right part) context_pre_only: last layer only do prenorm + modulation cuz no more ffn r rèFc s¨tƒ ¡||_|rt|ƒnt|ƒ|_t|ƒ|_ttƒ||||||d|_ |s:t j |ddd|_ t |||dd|_nd|_ d|_t j |ddd|_t |||dd|_dS)N)r®r=r°r±r§r²r¸Frzrrérê)rrr¸rrŽÚ attn_norm_cÚ attn_norm_xr«rärËrrƒÚ ff_norm_crŸÚff_cÚ ff_norm_xÚff_x)r"r=r°r±rír§r¸r%r'r(r÷s(  ù zMMDiTBlock.__init__NcCs(|jr | ||¡}n |j||d\}}} } } |j||d\} } }}}|j| ||||d\}}|jr4d}n.|| d¡|}| |¡d| dd…df| dd…df}| |¡}||  d¡|}||  d¡|}| |¡d|dd…df|dd…df} | | ¡}|| d¡|}||fS)Nrî)rKrÃrZrÄrÅr ) r¸rñròrËrGrórôrõrö)r"rKrÃrfrZrÄrÅZnorm_cZ c_gate_msaZ c_shift_mlpZ c_scale_mlpZ c_gate_mlpZnorm_xZ x_gate_msaZ x_shift_mlpZ x_scale_mlpZ x_gate_mlpZ x_attn_outputZ c_attn_outputZ c_ff_outputZ x_ff_outputr'r'r(r4s . . zMMDiTBlock.forward)r rèF)NNN)r6r7r8Ú__doc__rr4r9r'r'r%r(rðís rðcs(eZdZd‡fdd„ Zd dd„Z‡ZS) ÚTimestepEmbeddingr cs<tƒ ¡t|ƒ|_t t ||¡t ¡t ||¡¡|_dSr<) rrr;Ú time_embedrrTr„r“Útime_mlp)r"r=Zfreq_embed_dimr%r'r(r3s     ýzTimestepEmbedding.__init__Útimestepú float['b']cCs| |¡}| |¡}|Sr<)rùrú)r"rûZ time_hiddenÚtimer'r'r(r4<s  zTimestepEmbedding.forward)r )rûrür5r'r'r%r(rø2s rø)r_r`)r=rarbrarcrF)r`)!r÷Ú __future__rÚtypingrrCr rÚtorch.nn.functionalÚ functionalr¶rÚeinopsrZx_transformers.x_transformersrÚModulerr;rOrgrqrsr|rŽrrŸr«rÇrärçrðrør'r'r'r(Ús4    .    ;=U$E