o ´0Àg£Oã@s†dZddlmZddlmZddlZddlZddlmZddlZddlmm Z ddl Z ddl m Z Gdd„dejƒZGd d „d ejƒZGd d „d ejƒZGd d„dejƒZd0d1dd„Zd2dd„ZGdd„dejƒZGdd„dejƒZGdd„dejƒZGd d!„d!ejƒZGd"d#„d#ejƒZGd$d%„d%ejƒZGd&d'„d'ƒZGd(d)„d)ƒZGd*d+„d+ejƒZGd,d-„d-ejƒZGd.d/„d/ejƒZdS)3z\ ein notation: b - batch n - sequence nt - text sequence nw - raw wave length d - dimension é)Ú annotations)ÚOptionalN)Únn)Úapply_rotary_pos_embcs(eZdZdZ‡fdd„Zdd„Z‡ZS)Ú FiLMLayerze Feature-wise Linear Modulation (FiLM) layer Reference: https://arxiv.org/abs/1709.07871 cs,tt|ƒ ¡||_t ||dd¡|_dS)Néé)ÚsuperrÚ__init__Ú in_channelsrÚConv1dÚfilm)Úselfr Z cond_channels©Ú __class__©úN/home/node59_tmpdata3/hkchen/DiffRhythm/diffrhythm/diffrhythm/model/modules.pyr szFiLMLayer.__init__cCsDtj| | d¡¡ddd\}}| dd¡}| dd¡}|||S)Nrr)ÚchunksÚdim)ÚtorchÚchunkr Ú unsqueezeÚ transpose)rÚxÚcÚgammaÚbetarrrÚforward#s    zFiLMLayer.forward©Ú__name__Ú __module__Ú __qualname__Ú__doc__r rÚ __classcell__rrrrrs rcs8eZdZ         d ‡fd d „ Zd d „Z‡ZS)ÚMelSpecééédéÀ]FrNTc sJtƒ ¡||_tjj||||||| ||d |_|jdt  d¡dddS)N) Ú sample_rateÚn_fftÚ win_lengthÚ hop_lengthÚn_melsÚpowerÚcenterÚ normalizedÚnormÚdummyrF)Ú persistent) r r Ún_mel_channelsÚ torchaudioÚ transformsÚMelSpectrogramÚmel_stftÚregister_bufferrÚtensor) rZ filter_lengthr,r+r4Útarget_sample_rateÚ normalizer.r1r/rrrr .s ÷ zMelSpec.__init__cCsbt|jƒdkr | d¡}t|jƒdksJ‚|jj|jkr"| |j¡| |¡}|jdd ¡}|S)Nérrgñh㈵øä>)Úmin) ÚlenÚshapeÚsqueezer2ÚdeviceÚtor8ÚclampÚlog)rÚinpÚmelrrrrKs   zMelSpec.forward) r%r&r%r'r(FrNT©rr r!r rr#rrrrr$-sör$có&eZdZ‡fdd„Zddd„Z‡ZS)ÚSinusPositionEmbeddingcstƒ ¡||_dS©N)r r r©rrrrrr ]s  zSinusPositionEmbedding.__init__éècCsv|j}|jd}t d¡|d}t tj||d ¡| ¡}|| d¡| d¡}tj |  ¡|  ¡fdd}|S)Nri'r©rBréÿÿÿÿ©r) rBrÚmathrErÚexpÚarangeÚfloatrÚcatÚsinÚcos)rrÚscalerBZhalf_dimÚembrrrras zSinusPositionEmbedding.forward)rMrHrrrrrJ\s rJcs*eZdZd ‡fdd„ Zd dd d „Z‡ZS)ÚConvPositionEmbeddingééc s`tƒ ¡|ddks J‚t tj|||||ddt ¡tj|||||ddt ¡¡|_dS)Nrr)ÚgroupsÚpadding)r r rÚ Sequentialr ÚMishÚconv1d)rrÚ kernel_sizer]rrrr os  üzConvPositionEmbedding.__init__Nrúfloat['b n d']Úmaskúbool['b n'] | NonecCs^|dur|d}| |d¡}| ddd¡}| |¡}| ddd¡}|dur-| |d¡}|S)N).Nçrrr)Ú masked_fillÚpermutera)rrrdÚoutrrrrys zConvPositionEmbedding.forward)r[r\rK)rrcrdrerHrrrrrZns rZçˆÃ@çð?rÚintÚendÚthetarTcCs„||||d9}d|t d|d¡d|d… ¡|}tj||jd}t ||¡ ¡}t |¡}t |¡}tj||gddS)NrrkrrNrOrP)rrSrTrBÚouterrWrVrU)rrmrnZtheta_rescale_factorÚfreqsÚtZ freqs_cosZ freqs_sinrrrÚprecompute_freqs_cis‹s*  rrcCs`|tj|tjd}| d¡tj||jtjd d¡| d¡ ¡}t ||k||d¡}|S)N)Údtyper)rBrsr)rÚ ones_likeÚfloat32rrSrBÚlongÚwhere)ÚstartÚlengthÚmax_posrXÚposrrrÚget_pos_embed_indices™s&ÿÿr|có$eZdZ‡fdd„Zdd„Z‡ZS)ÚGRNcs:tƒ ¡t t dd|¡¡|_t t dd|¡¡|_dS)Nr)r r rÚ ParameterrÚzerosrrrLrrrr ©s z GRN.__init__cCs@tj|dddd}||jdddd}|j|||j|S)NrrT)ÚprÚkeepdimrO)rr‚çíµ ÷ư>)rr1Úmeanrr)rrZGxÚNxrrrr®sz GRN.forwardrHrrrrr~¨s r~cs,eZdZ d d‡fdd„ Zdd d „Z‡ZS)ÚConvNeXtV2BlockrrrlÚintermediate_dimÚdilationcsrtƒ ¡|dd}tj||d|||d|_tj|dd|_t ||¡|_t  ¡|_ t |ƒ|_ t ||¡|_ dS)Néré)rbr^r]rˆrƒ)Úeps)r r rr ÚdwconvÚ LayerNormr1ÚLinearÚpwconv1ÚGELUÚactr~ÚgrnÚpwconv2)rrr‡rˆr^rrrr ¹s   ÿ  zConvNeXtV2Block.__init__rú torch.TensorÚreturncCs`|}| dd¡}| |¡}| dd¡}| |¡}| |¡}| |¡}| |¡}| |¡}||S)Nrr)rrŒr1rr‘r’r“)rrÚresidualrrrrÊs        zConvNeXtV2Block.forward)r)rrlr‡rlrˆrl)rr”r•r”rHrrrrr†¸sür†crI)ÚAdaLayerNormZerocs<tƒ ¡t ¡|_t ||d¡|_tj|ddd|_dS)Nr‰Frƒ©Úelementwise_affiner‹© r r rÚSiLUÚsilurŽÚlinearrr1rLrrrr Üó  zAdaLayerNormZero.__init__Nc Csh| | |¡¡}tj|ddd\}}}}}}| |¡d|dd…df|dd…df}|||||fS)Nr‰rrP©rrœrrr1) rrrYZ shift_msaZ scale_msaÚgate_msaÚ shift_mlpÚ scale_mlpÚgate_mlprrrräs.zAdaLayerNormZero.forwardrKrHrrrrr—Ûs r—cr})ÚAdaLayerNormZero_Finalcs<tƒ ¡t ¡|_t ||d¡|_tj|ddd|_dS)NrFrƒr˜rš)rrZcond_dimrrrr ñržzAdaLayerNormZero_Final.__init__cCsb| | |¡¡}tj|ddd\}}| |¡d|dd…ddd…f|dd…ddd…f}|S)NrrrPrŸ)rrrYrXÚshiftrrrrùs:zAdaLayerNormZero_Final.forwardrHrrrrr¤ðs r¤cs(eZdZd d ‡fdd„ Zd d „Z‡ZS) Ú FeedForwardNérfÚnoneÚ approximateÚstrc shtƒ ¡t||ƒ}|dur|n|}tj|d}t t ||¡|¡}t |t |¡t ||¡¡|_dS)N)r©) r r rlrrr_rŽÚDropoutÚff) rrZdim_outÚmultÚdropoutr©Ú inner_dimÚ activationZ project_inrrrr s   "zFeedForward.__init__cCs | |¡SrK)r¬)rrrrrrs zFeedForward.forward)Nr§rfr¨)r©rªrHrrrrr¦s r¦cs>eZdZ     dd‡fdd„ Z    dddd„Z‡ZS)Ú Attentionéé@rfNÚ processorú"JointAttnProcessor | AttnProcessorrrlÚheadsÚdim_headr®rTÚ context_dimú Optional[int]cstƒ ¡ttdƒstdƒ‚||_||_||_|||_||_ ||_ ||_ t   ||j¡|_t   ||j¡|_t   ||j¡|_|j dur_t   ||j¡|_t   ||j¡|_|j dur_t   ||j¡|_t  g¡|_|j t   |j|¡¡|j t  |¡¡|j dur‹|j st   |j|¡|_dSdSdS)NÚscaled_dot_product_attentionzHAttention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.)r r ÚhasattrÚFÚ ImportErrorr´rr¶r¯r®r¸Úcontext_pre_onlyrrŽÚto_qÚto_kÚto_vÚto_k_cÚto_v_cÚto_q_cÚ ModuleListÚto_outÚappendr«Úto_out_c)rr´rr¶r·r®r¸r¾rrrr s0     ÿzAttention.__init__rrcrrdrer•r”cCs0|dur|j||||||dS|j||||dS)N)rrdÚropeÚc_rope)rdrÉ)r´)rrrrdrÉrÊrrrrBszAttention.forward)r²r³rfNN) r´rµrrlr¶rlr·rlr®rTr¸r¹©NNNN)rrcrrcrdrer•r”rHrrrrr±sø-úr±c@s$eZdZdd„Z  ddd d „ZdS)Ú AttnProcessorcCódSrKr©rrrrr TózAttnProcessor.__init__NÚattnr±rrcrdrer•útorch.FloatTensorcCsŒ|jd}| |¡}| |¡}| |¡}|dur6|\} } | dur&| | dfnd\} } t|| | ƒ}t|| | ƒ}|jd} | |j}| |d|j|¡ dd¡}| |d|j|¡ dd¡}| |d|j|¡ dd¡}|dur…|}| d¡ d¡}|  ||j|jd|jd¡}nd}t j ||||dd d }| dd¡  |d|j|¡}|  |j¡}|jd|ƒ}|jd|ƒ}|durÄ| d¡}| |d¡}|S) Nrçð¿©rkrkrOrréþÿÿÿrfF©Ú attn_maskÚ dropout_pÚ is_causal)r@r¿rÀrÁrr¶ÚviewrrÚexpandr¼rºÚreshaperCrsrÆrg)rrÐrrdrÉÚ batch_sizeÚqueryÚkeyÚvaluerpÚ xpos_scaleÚ q_xpos_scaleÚ k_xpos_scaler¯Úhead_dimrÖrrrÚ__call__Ws8           zAttnProcessor.__call__)NN)rÐr±rrcrdrer•rÑ©rr r!r rärrrrrÌSs ûrÌc@s(eZdZdd„Z    dddd„ZdS)ÚJointAttnProcessorcCrÍrKrrÎrrrr ‘rÏzJointAttnProcessor.__init__NrÐr±rrcrúfloat['b nt d']rdrer•rÑcCsŒ|}|jd}| |¡} | |¡} | |¡} | |¡} | |¡} | |¡}|durG|\}}|dur7||dfnd\}}t| ||ƒ} t| ||ƒ} |duri|\}}|durY||dfnd\}}t| ||ƒ} t| ||ƒ} tj | | gdd} tj | | gdd} tj | |gdd} | jd}||j }|   |d|j |¡  dd¡} |   |d|j |¡  dd¡} |   |d|j |¡  dd¡} |durÞt j|d|jdfdd }| d¡ d¡}| ||j | jd | jd ¡}nd}t j| | | |d d d }|  dd¡ |d|j |¡}| | j¡}|dd…d|jd…f|dd…|jdd…f}}|jd|ƒ}|jd|ƒ}|js1| |¡}|durB| d¡}| |d ¡}||fS)NrrÒrÓrrPrOrT)rßrÔrfFrÕ)r@r¿rÀrÁrÄrÂrÃrrrUr¶rÙrr¼ÚpadrrÚrºrÛrCrsrÆr¾rÈrg)rrÐrrrdrÉrÊr–rÜrÝrÞrßZc_queryZc_keyZc_valuerpràrárâr¯rãrÖrrrrä”sZ               þ   zJointAttnProcessor.__call__rË) rÐr±rrcrrçrdrer•rÑrårrrrræsùræcs(eZdZd ‡fdd„ Zd dd„Z‡ZS) ÚDiTBlockr§çš™™™™™¹?Fcsjtƒ ¡t|ƒ|_ttƒ||||d|_tj|ddd|_ t |||dd|_ ||_ |r3t |ƒ|_dSdS)N)r´rr¶r·r®Frƒr˜Útanh©rr­r®r©)r r r—Ú attn_normr±rÌrÐrrÚff_normr¦r¬Úuse_style_promptr¤Ú prompt_norm)rrr¶r·Úff_multr®rïrrrr çs  ûþzDiTBlock.__init__Nc Cs¢|dur |jr | ||¡}|j||d\}}}} } |j|||d} || d¡| }| |¡d| dd…df|dd…df}| |¡} ||  d¡| }|S)N©rY)rrdrÉr)rïrðrírÐrrîr¬) rrrqrrdrÉr1r r¡r¢r£Ú attn_outputZ ff_outputrrrrûs . zDiTBlock.forward©r§rêF©NNNrHrrrrréæsrécs,eZdZdZd ‡fdd„ Zd dd „Z‡ZS) Ú MMDiTBlocka  modified from diffusers/src/diffusers/models/attention.py notes. _c: context related. text, cond, etc. (left part in sd3 fig2.b) _x: noised input related. (right part) context_pre_only: last layer only do prenorm + modulation cuz no more ffn r§rêFc s¨tƒ ¡||_|rt|ƒnt|ƒ|_t|ƒ|_ttƒ||||||d|_ |s:t j |ddd|_ t |||dd|_nd|_ d|_t j |ddd|_t |||dd|_dS)N)r´rr¶r·r®r¸r¾Frƒr˜rërì)r r r¾r¤r—Ú attn_norm_cÚ attn_norm_xr±rærÐrrÚ ff_norm_cr¦Úff_cÚ ff_norm_xÚff_x)rrr¶r·rñr®r¾rrrr s(  ù zMMDiTBlock.__init__NcCs(|jr | ||¡}n |j||d\}}} } } |j||d\} } }}}|j| ||||d\}}|jr4d}n.|| d¡|}| |¡d| dd…df| dd…df}| |¡}||  d¡|}||  d¡|}| |¡d|dd…df|dd…df} | | ¡}|| d¡|}||fS)Nrò)rrrdrÉrÊr) r¾r÷rørÐrrùrúrûrü)rrrrqrdrÉrÊZnorm_cZ c_gate_msaZ c_shift_mlpZ c_scale_mlpZ c_gate_mlpÚnorm_xZ x_gate_msaZ x_shift_mlpZ x_scale_mlpZ x_gate_mlpZ x_attn_outputZ c_attn_outputZ c_ff_outputZ x_ff_outputrrrr7s . . zMMDiTBlock.forwardrôrõrrrrrrös röcs(eZdZd‡fdd„ Zd dd„Z‡ZS) ÚTimestepEmbeddingr&cs<tƒ ¡t|ƒ|_t t ||¡t ¡t ||¡¡|_dSrK) r r rJÚ time_embedrr_rŽr›Útime_mlp)rrZfreq_embed_dimrrrr Zs  (zTimestepEmbedding.__init__Útimestepú float['b']cCs$| |¡}| |j¡}| |¡}|SrK)rÿrCrsr)rrZ time_hiddenÚtimerrrr_s   zTimestepEmbedding.forward)r&)rrrHrrrrrþYsrþ)rjrk)rrlrmrlrnrT)rk) r"Ú __future__rÚtypingrrQrrÚtorch.nn.functionalÚ functionalr¼r5Zx_transformers.x_transformersrÚModulerr$rJrZrrr|r~r†r—r¤r¦r±rÌrærérörþrrrrÚs6   /  #<=V-F