o g+@sdZddlmZddlmZddlmZddlZddlmZddlZddlmm Z ddl m Z ddl mZdd lmZdd lmZmZmZmZmZmZdddZGdddejZdS)z\ ein notation: b - batch n - sequence nt - text sequence nw - raw wave length d - dimension ) annotations)Callable)randomN)nn) pad_sequence)odeint)MelSpec)defaultexistslist_str_to_idxlist_str_to_tensor lens_to_maskmask_from_frac_lengthsseq_lenint['b']startendcCs^|}tj||d}|dddf|dddfk}|dddf|dddfk}||@S)Ndevice)torcharangelong)rrrr max_seq_lenseqZ start_maskZend_maskrJ/home/node59_tmpdata3/hkchen/DiffRhythm/diffrhythm/diffrhythm/model/cfm.py"custom_mask_from_start_end_indices s   rcseZdZdeddeddddddd d d d f d4fdd ZeddZed d d d ddd d dd d d dd d ddd d5d-d.Z d6d7d2d3Z Z S)8CFMeuler)methodg?)min_stepg333333?g?g?N)gffffff?g?F transformer nn.Module odeint_kwargsdictodeint_optionsfrac_lengths_masktuple[float, float]vocab_char_mapdict[str:int] | Noneuse_style_promptboolc st| |_| |_||_||_||_||_td|jd|jd|j||_ |j } | |_ ||_ ||_ ||_ | |_| |_dS)Nzaudio drop prob -> z; style_drop_prob -> z; lrc_drop_prob: )super__init__r' num_channelsaudio_drop_probcond_drop_probstyle_drop_prob lrc_drop_probprintr"dimsigmar$r&r)r+)selfr"r6r$r&r0r1r2r3r/r'r)r+r5 __class__rrr.(s   z CFM.__init__cCst|jS)N)next parametersr)r7rrrr_sz CFM.device g@iri) style_promptstyle_prompt_lensnegative_style_promptlenssteps cfg_strengthsway_sampling_coefseed max_durationvocoder no_ref_audioduplicate_testt_inter edit_mask start_timelatent_pred_start_framelatent_pred_end_framecondfloat['b n d'] | float['b nw']textint['b nt'] | list[str]durationint | int['b']r@int['b'] | NonerD int | NonerF0Callable[[float['b d n']], float['b nw']] | Nonec$ sRtjtjkr|}|jd|kr&|ddd|ddf}|jdkrA |}| ddd}|jdj ksAJg|jdd|j R\}}}t |s`tj|f||tjd}ttrt jrttj|nt|jd|ksJt rdkjdd}t|}|dur||@}t|g|j }|}t|g|j }t||||j |d}|d}t|t||t|trtj|f||tjd}|j| d}|} |rt j!|dd|| d|fd d }|dkrt|nd|r t|}f d d }g}|D]}t | r)t"| |#tj$|j j jdqt%|dd d}d}|rZ|}d||||}t|d|}tj&|d|j jd} | dur}| | t'tj(d| d| } t)||| fij*}!|!d}"|"}#t||#|}#t | r|# ddd}#| |#}#|#|!fS)Nr)rdtype)r5)rr)maxr)valuec s\j||dddd }dkr|Sj||dddd }|||S)NF) xrNrPtimemaskdrop_audio_cond drop_text drop_promptr=r>rKgh㈵>T)r")tr]predZ null_pred rBr_r?r7rKZ step_condr=r>rPrrfnszCFM.sample..fnT) padding_value batch_first)+evalr:r;rZrfloat16halfshapendimmel_specpermuter/rr fullr isinstancelistr)r tor sumr tensorr unsqueezewhere zeros_likeintclampamaxFpad manual_seedappendrandnrlinspacecospirr$)$r7rNrPrRr=r>r?r@rArBrCrDrErFrGrHrIrJrKrLrMbatchZ cond_seq_lenrZ text_lensZ cond_maskZfixed_span_maskZ test_condrfy0ZdurZt_startrcZ trajectoryZsampledoutrrersamplecs|             " $ z CFM.sampleinpnoise_scheduler str | Nonec  Csjg|jdd|j|j|jR\} } } } } t|s%tj| f| | d}t|| d}tj| f|jd j |j }t ||}t|rE|}|}t |}tjdd| f|jd}tjj|}|dd}d||||}||}t|dt||}t|jk}t|jk}t|jk}|j|||||||||||d }tj||d d }||}|||fS) NrXr)lengthrrW)meanstdsizerrY).N) r]rNrPr^r`rarbr=r> grad_ckptrKnone) reduction)rlrZrr6r rrpr zerosfloatuniform_r'r randn_likenormalr functionalsigmoidrvrwrxrr0r3r2r"r|mse_lossr)r7rrPr=r>r@rrrKrrrZrt_σ1r_Z frac_lengthsZrand_span_maskx1x0r^rctφflowrNr`rarbrdlossrrrforwards40       z CFM.forward) r"r#r$r%r&r%r'r(r)r*r+r,) rNrOrPrQrRrSr@rTrDrUrFrV)NNNNFN)rrOrPrQr@rTrr) __name__ __module__ __qualname__r%r.propertyrrno_gradrr __classcell__rrr8rr's\7 r)rrrrrr)__doc__ __future__rtypingrrrrZtorch.nn.functionalrr|Ztorch.nn.utils.rnnrZ torchdiffeqrZdiffrhythm.model.modulesrZdiffrhythm.model.utilsr r r r r rrModulerrrrrs