o uyg5@spddlZddlZddlmZmZddlZddlmZddlmZ ddl m Z ddl m Z GdddejjZdS)N)DictOptional) functional) DictConfig) make_pad_maskcseZdZddddddddddd ddd ed d d ddddddgddddddddddddddddf d ed!ed"ed#ed$ed%ed&ed'ejj d(ejj d)ejj d*e d+e ffd,d- Z d.e d/ej d0e eeejffd1d2Zed3d4ZZS)5MaskedDiffWithXveciPmeli2TNgư>eulercosineg?gffffff?l1)Z sigma_minsolverZ t_schedulerZtraining_cfg_rateZinference_cfg_rateZ reg_loss_typeg@ gelu)channelsdropoutZattention_head_dimn_blocksZnum_mid_blocks num_headsZact_fn) in_channels out_channelZ spk_emb_dimZn_spksZ cfm_paramsZdecoder_paramsi"Vri@)n_fftZnum_mels sampling_rateZhop_sizewin_sizefminfmax input_size output_size spk_embed_dim output_type vocab_sizeinput_frame_rateonly_mask_lossencoderlength_regulatordecoder decoder_conf mel_feat_confc st||_||_| |_| |_||_||_||_t d|jt |||_ tj |||_||_tj |j||_| |_| |_||_dS)Nzinput frame rate=)super__init__r$r%r.r/r(r'r)logginginfonn Embeddinginput_embeddingtorchLinearspk_embed_affine_layerr+ encoder_projr-r,r*) selfr$r%r&r'r(r)r*r+r,r-r.r/ __class__A/home/splend1dchan/Desktop/BreezyVoice-dev/cosyvoice/flow/flow.pyr1s  zMaskedDiffWithXvec.__init__batchdevicereturncCs|d|}|d|}|d|}|d|}|d|}tj|dd}||}t|d|}|tj |d d |}| ||\} } | | } | | |\} } tj |j|jd } t|D]#\} } td kruqjtd td | }|| d|f| | d|f<qj| dd} t|| }tj|jdd| jddddjdd}|jj|dd|d| dd|| d\}}d|iS)N speech_tokenspeech_token_len speech_featspeech_feat_len embeddingr dimrminrAg?g333333?nearest)sizemode)condloss)toF normalizer9rfloat unsqueezer6r7clampr+r:r,zerosshaperA enumeraterandomrandintint transpose interpolatesqueezer-Z compute_loss contiguous)r;r@rAtoken token_lenfeatfeat_lenrGmaskh h_lengthscondsijindexrS_r>r>r?forward7s:    * zMaskedDiffWithXvec.forwardcCs~|jddks Jtj|dd}||}tj||gdd||}}t|d |}| tj |dd|}| ||\} } | | } |ddd} || | \} } tjd| |jg|jd } |jddkrt|D]\} }|| | | d|f<qv| dd } t|  | }|j| dd |d|| d d }|jddkr|dddd|jddf}|S) Nrr rHrJrKr rrrMrN )murhspksrRZ n_timesteps)r[rUrVr9r7concatrrWrXrTr6rYr+r:r_r,rZmaxitemr%rAr\r`r-rc)r;rdre prompt_tokenprompt_token_len prompt_featprompt_feat_lenrGrhrirjrgrkrlrmrfr>r>r? inferencecs4      zMaskedDiffWithXvec.inference)__name__ __module__ __qualname__rr_strboolr7r4Modulerr1dictrArTensorrpinference_moder{ __classcell__r>r>r<r?rs^6     ,r)r2r]typingrrr7torch.nnr4rrUZ omegaconfrcosyvoice.utils.maskrrrr>r>r>r?s