U lxd#@sddlZddlZddlZddlZddlZddlZddlmZddl Z ddl m Z m Z m Z ddl m Z mZGdddej jjZGdddZdS)N)spectrogram_torchspec_to_mel_torchr)load_wav_to_torchload_filepaths_and_textc@sBeZdZdZdeedddZddZd d Zd d Zd dZ dS)TextAudioSpeakerLoaderz 1) loads audio, speaker_id, text pairs 2) normalizes text and converts them to sequences of integers 3) computes spectrograms from audio files. FT) all_in_memvol_augcst|_|_|jj_|jj_|jj_|jj_|jj_|jj_|j j _ |j j _ |j _|jj_|j jo||_tdtj|_jrfddjD_dS)Nicsg|]}|dqS)r) get_audio).0pselfD:\so-vits-svc\data_utils.py -sz3TextAudioSpeakerLoader.__init__..)r audiopathshparamsdata max_wav_value sampling_rate filter_length hop_length win_lengthtrainZuse_srZ max_speclenZspec_lenspkspk_mapmodelZ vol_embeddingvol_embrrandomseedshufflercache)r rrrrrr r__init__s$            zTextAudioSpeakerLoader.__init__cCsR|dd}t|\}}||jkr4td||j||j}|d}|dd}tj |rlt |}n4t ||j |j|j|jdd}t |d}t |||dd }t |j|g}tj |d d d \}} t tj|td }t tj| td } t |d} t| d|jd} |jrN|d} t | } t | } nd} t| d|d} t | d|ddkst!| d|d|j|ft |jd| |jd|jkst!|ddd| f| ddd| f|d| | d| f\}} }} |ddd| |jf}| dkr@| d| } | ||||| | fS)N\/z {} SR doesn't match target {} SRrz.wavz.spec.ptFcenterz.f0.npyT) allow_pickle)dtypez.soft.ptz.vol.npy)"replacerr ValueErrorformatr unsqueezeospathexiststorchloadrrrrsqueezesavesplit LongTensorrnp FloatTensorarrayfloatutilsrepeat_expand_2dshaper from_numpyminsizeabsAssertionError)r filenameaudior audio_normZ spec_filenamespecrf0uvcZ volume_pathvolumelminrrrr /sP           <(D  z TextAudioSpeakerLoader.get_audioc CsVtddgr|jr|dkrttt|d}tdt d|} t d| } |d| }|d| }t ||j j j|j j j|j j j|j j jddd}|jdd krDtd|jdd } | d } |dd| | f|dd| | f|| | || | f\}}}}|dd| |j| |jf}|dkrD|| | }|||||||fS) NTFgh㈵>r,r* r%ri i)rchoicerr=r4maxrDrBr:log10uniformrrrrrrrr@randint) r rLrJrIrHrrKrMZmax_ampZ max_shiftZlog10_vol_shiftstartendrrr random_slice]s.   D   z#TextAudioSpeakerLoader.random_slicecCs4|jr|j|j|S|j||j|dSdS)Nr)rrWr!r r)r indexrrr __getitem__xsz"TextAudioSpeakerLoader.__getitem__cCs t|jSN)lenrr rrr__len__~szTextAudioSpeakerLoader.__len__N)FT) __name__ __module__ __qualname____doc__boolr"r rWrYr\rrrrrs .rc@seZdZddZdS)TextAudioCollatec CsPdd|D}tjtdd|Dddd\}}tdd|D}tdd|D}tt|}tt||ddjd|}tt||}tt||dd jd|} tt|d |} tt|d } tt||} tt||} || || | | tt|D]}|||}|d}|||ddd| d f<| d ||<|d }|||d| df<|d }|| |ddd| d f<|d }|| |ddd| d f<|d | |df<|d }|| |d| df<|d}|dkr4|| |d| df<nd} q2||| | | || | fS)NcSsg|]}|dk r|qSrZr)r brrrrsz-TextAudioCollate.__call__..cSsg|]}|djdqSrr,)r@r xrrrrsrT)dim descendingcSsg|]}|ddqSrdrCrerrrrscSsg|]}|ddqS)r+r,rirerrrrsr,r+) r4sortr9rQr[r;r@zero_rangerC)r batch input_lengthsZids_sorted_decreasingZ max_c_lenZ max_wav_lenlengthsZc_paddedZ f0_paddedZ spec_paddedZ wav_paddedZspkidsZ uv_paddedZ volume_paddedirowrLrJrIwavrKrMrrr__call__sR     zTextAudioCollate.__call__N)r]r^r_rwrrrrrbsrb)timer1rnumpyr:r4torch.utils.datamodules.commonscommonsr>modules.mel_processingrrrrrDatasetrrbrrrrs o