o ii¯g—%ã@sìddlmZddlZddlZddlZddlZddlmZddl m m Z ddl Z ddl Z ddlZddlZddlZzddlZdZWney\edƒddlmZddlmZdZYnwddlmZmZmZmZmZm Z Gd d „d ƒZ!dS) é)ÚpartialN)ÚCallableTz5failed to import ttsfrd, use WeTextProcessing instead)Ú NormalizerF)Úcontains_chineseÚ replace_blankÚreplace_corner_markÚremove_bracketÚspell_out_numberÚsplit_paragraphc@s„eZdZ   d!dededededed ed efd d „Zd d„Zdd„Zdd„Z dd„Z d"dd„Z dd„Z dd„Z dd„Zdd„Zd S)#ÚCosyVoiceFrontEndÚFÚallÚ get_tokenizerÚfeat_extractorÚcampplus_modelÚspeech_tokenizer_modelÚspk2infoÚinstructÚallowed_specialc Csv|ƒ|_||_t tj ¡rdnd¡|_t ¡}tjj |_ d|_ tj ||dgd|_ tj ||tj ¡r5dndgd|_tj |¡rJtj||jd|_||_||_t ¡|_t|_|jr®t ¡|_tj tj t¡¡} tj d¡swt  !d  "¡¡tj d ¡s…t  !d  "¡¡n t#d | ƒ|j $d  %| ¡¡dusšJdƒ‚|j &d¡|j 'd¡|j (d¡dSt)ddd|_*t+ƒ|_,dS)NÚcudaÚcpuéÚCPUExecutionProvider)Ú sess_optionsÚ providersÚCUDAExecutionProvider)Ú map_locationz resource.zipzRwget https://huggingface.co/FunAudioLLM/CosyVoice-ttsfrd/resolve/main/resource.zipÚresourcezunzip resource.zipÚrootz{}/../../resourceTz$failed to initialize ttsfrd resourceZpinyinF)Z remove_erhuaZ full_to_half)-Ú tokenizerrÚtorchÚdevicerÚ is_availableÚ onnxruntimeÚSessionOptionsZGraphOptimizationLevelZORT_ENABLE_ALLZgraph_optimization_levelZintra_op_num_threadsÚInferenceSessionÚcampplus_sessionÚspeech_tokenizer_sessionÚosÚpathÚexistsÚloadrrrÚinflectÚengineÚinflect_parserÚ use_ttsfrdÚttsfrdZTtsFrontendEngineÚfrdÚdirnameÚabspathÚ__file__Ú subprocessÚrunÚsplitÚprintÚ initializeÚformatZ set_lang_typeZenable_pinyin_mixZset_breakmodel_indexÚ ZhNormalizerÚ zh_tn_modelÚ EnNormalizerÚ en_tn_model) ÚselfrrrrrrrÚoptionÚROOT_DIR©rBúD/home/splend1dchan/Desktop/BreezyVoice-dev/cosyvoice/cli/frontend.pyÚ__init__'s: "         zCosyVoiceFrontEnd.__init__cCsT|jj||jd}tj|gtjd |j¡}tj|jdgtjd |j¡}||fS)N©r©Údtyper) rÚencoderr ÚtensorÚint32Útor!Úshape)r?ÚtextZ text_tokenZtext_token_lenrBrBrCÚ_extract_text_tokenVs z%CosyVoiceFrontEnd._extract_text_tokenc Csªtj|dd}|j d|j ¡dj| ¡ ¡ ¡|j ¡djt j |j dgt j di¡d  ¡ ¡}tj|gtj d |j¡}tj|j dgtj d |j¡}||fS)Né€)Ún_melsrrérF)ÚwhisperZlog_mel_spectrogramr'r6Ú get_inputsÚnameÚdetachrÚnumpyÚnpÚarrayrLrJÚflattenÚtolistr rIrKr!)r?ÚspeechÚfeatÚ speech_tokenÚspeech_token_lenrBrBrCÚ_extract_speech_token\s$$ÿÿÿ z'CosyVoiceFrontEnd._extract_speech_tokencCsvtj|dddd}||jddd}|j d|j ¡dj|jdd ¡  ¡i¡d  ¡  ¡}t   |g¡ |j¡}|S)NéPré€>)Ú num_mel_binsÚditherÚsample_frequencyT)ÚdimÚkeepdim©re)ÚkaldiÚfbankÚmeanr&r6rSrTÚ unsqueezerrVrYrZr rIrKr!)r?r[r\Ú embeddingrBrBrCÚ_extract_spk_embeddingdsý:z(CosyVoiceFrontEnd._extract_spk_embeddingcCsV| |¡jdd dd¡ |j¡}|jdd}tj|jdgtj d |j¡}||fS)NrrgrrF) rÚsqueezeÚ transposerKr!rkr rIrLrJ)r?r[Ú speech_featÚspeech_feat_lenrBrBrCÚ_extract_speech_featns"  z&CosyVoiceFrontEnd._extract_speech_featTc Cs| ¡}t|ƒrV|jr|j |d¡}n|j |¡}| dd¡}t|ƒ}t |ƒ}| dd¡}| dd¡}t |ƒ}t   dd |¡}d d „t |t|jj|jd d dddddDƒ}n.|jra|j |d¡}n|j |¡}t||jƒ}dd „t |t|jj|jd ddddddDƒ}|durŠ|S|S)NÚinputÚ r Ú.uã€z - u,u[,,]+$u。cSóg|]}|‘qSrBrB©Ú.0ÚirBrBrCÚ ‚óz4CosyVoiceFrontEnd.text_normalize..rEÚzhr`é<éF)Z token_max_nZ token_min_nZ merge_lenZ comma_splitcSrvrBrBrwrBrBrCrz‹r{Úen)Ústriprr/r1Zget_frd_extra_infor<Ú normalizeÚreplacerrrÚreÚsubr rrrHrr>r r.)r?rMr7ÚtextsrBrBrCÚtext_normalizets6     þ   þz CosyVoiceFrontEnd.text_normalizecCs.| |¡\}}|j|d}||||dœ}|S)Nrl)rMÚtext_lenÚ llm_embeddingÚflow_embedding)rNr)r?Útts_textÚspk_idÚtts_text_tokenÚtts_text_token_lenrlÚ model_inputrBrBrCÚ frontend_sft’szCosyVoiceFrontEnd.frontend_sftc Csx| |¡\}}| |¡\}}tjjddd|ƒ}| |¡\} } | |¡\} } | |¡} ||||| | | | | | | | dœ }|S)Nrai"V)Ú orig_freqÚnew_freq) rMr‡Ú prompt_textÚprompt_text_lenÚllm_prompt_speech_tokenÚllm_prompt_speech_token_lenZflow_prompt_speech_tokenZflow_prompt_speech_token_lenZprompt_speech_featZprompt_speech_feat_lenrˆr‰)rNÚ torchaudioÚ transformsÚResamplerrr_rm)r?rŠr’Úprompt_speech_16krŒrZprompt_text_tokenZprompt_text_token_lenZprompt_speech_22050rprqr]r^rlrŽrBrBrCÚfrontend_zero_shot˜s ûz$CosyVoiceFrontEnd.frontend_zero_shotcCs*| |d|¡}|d=|d=|d=|d=|S)Nr r’r“r”r•)rš)r?rŠr™rŽrBrBrCÚfrontend_cross_lingual§s z(CosyVoiceFrontEnd.frontend_cross_lingualcCs8| ||¡}|d=| |d¡\}}||d<||d<|S)Nrˆz r’r“)rrN)r?rŠr‹Ú instruct_textrŽZinstruct_text_tokenZinstruct_text_token_lenrBrBrCÚfrontend_instruct°s z#CosyVoiceFrontEnd.frontend_instructN)r Fr )T)Ú__name__Ú __module__Ú __qualname__rÚstrÚboolrDrNr_rmrrr†rršr›rrBrBrBrCr %s8ùÿþýüûú ù/  r )"Ú functoolsrr#r rVrWrRÚtypingrZtorchaudio.compliance.kaldiÚ compliancerhr–r(rƒr,r5r0r/Ú ImportErrorr8Ztn.chinese.normalizerrr;Ztn.english.normalizerr=Zcosyvoice.utils.frontend_utilsrrrrr r r rBrBrBrCÚs.     ü