U /f#@sddlmZmZmZddlZddlmZddlmmZddl m Z m Z ddl m Z ddlmZddl mZGdd d ejjZdS) )DictOptionalUnionN)nn) pad_sequenceunpad_sequence) IGNORE_ID)LabelSmoothingLoss) th_accuracycseZdZdeeeeeejjejjeeed fdd Z ej ej ddd Z d d Z e ejeeeej fd d dZdej eeeefeedddZedej ej ej ej ej ej ej eeeeej d ddZZS) TransformerLMT) text_encoder_input_sizellm_input_sizellm_output_sizetext_token_sizespeech_token_size text_encoderllmlength_normalized_loss lsm_weight spk_embed_dimc st||_||_tj|||_||_t |j ||_ d|_ d|_ tjd||_||_t ||d|_t|dt| |d|_tj|||_tj | ||_dS)Nr)size padding_idx smoothingZnormalize_length)super__init__rrtorchr Embeddingtext_embeddingrLinear output_sizetext_encoder_affine_layersos_eostask_id llm_embeddingr llm_decoderr r criterion_cespeech_embeddingspk_embed_affine_layer) selfrrrrrrrrrr __class__B/proj/MR_dataset/benson/CosyVoice-main-aug-19/cosyvoice/llm/llm.pyrs, zTransformerLM.__init__)text text_lengthscCs8|j||ddd\}}|dd}||}||fS)Nr)Zdecoding_chunk_sizeZnum_decoding_left_chunks)rsqueezesumr$)r,r1r2Z encoder_outZ encoder_maskZencoder_out_lensr/r/r0encodeBs zTransformerLM.encodec svt|ddt|ddfddttD}tjdd|Dtjd} t|dtd}|| fS)NT) batch_firstc s@g|]8}tjjdd||jdd|gddqS)rdim)rconcatr4.0i embedding sos_eos_emb speech_token task_id_emb text_tokenr/r0 Osz4TransformerLM.pad_unpad_sequence..cSsg|]}|dqS)r)rr;r/r/r0rDPs)dtyper7 padding_value) rcpurangelenrtensorint32rr) r,r@r?rCtext_token_lenrBrAspeech_token_lenlm_input lm_input_lenr/r>r0pad_unpad_sequenceLs "z TransformerLM.pad_unpad_sequence)batchdevicereturnc s^|d|}|d||d||d||d|}fddt|dD}t|d td |}|}|\}tj|d d } |}| d }j j j d d d }j j jd d d }||||\}} || |\} } | } | |} t| d jd |td}| |dS)z Args: text: (B, L, D) text_lengths: (B,) audio: (B, T, N) or (B, T) audio_lengths: (B,) rCrMrArNr?c sDg|]<}ttgd||d|fjgqS)rN)rrKrtolistrr;r,rArNrMr/r0rDgsz)TransformerLM.forward..rTrFrr8r3)Z ignore_label)lossacc)torIrrrr!r6F normalizer+ unsqueezer'weightr%reshaper&r*rQrr(r)r viewr)r,rRrSrCr?Z lm_targetr@rBrOrPZ lm_outputZlm_output_masklogitsrWrXr/rVr0forwardTs* "      zTransformerLM.forwardr)weighted_scoressampling beam_size ignore_eoscCsB|jdd|\}}|j|dd}||}|r>|j|krq>q|S)Nr3r8T) replacement)softmaxtopk multinomialr)r,rbrcrdreprobindicestop_idsr/r/r0 sampling_idss zTransformerLM.sampling_idsr) r1text_len prompt_textprompt_text_lenprompt_speech_tokenprompt_speech_token_lenr?rdrcmax_token_text_ratiomin_token_text_ratiorTc Cs8|j} tj||gdd}||7}||}|||\}}|jddkrntj|dd}||}|j dd}nt dd|j  | }|j j|jddd} |j j|jddd}|dkr||}nt dd|j  | }tj| ||||gdd}t||| }t||| }g}d}tj d|jdtj d|jd}}t|D]}|jj|dd||ttjd|jd|jdf|jd tjd\}}}||dddfjdd}|j|jdd| |||krdnd d }||jkrq$| |||!d7}|jj|ddd}qHtj"|gtj#| d S) Nrr8rr3)rrrr)rS)offsetZrequired_cache_size att_cache cnn_cacheZatt_maskTF)re)rErS)$rSrr:r!r6shaperZr[r+r\zerosrrYr'r]r%r^r&r*intrIrZ forward_chunktrilonesboolr( log_softmaxrmr4itemrappendrrKint64)r,r1rprqrrrsrtr?rdrcrurvrSr@rBZprompt_speech_token_embrOmin_lenmax_lenZ out_tokensrwrxryr=Zy_predZlogprlr/r/r0 inferencesB   " . ,  zTransformerLM.inference)Tr r )TrT)rrnror)__name__ __module__ __qualname__r|rrModulerfloatrTensorr6rQdictrSrstrrrarrminference_moder __classcell__r/r/r-r0r sf +   3   r )typingrrrrrZtorch.nn.functional functionalrZZtorch.nn.utils.rnnrrZcosyvoice.utils.commonrZ*cosyvoice.transformer.label_smoothing_lossr r rr r/r/r/r0s