U /f"@sjddlZddlmZddlmZmZmZddlmZm Z m Z m Z m Z m Z ddlmZGdddejZdS)N)pack rearrangerepeat)SinusoidalPosEmbBlock1D ResnetBlock1D Downsample1DTimestepEmbedding Upsample1DBasicTransformerBlockcs0eZdZdfdd Zd d Zdd dZZS)ConditionalDecoderr皙?@snakec sftt|}||_||_t||_|dd} t|| dd|_t g|_ t g|_ t g|_ |tt|D]} } || | t|dk} t| | d}t fddt|D}| stnt jd dd }|j t |||gqxt|D]^} |d } |d }t| | d}t fd dt|D}|j t ||gq |d d d |df}tt|dD]} || d} || d| t|dk} t| | d}t fddt|D}| stddnt jd dd }|j t |||gqt|d |d |_t |d |jd|_|d S)z This decoder requires an input with the same shape of the target. So, if your text content is shorter or longer than the outputs, please re-sampling it before feeding to the decoder. rrsilu) in_channelstime_embed_dimact_fnr)dimdim_out time_emb_dimc sg|]}tdqS)rnum_attention_headsattention_head_dimdropout activation_fnr .0_rrr num_headsoutput_channelG/proj/MR_dataset/benson/CosyVoice-main-aug-19/cosyvoice/flow/decoder.py =sz/ConditionalDecoder.__init__..)paddingc sg|]}tdqSrr r"r%r(r)r*SsNrc sg|]}tdqSrr r"r%r(r)r*lsT)use_conv_transpose)super__init__tupler out_channelsrtime_embeddingsr time_mlpnn ModuleList down_blocks mid_blocks up_blocksrangelenrrConv1dappendr r final_block final_projinitialize_weights)selfrr2channelsr rn_blocksnum_mid_blocksr&rri input_channelis_lastresnettransformer_blocks downsampleupsample __class__r%r)r0sz           zConditionalDecoder.__init__cCs|D]}t|tjrFtjj|jdd|jdk rtj|jdqt|tj rttj|jdtj|jdqt|tj rtjj|jdd|jdk rtj|jdqdS)Nrelu) nonlinearityrr) modules isinstancer5r<initkaiming_normal_weightbias constant_ GroupNormLinear)rAmr(r(r)r@s      z%ConditionalDecoder.initialize_weightsNc Cs|||}||}t||gdd}|dk rTt|d|jdd}t||gdd}|dk rnt||gdd}g}|g}|jD]\} } } |d} | || |}t|d}t | dd | } | D]}||| |d }qt|d }| || || }| | ddddddd fq~|dd}|d}|j D]d\} } | |||}t|d}t | dd |} | D]}||| |d }qzt|d }q<|j D]\} } }|}|}t|ddddd|jdf|gdd}| |||}t|d}t | dd |} | D]}||| |d }q&t|d }|||}q|||}|||}||S) a.Forward pass of the UNet1DConditional model. Args: x (torch.Tensor): shape (batch_size, in_channels, time) mask (_type_): shape (batch_size, 1, time) t (_type_): shape (batch_size) spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None. cond (_type_, optional): placeholder for future use. Defaults to None. Raises: ValueError: _description_ ValueError: _description_ Returns: _type_: _description_ zb * trNz b c -> b c tr-)tzb c t -> b t crr) hidden_statesattention_masktimestepzb t c -> b c t)r3r4rrshaper7r contiguoustorchmatmul transposer=r8r9popr>r?)rAxmaskmurZspkscondhiddensmasksrHrIrJ mask_down attn_masktransformer_blockmask_midrKmask_upskipoutputr(r(r)forwardsp     $   .   zConditionalDecoder.forward)rrrrrrr)NN)__name__ __module__ __qualname__r0r@rr __classcell__r(r(rLr)r slr )r`torch.nnr5einopsrrr matcha.models.components.decoderrrrrr r $matcha.models.components.transformerr Moduler r(r(r(r)s