o g@sdZddlmZddlZddlmZddlZddlmmZddlm Z ddl m Z ddl m Z ddlmZdd lmZmZmZmZmZmZmZGd d d ejZGd d d ejZGdddejZdS)z\ ein notation: b - batch n - sequence nt - text sequence nw - raw wave length d - dimension ) annotationsN)nn)RotaryEmbeddingLlamaDecoderLayer) LlamaConfig) checkpoint)TimestepEmbeddingConvNeXtV2BlockConvPositionEmbeddingDiTBlockAdaLayerNormZero_Finalprecompute_freqs_cisget_pos_embed_indicescs*eZdZd fdd Zd d dd ZZS) TextEmbeddingrcsvtt|d|_|dkr6d|_d|_|jdt|jddtj fdd t |D|_ dSd|_dS) NrTi freqs_cisF) persistentcsg|] }tqS)r ).0_ conv_multtext_dimrJ/home/node59_tmpdata3/hkchen/DiffRhythm/diffrhythm/diffrhythm/model/dit.py .sz*TextEmbedding.__init__..) super__init__r Embedding text_embedextra_modelingprecompute_max_posregister_bufferr Sequentialrange text_blocks)selftext_num_embedsr conv_layersr __class__rrr%s  zTextEmbedding.__init__Ftext int['b nt']c Csv|jd|jd}}|rt|}||}|jr9tj|ftjd}t|||jd}|j |}||}| |}|S)Nrr)dtype)max_pos) shapetorch zeros_liker r!zeroslongrr"rr&) r'r,seq_len drop_textbatchtext_lenZ batch_startpos_idxZtext_pos_embedrrrforward3s    zTextEmbedding.forward)rrF)r,r-__name__ __module__ __qualname__rr: __classcell__rrr*rr$srcs(eZdZfddZd d dd ZZS) InputEmbeddingcs8tt|d||d||_t|d|_dS)Nrdim)rrrLinearprojr conv_pos_embed)r'mel_dimrout_dimcond_dimr*rrrPs zInputEmbedding.__init__Fxfloat['b n d']condr cCsr|rt|}|dd|jdd}|dd|jdd}|tj|||||fdd}|||}|S)NrrB)r1r2 unsqueezerepeatr0rEcatrF)r'rJrLr style_embZtime_embdrop_audio_condrrrr:Us zInputEmbedding.forwardr;)rJrKrLrKr rKr<rrr*rrAOs rAc sNeZdZdddddddddd d d fd d Z    ddddZZS)DiT@g?dNrF) depthheadsdim_headdropoutff_multrGr(rr)long_skip_connectionuse_style_promptc stdt|_t|_| dur|} t|| | d|_t|| d|_|_ ||_ t |ddd_ t fddt|D|_| rUt jd d d nd|_t fd dt|d D|_|jD]} | D]}|qsqmt|_t ||_dS) Ni)r))rIsilu) hidden_sizeintermediate_size hidden_actsdpacsg|]}t|dqS)) layer_idxrri) llama_configrrrsz DiT.__init__..rF)biascs$g|]}tttqSr)rr$rDSiLUrf)rIrCrrrs )rrr time_embedstart_time_embedrr rA input_embedrCrYr_attn_implementationr ModuleListr%transformer_blocksrDr^text_fusion_linears parametersdetachzero_r norm_outproj_out)r'rCrYrZr[r\r]rGr(rr)r^r_layerpr*)rIrCrhrres6        z DiT.__init__rJrKrLr,r-timefloat['b'] | float['']maskbool['b n'] | Nonec CsT|jd|jd} }|jdkr|| }||}|| }||}|j|||d}|r2t|}|}|j||||||d}|j durF|}tj |jd|j d}| d|jdd}t |jD])\}}| sq|||d^}}n t|||dd^}}||jd kr||j||}qb|j dur| tj||fd d }|||}||}|S) Nrr)r6)rR)device) position_idsF)r~ use_reentrantrrMrB)r0ndimrOrkrlr r1r2rmr^aranger}rN enumeraterprrYrqrPrurv)r'rJrLr,ryrRr6 drop_prompt style_promptstyle_prompt_lensr{ grad_ckpt start_timer7r5ts_tcr rQresidualZpos_idsrgblockroutputrrrr:s6          z DiT.forward)FNNNFN) rJrKrLrKr,r-ryrzr{r|r<rrr*rrSds(DrS)__doc__ __future__rr1rZtorch.nn.functional functionalFx_transformers.x_transformersrZ(transformers.models.llama.modeling_llamarZtransformers.models.llamartorch.utils.checkpointrdiffrhythm.model.modulesr r r r r rrModulerrArSrrrrs      $+