o h2hF@sddlZddlZddlmZddlmZmZddlmZddl m Z dgZ ddZ ej jd d d dd dZej jd d d ddZGdddejZGdddejZGdddejZGdddeZGdddejZGdddejZGdddeeZdS)N) ConfigMixinregister_to_config) ModelMixin)flash_attentionWanModelc Csn|ddksJ|d}|tj}t|tdt||| }tjt |t |gdd}|S)Nr'rdim) typetorchfloat64outerpowarangetodivcatcossin)r positionhalfsinusoidxr3/home/ubuntu/wan22/wan2.2-main/wan/modules/model.pysinusoidal_embedding_1ds  rcudaF)enabledr c Cs\|ddksJtt|dt|td|dtj|}tt||}|S)Nrrg?) r rrrrrrpolar ones_like) max_seq_lenr thetafreqsrrr rope_paramssr%c Csh|d|dd}}|j|d|d|d|dgdd}g}t|D]\}\}}} ||| } t||d| ftj| |dd} tj |dd| |ddd ||| d|dd| d|dd ||| d|dd|  dd| d ||| dgdd| dd} t | |  d} t | ||| dfg} || q)t|S)Nrrr r)sizesplit enumeratetolistr view_as_complexrrreshaperviewexpand view_as_realflattenappendstackfloat) r grid_sizesr$ncoutputifhwseq_lenx_ifreqs_irrr rope_apply&s*( &&&  r@cs.eZdZdfdd ZddZddZZS) WanRMSNormh㈵>cs,t||_||_tt||_dSN) super__init__r epsnn Parameterr onesweight)selfr rF __class__rrrEGs zWanRMSNorm.__init__cCs||||jSz> Args: x(Tensor): Shape [B, L, C] )_normr4type_asrJrKrrrrforwardMszWanRMSNorm.forwardcCs$|t|djddd|jS)Nrr'T)r keepdim)r rsqrtrmeanrFrQrrrrOTs$zWanRMSNorm._norm)rB)__name__ __module__ __qualname__rErRrO __classcell__rrrLrrAEsrAcs*eZdZdfdd ZfddZZS) WanLayerNormư>Fcstj|||ddS)N)elementwise_affinerF)rDrE)rKr rFr\rLrrrEZszWanLayerNorm.__init__cst||SrN)rDrRr4rPrQrLrrrR]szWanLayerNorm.forward)r[FrVrWrXrErRrYrrrLrrZXsrZcs,eZdZ   dfdd ZddZZS) WanSelfAttentionr'r'Tr[cs||dksJt||_||_|||_||_||_||_t |||_ t |||_ t |||_ t |||_ |rEt||dnt|_|rUt||d|_dSt|_dS)Nr)rF)rDrEr num_headshead_dim window_sizeqk_normrFrGLinearqkvorAIdentitynorm_qnorm_k)rKr r`rbrcrFrLrrrEgs  $zWanSelfAttention.__init__c sg|jddjjR\fdd}||\}}}tt|||t|||||jd}|d}|}|S)a Args: x(Tensor): Shape [B, L, num_heads, C / num_heads] seq_lens(Tensor): Shape [B] grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W) freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2] NrcsX|}|}|}|||fSrC)rjrer.rkrfrg)rrerfrgbdr6srKrrqkv_fns z(WanSelfAttention.forward..qkv_fn)rerfrgk_lensrb)shaper`rarr@rbr1rh) rKrseq_lensr5r$rprerfrgrrlrrR~s(    zWanSelfAttention.forward)r_Tr[r]rrrLrr^es r^c@seZdZddZdS)WanCrossAttentionc Cs|d|j|j}}}||||d||}||||d||}|||d||} t ||| |d}| d}| |}|S)z Args: x(Tensor): Shape [B, L1, C] context(Tensor): Shape [B, L2, C] context_lens(Tensor): Shape [B] rr')rqr) r(r`rarjrer.rkrfrgrr1rh) rKrcontext context_lensrmr6rnrerfrgrrrrRs  zWanCrossAttention.forwardN)rVrWrXrRrrrrrts rtcs.eZdZ    d fdd ZddZZS) WanAttentionBlockr_TFr[cst||_||_||_||_||_||_||_t |||_ t ||||||_ |r2t ||ddnt |_t||d|||_t |||_t t ||t jddt |||_t tdd||d|_dS) NT)r\r_tanh approximater?)rDrEr ffn_dimr`rbrccross_attn_normrFrZnorm1r^ self_attnrGrinorm3rt cross_attnnorm2 SequentialrdGELUffnrHr randn modulation)rKr r}r`rbrcr~rFrLrrrEs8      "zWanAttentionBlock.__init__c s|jtjksJtjjdtjdjd|jddd}Wdn1s)wY|djtjks8J | d|d d|d d|||}tjjdtjd|||d d}Wdn1stwYfd d } | ||||}|S) aP Args: x(Tensor): Shape [B, L, C] e(Tensor): Shape [B, L1, 6, C] seq_lens(Tensor): Shape [B], length of each sequence in batch grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W) freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2] rdtyperr{rr Nrcs||||}|d|dd|dd}tjjdtj d|||dd}Wd|S1sFwY|S)Nrrr&rr) rrrrr4squeezer ampautocastfloat32)rrurveyrKrrcross_attn_ffns, z1WanAttentionBlock.forward..cross_attn_ffn) rr rrrr unsqueezechunkrrr4r) rKrrrsr5r$rurvrrrrrrRs, zWanAttentionBlock.forward)r_TFr[r]rrrLrrws"rwcs&eZdZdfdd ZddZZS)Headr[cslt||_||_||_||_t||}t|||_ t |||_ t tdd||d|_dS)Nrrr|)rDrEr out_dim patch_sizerFmathprodrZnormrGrdheadrHr rr)rKr rrrFrLrrrEs  "z Head.__init__cCs|jtjksJtjjdtjd1|jd|djddd}|| |d|d d|d d}Wd|S1sEwY|S)zg Args: x(Tensor): Shape [B, L1, C] e(Tensor): Shape [B, L1, C] rrrrr rN) rr rrrrrrrrr)rKrrrrrrRs ( z Head.forward)r[r]rrrLrrsrcsneZdZdZgdZdgZe          dfdd Z dddZddZ ddZ Z S)rzR Wan diffusion backbone supporting both text-to-video and image-to-video. )rr~rctext_dimrbrwt2vrrr  r_Tr[c st|dvs J||_||_||_||_|_|_||_||_ | |_ |_ | |_ |_ |_|_|_tj|||d|_tt|tjddt|_tt|tt|_tttd|_tfddt| D|_t| ||_dkrd dksJ}t j!t"d |d |dt"d d |dt"d d |dgd d |_#|$dS)a) Initialize the diffusion model backbone. Args: model_type (`str`, *optional*, defaults to 't2v'): Model variant - 't2v' (text-to-video) or 'i2v' (image-to-video) patch_size (`tuple`, *optional*, defaults to (1, 2, 2)): 3D patch dimensions for video embedding (t_patch, h_patch, w_patch) text_len (`int`, *optional*, defaults to 512): Fixed length for text embeddings in_dim (`int`, *optional*, defaults to 16): Input video channels (C_in) dim (`int`, *optional*, defaults to 2048): Hidden dimension of the transformer ffn_dim (`int`, *optional*, defaults to 8192): Intermediate dimension in feed-forward network freq_dim (`int`, *optional*, defaults to 256): Dimension for sinusoidal time embeddings text_dim (`int`, *optional*, defaults to 4096): Input dimension for text embeddings out_dim (`int`, *optional*, defaults to 16): Output video channels (C_out) num_heads (`int`, *optional*, defaults to 16): Number of attention heads num_layers (`int`, *optional*, defaults to 32): Number of transformer blocks window_size (`tuple`, *optional*, defaults to (-1, -1)): Window size for local attention (-1 indicates global attention) qk_norm (`bool`, *optional*, defaults to True): Enable query/key normalization cross_attn_norm (`bool`, *optional*, defaults to False): Enable cross-attention normalization eps (`float`, *optional*, defaults to 1e-6): Epsilon value for normalization layers )ri2vti2v) kernel_sizestriderxryr{c s g|] }tqSr)rw).0_r~r rFr}r`rcrbrr s  z%WanModel.__init__..rrirrr N)%rDrE model_typertext_lenin_dimr r}freq_dimrrr` num_layersrbrcr~rFrGConv3dpatch_embeddingrrdrtext_embeddingSiLUtime_embeddingtime_projection ModuleListrangeblocksrrr rr%r$ init_weights)rKrrrrr r}rrrr`rrbrcr~rFrnrLrrrE0sT 5     zWanModel.__init__Ncsjdkr |dus Jjjj}jj|krj|_|dur+ddt||D}fdd|D}tdd|D}dd|D}tj dd|Dtj d }| ksZJt fd d|D}| d kru||d }tjjd tjd <|d } |}tj|d | f} | ddj f} | jtjkr| jtjksJWdn1swYd} tfdd|D}t| ||j|| d} jD] }||fi| }q|| }||}dd|DS)a Forward pass through the diffusion model Args: x (List[Tensor]): List of input video tensors, each with shape [C_in, F, H, W] t (Tensor): Diffusion timesteps tensor of shape [B] context (List[Tensor]): List of text embeddings each with shape [L, C] seq_len (`int`): Maximum sequence length for positional encoding y (List[Tensor], *optional*): Conditional video inputs for image-to-video mode, same shape as x Returns: List[Tensor]: List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8] rNcSs"g|] \}}tj||gddqS)rr )r r)rurgrrrrs"z$WanModel.forward..csg|] }|dqS)r)rrrrrrrrscSs&g|]}tj|jddtjdqS)rNr)r tensorrrlongrrrrrs&cSsg|] }|dddqS)rr)r1 transposerrrrrscSsg|]}|dqS)r)r(rrrrrsrc s8g|]}tj||d|d|dgddqS)rrr )r r new_zerosr(r)r=rrrs $rrrrr{c s4g|]}t||j|d|dgqS)rr)r rrrr(rrrrrs  )rrsr5r$rurvcSsg|]}|qSr)r4rrrrrs) rrrJdevicer$rzipr r3rrmaxrr r/r(rrrr1rrr unflattenr4rrrdictrr unpatchify)rKrtrur=rrr5rsbtre0rvkwargsblockr)rKr=rrRsd           zWanModel.forwardcCs|j}g}t||D]5\}}|dt|jg||j|R}td|}|j |gddt||jDR}| |q |S)a Reconstruct video tensors from patch embeddings. Args: x (List[Tensor]): List of patchified features, each with shape [L, C_out * prod(patch_size)] grid_sizes (Tensor): Original spatial-temporal grid dimensions before patching, shape [B, 3] (3 dimensions correspond to F_patches, H_patches, W_patches) Returns: List[Tensor]: Reconstructed video tensors with shape [C_out, F, H / 8, W / 8] Nzfhwpqrc->cfphqwrcSsg|]\}}||qSrr)rr9jrrrrsz'WanModel.unpatchify..) rrr+rrr.rr einsumr-r2)rKrr5r7outrrgrrrrs( $ zWanModel.unpatchifycCs|D]}t|tjrtj|j|jdurtj|jqtj|j j d|j D]}t|tjrAtjj |jddq0|j D]}t|tjrXtjj |jddqGtj|jjjdS)zJ Initialize model parameters using Xavier initialization. Nrg{Gz?)std)modules isinstancerGrdinitxavier_uniform_rJbiaszeros_rr1rnormal_rr)rKmrrrr s     zWanModel.init_weights)rrrrrrrrrrrr_TTr[rC) rVrWrX__doc__ignore_for_config_no_split_modulesrrErRrrrYrrrLrr&s2o Y)r )rr torch.nnrGdiffusers.configuration_utilsrrdiffusers.models.modeling_utilsr attentionr__all__rrrr%r@ModulerA LayerNormrZr^rtrwrrrrrrs&      9O