o g@s^ddlZddlZddlZddlZddlZddlZddlZddlZddlZddl m Z ddl m Z ddl mZddlmmZddlZddlmmZddlmZddlmZddlmZmZddlmZdd l m!Z!dd l"m#Z#dd l$m%Z%dd l&m'Z'm(Z(m)Z)dd l*m+Z+ddl,Z-ddl.m/Z/m0Z0ddl1m2Z2ddl3m4Z4ddl5m6Z7GdddZ8dS)N)contextmanager)partial)P)tqdm)fftnifftn) shard_model)WanModel)T5EncoderModel)WanVAE)FlowDPMSolverMultistepSchedulerget_sampling_sigmasretrieve_timesteps)FlowUniPCMultistepScheduler)ViTModelViTFeatureExtractor) cache_video)Imagec@seZdZ      d&ddZ      d'd dZ      d(ddZ     d)ddZd*ddZ    d+ddZd,ddZ d-dd Z        !   "  #d.d$d%Z dS)/WanT2VrFc  Csvtd||_||_||_||_|j|_|j|_tt|d} t |j |j tdt j ||jt j ||j|r;| ndd|_|j|_|j|_tt j ||j|jd|_td|t||_|jd|rd d lm} d d l m!} m"} |jj#D] } t$%| | j&| j&_'qt$%| |j|j_'| |_(nd |_(t)*rt)+|r| |j|_n|j,|j|j-|_-dS) a Initializes the Wan text-to-video generation model components. Args: config (EasyDict): Object containing model parameters initialized from config.py checkpoint_dir (`str`): Path to directory containing model checkpoints device_id (`int`, *optional*, defaults to 0): Id of target GPU device rank (`int`, *optional*, defaults to 0): Process rank for distributed training t5_fsdp (`bool`, *optional*, defaults to False): Enable FSDP sharding for T5 model dit_fsdp (`bool`, *optional*, defaults to False): Enable FSDP sharding for DiT model use_usp (`bool`, *optional*, defaults to False): Enable distribution strategy of USP. t5_cpu (`bool`, *optional*, defaults to False): Whether to place T5 model on CPU. Only works without t5_fsdp. zcuda:) device_idcpuN)text_lendtypedevicecheckpoint_pathtokenizer_pathshard_fn)vae_pthrzCreating WanModel from Fr) get_sequence_parallel_world_sizer)usp_attn_forwardusp_dit_forward).torchrconfigrankt5_cpunum_train_timesteps param_dtyperr r rt5_dtypeospathjoin t5_checkpoint t5_tokenizer text_encoder vae_stride patch_sizer vae_checkpointvaelogginginfor from_pretrainedmodelevalrequires_grad_xfuser.core.distributedr!distributed.xdit_context_parallelr r!blockstypes MethodType self_attnforwardsp_sizedistis_initializedbarriertosample_neg_prompt)selfr#checkpoint_dirrr$t5_fsdpdit_fsdpuse_uspr%rrr r!blockrL/root/Wan2.1/wan/text2video.py__init__#sP         zWanT2V.__init__iiQ@unipc2Tc $ s|} jjj| djdd|djd|djdf} t| d| djdjd| djj} |dkrHj}| dkrN| nt dt j } t jjd}|| jsjjj|gj}|gj}| rjjn&|gt d}|gt d}fdd |D}fd d |D}t j| d| d| d| dt jj|d g}td d }tjd|}tjjdt ||dkrtjddd}|j|j|d|j }n!|dkrt!jddd}t"||}t#|j|d\}}nt$d|}|| d}|| d}t%t&|D]R\}}|}|g}t '|}jjj|fd|i|d}j|fd|i|d}||||} |j(| )d||d)dd|dd}!|!*dg}q,|}"| rjj+dkrj,|"}#Wdn 1swYWdn 1swYWdn 1swY~~~| rt-.t j/0t12rt13j+dkr|#dSdS) Generates video frames from text prompt using diffusion process. Args: input_prompt (`str`): Text prompt for content generation size (tupele[`int`], *optional*, defaults to (1280,720)): Controls video resolution, (width,height). frame_num (`int`, *optional*, defaults to 81): How many frames to sample from a video. The number should be 4n+1 shift (`float`, *optional*, defaults to 5.0): Noise schedule shift parameter. Affects temporal dynamics sample_solver (`str`, *optional*, defaults to 'unipc'): Solver used to sample the video. sampling_steps (`int`, *optional*, defaults to 40): Number of diffusion sampling steps. Higher values improve quality but slow generation guide_scale (`float`, *optional*, defaults 5.0): Classifier-free guidance scale. Controls prompt adherence vs. creativity n_prompt (`str`, *optional*, defaults to ""): Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt` seed (`int`, *optional*, defaults to -1): Random seed for noise generation. If -1, use random seed. offload_model (`bool`, *optional*, defaults to True): If True, offloads models to CPU during generation to save VRAM Returns: torch.Tensor: Generated video frames tensor. Dimensions: (C, N H, W) where: - C: Color channels (3 for RGB) - N: Number of frames (81) - H: Frame height (from size) - W: Frame width from size) rrrTrrcg|]}|jqSrLrDr.0trFrLrM z#WanT2V.generate..crZrLr[r\r_rLrMr`ra)rr generatorcs dVdSNrLrLrLrLrM noop_no_sync z%WanT2V.generate..noop_no_syncno_syncrrRFr&shiftuse_dynamic_shiftingrrjdpm++rsigmasUnsupported solver.contextseq_lenr^ return_dictrbN)4r2r6z_dimr/mathceilr0r@rErandomrandintsysmaxsizer" Generatorr manual_seedr%r.rDrrandnfloat32rgetattrampautocastr'no_gradrr& set_timesteps timestepsr rrNotImplementedError enumeraterstackstep unsqueezesqueezer$decodegccollectcuda synchronizerArBrC)$rF input_promptsize frame_numrj sample_solversampling_steps guide_scalen_promptseed offload_modelF target_shapersseed_grr context_nullnoiserergsample_schedulerrsampling_sigmas_latentsarg_carg_nullr^latent_model_inputtimestepnoise_pred_condnoise_pred_uncond noise_predtemp_x0x0videosrLr_rMgeneratexs-   "          :  zWanT2V.generate@rmNc % s|} jjj| djdd|djd|djdf} t| d| djdjd| djj}|dkrHj}| dkrN| nt dt j } t jjd}|| jsjjj|gj}|gj}| rjjn&|gt d}|gt d}fdd |D}fd d |D}| g}td d }tjd |}tjjdt ||dkrtjddd}|j|j|d|j}n!|dkrtjddd}t ||}t!|j|d\}}nt"d|}||d}||d}t#t$|D]R\}}|}|g}t %|}jjj|fd|i|d}j|fd|i|d} | ||| }!|j&|!'d||d'dd|dd}"|"(dg}q|}#| rujj)dkrj*|#}$Wdn 1swYWdn 1swYWdn 1swY~~~| rt+,t j-.t/0rt/1j)dkr|$dSdS)rVrrrWrXrTrYrcrZrLr[r\r_rLrMr`\raz&WanT2V.reconstruct..crZrLr[r\r_rLrMr`]racsrcrdrLrLrLrLrMrelrfz(WanT2V.reconstruct..noop_no_syncrgrhrRFrirlrmrnrprqr^rtN)2r2r6rvr/rwrxr0r@rEryrzr{r|r"r}rr~r%r.rDrrrrrr'rrr&rrr rrrrrrrrrr$rrrrrrArBrC)%rFrrrrjrrrrrr res_latentsrrrsrrrrrrergrrrrrrrr^rrrrrrrrrLr_rM reconstructs.   "          9  zWanT2V.reconstructc Cs,|j||d} ||| |||d||| } | S)a Perform Inversion on a video to obtain latent representations. Args: video_path (str): Path to the video file. num_steps (int, *optional*, defaults to 50): Number of inversion steps. size (tuple[`int`], *optional*, defaults to (1280,720)): Target resolution for resizing frames. Returns: torch.Tensor: The latent representation after DDIM Inversion. )rrW)load_video_frames_inversion_midpoint) rFr video_pathrrrrrrrjrinverted_latentsrLrLrM inversions zWanT2V.inversionc Cst|}|std|g} |\}}|sn&t|tj}tj||tjd}t | ddddd}| |q||sOtd |t | dddd |j}|j|}|S) aa Load video frames from the given path and preprocess them. Args: video_path (str): Path to the video file. size (tuple[`int`], *optional*, defaults to (1280,720)): Target resolution for resizing frames. Returns: torch.Tensor: Tensor of video frames with shape (frame_num, C, H, W). zCannot open video file: T) interpolationrWrrg_@g?zNo frames found in video: rX)cv2Z VideoCaptureZisOpened ValueErrorreadcvtColorZ COLOR_BGR2RGBresizeZ INTER_AREAr" from_numpyfloatpermuteappendreleaserrDrr2 video_encode) rFrrcapframesretframeZ frames_tensorrrLrLrMrs&     zWanT2V.load_video_framesrc ( s|} jjj| djdd|djd|djdf} |dkr(|ntdtj}t ||dkrNt j ddd} | j |j | dd| j} n!|d krktj ddd} t|| }t| j |dd \} }ntd jj |}|}j}jsjjj |gj }|gj }n"fd d |gt dD}fdd |gt dD}t| d| djdjd| djj}||d}||d}| j}td| td|tjj dt!t"t#| ddD]\}}| $|}||d||d}tj%|gj d}j|gfd|i|d}j|gfd|i|d}||||} |tj&}||| tj&}!|d}|| |dd}"tj%|"gj d}#j|!gfd|#i|d}$j|!gfd|#i|d}%|%||$|%}&|tj&}d|&| |}'||| tj&d|||'tj&}qWdn 1swYWdn 1swY|'dS)! Internal method to perform Inversion. Args: latents (torch.Tensor): Latent representations of video frames. num_steps (int): Number of inversion steps. Returns: torch.Tensor: The latent representation after inversion. rrrWrRFriT)rrjrrm)rrorrpcrZrLr[r\r_rLrMr`;raz.WanT2V._inversion_midpoint..rcrZrLr[r\r_rLrMr`<rarXrqz timesteps: zsigmas: rhNrUrYr^?)(r2r6rvr/ryrzr{r|r"r~rr&rrrr rrrrDclonerEr%r.rwrxr0r@roprintrrr'rrrindex_for_timesteptensorrr)(rFrrrrrrrrrjrrrrrrx_t sample_prompt uncond_prompt dummy_contextrrsargs args_nullroir^idxdtZ timestep_currZmodel_output_k1Znoise_pred_uncond_k1k1Zx_midZt_midZ timestep_midZmodel_output_k2Znoise_pred_uncond_k2k2Z first_orderrLr_rMrs   ""     0 zWanT2V._inversion_midpointc stjddd}t||}t|j|d|jdj}|}d}j } d} j j j | dj dddj dd j d f} t|jd |jd jdjd |jdjj} t| d | d jdjd | djj} t| | kjsjj j|gj}| gj}n"fd d |gtdD}fdd | gtdD}|| d}|| d}tjjdtj jttt|dddD]d}||}||d}||}||}|j|}|j|}||}t|tj |gjd}j |gfd|i|d}j |gfd|i|d}|d||}|||}t|!|"qWdn 1sewYWdn 1suwY|#dS)rrFrirnrgTwo anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage.rPrrrWrXcrZrLr[r\r_rLrMr`raz%WanT2V._inversion..rcrZrLr[r\r_rLrMr`rarqrhZ InversiondescrYr^rN)$r r&rrrrfliprDrrEr2r6rvr/rwrxshaper0r@rr%r.r"rrr'rrrangelenrrormeanstdr)rFr num_stepsrj schedulerrZtimesteps_inversionrrrrrZ seq_len_orirsrrrrrt_currt_nextZ index_currZ index_next sigma_curr sigma_nextrr model_outputrrrLr_rM _inversionjs     ""         zWanT2V._inversionc sxtjddd}t||}t|j|d|jdj}|}t |j d|j dj dj d|j dj j }d} j} jsfjjj| gj} | gj} n"fd d | gtd D} fd d | gtd D} | |d } | |d }ttjjdjjttt|dddD]d}||}||d}|j||}|j||}tj|gjd}j|gfd|i|d}|}|||}|dkr|d|||nd}td||t||}t|| qWdn 1s"wYWdn 1s2wY|!dS)NrFrirnrrWrXrcrZrLr[r\r_rLrMr`raz)WanT2V.ddim_inversion..rcrZrLr[r\r_rLrMr`rarqrhzDDIM InversionrrYr^)"r r&rrrrrrDrrwrxrr0r@rEr%r.r6r"rrrr'rrrrorrsqrtrrrr)rFrrrjrrrrrsrrrrrrrrrrrrrrx0_predZ epsilon_thetarLr_rMddim_inversions^ ""       zWanT2V.ddim_inversion# flowedit_vizca s |}|\}}jjj|djdd|djd|djdf}t|d|djdjd|djj}|dkrLj}| dkrR| nt dt j } t | g}g}t}tdtd}}i}|rtj|dd td ||dkr| |kr| |}tjd|d|td } t| }td t|d tt|ntd|dur|d}||jdjd}!ttd|!|dd}!td|d|!d| }"jsj j!j" | gj"}# |gj"}$ |gj"}%| rj j#n9 | gt "d}# |gt "d}$ |gt "d}%fdd|#D}#fdd|$D}$fdd|%D}%|"$}&|dkrlt%j&dddd}'|'j'|j"|d|'j(}(n"|dkrt)j&dddd}'t*||})t+|'j"|)d\}(}*nt,dj!j"t-j.j/d t 0~t1t2|(D]n\}+},t j3|,gj"d}-| ||+}.|}/d|+d| krːqd|+d|kr|,d }0|,|(d!kr|(|(45|,dd nd}1t j6|"j"d}2d|0|"|0|2}3|&|3|"}4j|3g|-|#|d"d}5j|4g|-|$|d"d}6j|3g|-|%|d"d}7j|4g|-|%|d"d}8|7||5|7}9|8||6|8}:|:|9};|/rzjt j7|9dd#}|&dd|!ddddf}?|<|!ddddf}@|=|!ddddf}A|>|!ddddf}B|8|.|,9|?#|@#|A#|B#d$t:d%|.Wn t;y}Cztjd|,|&>ddd)d}G|G?d}&qWdn 1s%wYWdn 1s5wY|r|rtd*t|d+tdtd}}|D]=}Ht|Hd,9|Hd-9|Hd.9}It|Hd,9|Hd-9|Hd.9}Jt||I}t||J}qV|tdks|tdkrtd/d0\}K}Ln||}K}Ltd1|Kd2d3|Ld2d4| sшjj"j"krшj!j"t2|d5d6D]}Hz|Hd7}.|Hd8},|Hd9}M|Hd,@}N|Hd-@}O|Hd.@}P|M>d>dj!jj"j/d }?| rjj"j"krj!j"t 0)t-j.j/t jAkd:jB|?}QWdn 1s=wYWdn 1sMwY| rcj#tCDt jEF|Qddddddddf}R|R#dd;GddHddd@}St I|N>d>d}Tt I|O>d>d}Ut I|P>d>d}Vt jJjKjL|T||fdd?d@\}Z}[|ZjOdA|.dB|,dCddDdE|[dP|S|[dQdF|[dRdG|[djP|WdH|K|LdI}\|[dQdJ|[dRdG|ZjS|\|[ddKdLdM|[djP|XdH|K|LdI}]|[dQdN|[dRdG|ZjS|]|[ddKdLdM|[djP|YdH|K|LdI}^|[dQdO|[dRdG|ZjS|^|[ddKdLdMtMjTgdPdQtjUV|dR|.dSdT|,dUdV}_tMW|_tMX|Z|8|_Wqt;y}Cztj.crZrLr[r\r_rLrMr`LracrZrLr[r\r_rLrMr`MrarRF)r&rjrk solver_orderrlrmrnrprYrSirU)r^rrrs)dim)relative_indexr^latent_frame_cpumag_src_frame_cpumag_tar_frame_cpumag_delta_frame_cpuz2Stored visualization data for step relative index z3Error storing visualization data at relative index z: )exc_info)ruz Processing z stored visualization steps...rrrzNCould not determine global magnitude range. Using automatic scaling per image.)NNz+Global magnitude range for color scaling: [z.4fz, ]zGenerating Visualization Plotsrrr^r)enabledrbilinear)rmode align_corners)r)figsizezFlowEdit Viz @ Step Index u (t ≈ z.2f)fontsizez Decoded Frameoffviridis)cmapZvminZvmaxzSrc MaggZd;O?g{Gz?)axfractionpadzTar Magz V_delta Mag)rgQ?rgffffff?)rectZflowedit_step_04dZ_t_z.0fz.pngz7Error generating visualization plot for relative index zSaved z visualization images to )Zr2r6rvr/rwrxr0r@rEryrzr{r|r"r~setrr)makedirsr3r4nplinspaceintrsortedlistwarningminmaxrr%r.rDrrrrr&rrr rrrrrr'rrrrtolistindex randn_likenormritemdebug Exceptionerrorrrrnumpyfloat16rrrr empty_cacheclamprrnn functional interpolatepltsubplotsZsuptitleimshow set_titleaxiscolorbar tight_layoutr*r+savefigcloser$)arFZ target_promptrrrjrrrrrrZsource_video_pathZ source_promptZ nmax_stepZ nmin_stepZnavgZ visualizeZvisualize_stepsZvisualize_frame_idx output_dirrWHrrsZviz_image_filenamesZvisualization_data_to_processZsteps_to_captureZglobal_min_magZglobal_max_magZcapture_magnitudesZtotal_flowedit_stepsindicesZvisualize_latent_frame_idxZx_srcZ context_srcZ context_tarrZzt_editrrrrrr^rrZshould_visualize_this_stepZt_iZt_im1Z fwd_noiseZzt_srcZzt_tarZnoise_pred_srcZnoise_pred_tarZnoise_pred_uncond_srcZnoise_pred_uncond_tarZnoise_pred_src_guidedZnoise_pred_tar_guidedZV_deltaZmag_srcZmag_tar mag_deltaZlatent_frame_gpuZmag_src_frame_gpuZmag_tar_frame_gpuZmag_delta_frame_gpuerrrrdatamin_valmax_valZ_vminZ_vmaxrrrrZdecoded_frames_batchZdecoded_frame_gpuZdecoded_frame_npZ mag_src_tZ mag_tar_tZ mag_delta_tZmag_src_resizedZmag_tar_resizedZmag_delta_resizedfigaxesim1im2Zim3output_filenamerrLr_rMedits  $         , & Q        $ " *"""*HHH   (  z WanT2V.edit)rrFFFF) rOrPrQrRrSrQrTrUT) rrPrQrmrSrQrTrUTN)rSrrRrrPrUr)r)rRrrPrUr)rQ)rSrQ)rOrPrQrRrSrQrTrUTNNrrrTrNr) __name__ __module__ __qualname__rNrrrrrrrr@rLrLrLrMr!s W ! $ ', f V>r)9rr3rwr)ryschedr{r<r contextlibr functoolsrregexrZtorch.nn.functionalr&r'rr"Ztorch.cuda.amprrtorch.distributed distributedrArZ torch.fftrrZdistributed.fsdpr Z modules.modelr Z modules.t5r Z modules.vaer Zutils.fm_solversr rrZutils.fm_solvers_unipcrr"r transformersrrwan.utils.utilsrPILrmatplotlib.pyplotpyplotr)rrLrLrLrMs<