o h2hW9@sddlZddlZddlZddlZddlZddlZddlZddlmZddl m Z ddl Z ddl m mZddlmZddlmZddlmZddlmZmZddlmZdd lmZdd lmZdd lmZdd l m!Z!m"Z"m#Z#dd l$m%Z%GdddZ&dS)N)contextmanager)partial)tqdm) shard_model)sp_attn_forwardsp_dit_forward)get_world_size)WanModel)T5EncoderModel) Wan2_1_VAE)FlowDPMSolverMultistepSchedulerget_sampling_sigmasretrieve_timesteps)FlowUniPCMultistepSchedulerc@sReZdZ        dddZddZdd Z    dddZdS)WanT2VrFTc CsPtd||_||_||_||_| |_|j|_|j|_|j|_|s'|s'|r*d|_t t |d} t |j |j tdtj||jtj||j|rK| ndd|_|j|_|j|_ttj||j|jd|_td|tj||jd |_|j|j||| | d |_tj||j d |_!|j|j!||| | d |_!|rt"|_#nd |_#|j$|_$dS) aC Initializes the Wan text-to-video generation model components. Args: config (EasyDict): Object containing model parameters initialized from config.py checkpoint_dir (`str`): Path to directory containing model checkpoints device_id (`int`, *optional*, defaults to 0): Id of target GPU device rank (`int`, *optional*, defaults to 0): Process rank for distributed training t5_fsdp (`bool`, *optional*, defaults to False): Enable FSDP sharding for T5 model dit_fsdp (`bool`, *optional*, defaults to False): Enable FSDP sharding for DiT model use_sp (`bool`, *optional*, defaults to False): Enable distribution strategy of sequence parallel. t5_cpu (`bool`, *optional*, defaults to False): Whether to place T5 model on CPU. Only works without t5_fsdp. init_on_cpu (`bool`, *optional*, defaults to True): Enable initializing Transformer Model on CPU. Only works without FSDP or USP. convert_model_dtype (`bool`, *optional*, defaults to False): Convert DiT model parameters dtype to 'config.param_dtype'. Only works without FSDP. zcuda:F) device_idcpuN)text_lendtypedevicecheckpoint_pathtokenizer_pathshard_fn)vae_pthrzCreating WanModel from ) subfolder)modeluse_spdit_fsdprconvert_model_dtyper)%torchrconfigrankt5_cpu init_on_cpunum_train_timestepsboundary param_dtyperrr rt5_dtypeospathjoin t5_checkpoint t5_tokenizer text_encoder vae_stride patch_sizer vae_checkpointvaelogginginfor from_pretrainedlow_noise_checkpointlow_noise_model_configure_modelhigh_noise_checkpointhigh_noise_modelr sp_sizesample_neg_prompt) selfr!checkpoint_dirrr"t5_fsdprrr#r$rrr@0/home/ubuntu/wan22/wan2.2-main/wan/text2video.py__init__!sd'     zWanT2V.__init__cCs|d|r|jD] }tt|j|j_q tt||_t r't |r/||}|S|r7| |j |js@| |j|S)a Configures a model object. This includes setting evaluation modes, applying distributed parallel strategy, and handling device placement. Args: model (torch.nn.Module): The model instance to configure. use_sp (`bool`): Enable distribution strategy of sequence parallel. dit_fsdp (`bool`): Enable FSDP sharding for DiT model. shard_fn (callable): The function to apply FSDP sharding. convert_model_dtype (`bool`): Convert DiT model parameters dtype to 'config.param_dtype'. Only works without FSDP. Returns: torch.nn.Module: The configured model. F)evalrequires_grad_blockstypes MethodTyper self_attnforwardrdistis_initializedbarriertor'r$r)r=rrrrrblockr@r@rAr8}s"    zWanT2V._configure_modelcCs||kr d}d}nd}d}|s|jr?tt||jjdkr)t||dtt||jjdkr?t|||jt||S)a= Prepares and returns the required model for the current timestep. Args: t (torch.Tensor): current timestep. boundary (`int`): The timestep threshold. If `t` is at or above this value, the `high_noise_model` is considered as the required model. offload_model (`bool`): A flag intended to control the offloading behavior. Returns: torch.nn.Module: The active model on the target device for the current timestep. r:r7cudar)itemr$nextgetattr parametersrtyperM)r=tr& offload_modelrequired_model_nameoffload_model_namer@r@rA_prepare_model_for_timesteps2   z"WanT2V._prepare_model_for_timestepiiQ@unipc2c ( sbt|tr ||fn|}|} jjj| djdd|djd|djdf} t| d| djdjd| dj j } |dkrSj }| dkrY| nt dt j} tjjd}|| jsjjj|gj}|gj}| rjjn&|gtd}|gtd}fdd |D}fd d |D}tj| d| d| d| dtjj|d g}td d }tjd|}tjd|}tjjdjdt ||j!j"}|dkrt#j"ddd}|j$|j|d|j%}n!|dkr7t&j"ddd}t'||}t(|j|d\}}nt)d|}|| d}|| d}t*t+|D]_\}}|}|g}t,|}-||| } |.|krm|dn|d}!| |fd|i|d}"| |fd|i|d}#|#|!|"|#}$|j/|$0d||d0dd|dd}%|%1dg}qM|}&| rjjtj23j4dkr͈j5|&}'Wdn 1swYWdn 1swYWdn 1swYWdn 1swY~~~| rt67tj28t9:r%t9;j4dkr/|'dSdS)a Generates video frames from text prompt using diffusion process. Args: input_prompt (`str`): Text prompt for content generation size (`tuple[int]`, *optional*, defaults to (1280,720)): Controls video resolution, (width,height). frame_num (`int`, *optional*, defaults to 81): How many frames to sample from a video. The number should be 4n+1 shift (`float`, *optional*, defaults to 5.0): Noise schedule shift parameter. Affects temporal dynamics sample_solver (`str`, *optional*, defaults to 'unipc'): Solver used to sample the video. sampling_steps (`int`, *optional*, defaults to 50): Number of diffusion sampling steps. Higher values improve quality but slow generation guide_scale (`float` or tuple[`float`], *optional*, defaults 5.0): Classifier-free guidance scale. Controls prompt adherence vs. creativity. If tuple, the first guide_scale will be used for low noise model and the second guide_scale will be used for high noise model. n_prompt (`str`, *optional*, defaults to ""): Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt` seed (`int`, *optional*, defaults to -1): Random seed for noise generation. If -1, use random seed. offload_model (`bool`, *optional*, defaults to True): If True, offloads models to CPU during generation to save VRAM Returns: torch.Tensor: Generated video frames tensor. Dimensions: (C, N H, W) where: - C: Color channels (3 for RGB) - N: Number of frames (81) - H: Frame height (from size) - W: Frame width from size) rrr_)rrcg|]}|jqSr@rMr.0rUr=r@rA z#WanT2V.generate..crcr@rdrergr@rArhri)rr generatorcss dVdS)Nr@r@r@r@rA noop_no_sync"s z%WanT2V.generate..noop_no_syncno_syncrO)rr]F)r%shiftuse_dynamic_shifting)rrmzdpm++)rsigmaszUnsupported solver.)contextseq_lenrU) return_dictrjN)< isinstancefloatr2rz_dimr/mathceilr0r;r<randomrandintsysmaxsizer Generatorr manual_seedr#r.rMrrandnfloat32rrRr7r:ampautocastr'no_gradr&r%r set_timesteps timestepsr rrNotImplementedError enumeraterstackrYrPstep unsqueezesqueezerO empty_cacher"decodegccollect synchronizerJrKrL)(r= input_promptsize frame_numrm sample_solversampling_steps guide_scalen_promptseedrVF target_shaperqseed_grp context_nullnoiserkno_sync_low_noiseno_sync_high_noiser&sample_schedulerrsampling_sigmas_latentsarg_carg_nullrUlatent_model_inputtimesteprsample_guide_scalenoise_pred_condnoise_pred_uncond noise_predtemp_x0x0videosr@rgrAgenerates4/                  F  zWanT2V.generateN)rrFFFFTF) rZr[r\r]r^r\r_r`T)__name__ __module__ __qualname__rBr8rYrr@r@r@rArs, \,$r)'rr3rvr)rxrzrF contextlibr functoolsrr torch.cuda.amprOrtorch.distributed distributedrJrdistributed.fsdprdistributed.sequence_parallelrrdistributed.utilr modules.modelr modules.t5r modules.vae2_1r utils.fm_solversr rrutils.fm_solvers_unipcrrr@r@r@rAs,