g3ddlZddlZddlZddlZddlZddlZddlZddlmZddl m Z ddl Z ddl Z ddlmcmZddlmZddlmcmZddlmZddlmZddlmZddlmZdd lm Z dd l!m"Z"dd l#m$Z$m%Z%m&Z&dd l'm(Z(Gd dZ)y)N)contextmanager)partial)tqdm) shard_model) CLIPModel)WanModel)T5EncoderModel)WanVAE)FlowDPMSolverMultistepSchedulerget_sampling_sigmasretrieve_timesteps)FlowUniPCMultistepSchedulerc<eZdZ ddZ ddZy)WanI2Vc tjd||_||_||_||_||_|j |_|j|_tt|} t|j|jtjdtjj||j tjj||j"|r| nd|_|j&|_|j(|_t+tjj||j,|j|_t1|j2|jtjj||j4tjj||j6|_t;j<d|t?j@||_!|jBjEjGd |s|s|rd } |rd d l$m%} d d l&m'} m(} |jBjRD]1}tUjV| |jX|jX_-3tUjV| |jB|jB_-| |_.nd |_.t_j`rt_jb|r| |jB|_!n'| s%|jBje|j|jf|_3y)au Initializes the image-to-video generation model components. Args: config (EasyDict): Object containing model parameters initialized from config.py checkpoint_dir (`str`): Path to directory containing model checkpoints device_id (`int`, *optional*, defaults to 0): Id of target GPU device rank (`int`, *optional*, defaults to 0): Process rank for distributed training t5_fsdp (`bool`, *optional*, defaults to False): Enable FSDP sharding for T5 model dit_fsdp (`bool`, *optional*, defaults to False): Enable FSDP sharding for DiT model use_usp (`bool`, *optional*, defaults to False): Enable distribution strategy of USP. t5_cpu (`bool`, *optional*, defaults to False): Whether to place T5 model on CPU. Only works without t5_fsdp. init_on_cpu (`bool`, *optional*, defaults to True): Enable initializing Transformer Model on CPU. Only works without FSDP or USP. zcuda:) device_idcpuN)text_lendtypedevicecheckpoint_pathtokenizer_pathshard_fn)vae_pthr)rrrrzCreating WanModel from Fr) get_sequence_parallel_world_sizer)usp_attn_forwardusp_dit_forward)4torchrconfigrankuse_uspt5_cpunum_train_timesteps param_dtyperrr rt5_dtypeospathjoin t5_checkpoint t5_tokenizer text_encoder vae_stride patch_sizer vae_checkpointvaer clip_dtypeclip_checkpointclip_tokenizercliplogginginfor from_pretrainedmodelevalrequires_grad_xfuser.core.distributedr!distributed.xdit_context_parallelrrblockstypes MethodType self_attnforwardsp_sizedistis_initializedbarriertosample_neg_prompt)selfr checkpoint_dirrr!t5_fsdpdit_fsdpr"r# init_on_cpurrrrblocks /root/Wan2.1/wan/image2video.py__init__zWanI2V.__init__sjFllU9+#67     #)#=#= !--;)<*__//<<&GGLL9M9MN77<<8K8KL!(Xd  !++ ++GGLL1F1FG;; ##;;GGLL)/)?)?A77<<8M8MN P   .~.>?@--n=  ((/ h'K  1 M** 7*/*:*:$eoo+7' 7"'!1!1/4::!NDJJ ;=DLDL    LLN !$**-DJ dkk*!'!9!9c ztj|jdjdj |j }|} |j dd\} }| |z }ttj||z|jdz|jdz|jdz}ttj||z |jdz|jdz|jdz}||jdz} ||jdz}| dz |jdzdz|z|z|jd|jdzz}ttj||jz |jz}| dk\r| n#t!j"dt$j&} t)j*|j }|j-| t)j.dd||t(j0||j }t)j2dd |||j }d|ddddf<t)j4t)j6|ddddfd d |ddddfgd }|j9d|j dd zd ||}|j;ddd}| dk(r |j<} |j>s|j@jBj |j |jA|g|j }|jA| g|j }| r|j@jBjEn|jA|gt)j d}|jA| gt)j d}|Dcgc]}|j |j }}|Dcgc]}|j |j }}|jFjBj |j |jFjI|dddddddfg}| r$|jFjBjE|jJjMt)j4t(jNjPjS|djE| |fdj;ddt)jTdd| |gd j |j gd}t)j4||g}tVd}tY|jBd|}t[j\|j^5t)j`5|5|dk(rCtc|jddd}|jg||j ||jh}nP|dk(r@tk|jddd}tm||}to||j |\}} n tqd|}!|dg|||gd}"||||gd}#| rt(jrju|jBj |j twty|D]\} }|!j |j g}$|g}%t)jz|%j |j }%|jB|$fd|%i|"dj | rt)j dn |j }&| rt(jrju|jB|$fd|%i|#dj | rt)j dn |j }'| rt(jrju|'||&|'z zz}(|!j | rt)j dn |j }!|j}|(jd||!jdd| d})|)jd}!|!j |j g}*~$~%| r8|jBjEt(jrju|jdk(r|jJj*}+ddddddddd~~!~| r2tjt(jrjtjrtj|jdk(r+dSdScc}wcc}w#1swYxYw#1swYxYw#1swYxYw)!a Generates video frames from input image and text prompt using diffusion process. Args: input_prompt (`str`): Text prompt for content generation. img (PIL.Image.Image): Input image tensor. Shape: [3, H, W] max_area (`int`, *optional*, defaults to 720*1280): Maximum pixel area for latent space calculation. Controls video resolution scaling frame_num (`int`, *optional*, defaults to 81): How many frames to sample from a video. The number should be 4n+1 shift (`float`, *optional*, defaults to 5.0): Noise schedule shift parameter. Affects temporal dynamics [NOTE]: If you want to generate a 480p video, it is recommended to set the shift value to 3.0. sample_solver (`str`, *optional*, defaults to 'unipc'): Solver used to sample the video. sampling_steps (`int`, *optional*, defaults to 40): Number of diffusion sampling steps. Higher values improve quality but slow generation guide_scale (`float`, *optional*, defaults 5.0): Classifier-free guidance scale. Controls prompt adherence vs. creativity n_prompt (`str`, *optional*, defaults to ""): Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt` seed (`int`, *optional*, defaults to -1): Random seed for noise generation. If -1, use random seed offload_model (`bool`, *optional*, defaults to True): If True, offloads models to CPU during generation to save VRAM Returns: torch.Tensor: Generated video frames tensor. Dimensions: (C, N H, W) where: - C: Color channels (3 for RGB) - N: Number of frames (81) - H: Frame height (from max_area) - W: Frame width from max_area) g?rNr)r)r generatorrQ)repeatsdim)rYrbicubic)sizemodePc3Kdyw)NrarPrN noop_no_syncz%WanI2V.generate..noop_no_syncs sno_sync)runipcF)r$shiftuse_dynamic_shifting)rrezdpm++)rsigmaszUnsupported solver.)contextclip_feaseq_lenyt) return_dictrU)ITF to_tensorsub_div_rFrshaperoundnpsqrtr-r.intmathceilrBrandomrandintsysmaxsizer Generator manual_seedrandnfloat32onesconcatrepeat_interleaveview transposerGr#r,r8rr4visualr0encodenn functional interpolatezerosrgetattrampautocastr%no_gradrr$ set_timesteps timestepsr r rNotImplementedErrorcuda empty_cache enumeraterstackstep unsqueezesqueezer!decodegccollect synchronizerCrDrE),rH input_promptimgmax_area frame_numre sample_solversampling_steps guide_scalen_promptseed offload_modelFhw aspect_ratiolat_hlat_w max_seq_lenseed_gnoisemskrh context_nullrl clip_contextrkrbrcsample_schedulerrsampling_sigmas_latentarg_carg_nulllatent_model_inputtimestepnoise_pred_condnoise_pred_uncond noise_predtemp_x0x0videoss, rNgeneratezWanI2V.generates`ll3$$S)..s366t{{C yy}11u  GGH|+ ,0B B OOA  !%!3 45 GGH|+ ,0B B OOA  !%!3 45 DOOA& & DOOA& &A$//!"44q8EAEI OOA !3 35 $))K$,,$>?@4<<O qytfnnQ &D 44     --;; jjBuT[[AAqrE ll  # #C1Q3K BC12J  !"hhq#))A,!+Qu=mmAq!!$ r>--H{{    # # & &t{{ 3'' DG,,hZEL!!''++-'' U8KLG,,hZe9LML29:QqttDKK(:G:7CD!ADD-DLD 4;;'yy''Qa]);(<=  IIOO   ! HHOO LL##//IMMO1a&y0BBK)1C Ar1a(    !#4;;     LL#q "    $**i>\\ 0 0 1R -5==?R -GIR -'#>(,(@(@).$0 !.."4;;e/E,66 ')#B(,(@(@).$0 #6ne"L1$;;* , 1 **?@@F$AJ<(&S E((&S H &&( JJMM$++ &!$y/2 11&,ii &<%="3 ;;x033DKK@",$**&#=*2#=6;#==>#@@B/< U+$++AO !JJ**,$.DJJ&%@*2%@6>%@@A%CCE2/< U+$++DO"!JJ**,.#&77299  +8ELL'dkkK+//((+$$Q' %$ 0& '( ) !+ii ,-&? 1B   &&(yyA~,eR -R -R -h 6   JJL JJ " " $    LLN IINvay44m;D2R -R -R -R -R -R -sI"f"f/f1f% L6ff% f1f" f%%f. *f11f:N)rrFFFFT) irV@rd(rrZT)__name__ __module__ __qualname__rOrrarPrNrrsC  `:J%& " #Z5rPr)*rr5rwr'ryr{r> contextlibr functoolsrnumpyrtrtorch.cuda.amprrtorch.distributed distributedrC!torchvision.transforms.functional transformsrrnrdistributed.fsdpr modules.clipr modules.modelr modules.t5r modules.vaer utils.fm_solversr r rutils.fm_solvers_unipcrrrarPrNrsb  %  ..)##&HH?~5~5rP