wan2-1-fast-radial-attn-2

Running on Zero

App Files Files Community

multimodalart HF Staff commited on Jul 4

Commit

5e08962

verified ·

1 Parent(s): a999829

Update radial_attn/models/wan/sparse_transformer.py

Browse files

Files changed (1) hide show

radial_attn/models/wan/sparse_transformer.py +158 -158

radial_attn/models/wan/sparse_transformer.py CHANGED Viewed

@@ -367,176 +367,176 @@ class WanPipeline_Sparse(WanPipeline):
         return WanPipelineOutput(frames=video)
-# Add this entire function to the file
-@torch.no_grad()
-def wan_i2v_pipeline_call_sparse(
-    self,
-    image: PipelineImageInput,
-    prompt: Union[str, List[str]] = None,
-    negative_prompt: Union[str, List[str]] = None,
-    height: int = 480,
-    width: int = 832,
-    num_frames: int = 81,
-    num_inference_steps: int = 50,
-    guidance_scale: float = 5.0,
-    num_videos_per_prompt: Optional[int] = 1,
-    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-    latents: Optional[torch.Tensor] = None,
-    prompt_embeds: Optional[torch.Tensor] = None,
-    negative_prompt_embeds: Optional[torch.Tensor] = None,
-    image_embeds: Optional[torch.Tensor] = None,
-    last_image: Optional[torch.Tensor] = None,
-    output_type: Optional[str] = "np",
-    return_dict: bool = True,
-    attention_kwargs: Optional[Dict[str, Any]] = None,
-    callback_on_step_end: Optional[
-        Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
-    ] = None,
-    callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-    max_sequence_length: int = 512,
-):
-    if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
-        callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
-    self.check_inputs(
-        prompt,
-        negative_prompt,
-        image,
-        height,
-        width,
-        prompt_embeds,
-        negative_prompt_embeds,
-        image_embeds,
-        callback_on_step_end_tensor_inputs,
-    )
-    if num_frames % self.vae_scale_factor_temporal != 1:
-        logger.warning(
-            f"`num_frames - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the nearest number."
         )
-        num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
-    num_frames = max(num_frames, 1)
-    self._guidance_scale = guidance_scale
-    self._attention_kwargs = attention_kwargs
-    self._current_timestep = None
-    self._interrupt = False
-    device = self._execution_device
-    if prompt is not None and isinstance(prompt, str):
-        batch_size = 1
-    elif prompt is not None and isinstance(prompt, list):
-        batch_size = len(prompt)
-    else:
-        batch_size = prompt_embeds.shape[0]
-    prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        do_classifier_free_guidance=self.do_classifier_free_guidance,
-        num_videos_per_prompt=num_videos_per_prompt,
-        prompt_embeds=prompt_embeds,
-        negative_prompt_embeds=negative_prompt_embeds,
-        max_sequence_length=max_sequence_length,
-        device=device,
-    )
-    transformer_dtype = self.transformer.dtype
-    prompt_embeds = prompt_embeds.to(transformer_dtype)
-    if negative_prompt_embeds is not None:
-        negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
-    if image_embeds is None:
-        if last_image is None:
-            image_embeds = self.encode_image(image, device)
         else:
-            image_embeds = self.encode_image([image, last_image], device)
-    image_embeds = image_embeds.repeat(batch_size, 1, 1)
-    image_embeds = image_embeds.to(transformer_dtype)
-    self.scheduler.set_timesteps(num_inference_steps, device=device)
-    timesteps = self.scheduler.timesteps
-    num_channels_latents = self.vae.config.z_dim
-    image = self.video_processor.preprocess(image, height=height, width=width).to(device, dtype=torch.float32)
-    if last_image is not None:
-        last_image = self.video_processor.preprocess(last_image, height=height, width=width).to(
-            device, dtype=torch.float32
         )
-    latents, condition = self.prepare_latents(
-        image,
-        batch_size * num_videos_per_prompt,
-        num_channels_latents,
-        height,
-        width,
-        num_frames,
-        torch.float32,
-        device,
-        generator,
-        latents,
-        last_image,
-    )
-    num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-    self._num_timesteps = len(timesteps)
-    with self.progress_bar(total=num_inference_steps) as progress_bar:
-        for i, t in enumerate(timesteps):
-            if self.interrupt:
-                continue
-            self._current_timestep = t
-            latent_model_input = torch.cat([latents, condition], dim=1).to(transformer_dtype)
-            timestep = t.expand(latents.shape[0])
-            noise_pred = self.transformer(
-                hidden_states=latent_model_input,
-                timestep=timestep,
-                encoder_hidden_states=prompt_embeds,
-                encoder_hidden_states_image=image_embeds,
-                attention_kwargs=attention_kwargs,
-                return_dict=False,
-                numeral_timestep=i,  # <--- MODIFICATION
-            )[0]
-            if self.do_classifier_free_guidance:
-                noise_uncond = self.transformer(
                     hidden_states=latent_model_input,
                     timestep=timestep,
-                    encoder_hidden_states=negative_prompt_embeds,
                     encoder_hidden_states_image=image_embeds,
                     attention_kwargs=attention_kwargs,
                     return_dict=False,
                     numeral_timestep=i,  # <--- MODIFICATION
                 )[0]
-                noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)
-            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-            if callback_on_step_end is not None:
-                callback_kwargs = {}
-                for k in callback_on_step_end_tensor_inputs:
-                    callback_kwargs[k] = locals()[k]
-                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-                latents = callback_outputs.pop("latents", latents)
-                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                progress_bar.update()
-    self._current_timestep = None
-    if not output_type == "latent":
-        latents = latents.to(self.vae.dtype)
-        latents_mean = (
-            torch.tensor(self.vae.config.latents_mean)
-            .view(1, self.vae.config.z_dim, 1, 1, 1)
-            .to(latents.device, latents.dtype)
-        )
-        latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
-            latents.device, latents.dtype
-        )
-        latents = latents / latents_std + latents_mean
-        video = self.vae.decode(latents, return_dict=False)[0]
-        video = self.video_processor.postprocess_video(video, output_type=output_type)
-    else:
-        video = latents
-    self.maybe_free_model_hooks()
-    if not return_dict:
-        return (video,)
-    return WanPipelineOutput(frames=video)
 def replace_sparse_forward():
     WanTransformerBlock.forward = WanTransformerBlock_Sparse.forward
     WanTransformer3DModel.forward = WanTransformer3DModel_Sparse.forward
     WanPipeline.__call__ = WanPipeline_Sparse.__call__
-    WanImageToVideoPipeline.__call__ = wan_i2v_pipeline_call_sparse

         return WanPipelineOutput(frames=video)
+class WanImageToVideoPipeline_Sparse(WanImageToVideoPipeline):
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        last_image: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        self.check_inputs(
+            prompt,
+            negative_prompt,
+            image,
+            height,
+            width,
+            prompt_embeds,
+            negative_prompt_embeds,
+            image_embeds,
+            callback_on_step_end_tensor_inputs,
         )
+        if num_frames % self.vae_scale_factor_temporal != 1:
+            logger.warning(
+                f"`num_frames - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the nearest number."
+            )
+            num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
+        num_frames = max(num_frames, 1)
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+        device = self._execution_device
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
         else:
+            batch_size = prompt_embeds.shape[0]
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        transformer_dtype = self.transformer.dtype
+        prompt_embeds = prompt_embeds.to(transformer_dtype)
+        if negative_prompt_embeds is not None:
+            negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
+        if image_embeds is None:
+            if last_image is None:
+                image_embeds = self.encode_image(image, device)
+            else:
+                image_embeds = self.encode_image([image, last_image], device)
+        image_embeds = image_embeds.repeat(batch_size, 1, 1)
+        image_embeds = image_embeds.to(transformer_dtype)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        num_channels_latents = self.vae.config.z_dim
+        image = self.video_processor.preprocess(image, height=height, width=width).to(device, dtype=torch.float32)
+        if last_image is not None:
+            last_image = self.video_processor.preprocess(last_image, height=height, width=width).to(
+                device, dtype=torch.float32
+            )
+        latents, condition = self.prepare_latents(
+            image,
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            num_frames,
+            torch.float32,
+            device,
+            generator,
+            latents,
+            last_image,
         )
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                self._current_timestep = t
+                latent_model_input = torch.cat([latents, condition], dim=1).to(transformer_dtype)
+                timestep = t.expand(latents.shape[0])
+                noise_pred = self.transformer(
                     hidden_states=latent_model_input,
                     timestep=timestep,
+                    encoder_hidden_states=prompt_embeds,
                     encoder_hidden_states_image=image_embeds,
                     attention_kwargs=attention_kwargs,
                     return_dict=False,
                     numeral_timestep=i,  # <--- MODIFICATION
                 )[0]
+                if self.do_classifier_free_guidance:
+                    noise_uncond = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=timestep,
+                        encoder_hidden_states=negative_prompt_embeds,
+                        encoder_hidden_states_image=image_embeds,
+                        attention_kwargs=attention_kwargs,
+                        return_dict=False,
+                        numeral_timestep=i,  # <--- MODIFICATION
+                    )[0]
+                    noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        self._current_timestep = None
+        if not output_type == "latent":
+            latents = latents.to(self.vae.dtype)
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            latents = latents / latents_std + latents_mean
+            video = self.vae.decode(latents, return_dict=False)[0]
+            video = self.video_processor.postprocess_video(video, output_type=output_type)
+        else:
+            video = latents
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (video,)
+        return WanPipelineOutput(frames=video)
 def replace_sparse_forward():
     WanTransformerBlock.forward = WanTransformerBlock_Sparse.forward
     WanTransformer3DModel.forward = WanTransformer3DModel_Sparse.forward
     WanPipeline.__call__ = WanPipeline_Sparse.__call__
+    WanImageToVideoPipeline.__call__ = WanImageToVideoPipeline_Sparse.__call__