rahul7star commited on
Commit
b08ca35
·
verified ·
1 Parent(s): 52f499a

Create app_t2v.py

Browse files
Files changed (1) hide show
  1. app_t2v.py +229 -0
app_t2v.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
4
+
5
+ #import subprocess
6
+ #subprocess.run('pip install flash-attn==2.7.4.post1 --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
7
+
8
+ # wan2.2-main/gradio_ti2v.py
9
+ import gradio as gr
10
+ import torch
11
+ from huggingface_hub import snapshot_download
12
+ from PIL import Image
13
+ import random
14
+ import numpy as np
15
+ import spaces
16
+
17
+ import wan
18
+ from wan.configs import WAN_CONFIGS, SIZE_CONFIGS, MAX_AREA_CONFIGS, SUPPORTED_SIZES
19
+ from wan.utils.utils import cache_video
20
+
21
+ import gc
22
+
23
+ # --- 1. Global Setup and Model Loading ---
24
+
25
+ print("Starting Gradio App for Wan 2.2 TI2V-5B...")
26
+
27
+ # Download model snapshots from Hugging Face Hub
28
+ repo_id = "Wan-AI/Wan2.2-TI2V-5B"
29
+ print(f"Downloading/loading checkpoints for {repo_id}...")
30
+ ckpt_dir = snapshot_download(repo_id, local_dir_use_symlinks=False)
31
+ print(f"Using checkpoints from {ckpt_dir}")
32
+
33
+ # Load the model configuration
34
+ TASK_NAME = 'ti2v-5B'
35
+ cfg = WAN_CONFIGS[TASK_NAME]
36
+ FIXED_FPS = 24
37
+ MIN_FRAMES_MODEL = 8
38
+ MAX_FRAMES_MODEL = 121
39
+
40
+ # Instantiate the pipeline in the global scope
41
+ print("Initializing WanTI2V pipeline...")
42
+ device = "cuda" if torch.cuda.is_available() else "cpu"
43
+ device_id = 0 if torch.cuda.is_available() else -1
44
+ pipeline = wan.WanTI2V(
45
+ config=cfg,
46
+ checkpoint_dir=ckpt_dir,
47
+ device_id=device_id,
48
+ rank=0,
49
+ t5_fsdp=False,
50
+ dit_fsdp=False,
51
+ use_sp=False,
52
+ t5_cpu=False,
53
+ init_on_cpu=False,
54
+ convert_model_dtype=True,
55
+ )
56
+ print("Pipeline initialized and ready.")
57
+
58
+ # --- Helper Functions ---
59
+ def select_best_size_for_image(image, available_sizes):
60
+ """Select the size option with aspect ratio closest to the input image."""
61
+ if image is None:
62
+ return available_sizes[0] # Return first option if no image
63
+
64
+ img_width, img_height = image.size
65
+ img_aspect_ratio = img_height / img_width
66
+
67
+ best_size = available_sizes[0]
68
+ best_diff = float('inf')
69
+
70
+ for size_str in available_sizes:
71
+ # Parse size string like "704*1280"
72
+ height, width = map(int, size_str.split('*'))
73
+ size_aspect_ratio = height / width
74
+ diff = abs(img_aspect_ratio - size_aspect_ratio)
75
+
76
+ if diff < best_diff:
77
+ best_diff = diff
78
+ best_size = size_str
79
+
80
+ return best_size
81
+
82
+ def handle_image_upload(image):
83
+ """Handle image upload and return the best matching size."""
84
+ if image is None:
85
+ return gr.update()
86
+
87
+ pil_image = Image.fromarray(image).convert("RGB")
88
+ available_sizes = list(SUPPORTED_SIZES[TASK_NAME])
89
+ best_size = select_best_size_for_image(pil_image, available_sizes)
90
+
91
+ return gr.update(value=best_size)
92
+
93
+ def get_duration(
94
+ prompt,
95
+ size,
96
+ duration_seconds,
97
+ sampling_steps,
98
+ guide_scale,
99
+ shift,
100
+ seed,
101
+ progress):
102
+ """Calculate dynamic GPU duration based on parameters."""
103
+ if sampling_steps > 35 and duration_seconds >= 2:
104
+ return 120
105
+ elif sampling_steps < 35 or duration_seconds < 2:
106
+ return 105
107
+ else:
108
+ return 90
109
+
110
+ # --- 2. Gradio Inference Function ---
111
+ @spaces.GPU(duration=get_duration)
112
+ def generate_video(
113
+
114
+ prompt,
115
+ size,
116
+ duration_seconds,
117
+ sampling_steps,
118
+ guide_scale,
119
+ shift,
120
+ seed,
121
+ progress=gr.Progress(track_tqdm=True)
122
+ ):
123
+ """The main function to generate video, called by the Gradio interface."""
124
+ if seed == -1:
125
+ seed = random.randint(0, sys.maxsize)
126
+
127
+ # input_image = None
128
+ # if image is not None:
129
+ # input_image = Image.fromarray(image).convert("RGB")
130
+ # # Resize image to match selected size
131
+ # target_height, target_width = map(int, size.split('*'))
132
+ # input_image = input_image.resize((target_width, target_height))
133
+
134
+ # Calculate number of frames based on duration
135
+ num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
136
+
137
+ video_tensor = pipeline.generate(
138
+ input_prompt=prompt,
139
+ img=None # Pass None for T2V, Image for I2V
140
+ size=SIZE_CONFIGS[size],
141
+ max_area=MAX_AREA_CONFIGS[size],
142
+ frame_num=num_frames, # Use calculated frames instead of cfg.frame_num
143
+ shift=shift,
144
+ sample_solver='unipc',
145
+ sampling_steps=int(sampling_steps),
146
+ guide_scale=guide_scale,
147
+ seed=seed,
148
+ offload_model=True
149
+ )
150
+
151
+ # Save the video to a temporary file
152
+ video_path = cache_video(
153
+ tensor=video_tensor[None], # Add a batch dimension
154
+ save_file=None, # cache_video will create a temp file
155
+ fps=cfg.sample_fps,
156
+ normalize=True,
157
+ value_range=(-1, 1)
158
+ )
159
+ del video_tensor
160
+ gc.collect()
161
+ return video_path
162
+
163
+
164
+ # --- 3. Gradio Interface ---
165
+ css = ".gradio-container {max-width: 1100px !important; margin: 0 auto} #output_video {height: 500px;} #input_image {height: 500px;}"
166
+
167
+ with gr.Blocks(css=css, theme=gr.themes.Soft(), delete_cache=(60, 900)) as demo:
168
+ gr.Markdown("# Wan 2.2 TI2V 5B")
169
+ gr.Markdown("generate high quality videos using **Wan 2.2 5B Text-Image-to-Video model**,[[model]](https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B),[[paper]](https://arxiv.org/abs/2503.20314)")
170
+
171
+ with gr.Row():
172
+ with gr.Column(scale=2):
173
+ #image_input = gr.Image(type="numpy", label="Input Image (Optional)", elem_id="input_image")
174
+ prompt_input = gr.Textbox(label="Prompt", value="A beautiful waterfall in a lush jungle, cinematic.", lines=3)
175
+ duration_input = gr.Slider(
176
+ minimum=round(MIN_FRAMES_MODEL/FIXED_FPS, 1),
177
+ maximum=round(MAX_FRAMES_MODEL/FIXED_FPS, 1),
178
+ step=0.1,
179
+ value=2.0,
180
+ label="Duration (seconds)",
181
+ info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
182
+ )
183
+ size_input = gr.Dropdown(label="Output Resolution", choices=list(SUPPORTED_SIZES[TASK_NAME]), value="704*1280")
184
+ with gr.Column(scale=2):
185
+ video_output = gr.Video(label="Generated Video", elem_id="output_video")
186
+
187
+
188
+ with gr.Accordion("Advanced Settings", open=False):
189
+ steps_input = gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=38, step=1)
190
+ scale_input = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=10.0, value=cfg.sample_guide_scale, step=0.1)
191
+ shift_input = gr.Slider(label="Sample Shift", minimum=1.0, maximum=20.0, value=cfg.sample_shift, step=0.1)
192
+ seed_input = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
193
+
194
+ run_button = gr.Button("Generate Video", variant="primary")
195
+
196
+ # Add image upload handler
197
+ # image_input.upload(
198
+ # fn=handle_image_upload,
199
+ # inputs=[image_input],
200
+ # outputs=[size_input]
201
+ # )
202
+
203
+ # image_input.clear(
204
+ # fn=handle_image_upload,
205
+ # inputs=[image_input],
206
+ # outputs=[size_input]
207
+ # )
208
+
209
+ # example_image_path = os.path.join(os.path.dirname(__file__), "examples/i2v_input.JPG")
210
+ # gr.Examples(
211
+ # examples=[
212
+ # [example_image_path, "The cat removes the glasses from its eyes.", "1280*704", 1.5],
213
+ # [None, "A cinematic shot of a boat sailing on a calm sea at sunset.", "1280*704", 2.0],
214
+ # [None, "Drone footage flying over a futuristic city with flying cars.", "1280*704", 2.0],
215
+ # ],
216
+ inputs=[prompt_input, size_input, duration_input],
217
+ outputs=video_output,
218
+ fn=generate_video,
219
+ cache_examples=False,
220
+ )
221
+
222
+ run_button.click(
223
+ fn=generate_video,
224
+ inputs=[ prompt_input, size_input, duration_input, steps_input, scale_input, shift_input, seed_input],
225
+ outputs=video_output
226
+ )
227
+
228
+ if __name__ == "__main__":
229
+ demo.launch()