Update app.py

#2
by linoyts HF Staff - opened
Files changed (1) hide show
  1. app.py +54 -37
app.py CHANGED
@@ -1,23 +1,20 @@
1
  import torch
 
2
  from diffusers import AutoencoderKLWan, WanVACEPipeline, UniPCMultistepScheduler
3
  from diffusers.utils import export_to_video
4
- from transformers import CLIPVisionModel
5
  import gradio as gr
6
  import tempfile
7
  import spaces
8
  from huggingface_hub import hf_hub_download
9
  import numpy as np
10
- import PIL.Image
11
  import random
12
 
13
-
14
-
15
  model_id = "Wan-AI/Wan2.1-VACE-14B-diffusers"
16
  vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
17
  pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16).to("cuda")
18
 
19
  pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=2.0)
20
- pipe.to("cuda")
21
 
22
  pipe.load_lora_weights(
23
  "vrgamedevgirl84/Wan14BT2VFusioniX",
@@ -80,7 +77,7 @@ def handle_gallery_upload_for_dims_wan(gallery_images, current_h_val, current_w_
80
  return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
81
  try:
82
  # Use the first image to calculate dimensions
83
- first_image = gallery_images[0]
84
  new_h, new_w = _calculate_new_dimensions_wan(
85
  first_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
86
  SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
@@ -96,17 +93,17 @@ def update_prompt_from_mode(mode):
96
  return MODE_PROMPTS.get(mode, "")
97
 
98
 
99
- def prepare_video_and_mask_Ref2V( height: int, width: int, num_frames: int):
100
  frames = []
101
  # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
102
  # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
103
  # match the original code.
104
- frames.extend([PIL.Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames))
105
- mask_white = PIL.Image.new("L", (width, height), 255)
106
  mask = [mask_white] * (num_frames)
107
  return frames, mask
108
 
109
- def prepare_video_and_mask_FLF2V(first_img: PIL.Image.Image, last_img: PIL.Image.Image, height: int, width: int, num_frames: int):
110
  first_img = first_img.resize((width, height))
111
  last_img = last_img.resize((width, height))
112
  frames = []
@@ -114,26 +111,26 @@ def prepare_video_and_mask_FLF2V(first_img: PIL.Image.Image, last_img: PIL.Image
114
  # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
115
  # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
116
  # match the original code.
117
- frames.extend([PIL.Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames - 2))
118
  frames.append(last_img)
119
- mask_black = PIL.Image.new("L", (width, height), 0)
120
- mask_white = PIL.Image.new("L", (width, height), 255)
121
  mask = [mask_black, *[mask_white] * (num_frames - 2), mask_black]
122
  return frames, mask
123
 
124
- def prepare_video_and_mask_Random2V(images: List[PIL.Image.Image], frame_indices: List[int], height: int, width: int, num_frames: int):
125
  images = [img.resize((width, height)) for img in images]
126
  # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
127
  # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
128
  # match the original code.
129
- frames = [PIL.Image.new("RGB", (width, height), (128, 128, 128))] * num_frames
130
 
131
- mask_black = PIL.Image.new("L", (width, height), 0)
132
- mask_white = PIL.Image.new("L", (width, height), 255)
133
  mask = [mask_white] * num_frames
134
 
135
  for img, idx in zip(images, frame_indices):
136
- assert idx < num_frames
137
  frames[idx] = img
138
  mask[idx] = mask_black
139
 
@@ -179,11 +176,13 @@ def generate_video(gallery_images, mode, prompt, height, width,
179
  """
180
  if gallery_images is None or len(gallery_images) == 0:
181
  raise gr.Error("Please upload at least one image to the gallery.")
 
 
182
 
183
  if mode == "FLF2V" and len(gallery_images) >= 2:
184
  gallery_images = gallery_images[:2]
185
  elif mode == "FLF2V" and len(gallery_images) < 2:
186
- raise gr.Error("only one image was supplied, but 2 are needed for FLF2V")
187
 
188
  target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
189
  target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
@@ -192,20 +191,29 @@ def generate_video(gallery_images, mode, prompt, height, width,
192
 
193
  current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
194
 
195
-
196
  # Process images based on the selected mode
197
  if mode == "FLF2V":
198
- frames, mask = prepare_video_and_mask_FLF2V(first_img=gallery_images[0], last_img=gallery_images[1], height=target_h, width=target_w, num_frames=num_frames)
199
- reference_images=None
 
 
 
 
 
 
200
  elif mode == "Ref2V":
201
  frames, mask = prepare_video_and_mask_Ref2V(height=target_h, width=target_w, num_frames=num_frames)
202
- reference_images =gallery_images
203
- else: # mode == "":
204
- frames, mask = prepare_video_and_mask_Random2V(images=gallery_images, frame_indices=[0,15,40], height=target_h, width=target_w, num_frames=num_frames)
205
- reference_images=None
206
-
207
- # resized_image = input_image.resize((target_w, target_h))
208
-
 
 
 
 
209
 
210
  with torch.inference_mode():
211
  output_frames_list = pipe(
@@ -228,8 +236,8 @@ def generate_video(gallery_images, mode, prompt, height, width,
228
  return video_path, current_seed
229
 
230
  with gr.Blocks() as demo:
231
- gr.Markdown("# Fast 4 steps Wan 2.1 I2V (14B) with CausVid LoRA - Multi-Image Gallery")
232
- gr.Markdown("[CausVid](https://github.com/tianweiy/CausVid) is a distilled version of Wan 2.1 to run faster in just 4-8 steps, [extracted as LoRA by Kijai](https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_CausVid_14B_T2V_lora_rank32.safetensors) and is compatible with 🧨 diffusers")
233
 
234
  with gr.Row():
235
  with gr.Column():
@@ -251,11 +259,18 @@ with gr.Blocks() as demo:
251
  choices=["Ref2V", "FLF2V", "Random2V"],
252
  value="Ref2V",
253
  label="Processing Mode",
254
- info="Ref2V: Reference to Video | FLF2V: First-Last Frame to Video | Random2V: Random Image to Video"
255
  )
256
 
257
  prompt_input = gr.Textbox(label="Prompt", value=MODE_PROMPTS["Ref2V"])
258
- duration_seconds_input = gr.Slider(minimum=round(MIN_FRAMES_MODEL/FIXED_FPS,1), maximum=round(MAX_FRAMES_MODEL/FIXED_FPS,1), step=0.1, value=2, label="Duration (seconds)", info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps.")
 
 
 
 
 
 
 
259
 
260
  with gr.Accordion("Advanced Settings", open=False):
261
  negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
@@ -271,12 +286,14 @@ with gr.Blocks() as demo:
271
 
272
  with gr.Column():
273
  video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
274
- with gr.Accordion("Mode Information", open=True):
275
  gr.Markdown("""
276
  **Processing Modes:**
277
- - **Ref2V**: Uses the first image as reference for video generation
278
- - **FLF2V**: Blends first and last images for interpolation (requires at least 2 images)
279
- - **Random2V**: Randomly selects one image from the gallery for generation
 
 
280
  """)
281
 
282
  # Update prompt when mode changes
 
1
  import torch
2
+ from typing import List
3
  from diffusers import AutoencoderKLWan, WanVACEPipeline, UniPCMultistepScheduler
4
  from diffusers.utils import export_to_video
 
5
  import gradio as gr
6
  import tempfile
7
  import spaces
8
  from huggingface_hub import hf_hub_download
9
  import numpy as np
10
+ from PIL import Image
11
  import random
12
 
 
 
13
  model_id = "Wan-AI/Wan2.1-VACE-14B-diffusers"
14
  vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
15
  pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16).to("cuda")
16
 
17
  pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=2.0)
 
18
 
19
  pipe.load_lora_weights(
20
  "vrgamedevgirl84/Wan14BT2VFusioniX",
 
77
  return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
78
  try:
79
  # Use the first image to calculate dimensions
80
+ first_image = gallery_images[0][0]
81
  new_h, new_w = _calculate_new_dimensions_wan(
82
  first_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
83
  SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
 
93
  return MODE_PROMPTS.get(mode, "")
94
 
95
 
96
+ def prepare_video_and_mask_Ref2V(height: int, width: int, num_frames: int):
97
  frames = []
98
  # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
99
  # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
100
  # match the original code.
101
+ frames.extend([Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames))
102
+ mask_white = Image.new("L", (width, height), 255)
103
  mask = [mask_white] * (num_frames)
104
  return frames, mask
105
 
106
+ def prepare_video_and_mask_FLF2V(first_img: Image.Image, last_img: Image.Image, height: int, width: int, num_frames: int):
107
  first_img = first_img.resize((width, height))
108
  last_img = last_img.resize((width, height))
109
  frames = []
 
111
  # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
112
  # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
113
  # match the original code.
114
+ frames.extend([Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames - 2))
115
  frames.append(last_img)
116
+ mask_black = Image.new("L", (width, height), 0)
117
+ mask_white = Image.new("L", (width, height), 255)
118
  mask = [mask_black, *[mask_white] * (num_frames - 2), mask_black]
119
  return frames, mask
120
 
121
+ def prepare_video_and_mask_Random2V(images: List[Image.Image], frame_indices: List[int], height: int, width: int, num_frames: int):
122
  images = [img.resize((width, height)) for img in images]
123
  # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
124
  # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
125
  # match the original code.
126
+ frames = [Image.new("RGB", (width, height), (128, 128, 128))] * num_frames
127
 
128
+ mask_black = Image.new("L", (width, height), 0)
129
+ mask_white = Image.new("L", (width, height), 255)
130
  mask = [mask_white] * num_frames
131
 
132
  for img, idx in zip(images, frame_indices):
133
+ assert idx < num_frames, f"Frame index {idx} exceeds num_frames {num_frames}"
134
  frames[idx] = img
135
  mask[idx] = mask_black
136
 
 
176
  """
177
  if gallery_images is None or len(gallery_images) == 0:
178
  raise gr.Error("Please upload at least one image to the gallery.")
179
+ else:
180
+ gallery_images = [img[0] for img in gallery_images]
181
 
182
  if mode == "FLF2V" and len(gallery_images) >= 2:
183
  gallery_images = gallery_images[:2]
184
  elif mode == "FLF2V" and len(gallery_images) < 2:
185
+ raise gr.Error("FLF2V mode requires at least 2 images, but only {} were supplied.".format(len(gallery_images)))
186
 
187
  target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
188
  target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
 
191
 
192
  current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
193
 
 
194
  # Process images based on the selected mode
195
  if mode == "FLF2V":
196
+ frames, mask = prepare_video_and_mask_FLF2V(
197
+ first_img=gallery_images[0],
198
+ last_img=gallery_images[1],
199
+ height=target_h,
200
+ width=target_w,
201
+ num_frames=num_frames
202
+ )
203
+ reference_images = None
204
  elif mode == "Ref2V":
205
  frames, mask = prepare_video_and_mask_Ref2V(height=target_h, width=target_w, num_frames=num_frames)
206
+ reference_images = gallery_images
207
+ else: # mode == "Random2V"
208
+
209
+ frames, mask = prepare_video_and_mask_Random2V(
210
+ images=gallery_images,
211
+ frame_indices=[0,20,40], # todo - generalize
212
+ height=target_h,
213
+ width=target_w,
214
+ num_frames=num_frames
215
+ )
216
+ reference_images = None
217
 
218
  with torch.inference_mode():
219
  output_frames_list = pipe(
 
236
  return video_path, current_seed
237
 
238
  with gr.Blocks() as demo:
239
+ gr.Markdown("# Wan 2.1 VACE (14B) with Phantom & Detail Enhancer LoRAs - Multi-Image Gallery")
240
+ gr.Markdown("Using [Wan2.1-VACE-14B](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B-diffusers) with Phantom FusionX and Detail Enhancer LoRAs for advanced video generation with multiple conditioning modes.")
241
 
242
  with gr.Row():
243
  with gr.Column():
 
259
  choices=["Ref2V", "FLF2V", "Random2V"],
260
  value="Ref2V",
261
  label="Processing Mode",
262
+ info="Ref2V: Reference to Video | FLF2V: First-Last Frame to Video | Random2V: Random frames to Video"
263
  )
264
 
265
  prompt_input = gr.Textbox(label="Prompt", value=MODE_PROMPTS["Ref2V"])
266
+ duration_seconds_input = gr.Slider(
267
+ minimum=round(MIN_FRAMES_MODEL/FIXED_FPS,1),
268
+ maximum=round(MAX_FRAMES_MODEL/FIXED_FPS,1),
269
+ step=0.1,
270
+ value=2,
271
+ label="Duration (seconds)",
272
+ info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
273
+ )
274
 
275
  with gr.Accordion("Advanced Settings", open=False):
276
  negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
 
286
 
287
  with gr.Column():
288
  video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
289
+ with gr.Accordion("Mode Information", open=False):
290
  gr.Markdown("""
291
  **Processing Modes:**
292
+ - **Ref2V**: Uses uploaded images as style references for video generation. All frames are generated based on the reference images.
293
+ - **FLF2V**: First-Last Frame mode - uses first and last images as keyframes and generates the frames in between (requires exactly 2 images)
294
+ - **Random2V**: Places uploaded images at specific frames in the video and generates the rest. Images are distributed evenly across the video duration.
295
+
296
+ **Note**: VACE pipeline supports advanced conditioning with masks and reference images for more control over generation.
297
  """)
298
 
299
  # Update prompt when mode changes