VSF / app.py
weathon's picture
Update app.py
020f872 verified
import gradio as gr
import numpy as np
import imageio
import torch
from diffusers import AutoencoderKLWan
from vsfwan.pipeline import WanPipeline
from vsfwan.processor import WanAttnProcessor2_0
from diffusers import WanVACEPipeline
from diffusers.utils import export_to_video
import uuid
try:
import spaces
except ImportError:
class spaces:
@staticmethod
def GPU(fn):
return fn
model_id = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
pipe.load_lora_weights(
"Kijai/WanVideo_comfy",
weight_name="Wan21_CausVid_bidirect2_T2V_1_3B_lora_rank32.safetensors",
adapter_name="lora"
)
pipe = pipe.to("cuda")
height = 480
width = 832
import os
os.makedirs("videos", exist_ok=True)
@spaces.GPU
def generate_video(positive_prompt, negative_prompt, guidance_scale, bias, step, frames, seed, progress=gr.Progress(track_tqdm=False)):
lambda total: progress.tqdm(range(total))
print(f"Generating video with params: {positive_prompt}, {negative_prompt}, {guidance_scale}, {bias}, {step}, {frames}")
pipe.set_adapters("lora", 0.5) # change this to 0.5 fixed it? it did not restart
prompt = positive_prompt
neg_prompt = negative_prompt
neg_prompt_embeds, _ = pipe.encode_prompt(
prompt=neg_prompt,
padding=False,
do_classifier_free_guidance=False,
)
pos_prompt_embeds, _ = pipe.encode_prompt(
prompt=prompt,
do_classifier_free_guidance=False,
max_sequence_length=512 - neg_prompt_embeds.shape[1],
)
neg_len = neg_prompt_embeds.shape[1]
pos_len = pos_prompt_embeds.shape[1]
print(neg_len, pos_len)
img_len = (height//8) * (width//8) * 3 * (frames // 4 + 1) // 12
print(img_len)
mask = torch.zeros((1, img_len, pos_len+neg_len)).cuda()
# mask[:, :, -neg_len:] = -torch.inf # this should be negative
mask[:, :, -neg_len:] = -bias
for block in pipe.transformer.blocks:
block.attn2.processor = WanAttnProcessor2_0(scale=guidance_scale, neg_prompt_length=neg_len, attn_mask=mask)
prompt_embeds = torch.cat([pos_prompt_embeds, neg_prompt_embeds], dim=1)
output = pipe(
prompt_embeds=prompt_embeds,
negative_prompt=neg_prompt,
height=height,
width=width,
num_frames=frames,
num_inference_steps=step,
guidance_scale=0.0,
generator=torch.Generator(device="cuda").manual_seed(seed),
).frames[0]
path = f"videos/{uuid.uuid4().hex}.mp4"
export_to_video(output[5:], path, fps=15)
output_path = path
with open(output_path.replace(".mp4", ".txt"), "w") as f:
f.write(f"Positive Prompt: {positive_prompt}\n")
f.write(f"Negative Prompt: {negative_prompt}\n")
f.write(f"Guidance Scale: {guidance_scale}\n")
f.write(f"Bias: {bias}\n")
f.write(f"Steps: {step}\n")
f.write(f"Frames: {frames}\n")
f.write(f"Seed: {seed}\n")
print(f"Video saved to {output_path}")
return output_path
with gr.Blocks(title="Value Sign Flip Wan 2.1 Demo") as demo:
gr.Markdown("# Value Sign Flip Wan 2.1 Demo \n\n This demo is based on Wan 2.1 T2V model and uses Value Sign Flip technique to generate videos with different guidance scales and biases. More on [GitHub](https://github.com/weathon/VSF/blob/main/wan.md)\n\nPositive prompt should be at least 2 sentence long or the results will be weird.")
with gr.Row():
pos = gr.Textbox(label="Positive Prompt", value="A chef cat and a chef dog with chef suit baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon.")
neg = gr.Textbox(label="Negative Prompt", value="white dog")
with gr.Row():
guidance = gr.Slider(0, 5, step=0.1, label="Guidance Scale", value=1.5)
bias = gr.Slider(0, 0.5, step=0.01, label="Bias", value=0.1)
step = gr.Slider(6, 15, step=1, label="Step", value=10)
frames = gr.Slider(31, 81, step=1, label="Frames", value=81)
seed = gr.Number(label="Seed", value=0, precision=0)
out = gr.Video(label="Generated Video")
btn = gr.Button("Generate")
btn.click(fn=generate_video, inputs=[pos, neg, guidance, bias, step, frames, seed], outputs=out)
demo.launch()