import os import gc import time import random import torch import imageio import gradio as gr from diffusers.utils import load_image from skyreels_v2_infer import DiffusionForcingPipeline from skyreels_v2_infer.modules import download_model from skyreels_v2_infer.pipelines import PromptEnhancer, resizecrop is_shared_ui = True if "fffiloni/SkyReels-V2" in os.environ['SPACE_ID'] else False is_gpu_associated = torch.cuda.is_available() model_id = None if not is_shared_ui: model_id = download_model("Skywork/SkyReels-V2-DF-1.3B-540P") def generate_diffusion_forced_video( prompt, image=None, target_length="10", model_id="Skywork/SkyReels-V2-DF-1.3B-540P", resolution="540P", num_frames=257, ar_step=0, causal_attention=False, causal_block_size=1, base_num_frames=97, overlap_history=17, addnoise_condition=20, guidance_scale=6.0, shift=8.0, inference_steps=30, use_usp=False, offload=True, fps=24, seed=None, prompt_enhancer=False, teacache=True, teacache_thresh=0.2, use_ret_steps=True, ): start_time = time.perf_counter() model_id = download_model(model_id) if resolution == "540P": height, width = 544, 960 elif resolution == "720P": height, width = 720, 1280 else: raise ValueError(f"Invalid resolution: {resolution}") if seed is None: random.seed(time.time()) seed = int(random.randrange(4294967294)) if num_frames > base_num_frames and overlap_history is None: raise ValueError("Specify `overlap_history` for long video generation. Try 17 or 37.") if addnoise_condition > 60: print("Warning: Large `addnoise_condition` may reduce consistency. Recommended: 20.") if image is not None: image = load_image(image).convert("RGB") image_width, image_height = image.size if image_height > image_width: height, width = width, height image = resizecrop(image, height, width) negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" prompt_input = prompt if prompt_enhancer and image is None: enhancer = PromptEnhancer() prompt_input = enhancer(prompt_input) del enhancer gc.collect() torch.cuda.empty_cache() pipe = DiffusionForcingPipeline( model_id, dit_path=model_id, device=torch.device("cuda"), weight_dtype=torch.bfloat16, use_usp=use_usp, offload=offload, ) if causal_attention: pipe.transformer.set_ar_attention(causal_block_size) if teacache: if ar_step > 0: num_steps = ( inference_steps + (((base_num_frames - 1) // 4 + 1) // causal_block_size - 1) * ar_step ) else: num_steps = inference_steps pipe.transformer.initialize_teacache( enable_teacache=True, num_steps=num_steps, teacache_thresh=teacache_thresh, use_ret_steps=use_ret_steps, ckpt_dir=model_id, ) with torch.amp.autocast("cuda", dtype=pipe.transformer.dtype), torch.no_grad(): video_frames = pipe( prompt=prompt_input, negative_prompt=negative_prompt, image=image, height=height, width=width, num_frames=num_frames, num_inference_steps=inference_steps, shift=shift, guidance_scale=guidance_scale, generator=torch.Generator(device="cuda").manual_seed(seed), overlap_history=overlap_history, addnoise_condition=addnoise_condition, base_num_frames=base_num_frames, ar_step=ar_step, causal_block_size=causal_block_size, fps=fps, )[0] os.makedirs("gradio_df_videos", exist_ok=True) timestamp = time.strftime("%Y%m%d_%H%M%S") output_path = f"gradio_df_videos/{prompt[:50].replace('/', '')}_{seed}_{timestamp}.mp4" imageio.mimwrite(output_path, video_frames, fps=fps, quality=8, output_params=["-loglevel", "error"]) end_time = time.perf_counter() elapsed = end_time - start_time print(f"[LOG] Execution time: {elapsed:.4f} seconds") return output_path # Gradio UI resolution_options = ["540P", "720P"] model_options = ["Skywork/SkyReels-V2-DF-1.3B-540P"] # Update if there are more if is_shared_ui is False: model_options = [ "Skywork/SkyReels-V2-DF-1.3B-540P", "Skywork/SkyReels-V2-DF-14B-540P", "Skywork/SkyReels-V2-DF-14B-720P" ] length_options = [] if is_shared_ui is True: length_options = ["4"] else: length_options = ["4", "10", "15", "30", "60"] css = """ div#warning-duplicate { background-color: #ebf5ff; padding: 0 16px 16px; margin: 0px 0; color: #030303!important; } div#warning-duplicate > .gr-prose > h2, div#warning-duplicate > .gr-prose > p { color: #0f4592!important; } div#warning-duplicate strong { color: #0f4592; } p.actions { display: flex; align-items: center; margin: 20px 0; } div#warning-duplicate .actions a { display: inline-block; margin-right: 10px; } div#warning-setgpu { background-color: #fff4eb; padding: 0 16px 16px; margin: 0px 0; color: #030303!important; } div#warning-setgpu > .gr-prose > h2, div#warning-setgpu > .gr-prose > p { color: #92220f!important; } div#warning-setgpu a, div#warning-setgpu b { color: #91230f; } div#warning-setgpu p.actions > a { display: inline-block; background: #1f1f23; border-radius: 40px; padding: 6px 24px; color: antiquewhite; text-decoration: none; font-weight: 600; font-size: 1.2em; } div#warning-ready { background-color: #ecfdf5; padding: 0 16px 16px; margin: 0px 0; color: #030303!important; } div#warning-ready > .gr-prose > h2, div#warning-ready > .gr-prose > p { color: #057857!important; } .custom-color { color: #030303 !important; } """ with gr.Blocks(css=css) as demo: with gr.Column(): gr.Markdown("# SkyReels V2: Infinite-Length Film Generation") gr.Markdown("The first open-source video generative model employing AutoRegressive Diffusion-Forcing architecture that achieves the SOTA performance among publicly available models.") gr.HTML("""
""") with gr.Row(): with gr.Column(scale=1): prompt = gr.Textbox(label="Prompt") with gr.Row(): if is_shared_ui: target_length = gr.Radio(label="Video length target", choices=length_options, value="4") forbidden_length = gr.Radio(label="Available target on duplicated instance", choices=["10","15","30","60"], value=None, interactive=False) else: target_length = gr.Radio(label="Video length target", choices=length_options, value="4") num_frames = gr.Slider(minimum=17, maximum=257, value=97, step=20, label="Number of Frames", interactive=False) image = gr.Image(type="filepath", label="Input Image (optional)") with gr.Accordion("Advanced Settings", open=False): model_id = gr.Dropdown(choices=model_options, value=model_options[0], label="Model ID") resolution = gr.Radio(choices=resolution_options, value="540P", label="Resolution", interactive=False if is_shared_ui else True) ar_step = gr.Number(label="AR Step", value=0) causal_attention = gr.Checkbox(label="Causal Attention") causal_block_size = gr.Number(label="Causal Block Size", value=1) base_num_frames = gr.Number(label="Base Num Frames", value=97) overlap_history = gr.Number(label="Overlap History (set for long videos)", value=None) addnoise_condition = gr.Number(label="AddNoise Condition", value=0) guidance_scale = gr.Slider(minimum=1.0, maximum=20.0, value=6.0, step=0.1, label="Guidance Scale") shift = gr.Slider(minimum=0.0, maximum=20.0, value=8.0, step=0.1, label="Shift") inference_steps = gr.Slider(minimum=1, maximum=100, value=30, step=1, label="Inference Steps") use_usp = gr.Checkbox(label="Use USP", visible=False if is_shared_ui else True) offload = gr.Checkbox(label="Offload", value=True, interactive=False if is_shared_ui else True) fps = gr.Slider(minimum=1, maximum=60, value=24, step=1, label="FPS") seed = gr.Number(label="Seed (optional)", precision=0) prompt_enhancer = gr.Checkbox(label="Prompt Enhancer", visible=False if is_shared_ui else True) use_teacache = gr.Checkbox(label="Use TeaCache", value=True) teacache_thresh = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.01, label="TeaCache Threshold") use_ret_steps = gr.Checkbox(label="Use Retention Steps", value=True) submit_btn = gr.Button("Generate", interactive=False if is_shared_ui else True) with gr.Column(scale=2): if is_shared_ui: top_description = gr.HTML(f''' ''', elem_id="warning-duplicate") else: if(is_gpu_associated): top_description = gr.HTML(f'''You will be billed by the minute from when you activated the GPU until when it is turned off.
There's only one step left before you can properly play with this demo: attribute a GPU to it (via the Settings tab) and run the app below. You will be billed by the minute from when you activate the GPU until when it is turned off.