import torch from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition from diffusers.utils import export_to_video pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16) pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16) pipe.to("cuda") pipe_upsample.to("cuda") pipe.vae.enable_tiling() prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region." negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted" expected_height, expected_width = 704, 512 downscale_factor = 2 / 3 num_frames = 121 # Part 1. Generate video at smaller resolution downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor) latents = pipe( conditions=None, prompt=prompt, negative_prompt=negative_prompt, width=downscaled_width, height=downscaled_height, num_frames=num_frames, num_inference_steps=30, generator=torch.Generator().manual_seed(0), output_type="latent", ).frames # Part 2. Upscale generated video using latent upsampler with fewer inference steps # The available latent upsampler upscales the height/width by 2x upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2 upscaled_latents = pipe_upsample( latents=latents, output_type="latent" ).frames # Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended) video = pipe( prompt=prompt, negative_prompt=negative_prompt, width=upscaled_width, height=upscaled_height, num_frames=num_frames, denoise_strength=0.4, # Effectively, 4 inference steps out of 10 num_inference_steps=10, latents=upscaled_latents, decode_timestep=0.05, image_cond_noise_scale=0.025, generator=torch.Generator().manual_seed(0), output_type="pil", ).frames[0] # Part 4. Downscale the video to the expected resolution video = [frame.resize((expected_width, expected_height)) for frame in video] export_to_video(video, "output.mp4", fps=24) import torch import gradio as gr from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition from diffusers.utils import export_to_video def generate_video( prompt, negative_prompt, expected_height, expected_width, downscale_factor, num_frames, num_inference_steps, denoise_strength, seed, progress=gr.Progress() ): # Initialize pipelines (move this outside the function for production) progress(0.1, desc="Loading models...") pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16) pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16) pipe.to("cuda") pipe_upsample.to("cuda") pipe.vae.enable_tiling() # Part 1. Generate video at smaller resolution progress(0.2, desc="Generating initial video...") downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor) generator = torch.Generator().manual_seed(seed) latents = pipe( conditions=None, prompt=prompt, negative_prompt=negative_prompt, width=downscaled_width, height=downscaled_height, num_frames=num_frames, num_inference_steps=num_inference_steps, generator=generator, output_type="latent", ).frames # Part 2. Upscale generated video progress(0.5, desc="Upscaling video...") upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2 upscaled_latents = pipe_upsample( latents=latents, output_type="latent" ).frames # Part 3. Denoise the upscaled video progress(0.7, desc="Refining video quality...") video = pipe( prompt=prompt, negative_prompt=negative_prompt, width=upscaled_width, height=upscaled_height, num_frames=num_frames, denoise_strength=denoise_strength, num_inference_steps=10, latents=upscaled_latents, decode_timestep=0.05, image_cond_noise_scale=0.025, generator=generator, output_type="pil", ).frames[0] # Part 4. Downscale the video to the expected resolution progress(0.9, desc="Finalizing video...") video = [frame.resize((expected_width, expected_height)) for frame in video] # Save and return video output_path = "output.mp4" export_to_video(video, output_path, fps=24) return output_path # Create Gradio interface with gr.Blocks(title="LTX Video Generator") as demo: gr.Markdown("# LTX Video Generator") gr.Markdown("Generate videos from text prompts using Lightricks' LTX model") with gr.Row(): with gr.Column(): prompt = gr.Textbox( label="Prompt", value="The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region.", lines=4 ) negative_prompt = gr.Textbox( label="Negative Prompt", value="worst quality, inconsistent motion, blurry, jittery, distorted", lines=2 ) with gr.Row(): expected_height = gr.Slider( label="Output Height", minimum=256, maximum=1024, step=64, value=704 ) expected_width = gr.Slider( label="Output Width", minimum=256, maximum=1024, step=64, value=512 ) with gr.Row(): downscale_factor = gr.Slider( label="Initial Downscale Factor", minimum=0.3, maximum=0.9, step=0.05, value=2/3 ) num_frames = gr.Slider( label="Number of Frames", minimum=24, maximum=240, step=1, value=121 ) with gr.Row(): num_inference_steps = gr.Slider( label="Inference Steps", minimum=10, maximum=50, step=1, value=30 ) denoise_strength = gr.Slider( label="Denoise Strength", minimum=0.1, maximum=0.9, step=0.05, value=0.4 ) seed = gr.Number( label="Seed", value=0, precision=0 ) submit_btn = gr.Button("Generate Video", variant="primary") with gr.Column(): output_video = gr.Video(label="Generated Video") submit_btn.click( fn=generate_video, inputs=[ prompt, negative_prompt, expected_height, expected_width, downscale_factor, num_frames, num_inference_steps, denoise_strength, seed ], outputs=output_video ) if __name__ == "__main__": demo.launch()