import torch
from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
from diffusers.utils import export_to_video

pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16)
pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16)
pipe.to("cuda")
pipe_upsample.to("cuda")
pipe.vae.enable_tiling()

prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
expected_height, expected_width = 704, 512
downscale_factor = 2 / 3
num_frames = 121

# Part 1. Generate video at smaller resolution
downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
latents = pipe(
    conditions=None,
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=downscaled_width,
    height=downscaled_height,
    num_frames=num_frames,
    num_inference_steps=30,
    generator=torch.Generator().manual_seed(0),
    output_type="latent",
).frames

# Part 2. Upscale generated video using latent upsampler with fewer inference steps
# The available latent upsampler upscales the height/width by 2x
upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
upscaled_latents = pipe_upsample(
    latents=latents,
    output_type="latent"
).frames

# Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
video = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=upscaled_width,
    height=upscaled_height,
    num_frames=num_frames,
    denoise_strength=0.4,  # Effectively, 4 inference steps out of 10
    num_inference_steps=10,
    latents=upscaled_latents,
    decode_timestep=0.05,
    image_cond_noise_scale=0.025,
    generator=torch.Generator().manual_seed(0),
    output_type="pil",
).frames[0]

# Part 4. Downscale the video to the expected resolution
video = [frame.resize((expected_width, expected_height)) for frame in video]

export_to_video(video, "output.mp4", fps=24)
import torch
import gradio as gr
from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
from diffusers.utils import export_to_video

def generate_video(
    prompt,
    negative_prompt,
    expected_height,
    expected_width,
    downscale_factor,
    num_frames,
    num_inference_steps,
    denoise_strength,
    seed,
    progress=gr.Progress()
):
    # Initialize pipelines (move this outside the function for production)
    progress(0.1, desc="Loading models...")
    pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16)
    pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16)
    pipe.to("cuda")
    pipe_upsample.to("cuda")
    pipe.vae.enable_tiling()
    
    # Part 1. Generate video at smaller resolution
    progress(0.2, desc="Generating initial video...")
    downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
    generator = torch.Generator().manual_seed(seed)
    
    latents = pipe(
        conditions=None,
        prompt=prompt,
        negative_prompt=negative_prompt,
        width=downscaled_width,
        height=downscaled_height,
        num_frames=num_frames,
        num_inference_steps=num_inference_steps,
        generator=generator,
        output_type="latent",
    ).frames
    
    # Part 2. Upscale generated video
    progress(0.5, desc="Upscaling video...")
    upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
    upscaled_latents = pipe_upsample(
        latents=latents,
        output_type="latent"
    ).frames
    
    # Part 3. Denoise the upscaled video
    progress(0.7, desc="Refining video quality...")
    video = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        width=upscaled_width,
        height=upscaled_height,
        num_frames=num_frames,
        denoise_strength=denoise_strength,
        num_inference_steps=10,
        latents=upscaled_latents,
        decode_timestep=0.05,
        image_cond_noise_scale=0.025,
        generator=generator,
        output_type="pil",
    ).frames[0]
    
    # Part 4. Downscale the video to the expected resolution
    progress(0.9, desc="Finalizing video...")
    video = [frame.resize((expected_width, expected_height)) for frame in video]
    
    # Save and return video
    output_path = "output.mp4"
    export_to_video(video, output_path, fps=24)
    
    return output_path

# Create Gradio interface
with gr.Blocks(title="LTX Video Generator") as demo:
    gr.Markdown("# LTX Video Generator")
    gr.Markdown("Generate videos from text prompts using Lightricks' LTX model")
    
    with gr.Row():
        with gr.Column():
            prompt = gr.Textbox(
                label="Prompt",
                value="The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region.",
                lines=4
            )
            negative_prompt = gr.Textbox(
                label="Negative Prompt",
                value="worst quality, inconsistent motion, blurry, jittery, distorted",
                lines=2
            )
            
            with gr.Row():
                expected_height = gr.Slider(
                    label="Output Height",
                    minimum=256,
                    maximum=1024,
                    step=64,
                    value=704
                )
                expected_width = gr.Slider(
                    label="Output Width",
                    minimum=256,
                    maximum=1024,
                    step=64,
                    value=512
                )
            
            with gr.Row():
                downscale_factor = gr.Slider(
                    label="Initial Downscale Factor",
                    minimum=0.3,
                    maximum=0.9,
                    step=0.05,
                    value=2/3
                )
                num_frames = gr.Slider(
                    label="Number of Frames",
                    minimum=24,
                    maximum=240,
                    step=1,
                    value=121
                )
            
            with gr.Row():
                num_inference_steps = gr.Slider(
                    label="Inference Steps",
                    minimum=10,
                    maximum=50,
                    step=1,
                    value=30
                )
                denoise_strength = gr.Slider(
                    label="Denoise Strength",
                    minimum=0.1,
                    maximum=0.9,
                    step=0.05,
                    value=0.4
                )
                seed = gr.Number(
                    label="Seed",
                    value=0,
                    precision=0
                )
            
            submit_btn = gr.Button("Generate Video", variant="primary")
        
        with gr.Column():
            output_video = gr.Video(label="Generated Video")
    
    submit_btn.click(
        fn=generate_video,
        inputs=[
            prompt,
            negative_prompt,
            expected_height,
            expected_width,
            downscale_factor,
            num_frames,
            num_inference_steps,
            denoise_strength,
            seed
        ],
        outputs=output_video
    )

if __name__ == "__main__":
    demo.launch()