DEMONMO commited on
Commit
b9d776d
·
verified ·
1 Parent(s): 225c8e1

Create video_generator.py

Browse files
Files changed (1) hide show
  1. video_generator.py +240 -0
video_generator.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
3
+ from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
4
+ from diffusers.utils import export_to_video
5
+
6
+ pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16)
7
+ pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16)
8
+ pipe.to("cuda")
9
+ pipe_upsample.to("cuda")
10
+ pipe.vae.enable_tiling()
11
+
12
+ prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
13
+ negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
14
+ expected_height, expected_width = 704, 512
15
+ downscale_factor = 2 / 3
16
+ num_frames = 121
17
+
18
+ # Part 1. Generate video at smaller resolution
19
+ downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
20
+ latents = pipe(
21
+ conditions=None,
22
+ prompt=prompt,
23
+ negative_prompt=negative_prompt,
24
+ width=downscaled_width,
25
+ height=downscaled_height,
26
+ num_frames=num_frames,
27
+ num_inference_steps=30,
28
+ generator=torch.Generator().manual_seed(0),
29
+ output_type="latent",
30
+ ).frames
31
+
32
+ # Part 2. Upscale generated video using latent upsampler with fewer inference steps
33
+ # The available latent upsampler upscales the height/width by 2x
34
+ upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
35
+ upscaled_latents = pipe_upsample(
36
+ latents=latents,
37
+ output_type="latent"
38
+ ).frames
39
+
40
+ # Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
41
+ video = pipe(
42
+ prompt=prompt,
43
+ negative_prompt=negative_prompt,
44
+ width=upscaled_width,
45
+ height=upscaled_height,
46
+ num_frames=num_frames,
47
+ denoise_strength=0.4, # Effectively, 4 inference steps out of 10
48
+ num_inference_steps=10,
49
+ latents=upscaled_latents,
50
+ decode_timestep=0.05,
51
+ image_cond_noise_scale=0.025,
52
+ generator=torch.Generator().manual_seed(0),
53
+ output_type="pil",
54
+ ).frames[0]
55
+
56
+ # Part 4. Downscale the video to the expected resolution
57
+ video = [frame.resize((expected_width, expected_height)) for frame in video]
58
+
59
+ export_to_video(video, "output.mp4", fps=24)
60
+ import torch
61
+ import gradio as gr
62
+ from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
63
+ from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
64
+ from diffusers.utils import export_to_video
65
+
66
+ def generate_video(
67
+ prompt,
68
+ negative_prompt,
69
+ expected_height,
70
+ expected_width,
71
+ downscale_factor,
72
+ num_frames,
73
+ num_inference_steps,
74
+ denoise_strength,
75
+ seed,
76
+ progress=gr.Progress()
77
+ ):
78
+ # Initialize pipelines (move this outside the function for production)
79
+ progress(0.1, desc="Loading models...")
80
+ pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16)
81
+ pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16)
82
+ pipe.to("cuda")
83
+ pipe_upsample.to("cuda")
84
+ pipe.vae.enable_tiling()
85
+
86
+ # Part 1. Generate video at smaller resolution
87
+ progress(0.2, desc="Generating initial video...")
88
+ downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
89
+ generator = torch.Generator().manual_seed(seed)
90
+
91
+ latents = pipe(
92
+ conditions=None,
93
+ prompt=prompt,
94
+ negative_prompt=negative_prompt,
95
+ width=downscaled_width,
96
+ height=downscaled_height,
97
+ num_frames=num_frames,
98
+ num_inference_steps=num_inference_steps,
99
+ generator=generator,
100
+ output_type="latent",
101
+ ).frames
102
+
103
+ # Part 2. Upscale generated video
104
+ progress(0.5, desc="Upscaling video...")
105
+ upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
106
+ upscaled_latents = pipe_upsample(
107
+ latents=latents,
108
+ output_type="latent"
109
+ ).frames
110
+
111
+ # Part 3. Denoise the upscaled video
112
+ progress(0.7, desc="Refining video quality...")
113
+ video = pipe(
114
+ prompt=prompt,
115
+ negative_prompt=negative_prompt,
116
+ width=upscaled_width,
117
+ height=upscaled_height,
118
+ num_frames=num_frames,
119
+ denoise_strength=denoise_strength,
120
+ num_inference_steps=10,
121
+ latents=upscaled_latents,
122
+ decode_timestep=0.05,
123
+ image_cond_noise_scale=0.025,
124
+ generator=generator,
125
+ output_type="pil",
126
+ ).frames[0]
127
+
128
+ # Part 4. Downscale the video to the expected resolution
129
+ progress(0.9, desc="Finalizing video...")
130
+ video = [frame.resize((expected_width, expected_height)) for frame in video]
131
+
132
+ # Save and return video
133
+ output_path = "output.mp4"
134
+ export_to_video(video, output_path, fps=24)
135
+
136
+ return output_path
137
+
138
+ # Create Gradio interface
139
+ with gr.Blocks(title="LTX Video Generator") as demo:
140
+ gr.Markdown("# LTX Video Generator")
141
+ gr.Markdown("Generate videos from text prompts using Lightricks' LTX model")
142
+
143
+ with gr.Row():
144
+ with gr.Column():
145
+ prompt = gr.Textbox(
146
+ label="Prompt",
147
+ value="The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region.",
148
+ lines=4
149
+ )
150
+ negative_prompt = gr.Textbox(
151
+ label="Negative Prompt",
152
+ value="worst quality, inconsistent motion, blurry, jittery, distorted",
153
+ lines=2
154
+ )
155
+
156
+ with gr.Row():
157
+ expected_height = gr.Slider(
158
+ label="Output Height",
159
+ minimum=256,
160
+ maximum=1024,
161
+ step=64,
162
+ value=704
163
+ )
164
+ expected_width = gr.Slider(
165
+ label="Output Width",
166
+ minimum=256,
167
+ maximum=1024,
168
+ step=64,
169
+ value=512
170
+ )
171
+
172
+ with gr.Row():
173
+ downscale_factor = gr.Slider(
174
+ label="Initial Downscale Factor",
175
+ minimum=0.3,
176
+ maximum=0.9,
177
+ step=0.05,
178
+ value=2/3
179
+ )
180
+ num_frames = gr.Slider(
181
+ label="Number of Frames",
182
+ minimum=24,
183
+ maximum=240,
184
+ step=1,
185
+ value=121
186
+ )
187
+
188
+ with gr.Row():
189
+ num_inference_steps = gr.Slider(
190
+ label="Inference Steps",
191
+ minimum=10,
192
+ maximum=50,
193
+ step=1,
194
+ value=30
195
+ )
196
+ denoise_strength = gr.Slider(
197
+ label="Denoise Strength",
198
+ minimum=0.1,
199
+ maximum=0.9,
200
+ step=0.05,
201
+ value=0.4
202
+ )
203
+ seed = gr.Number(
204
+ label="Seed",
205
+ value=0,
206
+ precision=0
207
+ )
208
+
209
+ submit_btn = gr.Button("Generate Video", variant="primary")
210
+
211
+ with gr.Column():
212
+ output_video = gr.Video(label="Generated Video")
213
+
214
+ submit_btn.click(
215
+ fn=generate_video,
216
+ inputs=[
217
+ prompt,
218
+ negative_prompt,
219
+ expected_height,
220
+ expected_width,
221
+ downscale_factor,
222
+ num_frames,
223
+ num_inference_steps,
224
+ denoise_strength,
225
+ seed
226
+ ],
227
+ outputs=output_video
228
+ )
229
+
230
+ if __name__ == "__main__":
231
+ demo.launch()
232
+
233
+
234
+
235
+
236
+
237
+
238
+
239
+
240
+