Files changed (1) hide show
  1. app.py +371 -474
app.py CHANGED
@@ -5,7 +5,6 @@ import logging
5
  import os
6
  from pathlib import Path
7
  from datetime import datetime
8
- import re
9
 
10
  import torch
11
  import numpy as np
@@ -16,42 +15,19 @@ from diffusers import AutoModel
16
  import gradio as gr
17
  import tempfile
18
  from huggingface_hub import hf_hub_download
19
- import traceback
20
-
21
- # Patch for scaled_dot_product_attention to fix enable_gqa issue
22
- import torch.nn.functional as F
23
-
24
- original_sdpa = F.scaled_dot_product_attention
25
-
26
- def patched_scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None, enable_gqa=None):
27
- # enable_gqa ํŒŒ๋ผ๋ฏธํ„ฐ๋ฅผ ๋ฌด์‹œํ•˜๊ณ  ๋‚˜๋จธ์ง€ ํŒŒ๋ผ๋ฏธํ„ฐ๋งŒ ์ „๋‹ฌ
28
- kwargs = {}
29
- if attn_mask is not None:
30
- kwargs['attn_mask'] = attn_mask
31
- if dropout_p != 0.0:
32
- kwargs['dropout_p'] = dropout_p
33
- if is_causal:
34
- kwargs['is_causal'] = is_causal
35
- if scale is not None:
36
- kwargs['scale'] = scale
37
-
38
- return original_sdpa(query, key, value, **kwargs)
39
-
40
- # ํŒจ์น˜ ์ ์šฉ
41
- F.scaled_dot_product_attention = patched_scaled_dot_product_attention
42
 
43
  from src.pipeline_wan_nag import NAGWanPipeline
44
  from src.transformer_wan_nag import NagWanTransformer3DModel
45
 
46
  # MMAudio imports
47
  try:
48
- import mmaudio
49
  except ImportError:
50
- os.system("pip install -e .")
51
- import mmaudio
52
 
53
  from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate as mmaudio_generate,
54
- load_video, make_video, setup_eval_logging)
55
  from mmaudio.model.flow_matching import FlowMatching
56
  from mmaudio.model.networks import MMAudio, get_my_mmaudio
57
  from mmaudio.model.sequence_config import SequenceConfig
@@ -75,7 +51,7 @@ MIN_FRAMES_MODEL = 8
75
  MAX_FRAMES_MODEL = 129
76
 
77
  DEFAULT_NAG_NEGATIVE_PROMPT = "Static, motionless, still, ugly, bad quality, worst quality, poorly drawn, low resolution, blurry, lack of details"
78
- DEFAULT_AUDIO_NEGATIVE_PROMPT = "music, speech, voice, singing, narration"
79
 
80
  # NAG Model Settings
81
  MODEL_ID = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
@@ -96,500 +72,421 @@ setup_eval_logging()
96
 
97
  # Initialize NAG Video Model
98
  try:
99
- vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
100
- wan_path = hf_hub_download(repo_id=SUB_MODEL_ID, filename=SUB_MODEL_FILENAME)
101
- transformer = NagWanTransformer3DModel.from_single_file(wan_path, torch_dtype=torch.bfloat16)
102
- pipe = NAGWanPipeline.from_pretrained(
103
- MODEL_ID, vae=vae, transformer=transformer, torch_dtype=torch.bfloat16
104
- )
105
- pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=5.0)
106
- pipe.to("cuda")
107
-
108
- pipe.transformer.__class__.attn_processors = NagWanTransformer3DModel.attn_processors
109
- pipe.transformer.__class__.set_attn_processor = NagWanTransformer3DModel.set_attn_processor
110
- pipe.transformer.__class__.forward = NagWanTransformer3DModel.forward
111
- print("NAG Video Model loaded successfully!")
112
  except Exception as e:
113
- print(f"Error loading NAG Video Model: {e}")
114
- pipe = None
115
 
116
  # Initialize MMAudio Model
117
  def get_mmaudio_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
118
- seq_cfg = audio_model_config.seq_cfg
119
-
120
- net: MMAudio = get_my_mmaudio(audio_model_config.model_name).to(device, dtype).eval()
121
- net.load_weights(torch.load(audio_model_config.model_path, map_location=device, weights_only=True))
122
- log.info(f'Loaded MMAudio weights from {audio_model_config.model_path}')
123
-
124
- feature_utils = FeaturesUtils(tod_vae_ckpt=audio_model_config.vae_path,
125
- synchformer_ckpt=audio_model_config.synchformer_ckpt,
126
- enable_conditions=True,
127
- mode=audio_model_config.mode,
128
- bigvgan_vocoder_ckpt=audio_model_config.bigvgan_16k_path,
129
- need_vae_encoder=False)
130
- feature_utils = feature_utils.to(device, dtype).eval()
131
-
132
- return net, feature_utils, seq_cfg
133
 
134
  try:
135
- audio_net, audio_feature_utils, audio_seq_cfg = get_mmaudio_model()
136
- print("MMAudio Model loaded successfully!")
137
  except Exception as e:
138
- print(f"Error loading MMAudio Model: {e}")
139
- audio_net = None
140
-
141
- # ๋น„๋””์˜ค ํ”„๋กฌํ”„ํŠธ๋ฅผ ์˜ค๋””์˜ค ํ”„๋กฌํ”„ํŠธ๋กœ ๋ณ€ํ™˜ํ•˜๋Š” ํ•จ์ˆ˜
142
- def extract_audio_description(video_prompt):
143
- """๋น„๋””์˜ค ํ”„๋กฌํ”„ํŠธ์—์„œ ์˜ค๋””์˜ค ๊ด€๋ จ ์„ค๋ช… ์ถ”์ถœ/๋ณ€ํ™˜"""
144
-
145
- # ํ‚ค์›Œ๋“œ ๋งคํ•‘
146
- audio_keywords = {
147
- 'car': 'car engine sound, vehicle noise',
148
- 'porsche': 'sports car engine roar, exhaust sound',
149
- 'guitar': 'electric guitar playing, guitar music',
150
- 'concert': 'crowd cheering, live music, applause',
151
- 'motorcycle': 'motorcycle engine sound, motor rumble',
152
- 'highway': 'traffic noise, road ambience',
153
- 'rain': 'rain sounds, water drops',
154
- 'wind': 'wind blowing sound',
155
- 'ocean': 'ocean waves, water sounds',
156
- 'city': 'urban ambience, city traffic sounds',
157
- 'singer': 'singing voice, vocals',
158
- 'crowd': 'crowd noise, people talking',
159
- 'flames': 'fire crackling sound',
160
- 'pyro': 'fire whoosh, flame burst sound',
161
- 'explosion': 'explosion sound, blast',
162
- 'countryside': 'nature ambience, birds chirping',
163
- 'wheat fields': 'wind through grass, rural ambience',
164
- 'engine': 'motor sound, mechanical noise',
165
- 'flat-six engine': 'sports car engine sound',
166
- 'roaring': 'loud engine roar',
167
- 'thunderous': 'loud booming sound',
168
- 'child': 'children playing sounds',
169
- 'running': 'footsteps sound',
170
- 'woman': 'ambient sounds',
171
- 'phone': 'subtle electronic ambience',
172
- 'advertisement': 'modern ambient sounds'
173
- }
174
-
175
- # ๊ฐ„๋‹จํ•œ ํ‚ค์›Œ๋“œ ๊ธฐ๋ฐ˜ ๋ณ€ํ™˜
176
- audio_descriptions = []
177
- lower_prompt = video_prompt.lower()
178
-
179
- for key, value in audio_keywords.items():
180
- if key in lower_prompt:
181
- audio_descriptions.append(value)
182
-
183
- # ๊ธฐ๋ณธ๊ฐ’ ์„ค์ •
184
- if not audio_descriptions:
185
- # ํ”„๋กฌํ”„ํŠธ์— ๋ช…์‹œ์ ์ธ ์˜ค๋””์˜ค ์„ค๋ช…์ด ์žˆ๋Š”์ง€ ํ™•์ธ
186
- if 'sound' in lower_prompt or 'audio' in lower_prompt or 'noise' in lower_prompt:
187
- # ํ”„๋กฌํ”„ํŠธ์—์„œ ์˜ค๋””์˜ค ๊ด€๋ จ ๋ถ€๋ถ„๋งŒ ์ถ”์ถœ
188
- audio_pattern = r'([^.]*(?:sound|audio|noise|music|voice|roar|rumble)[^.]*)'
189
- matches = re.findall(audio_pattern, lower_prompt, re.IGNORECASE)
190
- if matches:
191
- return ', '.join(matches)
192
-
193
- # ๊ธฐ๋ณธ ambient sound
194
- return "ambient environmental sounds matching the scene"
195
-
196
- return ', '.join(audio_descriptions)
197
 
198
  # Audio generation function
199
  @torch.inference_mode()
200
- def add_audio_to_video(video_path, prompt, audio_custom_prompt, audio_negative_prompt, audio_steps, audio_cfg_strength, duration):
201
- """Generate and add audio to video using MMAudio"""
202
- if audio_net is None:
203
- print("MMAudio model not loaded, returning video without audio")
204
- return video_path
205
-
206
- try:
207
- # ์ปค์Šคํ…€ ์˜ค๋””์˜ค ํ”„๋กฌํ”„ํŠธ๊ฐ€ ์žˆ์œผ๋ฉด ์‚ฌ์šฉ, ์—†์œผ๋ฉด ๋น„๋””์˜ค ํ”„๋กฌํ”„ํŠธ์—์„œ ์ถ”์ถœ
208
- if audio_custom_prompt and audio_custom_prompt.strip():
209
- audio_prompt = audio_custom_prompt.strip()
210
- else:
211
- audio_prompt = extract_audio_description(prompt)
212
-
213
- print(f"Original prompt: {prompt}")
214
- print(f"Audio prompt: {audio_prompt}")
215
-
216
- rng = torch.Generator(device=device)
217
- rng.manual_seed(random.randint(0, 2**32 - 1)) # ๋” ๋ช…ํ™•ํ•œ ๋žœ๋ค ์‹œ๋“œ
218
- fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=audio_steps)
219
-
220
- video_info = load_video(video_path, duration)
221
- clip_frames = video_info.clip_frames
222
- sync_frames = video_info.sync_frames
223
- duration = video_info.duration_sec
224
- clip_frames = clip_frames.unsqueeze(0)
225
- sync_frames = sync_frames.unsqueeze(0)
226
- audio_seq_cfg.duration = duration
227
- audio_net.update_seq_lengths(audio_seq_cfg.latent_seq_len, audio_seq_cfg.clip_seq_len, audio_seq_cfg.sync_seq_len)
228
-
229
- # ํ–ฅ์ƒ๋œ ๋„ค๊ฑฐํ‹ฐ๋ธŒ ํ”„๋กฌํ”„ํŠธ
230
- enhanced_negative = f"{audio_negative_prompt}, distortion, static noise, silence, random beeps"
231
-
232
- audios = mmaudio_generate(clip_frames,
233
- sync_frames, [audio_prompt], # ๋ณ€ํ™˜๋œ ์˜ค๋””์˜ค ํ”„๋กฌํ”„ํŠธ ์‚ฌ์šฉ
234
- negative_text=[enhanced_negative],
235
- feature_utils=audio_feature_utils,
236
- net=audio_net,
237
- fm=fm,
238
- rng=rng,
239
- cfg_strength=audio_cfg_strength)
240
- audio = audios.float().cpu()[0]
241
-
242
- # Create video with audio
243
- video_with_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
244
- make_video(video_info, video_with_audio_path, audio, sampling_rate=audio_seq_cfg.sampling_rate)
245
-
246
- return video_with_audio_path
247
- except Exception as e:
248
- print(f"Error in audio generation: {e}")
249
- traceback.print_exc()
250
- return video_path
251
 
252
  # Combined generation function
253
  def get_duration(prompt, nag_negative_prompt, nag_scale, height, width, duration_seconds,
254
- steps, seed, randomize_seed, enable_audio, audio_custom_prompt,
255
- audio_negative_prompt, audio_steps, audio_cfg_strength):
256
- # Calculate total duration including audio processing if enabled
257
- video_duration = int(duration_seconds) * int(steps) * 2.25 + 5
258
- audio_duration = 30 if enable_audio else 0 # Additional time for audio processing
259
- return video_duration + audio_duration
260
 
261
  @spaces.GPU(duration=get_duration)
262
  def generate_video_with_audio(
263
- prompt,
264
- nag_negative_prompt, nag_scale,
265
- height=DEFAULT_H_SLIDER_VALUE, width=DEFAULT_W_SLIDER_VALUE, duration_seconds=DEFAULT_DURATION_SECONDS,
266
- steps=DEFAULT_STEPS,
267
- seed=DEFAULT_SEED, randomize_seed=False,
268
- enable_audio=True, audio_custom_prompt="",
269
- audio_negative_prompt=DEFAULT_AUDIO_NEGATIVE_PROMPT,
270
- audio_steps=30, audio_cfg_strength=4.5,
271
  ):
272
- if pipe is None:
273
- return None, DEFAULT_SEED
274
-
275
- try:
276
- # Generate video first
277
- target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
278
- target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
279
-
280
- num_frames = np.clip(int(round(int(duration_seconds) * FIXED_FPS) + 1), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
281
-
282
- current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
283
-
284
- print(f"Generating video with: prompt='{prompt}', resolution={target_w}x{target_h}, frames={num_frames}")
285
-
286
- with torch.inference_mode():
287
- nag_output_frames_list = pipe(
288
- prompt=prompt,
289
- nag_negative_prompt=nag_negative_prompt,
290
- nag_scale=nag_scale,
291
- nag_tau=3.5,
292
- nag_alpha=0.5,
293
- height=target_h, width=target_w, num_frames=num_frames,
294
- guidance_scale=0.,
295
- num_inference_steps=int(steps),
296
- generator=torch.Generator(device="cuda").manual_seed(current_seed)
297
- ).frames[0]
298
-
299
- # Save initial video without audio
300
- with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
301
- temp_video_path = tmpfile.name
302
- export_to_video(nag_output_frames_list, temp_video_path, fps=FIXED_FPS)
303
- print(f"Video saved to: {temp_video_path}")
304
-
305
- # Add audio if enabled
306
- if enable_audio:
307
- try:
308
- print("Adding audio to video...")
309
- final_video_path = add_audio_to_video(
310
- temp_video_path,
311
- prompt,
312
- audio_custom_prompt,
313
- audio_negative_prompt,
314
- audio_steps,
315
- audio_cfg_strength,
316
- duration_seconds
317
- )
318
- # Clean up temp video
319
- if os.path.exists(temp_video_path) and final_video_path != temp_video_path:
320
- os.remove(temp_video_path)
321
- print(f"Final video with audio: {final_video_path}")
322
- except Exception as e:
323
- log.error(f"Audio generation failed: {e}")
324
- final_video_path = temp_video_path
325
- else:
326
- final_video_path = temp_video_path
327
-
328
- return final_video_path, current_seed
329
- except Exception as e:
330
- print(f"Error in video generation: {e}")
331
- return None, current_seed
332
 
333
  # Example generation function - simplified
334
  def set_example(prompt, nag_negative_prompt, nag_scale):
335
- """Set example values in the UI without triggering generation"""
336
- return (
337
- prompt,
338
- nag_negative_prompt,
339
- nag_scale,
340
- DEFAULT_H_SLIDER_VALUE,
341
- DEFAULT_W_SLIDER_VALUE,
342
- DEFAULT_DURATION_SECONDS,
343
- DEFAULT_STEPS,
344
- DEFAULT_SEED,
345
- True, # randomize_seed
346
- True, # enable_audio
347
- "", # audio_custom_prompt
348
- DEFAULT_AUDIO_NEGATIVE_PROMPT,
349
- 30, # audio_steps
350
- 4.5 # audio_cfg_strength
351
- )
352
 
353
  # Examples with audio descriptions
354
  examples = [
355
- ["Midnight highway outside a neon-lit city. A black 1973 Porsche 911 Carrera RS speeds at 120 km/h. Inside, a stylish singer-guitarist sings while driving, vintage sunburst guitar on the passenger seat. Sodium streetlights streak over the hood; RGB panels shift magenta to blue on the driver. Camera: drone dive, Russian-arm low wheel shot, interior gimbal, FPV barrel roll, overhead spiral. Neo-noir palette, rain-slick asphalt reflections, roaring flat-six engine blended with live guitar.", DEFAULT_NAG_NEGATIVE_PROMPT, 11],
356
- ["Arena rock concert packed with 20 000 fans. A flamboyant lead guitarist in leather jacket and mirrored aviators shreds a cherry-red Flying V on a thrust stage. Pyro flames shoot up on every downbeat, COโ‚‚ jets burst behind. Moving-head spotlights swirl teal and amber, follow-spots rim-light the guitarist's hair. Steadicam 360-orbit, crane shot rising over crowd, ultra-slow-motion pick attack at 1 000 fps. Film-grain teal-orange grade, thunderous crowd roar mixes with screaming guitar solo.", DEFAULT_NAG_NEGATIVE_PROMPT, 11],
357
- ["Golden-hour countryside road winding through rolling wheat fields. A man and woman ride a vintage cafรฉ-racer motorcycle, hair and scarf fluttering in the warm breeze. Drone chase shot reveals endless patchwork farmland; low slider along rear wheel captures dust trail. Sun-flare back-lights the riders, lens blooms on highlights. Soft acoustic rock underscore; engine rumble mixed at โ€“8 dB. Warm pastel color grade, gentle film-grain for nostalgic vibe.", DEFAULT_NAG_NEGATIVE_PROMPT, 11],
358
  ]
359
 
360
  # CSS styling - Fixed for better layout
361
  css = """
362
  /* Right column - video output */
363
  .video-output {
364
- border-radius: 15px;
365
- overflow: hidden;
366
- box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
367
- width: 100% !important;
368
- height: auto !important;
369
- min-height: 400px;
370
  }
371
 
372
  /* Ensure video container is responsive */
373
  .video-output video {
374
- width: 100% !important;
375
- height: auto !important;
376
- max-height: 600px;
377
- object-fit: contain;
378
- display: block;
379
  }
380
 
381
  /* Remove any overlay or background from video container */
382
  .video-output > div {
383
- background: transparent !important;
384
- padding: 0 !important;
385
  }
386
 
387
  /* Remove gradio's default video player overlay */
388
  .video-output .wrap {
389
- background: transparent !important;
390
  }
391
 
392
  /* Ensure no gray overlay on video controls */
393
  .video-output video::-webkit-media-controls-enclosure {
394
- background: transparent;
395
  }
396
  """
397
 
398
  # Gradio interface - Fixed structure
399
  with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
400
- gr.HTML("""
401
- <div class="container">
402
- <h1 class="main-title">๐ŸŽฌ VEO3 Free</h1>
403
- <p class="subtitle">Wan2.1-T2V-14B + Fast 4-step with NAG + Automatic Audio Generation</p>
404
- </div>
405
- """)
406
-
407
- gr.HTML("""
408
- <div class='container' style='display:flex; justify-content:center; gap:12px; margin-bottom: 20px;'>
409
- <a href="https://huggingface.co/spaces/openfree/Best-AI" target="_blank">
410
- <img src="https://img.shields.io/static/v1?label=OpenFree&message=BEST%20AI%20Services&color=%230000ff&labelColor=%23000080&logo=huggingface&logoColor=%23ffa500&style=for-the-badge" alt="OpenFree badge">
411
- </a>
412
-
413
- <a href="https://discord.gg/openfreeai" target="_blank">
414
- <img src="https://img.shields.io/static/v1?label=Discord&message=Openfree%20AI&color=%230000ff&labelColor=%23800080&logo=discord&logoColor=white&style=for-the-badge" alt="Discord badge">
415
- </a>
416
- </div>
417
- """)
418
-
419
- with gr.Row(equal_height=True):
420
- with gr.Column(scale=5):
421
- with gr.Group(elem_classes="prompt-container"):
422
- prompt = gr.Textbox(
423
- label="โœจ Video Prompt (also used for audio generation)",
424
- placeholder="Describe your video scene in detail...",
425
- lines=3,
426
- elem_classes="prompt-input"
427
- )
428
-
429
- with gr.Accordion("๐ŸŽจ Advanced Video Settings", open=False):
430
- nag_negative_prompt = gr.Textbox(
431
- label="Video Negative Prompt",
432
- value=DEFAULT_NAG_NEGATIVE_PROMPT,
433
- lines=2,
434
- )
435
- nag_scale = gr.Slider(
436
- label="NAG Scale",
437
- minimum=1.0,
438
- maximum=20.0,
439
- step=0.25,
440
- value=11.0,
441
- info="Higher values = stronger guidance"
442
- )
443
-
444
- with gr.Group(elem_classes="settings-panel"):
445
- gr.Markdown("### โš™๏ธ Video Settings")
446
-
447
- with gr.Row():
448
- duration_seconds_input = gr.Slider(
449
- minimum=1,
450
- maximum=8,
451
- step=1,
452
- value=DEFAULT_DURATION_SECONDS,
453
- label="๐Ÿ“ฑ Duration (seconds)",
454
- elem_classes="slider-container"
455
- )
456
- steps_slider = gr.Slider(
457
- minimum=1,
458
- maximum=8,
459
- step=1,
460
- value=DEFAULT_STEPS,
461
- label="๐Ÿ”„ Inference Steps",
462
- elem_classes="slider-container"
463
- )
464
-
465
- with gr.Row():
466
- height_input = gr.Slider(
467
- minimum=SLIDER_MIN_H,
468
- maximum=SLIDER_MAX_H,
469
- step=MOD_VALUE,
470
- value=DEFAULT_H_SLIDER_VALUE,
471
- label=f"๐Ÿ“ Height (ร—{MOD_VALUE})",
472
- elem_classes="slider-container"
473
- )
474
- width_input = gr.Slider(
475
- minimum=SLIDER_MIN_W,
476
- maximum=SLIDER_MAX_W,
477
- step=MOD_VALUE,
478
- value=DEFAULT_W_SLIDER_VALUE,
479
- label=f"๐Ÿ“ Width (ร—{MOD_VALUE})",
480
- elem_classes="slider-container"
481
- )
482
-
483
- with gr.Row():
484
- seed_input = gr.Slider(
485
- label="๐ŸŒฑ Seed",
486
- minimum=0,
487
- maximum=MAX_SEED,
488
- step=1,
489
- value=DEFAULT_SEED,
490
- interactive=True
491
- )
492
- randomize_seed_checkbox = gr.Checkbox(
493
- label="๐ŸŽฒ Random Seed",
494
- value=True,
495
- interactive=True
496
- )
497
-
498
- with gr.Group(elem_classes="audio-settings"):
499
- gr.Markdown("### ๐ŸŽต Audio Generation Settings")
500
-
501
- enable_audio = gr.Checkbox(
502
- label="๐Ÿ”Š Enable Automatic Audio Generation",
503
- value=True,
504
- interactive=True
505
- )
506
-
507
- with gr.Column(visible=True) as audio_settings_group:
508
- audio_custom_prompt = gr.Textbox(
509
- label="Custom Audio Prompt (Optional)",
510
- placeholder="Leave empty to auto-generate from video prompt, or specify custom audio description (e.g., 'car engine sound, traffic noise')",
511
- value="",
512
- )
513
- audio_negative_prompt = gr.Textbox(
514
- label="Audio Negative Prompt",
515
- value=DEFAULT_AUDIO_NEGATIVE_PROMPT,
516
- placeholder="Elements to avoid in audio",
517
- )
518
-
519
- with gr.Row():
520
- audio_steps = gr.Slider(
521
- minimum=10,
522
- maximum=50,
523
- step=5,
524
- value=30,
525
- label="๐ŸŽš๏ธ Audio Steps",
526
- info="More steps = better quality"
527
- )
528
- audio_cfg_strength = gr.Slider(
529
- minimum=1.0,
530
- maximum=10.0,
531
- step=0.5,
532
- value=4.5,
533
- label="๐ŸŽ›๏ธ Audio Guidance",
534
- info="Strength of prompt guidance"
535
- )
536
-
537
- # Toggle audio settings visibility
538
- enable_audio.change(
539
- fn=lambda x: gr.update(visible=x),
540
- inputs=[enable_audio],
541
- outputs=[audio_settings_group]
542
- )
543
-
544
- generate_button = gr.Button(
545
- "๐ŸŽฌ Generate Video with Audio",
546
- variant="primary",
547
- elem_classes="generate-btn"
548
- )
549
-
550
- with gr.Column(scale=5):
551
- video_output = gr.Video(
552
- label="Generated Video with Audio",
553
- autoplay=True,
554
- interactive=False,
555
- elem_classes="video-output",
556
- height=600
557
- )
558
-
559
- gr.HTML("""
560
- <div style="text-align: center; margin-top: 20px; color: #6b7280;">
561
- <p>๐Ÿ’ก Tip: For better audio, use Custom Audio Prompt with sound descriptions!</p>
562
- <p>๐ŸŽง Examples: "car engine sound", "crowd cheering", "nature ambience"</p>
563
- </div>
564
- """)
565
-
566
- # Examples section moved outside of columns
567
- with gr.Row():
568
- gr.Markdown("### ๐ŸŽฏ Example Prompts")
569
-
570
- gr.Examples(
571
- examples=examples,
572
- inputs=[prompt, nag_negative_prompt, nag_scale],
573
- outputs=None, # Don't connect outputs to avoid index issues
574
- cache_examples=False
575
- )
576
-
577
- # Connect UI elements
578
- ui_inputs = [
579
- prompt,
580
- nag_negative_prompt, nag_scale,
581
- height_input, width_input, duration_seconds_input,
582
- steps_slider,
583
- seed_input, randomize_seed_checkbox,
584
- enable_audio, audio_custom_prompt, audio_negative_prompt,
585
- audio_steps, audio_cfg_strength,
586
- ]
587
-
588
- generate_button.click(
589
- fn=generate_video_with_audio,
590
- inputs=ui_inputs,
591
- outputs=[video_output, seed_input],
592
- )
593
 
594
  if __name__ == "__main__":
595
- demo.queue().launch()
 
5
  import os
6
  from pathlib import Path
7
  from datetime import datetime
 
8
 
9
  import torch
10
  import numpy as np
 
15
  import gradio as gr
16
  import tempfile
17
  from huggingface_hub import hf_hub_download
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  from src.pipeline_wan_nag import NAGWanPipeline
20
  from src.transformer_wan_nag import NagWanTransformer3DModel
21
 
22
  # MMAudio imports
23
  try:
24
+ import mmaudio
25
  except ImportError:
26
+ os.system("pip install -e .")
27
+ import mmaudio
28
 
29
  from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate as mmaudio_generate,
30
+ load_video, make_video, setup_eval_logging)
31
  from mmaudio.model.flow_matching import FlowMatching
32
  from mmaudio.model.networks import MMAudio, get_my_mmaudio
33
  from mmaudio.model.sequence_config import SequenceConfig
 
51
  MAX_FRAMES_MODEL = 129
52
 
53
  DEFAULT_NAG_NEGATIVE_PROMPT = "Static, motionless, still, ugly, bad quality, worst quality, poorly drawn, low resolution, blurry, lack of details"
54
+ DEFAULT_AUDIO_NEGATIVE_PROMPT = "music"
55
 
56
  # NAG Model Settings
57
  MODEL_ID = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
 
72
 
73
  # Initialize NAG Video Model
74
  try:
75
+ vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
76
+ wan_path = hf_hub_download(repo_id=SUB_MODEL_ID, filename=SUB_MODEL_FILENAME)
77
+ transformer = NagWanTransformer3DModel.from_single_file(wan_path, torch_dtype=torch.bfloat16)
78
+ pipe = NAGWanPipeline.from_pretrained(
79
+ MODEL_ID, vae=vae, transformer=transformer, torch_dtype=torch.bfloat16
80
+ )
81
+ pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=5.0)
82
+ pipe.to("cuda")
83
+
84
+ pipe.transformer.__class__.attn_processors = NagWanTransformer3DModel.attn_processors
85
+ pipe.transformer.__class__.set_attn_processor = NagWanTransformer3DModel.set_attn_processor
86
+ pipe.transformer.__class__.forward = NagWanTransformer3DModel.forward
87
+ print("NAG Video Model loaded successfully!")
88
  except Exception as e:
89
+ print(f"Error loading NAG Video Model: {e}")
90
+ pipe = None
91
 
92
  # Initialize MMAudio Model
93
  def get_mmaudio_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
94
+ seq_cfg = audio_model_config.seq_cfg
95
+
96
+ net: MMAudio = get_my_mmaudio(audio_model_config.model_name).to(device, dtype).eval()
97
+ net.load_weights(torch.load(audio_model_config.model_path, map_location=device, weights_only=True))
98
+ log.info(f'Loaded MMAudio weights from {audio_model_config.model_path}')
99
+
100
+ feature_utils = FeaturesUtils(tod_vae_ckpt=audio_model_config.vae_path,
101
+ synchformer_ckpt=audio_model_config.synchformer_ckpt,
102
+ enable_conditions=True,
103
+ mode=audio_model_config.mode,
104
+ bigvgan_vocoder_ckpt=audio_model_config.bigvgan_16k_path,
105
+ need_vae_encoder=False)
106
+ feature_utils = feature_utils.to(device, dtype).eval()
107
+
108
+ return net, feature_utils, seq_cfg
109
 
110
  try:
111
+ audio_net, audio_feature_utils, audio_seq_cfg = get_mmaudio_model()
112
+ print("MMAudio Model loaded successfully!")
113
  except Exception as e:
114
+ print(f"Error loading MMAudio Model: {e}")
115
+ audio_net = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  # Audio generation function
118
  @torch.inference_mode()
119
+ def add_audio_to_video(video_path, prompt, audio_negative_prompt, audio_steps, audio_cfg_strength, duration):
120
+ """Generate and add audio to video using MMAudio"""
121
+ if audio_net is None:
122
+ print("MMAudio model not loaded, returning video without audio")
123
+ return video_path
124
+
125
+ try:
126
+ rng = torch.Generator(device=device)
127
+ rng.seed() # Random seed for audio
128
+ fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=audio_steps)
129
+
130
+ video_info = load_video(video_path, duration)
131
+ clip_frames = video_info.clip_frames
132
+ sync_frames = video_info.sync_frames
133
+ duration = video_info.duration_sec
134
+ clip_frames = clip_frames.unsqueeze(0)
135
+ sync_frames = sync_frames.unsqueeze(0)
136
+ audio_seq_cfg.duration = duration
137
+ audio_net.update_seq_lengths(audio_seq_cfg.latent_seq_len, audio_seq_cfg.clip_seq_len, audio_seq_cfg.sync_seq_len)
138
+
139
+ audios = mmaudio_generate(clip_frames,
140
+ sync_frames, [prompt],
141
+ negative_text=[audio_negative_prompt],
142
+ feature_utils=audio_feature_utils,
143
+ net=audio_net,
144
+ fm=fm,
145
+ rng=rng,
146
+ cfg_strength=audio_cfg_strength)
147
+ audio = audios.float().cpu()[0]
148
+
149
+ # Create video with audio
150
+ video_with_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
151
+ make_video(video_info, video_with_audio_path, audio, sampling_rate=audio_seq_cfg.sampling_rate)
152
+
153
+ return video_with_audio_path
154
+ except Exception as e:
155
+ print(f"Error in audio generation: {e}")
156
+ return video_path
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
  # Combined generation function
159
  def get_duration(prompt, nag_negative_prompt, nag_scale, height, width, duration_seconds,
160
+ steps, seed, randomize_seed, enable_audio, audio_negative_prompt,
161
+ audio_steps, audio_cfg_strength):
162
+ # Calculate total duration including audio processing if enabled
163
+ video_duration = int(duration_seconds) * int(steps) * 2.25 + 5
164
+ audio_duration = 30 if enable_audio else 0 # Additional time for audio processing
165
+ return video_duration + audio_duration
166
 
167
  @spaces.GPU(duration=get_duration)
168
  def generate_video_with_audio(
169
+ prompt,
170
+ nag_negative_prompt, nag_scale,
171
+ height=DEFAULT_H_SLIDER_VALUE, width=DEFAULT_W_SLIDER_VALUE, duration_seconds=DEFAULT_DURATION_SECONDS,
172
+ steps=DEFAULT_STEPS,
173
+ seed=DEFAULT_SEED, randomize_seed=False,
174
+ enable_audio=True, audio_negative_prompt=DEFAULT_AUDIO_NEGATIVE_PROMPT,
175
+ audio_steps=25, audio_cfg_strength=4.5,
 
176
  ):
177
+ if pipe is None:
178
+ return None, DEFAULT_SEED
179
+
180
+ try:
181
+ # Generate video first
182
+ target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
183
+ target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
184
+
185
+ num_frames = np.clip(int(round(int(duration_seconds) * FIXED_FPS) + 1), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
186
+
187
+ current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
188
+
189
+ print(f"Generating video with: prompt='{prompt}', resolution={target_w}x{target_h}, frames={num_frames}")
190
+
191
+ with torch.inference_mode():
192
+ nag_output_frames_list = pipe(
193
+ prompt=prompt,
194
+ nag_negative_prompt=nag_negative_prompt,
195
+ nag_scale=nag_scale,
196
+ nag_tau=3.5,
197
+ nag_alpha=0.5,
198
+ height=target_h, width=target_w, num_frames=num_frames,
199
+ guidance_scale=0.,
200
+ num_inference_steps=int(steps),
201
+ generator=torch.Generator(device="cuda").manual_seed(current_seed)
202
+ ).frames[0]
203
+
204
+ # Save initial video without audio
205
+ with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
206
+ temp_video_path = tmpfile.name
207
+ export_to_video(nag_output_frames_list, temp_video_path, fps=FIXED_FPS)
208
+ print(f"Video saved to: {temp_video_path}")
209
+
210
+ # Add audio if enabled
211
+ if enable_audio:
212
+ try:
213
+ print("Adding audio to video...")
214
+ final_video_path = add_audio_to_video(
215
+ temp_video_path,
216
+ prompt, # Use the same prompt for audio generation
217
+ audio_negative_prompt,
218
+ audio_steps,
219
+ audio_cfg_strength,
220
+ duration_seconds
221
+ )
222
+ # Clean up temp video
223
+ if os.path.exists(temp_video_path) and final_video_path != temp_video_path:
224
+ os.remove(temp_video_path)
225
+ print(f"Final video with audio: {final_video_path}")
226
+ except Exception as e:
227
+ log.error(f"Audio generation failed: {e}")
228
+ final_video_path = temp_video_path
229
+ else:
230
+ final_video_path = temp_video_path
231
+
232
+ return final_video_path, current_seed
233
+ except Exception as e:
234
+ print(f"Error in video generation: {e}")
235
+ return None, current_seed
 
236
 
237
  # Example generation function - simplified
238
  def set_example(prompt, nag_negative_prompt, nag_scale):
239
+ """Set example values in the UI without triggering generation"""
240
+ return (
241
+ prompt,
242
+ nag_negative_prompt,
243
+ nag_scale,
244
+ DEFAULT_H_SLIDER_VALUE,
245
+ DEFAULT_W_SLIDER_VALUE,
246
+ DEFAULT_DURATION_SECONDS,
247
+ DEFAULT_STEPS,
248
+ DEFAULT_SEED,
249
+ True, # randomize_seed
250
+ True, # enable_audio
251
+ DEFAULT_AUDIO_NEGATIVE_PROMPT,
252
+ 25, # audio_steps
253
+ 4.5 # audio_cfg_strength
254
+ )
 
255
 
256
  # Examples with audio descriptions
257
  examples = [
258
+ ["Midnight highway outside a neon-lit city. A black 1973 Porsche 911 Carrera RS speeds at 120 km/h. Inside, a stylish singer-guitarist sings while driving, vintage sunburst guitar on the passenger seat. Sodium streetlights streak over the hood; RGB panels shift magenta to blue on the driver. Camera: drone dive, Russian-arm low wheel shot, interior gimbal, FPV barrel roll, overhead spiral. Neo-noir palette, rain-slick asphalt reflections, roaring flat-six engine blended with live guitar.", DEFAULT_NAG_NEGATIVE_PROMPT, 11],
259
+ ["Arena rock concert packed with 20 000 fans. A flamboyant lead guitarist in leather jacket and mirrored aviators shreds a cherry-red Flying V on a thrust stage. Pyro flames shoot up on every downbeat, COโ‚‚ jets burst behind. Moving-head spotlights swirl teal and amber, follow-spots rim-light the guitarist's hair. Steadicam 360-orbit, crane shot rising over crowd, ultra-slow-motion pick attack at 1 000 fps. Film-grain teal-orange grade, thunderous crowd roar mixes with screaming guitar solo.", DEFAULT_NAG_NEGATIVE_PROMPT, 11],
260
+ ["Golden-hour countryside road winding through rolling wheat fields. A man and woman ride a vintage cafรฉ-racer motorcycle, hair and scarf fluttering in the warm breeze. Drone chase shot reveals endless patchwork farmland; low slider along rear wheel captures dust trail. Sun-flare back-lights the riders, lens blooms on highlights. Soft acoustic rock underscore; engine rumble mixed at โ€“8 dB. Warm pastel color grade, gentle film-grain for nostalgic vibe.", DEFAULT_NAG_NEGATIVE_PROMPT, 11],
261
  ]
262
 
263
  # CSS styling - Fixed for better layout
264
  css = """
265
  /* Right column - video output */
266
  .video-output {
267
+ border-radius: 15px;
268
+ overflow: hidden;
269
+ box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
270
+ width: 100% !important;
271
+ height: auto !important;
272
+ min-height: 400px;
273
  }
274
 
275
  /* Ensure video container is responsive */
276
  .video-output video {
277
+ width: 100% !important;
278
+ height: auto !important;
279
+ max-height: 600px;
280
+ object-fit: contain;
281
+ display: block;
282
  }
283
 
284
  /* Remove any overlay or background from video container */
285
  .video-output > div {
286
+ background: transparent !important;
287
+ padding: 0 !important;
288
  }
289
 
290
  /* Remove gradio's default video player overlay */
291
  .video-output .wrap {
292
+ background: transparent !important;
293
  }
294
 
295
  /* Ensure no gray overlay on video controls */
296
  .video-output video::-webkit-media-controls-enclosure {
297
+ background: transparent;
298
  }
299
  """
300
 
301
  # Gradio interface - Fixed structure
302
  with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
303
+ gr.HTML("""
304
+ <div class="container">
305
+ <h1 class="main-title">๐ŸŽฌ VEO3 Free</h1>
306
+ <p class="subtitle">Wan2.1-T2V-14B + Fast 4-step with NAG + Automatic Audio Generation</p>
307
+ </div>
308
+ """)
309
+
310
+ gr.HTML("""
311
+ <div class='container' style='display:flex; justify-content:center; gap:12px; margin-bottom: 20px;'>
312
+ <a href="https://huggingface.co/spaces/openfree/Best-AI" target="_blank">
313
+ <img src="https://img.shields.io/static/v1?label=OpenFree&message=BEST%20AI%20Services&color=%230000ff&labelColor=%23000080&logo=huggingface&logoColor=%23ffa500&style=for-the-badge" alt="OpenFree badge">
314
+ </a>
315
+
316
+ <a href="https://discord.gg/openfreeai" target="_blank">
317
+ <img src="https://img.shields.io/static/v1?label=Discord&message=Openfree%20AI&color=%230000ff&labelColor=%23800080&logo=discord&logoColor=white&style=for-the-badge" alt="Discord badge">
318
+ </a>
319
+ </div>
320
+ """)
321
+
322
+ with gr.Row(equal_height=True):
323
+ with gr.Column(scale=5):
324
+ with gr.Group(elem_classes="prompt-container"):
325
+ prompt = gr.Textbox(
326
+ label="โœจ Video Prompt (also used for audio generation)",
327
+ placeholder="Describe your video scene in detail...",
328
+ lines=3,
329
+ elem_classes="prompt-input"
330
+ )
331
+
332
+ with gr.Accordion("๐ŸŽจ Advanced Video Settings", open=False):
333
+ nag_negative_prompt = gr.Textbox(
334
+ label="Video Negative Prompt",
335
+ value=DEFAULT_NAG_NEGATIVE_PROMPT,
336
+ lines=2,
337
+ )
338
+ nag_scale = gr.Slider(
339
+ label="NAG Scale",
340
+ minimum=1.0,
341
+ maximum=20.0,
342
+ step=0.25,
343
+ value=11.0,
344
+ info="Higher values = stronger guidance"
345
+ )
346
+
347
+ with gr.Group(elem_classes="settings-panel"):
348
+ gr.Markdown("### โš™๏ธ Video Settings")
349
+
350
+ with gr.Row():
351
+ duration_seconds_input = gr.Slider(
352
+ minimum=1,
353
+ maximum=8,
354
+ step=1,
355
+ value=DEFAULT_DURATION_SECONDS,
356
+ label="๐Ÿ“ฑ Duration (seconds)",
357
+ elem_classes="slider-container"
358
+ )
359
+ steps_slider = gr.Slider(
360
+ minimum=1,
361
+ maximum=8,
362
+ step=1,
363
+ value=DEFAULT_STEPS,
364
+ label="๐Ÿ”„ Inference Steps",
365
+ elem_classes="slider-container"
366
+ )
367
+
368
+ with gr.Row():
369
+ height_input = gr.Slider(
370
+ minimum=SLIDER_MIN_H,
371
+ maximum=SLIDER_MAX_H,
372
+ step=MOD_VALUE,
373
+ value=DEFAULT_H_SLIDER_VALUE,
374
+ label=f"๐Ÿ“ Height (ร—{MOD_VALUE})",
375
+ elem_classes="slider-container"
376
+ )
377
+ width_input = gr.Slider(
378
+ minimum=SLIDER_MIN_W,
379
+ maximum=SLIDER_MAX_W,
380
+ step=MOD_VALUE,
381
+ value=DEFAULT_W_SLIDER_VALUE,
382
+ label=f"๐Ÿ“ Width (ร—{MOD_VALUE})",
383
+ elem_classes="slider-container"
384
+ )
385
+
386
+ with gr.Row():
387
+ seed_input = gr.Slider(
388
+ label="๐ŸŒฑ Seed",
389
+ minimum=0,
390
+ maximum=MAX_SEED,
391
+ step=1,
392
+ value=DEFAULT_SEED,
393
+ interactive=True
394
+ )
395
+ randomize_seed_checkbox = gr.Checkbox(
396
+ label="๐ŸŽฒ Random Seed",
397
+ value=True,
398
+ interactive=True
399
+ )
400
+
401
+ with gr.Group(elem_classes="audio-settings"):
402
+ gr.Markdown("### ๐ŸŽต Audio Generation Settings")
403
+
404
+ enable_audio = gr.Checkbox(
405
+ label="๐Ÿ”Š Enable Automatic Audio Generation",
406
+ value=True,
407
+ interactive=True
408
+ )
409
+
410
+ with gr.Column(visible=True) as audio_settings_group:
411
+ audio_negative_prompt = gr.Textbox(
412
+ label="Audio Negative Prompt",
413
+ value=DEFAULT_AUDIO_NEGATIVE_PROMPT,
414
+ placeholder="Elements to avoid in audio (e.g., music, speech)",
415
+ )
416
+
417
+ with gr.Row():
418
+ audio_steps = gr.Slider(
419
+ minimum=10,
420
+ maximum=50,
421
+ step=5,
422
+ value=25,
423
+ label="๐ŸŽš๏ธ Audio Steps",
424
+ info="More steps = better quality"
425
+ )
426
+ audio_cfg_strength = gr.Slider(
427
+ minimum=1.0,
428
+ maximum=10.0,
429
+ step=0.5,
430
+ value=4.5,
431
+ label="๐ŸŽ›๏ธ Audio Guidance",
432
+ info="Strength of prompt guidance"
433
+ )
434
+
435
+ # Toggle audio settings visibility
436
+ enable_audio.change(
437
+ fn=lambda x: gr.update(visible=x),
438
+ inputs=[enable_audio],
439
+ outputs=[audio_settings_group]
440
+ )
441
+
442
+ generate_button = gr.Button(
443
+ "๐ŸŽฌ Generate Video with Audio",
444
+ variant="primary",
445
+ elem_classes="generate-btn"
446
+ )
447
+
448
+ with gr.Column(scale=5):
449
+ video_output = gr.Video(
450
+ label="Generated Video with Audio",
451
+ autoplay=True,
452
+ interactive=False,
453
+ elem_classes="video-output",
454
+ height=600
455
+ )
456
+
457
+ gr.HTML("""
458
+ <div style="text-align: center; margin-top: 20px; color: #6b7280;">
459
+ <p>๐Ÿ’ก Tip: The same prompt is used for both video and audio generation!</p>
460
+ <p>๐ŸŽง Audio is automatically matched to the visual content</p>
461
+ </div>
462
+ """)
463
+
464
+ # Examples section moved outside of columns
465
+ with gr.Row():
466
+ gr.Markdown("### ๐ŸŽฏ Example Prompts")
467
+
468
+ gr.Examples(
469
+ examples=examples,
470
+ inputs=[prompt, nag_negative_prompt, nag_scale],
471
+ outputs=None, # Don't connect outputs to avoid index issues
472
+ cache_examples=False
473
+ )
474
+
475
+ # Connect UI elements
476
+ ui_inputs = [
477
+ prompt,
478
+ nag_negative_prompt, nag_scale,
479
+ height_input, width_input, duration_seconds_input,
480
+ steps_slider,
481
+ seed_input, randomize_seed_checkbox,
482
+ enable_audio, audio_negative_prompt, audio_steps, audio_cfg_strength,
483
+ ]
484
+
485
+ generate_button.click(
486
+ fn=generate_video_with_audio,
487
+ inputs=ui_inputs,
488
+ outputs=[video_output, seed_input],
489
+ )
 
 
 
 
 
 
490
 
491
  if __name__ == "__main__":
492
+ demo.queue().launch()