Spaces:

jbilcke-hf
/

OmniAvatar

Running on L40S

File size: 32,493 Bytes

import gradio as gr
import os
import sys
import tempfile
import shutil
import spaces
from pathlib import Path
import torch
import logging
from huggingface_hub import snapshot_download
import math
import random
import librosa
import numpy as np
import torch.nn as nn
from tqdm import tqdm
from functools import partial
from datetime import datetime
import torchvision.transforms as TT
from transformers import Wav2Vec2FeatureExtractor
import torchvision.transforms as transforms
import torch.nn.functional as F
from glob import glob

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
MODELS_DIR = Path(os.environ.get('MODELS_DIR', 'pretrained_models'))
DEFAULT_CONFIG_PATH = "configs/inference_1.3B.yaml"

# Import args_config module first
import OmniAvatar.utils.args_config
 
# Create and set global args before any other OmniAvatar imports
class Args:
    def __init__(self):
        self.rank = 0
        self.world_size = 1
        self.local_rank = 0
        self.device = 'cuda:0'
        self.num_nodes = 1
        self.dtype = 'bf16'
        self.exp_path = str(MODELS_DIR / "OmniAvatar-1.3B")
        self.dit_path = str(MODELS_DIR / "Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors")
        self.text_encoder_path = str(MODELS_DIR / "Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth")
        self.vae_path = str(MODELS_DIR / "Wan2.1-T2V-1.3B/Wan2.1_VAE.pth")
        self.wav2vec_path = str(MODELS_DIR / "wav2vec2-base-960h")
        self.train_architecture = 'lora'
        self.lora_rank = 128
        self.lora_alpha = 64.0
        self.lora_target_modules = 'q,k,v,o,ffn.0,ffn.2'
        self.init_lora_weights = 'kaiming'
        self.sp_size = 1
        self.num_persistent_param_in_dit = None
        self.use_fsdp = False
        self.i2v = True
        self.use_audio = True
        self.random_prefix_frames = True
        self.overlap_frame = 13
        self.num_steps = 20
        self.negative_prompt = 'Vivid color tones, background/camera moving quickly, screen switching, subtitles and special effects, mutation, overexposed, static, blurred details, subtitles, style, work, painting, image, still, overall grayish, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn face, deformed, disfigured, malformed limbs, fingers merging, motionless image, chaotic background, three legs, crowded background with many people, walking backward'
        self.guidance_scale = 4.5
        self.audio_scale = 0
        self.max_tokens = 30000
        self.sample_rate = 16000
        self.fps = 25
        self.max_hw = 720
        self.tea_cache_l1_thresh = 0
        self.image_sizes_720 = [[400, 720], [720, 720], [720, 400]]
        self.image_sizes_1280 = [[720, 720], [528, 960], [960, 528], [720, 1280], [1280, 720]]
        self.seq_len = 200
        self.infer = True
        self.debug = False
        self.model_config = None
    
    def __contains__(self, key):
        """Support 'in' operator for checking if attribute exists"""
        return hasattr(self, key)
    
    def __iter__(self):
        """Make the Args object iterable over its attributes"""
        return iter(self.__dict__)
    
    def keys(self):
        """Return the attribute names"""
        return self.__dict__.keys()
    
    def __getitem__(self, key):
        """Support dictionary-style access"""
        return getattr(self, key)

# Set the global args before any other OmniAvatar imports
OmniAvatar.utils.args_config.args = Args()

# Now we can safely import OmniAvatar modules
from OmniAvatar.utils.args_config import parse_args
from OmniAvatar.utils.io_utils import load_state_dict 
from peft import LoraConfig, inject_adapter_in_model
from OmniAvatar.models.model_manager import ModelManager
from OmniAvatar.wan_video import WanVideoPipeline
from OmniAvatar.utils.io_utils import save_video_as_grid_and_mp4
import torch.distributed as dist
from OmniAvatar.utils.audio_preprocess import add_silence_to_audio_ffmpeg
from OmniAvatar.distributed.fsdp import shard_model

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def download_models():
    """Download required models if they don't exist"""
    models_to_download = [
        {
            "repo_id": "Wan-AI/Wan2.1-T2V-1.3B",
            "local_dir": MODELS_DIR / "Wan2.1-T2V-1.3B",
            "name": "Wan2.1-T2V-1.3B base model"
        },
        {
            "repo_id": "OmniAvatar/OmniAvatar-1.3B",
            "local_dir": MODELS_DIR / "OmniAvatar-1.3B",
            "name": "OmniAvatar-1.3B LoRA weights"
        },
        {
            "repo_id": "facebook/wav2vec2-base-960h",
            "local_dir": MODELS_DIR / "wav2vec2-base-960h",
            "name": "Wav2Vec2 audio encoder"
        }
    ]
    
    # Create models directory if it doesn't exist
    MODELS_DIR.mkdir(exist_ok=True)
    
    for model in models_to_download:
        local_dir = model["local_dir"]
        
        # Check if model already exists
        if local_dir.exists() and any(local_dir.iterdir()):
            logger.info(f"{model['name']} already exists at {local_dir}")
            continue
            
        logger.info(f"Downloading {model['name']} from {model['repo_id']}...")
        try:
            snapshot_download(
                repo_id=model["repo_id"],
                local_dir=str(local_dir),
                local_dir_use_symlinks=False,
                resume_download=True
            )
            logger.info(f"Successfully downloaded {model['name']}")
        except Exception as e:
            logger.error(f"Failed to download {model['name']}: {str(e)}")
            raise gr.Error(f"Failed to download {model['name']}: {str(e)}")

# Utility functions from inference.py
def match_size(image_size, h, w):
    ratio_ = 9999
    size_ = 9999
    select_size = None
    for image_s in image_size:
        ratio_tmp = abs(image_s[0] / image_s[1] - h / w)
        size_tmp = abs(max(image_s) - max(w, h))
        if ratio_tmp < ratio_:
            ratio_ = ratio_tmp
            size_ = size_tmp
            select_size = image_s
        if ratio_ == ratio_tmp:
            if size_ == size_tmp:
                select_size = image_s
    return select_size

def resize_pad(image, ori_size, tgt_size):
    h, w = ori_size
    scale_ratio = max(tgt_size[0] / h, tgt_size[1] / w)
    scale_h = int(h * scale_ratio)
    scale_w = int(w * scale_ratio)

    image = transforms.Resize(size=[scale_h, scale_w])(image)

    padding_h = tgt_size[0] - scale_h
    padding_w = tgt_size[1] - scale_w
    pad_top = padding_h // 2
    pad_bottom = padding_h - pad_top
    pad_left = padding_w // 2
    pad_right = padding_w - pad_left

    image = F.pad(image, (pad_left, pad_right, pad_top, pad_bottom), mode='constant', value=0)
    return image

class WanInferencePipeline(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.args = args
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        if args.dtype=='bf16':
            self.dtype = torch.bfloat16
        elif args.dtype=='fp16':
            self.dtype = torch.float16
        else:   
            self.dtype = torch.float32
        self.pipe = self.load_model()
        if args.i2v:
            chained_trainsforms = []
            chained_trainsforms.append(TT.ToTensor())
            self.transform = TT.Compose(chained_trainsforms)
        if args.use_audio:
            from OmniAvatar.models.wav2vec import Wav2VecModel
            self.wav_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
                    str(MODELS_DIR / "wav2vec2-base-960h")
                )
            self.audio_encoder = Wav2VecModel.from_pretrained(str(MODELS_DIR / "wav2vec2-base-960h"), local_files_only=True).to(device=self.device)
            self.audio_encoder.feature_extractor._freeze_parameters()

    def load_model(self):
        # Initialize for single GPU
        os.environ['MASTER_ADDR'] = 'localhost'
        os.environ['MASTER_PORT'] = '12355'
        os.environ['RANK'] = '0'
        os.environ['WORLD_SIZE'] = '1'
        
        dist.init_process_group(backend="nccl", init_method="env://")
        
        from xfuser.core.distributed import (initialize_model_parallel,
                                            init_distributed_environment)
        init_distributed_environment(rank=0, world_size=1)
        initialize_model_parallel(
            sequence_parallel_degree=self.args.sp_size,
            ring_degree=1,
            ulysses_degree=self.args.sp_size,
        )
        torch.cuda.set_device(0)
        
        ckpt_path = f'{self.args.exp_path}/pytorch_model.pt'
        assert os.path.exists(ckpt_path), f"pytorch_model.pt not found in {self.args.exp_path}"
        if self.args.train_architecture == 'lora':
            self.args.pretrained_lora_path = pretrained_lora_path = ckpt_path
        else:
            resume_path = ckpt_path
        
        self.step = 0

        # Load models
        model_manager = ModelManager(device="cpu", infer=True)
        
        # For OmniAvatar, we need to override the model config to use in_dim=33
        # This is because OmniAvatar uses additional channels for audio/image conditioning
        if self.args.train_architecture == "lora" and "OmniAvatar" in self.args.exp_path:
            # Set model_config in args to override the default
            self.args.model_config = {
                "in_dim": 33  # 16 (base) + 17 (additional channels for OmniAvatar)
            }
        
        model_manager.load_models(
            [
                self.args.dit_path.split(","),
                self.args.text_encoder_path,
                self.args.vae_path
            ],
            torch_dtype=self.dtype,
            device='cpu',
        )

        pipe = WanVideoPipeline.from_model_manager(model_manager, 
                                                torch_dtype=self.dtype, 
                                                device=str(self.device), 
                                                use_usp=True if self.args.sp_size > 1 else False,
                                                infer=True)
        if self.args.train_architecture == "lora":
            logger.info(f'Use LoRA: lora rank: {self.args.lora_rank}, lora alpha: {self.args.lora_alpha}')
            self.add_lora_to_model(
                    pipe.denoising_model(),
                    lora_rank=self.args.lora_rank,
                    lora_alpha=self.args.lora_alpha,
                    lora_target_modules=self.args.lora_target_modules,
                    init_lora_weights=self.args.init_lora_weights,
                    pretrained_lora_path=pretrained_lora_path,
                )
        else:
            missing_keys, unexpected_keys = pipe.denoising_model().load_state_dict(load_state_dict(resume_path), strict=True)
            logger.info(f"load from {resume_path}, {len(missing_keys)} missing keys, {len(unexpected_keys)} unexpected keys")
        pipe.requires_grad_(False)
        pipe.eval()
        pipe.enable_vram_management(num_persistent_param_in_dit=self.args.num_persistent_param_in_dit)
        if self.args.use_fsdp:
            shard_fn = partial(shard_model, device_id=self.device)
            pipe.dit = shard_fn(pipe.dit)
        return pipe
    
    def add_lora_to_model(self, model, lora_rank=4, lora_alpha=4, lora_target_modules="q,k,v,o,ffn.0,ffn.2", init_lora_weights="kaiming", pretrained_lora_path=None, state_dict_converter=None):
        self.lora_alpha = lora_alpha
        if init_lora_weights == "kaiming":
            init_lora_weights = True
            
        lora_config = LoraConfig(
            r=lora_rank,
            lora_alpha=lora_alpha,
            init_lora_weights=init_lora_weights,
            target_modules=lora_target_modules.split(","),
        )
        model = inject_adapter_in_model(lora_config, model)
                
        if pretrained_lora_path is not None:
            state_dict = load_state_dict(pretrained_lora_path)
            if state_dict_converter is not None:
                state_dict = state_dict_converter(state_dict)
            missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
            all_keys = [i for i, _ in model.named_parameters()]
            num_updated_keys = len(all_keys) - len(missing_keys)
            num_unexpected_keys = len(unexpected_keys)
            logger.info(f"{num_updated_keys} parameters are loaded from {pretrained_lora_path}. {num_unexpected_keys} parameters are unexpected.")
    
    def forward(self, prompt, 
                image_path=None, 
                audio_path=None, 
                seq_len=101,
                height=720, 
                width=720,
                overlap_frame=None,
                num_steps=None,
                negative_prompt=None,
                guidance_scale=None,
                audio_scale=None,
                progress_callback=None):
        overlap_frame = overlap_frame if overlap_frame is not None else self.args.overlap_frame
        num_steps = num_steps if num_steps is not None else self.args.num_steps
        negative_prompt = negative_prompt if negative_prompt is not None else self.args.negative_prompt
        guidance_scale = guidance_scale if guidance_scale is not None else self.args.guidance_scale
        audio_scale = audio_scale if audio_scale is not None else self.args.audio_scale

        if image_path is not None:
            from PIL import Image
            image = Image.open(image_path).convert("RGB")
            image = self.transform(image).unsqueeze(0).to(self.device)
            _, _, h, w = image.shape
            select_size = match_size(getattr(self.args, f'image_sizes_{self.args.max_hw}'), h, w)
            image = resize_pad(image, (h, w), select_size)
            image = image * 2.0 - 1.0
            image = image[:, :, None]
        else:
            image = None
            select_size = [height, width]
        L = int(self.args.max_tokens * 16 * 16 * 4 / select_size[0] / select_size[1])
        L = L // 4 * 4 + 1 if L % 4 != 0 else L - 3  # video frames
        T = (L + 3) // 4  # latent frames

        if self.args.i2v:
            if self.args.random_prefix_frames:
                fixed_frame = overlap_frame
                assert fixed_frame % 4 == 1
            else:
                fixed_frame = 1
            prefix_lat_frame = (3 + fixed_frame) // 4
            first_fixed_frame = 1
        else:
            fixed_frame = 0
            prefix_lat_frame = 0
            first_fixed_frame = 0

        if audio_path is not None and self.args.use_audio:
            audio, sr = librosa.load(audio_path, sr=self.args.sample_rate)
            input_values = np.squeeze(
                    self.wav_feature_extractor(audio, sampling_rate=16000).input_values
                )
            input_values = torch.from_numpy(input_values).float().to(device=self.device)
            ori_audio_len = audio_len = math.ceil(len(input_values) / self.args.sample_rate * self.args.fps)
            input_values = input_values.unsqueeze(0)
            # padding audio
            if audio_len < L - first_fixed_frame:
                audio_len = audio_len + ((L - first_fixed_frame) - audio_len % (L - first_fixed_frame))
            elif (audio_len - (L - first_fixed_frame)) % (L - fixed_frame) != 0:
                audio_len = audio_len + ((L - fixed_frame) - (audio_len - (L - first_fixed_frame)) % (L - fixed_frame))
            input_values = F.pad(input_values, (0, audio_len * int(self.args.sample_rate / self.args.fps) - input_values.shape[1]), mode='constant', value=0)
            with torch.no_grad():
                hidden_states = self.audio_encoder(input_values, seq_len=audio_len, output_hidden_states=True)
                audio_embeddings = hidden_states.last_hidden_state
                for mid_hidden_states in hidden_states.hidden_states:
                    audio_embeddings = torch.cat((audio_embeddings, mid_hidden_states), -1)
            seq_len = audio_len
            audio_embeddings = audio_embeddings.squeeze(0)
            audio_prefix = torch.zeros_like(audio_embeddings[:first_fixed_frame])
        else:
            audio_embeddings = None

        # loop
        times = (seq_len - L + first_fixed_frame) // (L-fixed_frame) + 1
        if times * (L-fixed_frame) + fixed_frame < seq_len:
            times += 1
        video = []
        image_emb = {}
        img_lat = None
        if self.args.i2v:
            self.pipe.load_models_to_device(['vae'])
            img_lat = self.pipe.encode_video(image.to(dtype=self.dtype)).to(self.device)

            msk = torch.zeros_like(img_lat.repeat(1, 1, T, 1, 1)[:,:1])
            image_cat = img_lat.repeat(1, 1, T, 1, 1)
            msk[:, :, 1:] = 1
            image_emb["y"] = torch.cat([image_cat, msk], dim=1)
        for t in range(times):
            logger.info(f"[{t+1}/{times}]")
            
            # Create a sub-progress callback for this iteration
            if progress_callback:
                def sub_progress_callback(step, total_steps):
                    # Calculate overall progress including all iterations
                    iteration_progress = t / times
                    step_progress = step / total_steps / times
                    overall_progress = iteration_progress + step_progress
                    desc = f"Generating segment {t+1}/{times} - Step {step}/{total_steps}"
                    progress_callback(overall_progress, desc)
            else:
                sub_progress_callback = None
            
            audio_emb = {}
            if t == 0:
                overlap = first_fixed_frame
            else:
                overlap = fixed_frame
                image_emb["y"][:, -1:, :prefix_lat_frame] = 0
            prefix_overlap = (3 + overlap) // 4
            if audio_embeddings is not None:
                if t == 0:
                    audio_tensor = audio_embeddings[
                            :min(L - overlap, audio_embeddings.shape[0])
                        ]
                else:
                    audio_start = L - first_fixed_frame + (t - 1) * (L - overlap)
                    audio_tensor = audio_embeddings[
                        audio_start: min(audio_start + L - overlap, audio_embeddings.shape[0])
                    ]
                    
                audio_tensor = torch.cat([audio_prefix, audio_tensor], dim=0)
                audio_prefix = audio_tensor[-fixed_frame:]
                audio_tensor = audio_tensor.unsqueeze(0).to(device=self.device, dtype=self.dtype)
                audio_emb["audio_emb"] = audio_tensor
            else:
                audio_prefix = None
            if image is not None and img_lat is None:
                self.pipe.load_models_to_device(['vae'])
                img_lat = self.pipe.encode_video(image.to(dtype=self.dtype)).to(self.device)
                assert img_lat.shape[2] == prefix_overlap
            img_lat = torch.cat([img_lat, torch.zeros_like(img_lat[:, :, :1].repeat(1, 1, T - prefix_overlap, 1, 1))], dim=2)
            frames, _, latents = self.pipe.log_video(img_lat, prompt, prefix_overlap, image_emb, audio_emb,
                                                 negative_prompt, num_inference_steps=num_steps, 
                                                 cfg_scale=guidance_scale, audio_cfg_scale=audio_scale if audio_scale is not None else guidance_scale,
                                                 return_latent=True,
                                                 tea_cache_l1_thresh=self.args.tea_cache_l1_thresh,tea_cache_model_id="Wan2.1-T2V-1.3B",
                                                 progress_callback=sub_progress_callback)
            img_lat = None
            image = (frames[:, -fixed_frame:].clip(0, 1) * 2 - 1).permute(0, 2, 1, 3, 4).contiguous()
            if t == 0:
                video.append(frames)
            else:
                video.append(frames[:, overlap:])
        video = torch.cat(video, dim=1)
        video = video[:, :ori_audio_len + 1]
        return video

# Initialize the pipeline globally
inference_pipeline = None
args_global = None

def initialize_inference_pipeline():
    global inference_pipeline, args_global
    
    if inference_pipeline is not None:
        return inference_pipeline
    
    # Use the global args that was already created
    args_global = OmniAvatar.utils.args_config.args
    
    logger.info("Initializing inference pipeline...")
    inference_pipeline = WanInferencePipeline(args_global)
    logger.info("Inference pipeline initialized successfully")
    return inference_pipeline

def get_duration(reference_image,
                    audio_file, 
                    text_prompt,
                    seed,
                    use_random_seed,
                    num_steps,
                    guidance_scale,
                    audio_scale,
                    overlap_frames,
                    fps,
                    silence_duration,
                    resolution,
                    progress):
    if num_steps > 15:
        return 120
    else:
        return 100


# this task might too long for ZeroGPU maybe,
# but I need to try on a H200 to be sure
@spaces.GPU(duration=get_duration)
def generate_avatar_video(
    reference_image,
    audio_file, 
    text_prompt,
    seed=None,
    use_random_seed=True,
    num_steps=20,
    guidance_scale=4.5,
    audio_scale=None,
    overlap_frames=13,
    fps=25,
    silence_duration=0.3,
    resolution="720p",
    progress=gr.Progress()
):
    """Generate an avatar video using OmniAvatar"""
    
    try:
        progress(0.1, desc="Initializing")
        
        if use_random_seed or seed is None or seed == -1:
            seed = random.randint(0, 2147483647)
        
        set_seed(seed)
        
        # Initialize pipeline if needed
        pipeline = initialize_inference_pipeline()
        
        with tempfile.TemporaryDirectory() as temp_dir:
            temp_path = Path(temp_dir)
            
            progress(0.2, desc="Preparing inputs")
            
            # Copy input files to temp directory
            temp_image = temp_path / "input_image.jpeg"
            temp_audio = temp_path / "input_audio.mp3"
            shutil.copy(reference_image, temp_image)
            shutil.copy(audio_file, temp_audio)
            
            # Add silence to audio
            if silence_duration > 0:
                audio_with_silence = temp_path / "audio_with_silence.wav"
                add_silence_to_audio_ffmpeg(str(temp_audio), str(audio_with_silence), silence_duration)
                input_audio_path = str(audio_with_silence)
            else:
                input_audio_path = str(temp_audio)
            
            progress(0.3, desc="Configuring generation parameters")
            
            # Update args for this generation
            args_global.seed = seed
            args_global.num_steps = num_steps
            args_global.guidance_scale = guidance_scale
            args_global.audio_scale = audio_scale if audio_scale is not None and audio_scale > 0 else 0
            args_global.overlap_frame = overlap_frames
            args_global.fps = fps
            args_global.silence_duration_s = silence_duration
            args_global.max_hw = 720 if resolution == "480p" else 1280
            
            progress(0.4, desc="Running OmniAvatar generation")
            
            # Create a progress callback that maps pipeline progress to Gradio progress
            def pipeline_progress_callback(pipeline_progress, desc):
                # Map pipeline progress (0-1) to Gradio progress range (0.4-0.8)
                gradio_progress = 0.4 + (pipeline_progress * 0.4)
                progress(gradio_progress, desc=desc)
            
            # Generate video
            video = pipeline(
                prompt=text_prompt,
                image_path=str(temp_image),
                audio_path=input_audio_path,
                seq_len=args_global.seq_len,
                progress_callback=pipeline_progress_callback
            )
            
            progress(0.8, desc="Saving video")
            
            # Create output directory in temp folder
            output_dir = temp_path / "output"
            output_dir.mkdir(exist_ok=True)
            
            # Add audio offset for final output
            audio_with_offset = temp_path / "audio_with_offset.wav"
            add_silence_to_audio_ffmpeg(str(temp_audio), str(audio_with_offset), 1.0 / fps + silence_duration)
            
            # Save video
            save_video_as_grid_and_mp4(
                video, 
                str(output_dir), 
                fps, 
                prompt=text_prompt,
                audio_path=str(audio_with_offset) if args_global.use_audio else None, 
                prefix=f'result_000'
            )
            
            progress(0.9, desc="Finalizing")
            
            # Find the generated video file
            generated_videos = list(output_dir.glob("result_000_*.mp4"))
            if not generated_videos:
                # Also check for result_000.mp4 (without suffix)
                generated_videos = list(output_dir.glob("result_000.mp4"))
            
            if not generated_videos:
                raise gr.Error("No video file was generated")
            
            # Get the first (and should be only) video
            latest_video = generated_videos[0]
            
            # Create a persistent temporary file for Gradio
            with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_output:
                output_path = tmp_output.name
                
            # Copy the generated video to the persistent temp file
            shutil.copy(latest_video, output_path)
            
            progress(1.0, desc="Generation complete")
            logger.info(f"Video saved to: {output_path}")
            
            return output_path, seed
            
    except Exception as e:
        logger.error(f"Error generating video: {str(e)}", exc_info=True)
        raise gr.Error(f"Error generating video: {str(e)}")

# Initialize models on module import (for Hugging Face Spaces)
logger.info("Initializing OmniAvatar...")
logger.info("Checking and downloading required models...")
download_models()
logger.info("Model initialization complete")

# Create the Gradio interface
with gr.Blocks(title="OmniAvatar - Lipsynced Avatar Video Generation") as app:
    gr.Markdown("""
    # 🎭 OmniAvatar - Lipsynced Avatar Video Generation
    
    Generate videos with lipsynced avatars using a reference image and audio file.
    Based on Wan2.1 with OmniAvatar enhancements for audio-driven avatar animation.
    Note: this Gradio Space demo uses Wan2.1 1.3B and not Wan 14B.
    It takes about 4 minutes to generate a 4s long video (like in the examples), so we recommend you to duplicate this space.
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            # Input components
            reference_image = gr.Image(
                label="Reference Avatar Image",
                type="filepath",
                elem_id="reference_image"
            )
            
            audio_file = gr.Audio(
                label="Speech Audio File",
                type="filepath",
                elem_id="audio_file"
            )
            
            text_prompt = gr.Textbox(
                label="Video Description",
                placeholder="Describe the video scene and actions...",
                lines=3,
                value="A person speaking naturally with subtle facial expressions"
            )
            
            with gr.Accordion("Advanced Settings", open=False):
                with gr.Row():
                    use_random_seed = gr.Checkbox(
                        label="Use random seed",
                        value=True
                    )
                    
                    seed = gr.Slider(
                        label="Seed (ignored if random seed is checked)",
                        minimum=0,
                        maximum=2147483647,
                        step=1,
                        value=42
                    )
                    
                    resolution = gr.Radio(
                        label="Resolution",
                        choices=["480p", "720p"],
                        value="480p"
                    )
                
                with gr.Row():
                    num_steps = gr.Slider(
                        label="Inference Steps",
                        minimum=10,
                        maximum=50,
                        step=1,
                        value=15
                    )
                    
                    guidance_scale = gr.Slider(
                        label="Guidance Scale",
                        minimum=1.0,
                        maximum=10.0,
                        step=0.5,
                        value=4.5
                    )
                
                with gr.Row():
                    audio_scale = gr.Slider(
                        label="Audio Scale (leave 0 to use guidance scale)",
                        minimum=0.0,
                        maximum=10.0,
                        step=0.5,
                        value=3.0
                    )
                    
                    overlap_frames = gr.Slider(
                        label="Overlap Frames",
                        minimum=1,
                        maximum=25,
                        step=4,
                        value=13,
                        info="Must be 1 + 4*n"
                    )
                
                with gr.Row():
                    fps = gr.Slider(
                        label="FPS",
                        minimum=10,
                        maximum=30,
                        step=1,
                        value=25
                    )
                    
                    silence_duration = gr.Slider(
                        label="Silence Duration (s)",
                        minimum=0.0,
                        maximum=2.0,
                        step=0.1,
                        value=0.3
                    )
            
            generate_btn = gr.Button(
                "🎬 Generate Avatar Video",
                variant="primary"
            )
            
            # Add seed output display
            seed_output = gr.Number(
                label="Seed used",
                interactive=False
            )
        
        with gr.Column(scale=1):
            # Output component
            output_video = gr.Video(
                label="Generated Avatar Video",
                elem_id="output_video"
            )
            
            # Examples
            gr.Examples(
                examples=[
                    [
                        "examples/images/demo1.jpg",
                        "examples/audios/demo1.mp3",
                        "An arrogant and gloomy wizard explains something in a grave tone"
                    ],
                ],
                inputs=[reference_image, audio_file, text_prompt],
                label="Example Inputs"
            )
    
    # Connect the generate button
    generate_btn.click(
        fn=generate_avatar_video,
        inputs=[
            reference_image,
            audio_file,
            text_prompt,
            seed,
            use_random_seed,
            num_steps,
            guidance_scale,
            audio_scale,
            overlap_frames,
            fps,
            silence_duration,
            resolution
        ],
        outputs=[output_video, seed_output]
    )
    
    gr.Markdown("""
    ## 📝 Notes
    - The reference image should be a clear frontal view of the person
    - Audio should be clear speech without background music
    - Generation may take several minutes depending on video length
    - For best results, use high-quality input images and audio
    """)

# Launch the app
if __name__ == "__main__":
    app.launch(share=True)