Spaces:

mich123geb
/

wav2lip_api

Sleeping

File size: 3,367 Bytes

952337e
13089ed
a30d89d
 
 
 
bfd9324
13089ed
a37c88f
a30d89d
0f154d9
a30d89d
 
0f154d9
a30d89d
 
 
a37c88f
a30d89d
0f154d9
a30d89d
 
 
 
952337e
a30d89d
0f154d9
 
 
952337e
c12b434
952337e
a30d89d
0f154d9
a30d89d
bfd9324
a30d89d
bfd9324
a30d89d
0f154d9
a30d89d
 
 
 
 
 
bfd9324
0f154d9
 
 
 
 
 
 
 
 
4f314db
0f154d9
 
 
 
 
 
 
952337e
a30d89d
952337e
a30d89d
0f154d9
a30d89d
 
952337e
0f154d9
 
 
 
a30d89d
0f154d9
 
a30d89d
 
 
 
 
0f154d9

import os
import uuid
import subprocess
from pathlib import Path

import gradio as gr
from PIL import Image
from pydub import AudioSegment

# ──────────────────────────────────────────────
# 1. Download Wav2Lip model checkpoint
# ──────────────────────────────────────────────
MODEL_PATH = Path("wav2lip_gan.pth")
MODEL_URL  = "https://huggingface.co/spaces/fffiloni/wav2lip/resolve/main/wav2lip_gan.pth"

if not MODEL_PATH.exists():
    os.system(f"wget -q {MODEL_URL} -O {MODEL_PATH}")

# ──────────────────────────────────────────────
# 2. Preprocess image and audio (no cropping)
# ──────────────────────────────────────────────
def preprocess(image, audio_file):
    if image is None or audio_file is None:
        raise ValueError("Both an image and an audio file are required.")

    uid = uuid.uuid4().hex
    img_path = f"{uid}.jpg"
    wav_path = f"{uid}.wav"
    out_path = f"{uid}_result.mp4"

    image.save(img_path)

    seg = AudioSegment.from_file(audio_file)
    seg = seg.set_frame_rate(16000).set_channels(1)
    seg.export(wav_path, format="wav")

    return img_path, wav_path, out_path

# ──────────────────────────────────────────────
# 3. Main inference function
# ──────────────────────────────────────────────
def generate(image, audio):
    try:
        img, wav, out_vid = preprocess(image, audio)
    except Exception as e:
        return f"❌ {e}"

    try:
        subprocess.run(
            [
                "python", "inference.py",
                "--checkpoint_path", str(MODEL_PATH),
                "--face", img,
                "--audio", wav,
                "--outfile", out_vid,
                "--resize_factor", "1",
                "--pads", "0", "20", "0", "20",
                "--fps", "25",
                "--nosmooth"
            ],
            check=True,
        )
    except subprocess.CalledProcessError as e:
        return f"❌ Wav2Lip failed: {e}"

    return out_vid if Path(out_vid).exists() else "❌ Generation failed."

# ──────────────────────────────────────────────
# 4. Gradio interface
# ──────────────────────────────────────────────
demo = gr.Interface(
    fn=generate,
    inputs=[
        gr.Image(type="pil", label="Image (Full Resolution - Face Visible)"),
        gr.Audio(type="filepath", label="Audio (any format)")
    ],
    outputs=gr.Video(label="Talking-head MP4"),
    title="🗣️ High-Quality Wav2Lip (No Crop, Full Image)",
    description="Lip-sync using full image resolution. Add padding under the mouth and avoid smoothing for sharper lips.",
    allow_flagging="never",
    live=True,
)

if __name__ == "__main__":
    demo.launch()