import os
import uuid
import subprocess
from pathlib import Path

import gradio as gr
from PIL import Image
from pydub import AudioSegment

# ──────────────────────────────────────────────
# 1. Download Wav2Lip model checkpoint
# ──────────────────────────────────────────────
MODEL_PATH = Path("wav2lip_gan.pth")
MODEL_URL  = "https://huggingface.co/spaces/fffiloni/wav2lip/resolve/main/wav2lip_gan.pth"

if not MODEL_PATH.exists():
    os.system(f"wget -q {MODEL_URL} -O {MODEL_PATH}")

# ──────────────────────────────────────────────
# 2. Preprocess image and audio (no cropping)
# ──────────────────────────────────────────────
def preprocess(image, audio_file):
    if image is None or audio_file is None:
        raise ValueError("Both an image and an audio file are required.")

    uid = uuid.uuid4().hex
    img_path = f"{uid}.jpg"
    wav_path = f"{uid}.wav"
    out_path = f"{uid}_result.mp4"

    image.save(img_path)

    seg = AudioSegment.from_file(audio_file)
    seg = seg.set_frame_rate(16000).set_channels(1)
    seg.export(wav_path, format="wav")

    return img_path, wav_path, out_path

# ──────────────────────────────────────────────
# 3. Main inference function
# ──────────────────────────────────────────────
def generate(image, audio):
    try:
        img, wav, out_vid = preprocess(image, audio)
    except Exception as e:
        return f"❌ {e}"

    try:
        subprocess.run(
            [
                "python", "inference.py",
                "--checkpoint_path", str(MODEL_PATH),
                "--face", img,
                "--audio", wav,
                "--outfile", out_vid,
                "--resize_factor", "1",
                "--pads", "0", "20", "0", "20",
                "--fps", "25",
                "--nosmooth"
            ],
            check=True,
        )
    except subprocess.CalledProcessError as e:
        return f"❌ Wav2Lip failed: {e}"

    return out_vid if Path(out_vid).exists() else "❌ Generation failed."

# ──────────────────────────────────────────────
# 4. Gradio interface
# ──────────────────────────────────────────────
demo = gr.Interface(
    fn=generate,
    inputs=[
        gr.Image(type="pil", label="Image (Full Resolution - Face Visible)"),
        gr.Audio(type="filepath", label="Audio (any format)")
    ],
    outputs=gr.Video(label="Talking-head MP4"),
    title="🗣️ High-Quality Wav2Lip (No Crop, Full Image)",
    description="Lip-sync using full image resolution. Add padding under the mouth and avoid smoothing for sharper lips.",
    allow_flagging="never",
    live=True,
)

if __name__ == "__main__":
    demo.launch()