import gradio as gr
import torch
import numpy as np
import tempfile
import scipy.io.wavfile
from transformers import VitsModel, AutoTokenizer

# Load model and tokenizer
model = VitsModel.from_pretrained("jellecali8/somali_tts_model")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-som")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).eval()

# Load custom speaker embedding
try:
    speaker_embedding = torch.tensor(np.load("new_speaker_embedding.npy")).unsqueeze(0).to(device)
except Exception as e:
    speaker_embedding = None
    print(f"Embedding load error: {e}")

def tts_fn(text):
    try:
        inputs = tokenizer(text, return_tensors="pt").to(device)

        with torch.no_grad():
            output = model(**inputs, speaker_embeddings=speaker_embedding)

        # Check for empty waveform
        if output.waveform is None or output.waveform.shape[-1] == 0:
            return "❌ Model-ka ma soo saarin cod. Waxaa laga yaabaa in embedding uu cilad leeyahay."

        audio = output.waveform.squeeze().cpu().numpy()

        # Save audio to temp file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
            scipy.io.wavfile.write(f.name, rate=16000, data=(audio * 32767).astype(np.int16))
            return f.name
    except Exception as e:
        return f"Error during synthesis: {str(e)}"

gr.Interface(
    fn=tts_fn,
    inputs=gr.Textbox(label="Qor qoraalka Somali"),
    outputs=gr.Audio(label="Codka la clone gareeyey"),
    title="Cod Somali ah oo la clone gareeyay"
).launch()