Spaces:
Running
Running
File size: 1,593 Bytes
5e6c5bb c573494 5a1a7ec 9ce846a c573494 4c8b93e c573494 c953361 c573494 9ce846a c573494 9ce846a c573494 9ce846a 5a1a7ec 9ce846a 5a1a7ec c573494 5a1a7ec c953361 5a1a7ec 66d0bf1 9ce846a 66d0bf1 5a1a7ec c953361 5a1a7ec 4c8b93e 66d0bf1 c573494 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import gradio as gr
import torch
import numpy as np
from transformers import VitsModel, AutoTokenizer
LANG_MODEL_MAP = {
"English": "facebook/mms-tts-eng",
"Hindi": "facebook/mms-tts-hin",
"Tamil": "facebook/mms-tts-tam",
"Malayalam": "facebook/mms-tts-mal",
"Kannada": "facebook/mms-tts-kan",
"Telugu": "facebook/mms-tts-tel"
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cache = {}
def load_model_and_tokenizer(language):
model_name = LANG_MODEL_MAP[language]
if model_name not in cache:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = VitsModel.from_pretrained(model_name).to(device)
cache[model_name] = (tokenizer, model)
return cache[model_name]
def tts(language, text):
if not text.strip():
return 16000, np.zeros(1) # empty waveform if no text
tokenizer, model = load_model_and_tokenizer(language)
inputs = tokenizer(text, return_tensors="pt").to(device)
with torch.no_grad():
output = model(**inputs)
waveform = output.waveform.squeeze().cpu().numpy()
return 16000, waveform
iface = gr.Interface(
fn=tts,
inputs=[
gr.Dropdown(choices=list(LANG_MODEL_MAP.keys()), label="Select Language"),
gr.Textbox(label="Enter Text")
],
outputs=gr.Audio(label="Synthesized Speech", type="numpy"),
title="Multilingual Text-to-Speech (MMS)",
description="Generate speech from text using Meta's MMS models for English, Hindi, Tamil, Malayalam, Kannada and Telugu."
)
if __name__ == "__main__":
iface.launch() |