Spaces:
Sleeping
Sleeping
from transformers import VitsModel, AutoTokenizer | |
import torch | |
import scipy.io.wavfile | |
import gradio as gr | |
import numpy as np | |
# Load model and tokenizer | |
model = VitsModel.from_pretrained("Toadoum/swahili-mms-tts-finetuned", device_map="auto") | |
tokenizer = AutoTokenizer.from_pretrained("Toadoum/swahili-mms-tts-finetuned") | |
def text_to_speech(text): | |
# Tokenize input text | |
inputs = tokenizer(text, return_tensors="pt") | |
# Generate waveform | |
with torch.no_grad(): | |
output = model(**inputs).waveform | |
# Convert to numpy array | |
output_np = output.squeeze().cpu().numpy() | |
# Get sampling rate from model config | |
sampling_rate = model.config.sampling_rate | |
# Return as tuple for Gradio audio component | |
return (sampling_rate, output_np) | |
# Create Gradio interface | |
demo = gr.Interface( | |
fn=text_to_speech, | |
inputs=gr.Textbox( | |
label="Enter Swahili Text", | |
value="""Neurotech Africa ni kampuni kutoka Tanzania inaongoza mapinduzi ya kidigitali nchini na barani Afrika kwa suluhisho za Akili bandia (AI). | |
Tunajenga AI ambayo inasaidia biashara kuboresha uzoefu wa wateja kupitia teknolojia za kisasa za mazungumzo.""" | |
), | |
outputs=gr.Audio(label="Generated Speech"), | |
title="Swahili Text-to-Speech", | |
description="Convert Swahili text to speech using a fine-tuned MMS-TTS model", | |
allow_flagging="never" | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
demo.launch() |