Spaces:
Sleeping
Sleeping
File size: 1,453 Bytes
91b340e 3d37603 91b340e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
from transformers import VitsModel, AutoTokenizer
import torch
import scipy.io.wavfile
import gradio as gr
import numpy as np
# Load model and tokenizer
model = VitsModel.from_pretrained("Toadoum/swahili-mms-tts-finetuned", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("Toadoum/swahili-mms-tts-finetuned")
def text_to_speech(text):
# Tokenize input text
inputs = tokenizer(text, return_tensors="pt")
# Generate waveform
with torch.no_grad():
output = model(**inputs).waveform
# Convert to numpy array
output_np = output.squeeze().cpu().numpy()
# Get sampling rate from model config
sampling_rate = model.config.sampling_rate
# Return as tuple for Gradio audio component
return (sampling_rate, output_np)
# Create Gradio interface
demo = gr.Interface(
fn=text_to_speech,
inputs=gr.Textbox(
label="Enter Swahili Text",
value="""Neurotech Africa ni kampuni kutoka Tanzania inaongoza mapinduzi ya kidigitali nchini na barani Afrika kwa suluhisho za Akili bandia (AI).
Tunajenga AI ambayo inasaidia biashara kuboresha uzoefu wa wateja kupitia teknolojia za kisasa za mazungumzo."""
),
outputs=gr.Audio(label="Generated Speech"),
title="Swahili Text-to-Speech",
description="Convert Swahili text to speech using a fine-tuned MMS-TTS model",
allow_flagging="never"
)
# Launch the app
if __name__ == "__main__":
demo.launch() |