Ragulravi's picture
Update app.py
45ffb4a verified
import torch
import soundfile as sf
import gradio as gr
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
# Voice list
speaker_names = [
"Thoma", "Mary", "Swapna", "Dinesh", "Meera", "Jatin", "Aakash", "Sneha", "Kabir", "Tisha",
"Chingkhei", "Thoiba", "Priya", "Tarun", "Gauri", "Nisha", "Raghav", "Kavya", "Ravi", "Vikas", "Riya"
]
# Load model and tokenizers
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device)
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
def generate_tts(prompt, speaker):
description = (
f"{speaker} speaks in a warm, neutral Indian English accent with a moderate pitch and steady pace. "
"tone is friendly yet professional, making listeners feel welcome and comfortable. "
"The voice is clear and well-articulated, with no regional inflections, and the recording is high-quality with no background noise."
)
desc_ids = description_tokenizer(description, return_tensors="pt").to(device)
prompt_ids = tokenizer(prompt, return_tensors="pt").to(device)
with torch.no_grad():
generation = model.generate(
input_ids=desc_ids.input_ids,
attention_mask=desc_ids.attention_mask,
prompt_input_ids=prompt_ids.input_ids,
prompt_attention_mask=prompt_ids.attention_mask,
)
audio_arr = generation.cpu().numpy().squeeze()
wav_path = "parler_tts_output.wav"
sf.write(wav_path, audio_arr, model.config.sampling_rate)
return wav_path
def parler_gradio_interface(prompt, speaker):
return generate_tts(prompt, speaker)
# Gradio UI
iface = gr.Interface(
fn=parler_gradio_interface,
inputs=[
gr.Textbox(label="Enter text (Indian English)", lines=2),
gr.Dropdown(label="Choose Voice", choices=speaker_names, value=speaker_names[0])
],
outputs=gr.Audio(type="filepath", label="Generated Audio"),
title="Indic Parler-TTS Voice Generator",
description="Enter your text, select a voice, and click Generate to hear Indian English TTS with chosen style.",
allow_flagging="never"
)
if __name__ == "__main__":
iface.launch()