File size: 2,379 Bytes
6eb64c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45ffb4a
6eb64c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import torch
import soundfile as sf
import gradio as gr
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer

# Voice list
speaker_names = [
    "Thoma", "Mary", "Swapna", "Dinesh", "Meera", "Jatin", "Aakash", "Sneha", "Kabir", "Tisha",
    "Chingkhei", "Thoiba", "Priya", "Tarun", "Gauri", "Nisha", "Raghav", "Kavya", "Ravi", "Vikas", "Riya"
]

# Load model and tokenizers
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device)
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)

def generate_tts(prompt, speaker):
    description = (
        f"{speaker} speaks in a warm, neutral Indian English accent with a moderate pitch and steady pace. "
        "tone is friendly yet professional, making listeners feel welcome and comfortable. "
        "The voice is clear and well-articulated, with no regional inflections, and the recording is high-quality with no background noise."
    )
    desc_ids = description_tokenizer(description, return_tensors="pt").to(device)
    prompt_ids = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        generation = model.generate(
            input_ids=desc_ids.input_ids,
            attention_mask=desc_ids.attention_mask,
            prompt_input_ids=prompt_ids.input_ids,
            prompt_attention_mask=prompt_ids.attention_mask,
        )
    audio_arr = generation.cpu().numpy().squeeze()
    wav_path = "parler_tts_output.wav"
    sf.write(wav_path, audio_arr, model.config.sampling_rate)
    return wav_path

def parler_gradio_interface(prompt, speaker):
    return generate_tts(prompt, speaker)

# Gradio UI
iface = gr.Interface(
    fn=parler_gradio_interface,
    inputs=[
        gr.Textbox(label="Enter text (Indian English)", lines=2),
        gr.Dropdown(label="Choose Voice", choices=speaker_names, value=speaker_names[0])
    ],
    outputs=gr.Audio(type="filepath", label="Generated Audio"),
    title="Indic Parler-TTS Voice Generator",
    description="Enter your text, select a voice, and click Generate to hear Indian English TTS with chosen style.",
    allow_flagging="never"
)

if __name__ == "__main__":
    iface.launch()