Spaces:
Sleeping
Sleeping
File size: 5,323 Bytes
4f54df3 2a44583 4f54df3 2a44583 4f54df3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
import gradio as gr
import os
import tempfile
import syn_hifigan as syn
#import syn_vgan as syn
#import syn_k_univnet_multi as syn
description_text = """
# Multilingual TTS for Sámi languages (+ Finnish and Estonian)
Welcome! This is a demonstration of a multi-lingual and multi-speaker Text-to-Speech (TTS) model.
The demo is related to research on TTS for low-resource languages, and the effect of augmenting the training data with
areally close languages.
Disclaimers:
For convenience, the demo uses pretrained HiFi-GAN vocoder which doesn't work well with male voices.
English does not well due to small dataset and orthographic transcriptions. Use the demo just for testing, not for frequent or commercial use.
"""
speakers = {
"aj(sma)": 2,
"am(sme)": 3,
"ms(sme)": 4,
"ln(sme)": 5,
"mu(smj)": 7,
"sa(smj)": 8,
"bi(smj": 10,
"css(fin)": 11,
"ti(fin)": 13,
"ta(fin)": 14,
"liivika(est)": 15,
"indek(est)": 16,
"kylli(est)": 17,
"andreas(est)": 18,
"peeter(est)": 19,
"kersti(est)": 20,
"M6670(eng)": 21,
"M6097(eng)": 22,
"F92(eng)": 23,
"F9136(eng)": 24
}
mean_pitch = {
"aj0": 130,
"aj1": 130,
"am": 120,
"ms": 120,
"ln": 120,
"lo": 120,
"mu": 120,
"sa": 120,
"kd": 120,
"bi": 120,
"ti": 130,
"ta": 115,
"liivika": 120,
"indek": 90,
"kylli": 140,
"andreas": 100,
"peeter": 80,
"kersti": 120
}
languages = {
"guess": -1,
"South Sámi": 0, #South
"North Sámi": 1, #North
"Lule Sámi": 2, #Lule
"Finnish": 3,
"Estonian": 4,
"English": 5
}
# --- NEW: Add a dictionary for default prompts per language ---
default_prompts = {
"guess": "Sáhtta go esso-burgera luohti, Koskenkorva dahje carpool karajoiki gádjut árgabeaivveluođi?",
"North Sámi": "Riektačállinreaidduid lassin Divvun-joavkkus ovdanit dál maiddái hállanteknologiijareaidduid.",
"South Sámi": " Buerie aerede gaajhkesh dovnesh jïh buerie båeteme dan bæjhkoehtæmman.", #Guktie datnine?",
"Lule Sámi": "Sáme hållamsyntiesaj baktu máhttá adnegoahtet sáme gielajt ådå aktijvuodajn.",
"Finnish": "Joka kuuseen kurkottaa, se katajaan kapsahtaa.",
"Estonian": "Aprilli lõpp pani aiapidajate kannatuse jälle proovile – pärast mõnepäevast sooja saabub ootamatu külmalaine.",
"English": "This obscure language is not supported by this model."
}
public = False
tempdir = tempfile.gettempdir()
tts = syn.Synthesizer()
def speak(text, language, speaker, l_weight, s_weight, pace, postfilter): # pitch_shift,pitch_std):
# text frontend not implemented...
text = text.replace("...", "…")
#print(speakers[speaker])
#print(language)
use_lid = False
if language == "guess":
use_lid = True
audio = tts.speak(text, output_file=f'{tempdir}/tmp', lang=languages[language],
spkr=speakers[speaker], l_weight=l_weight, s_weight=s_weight,
pace=pace, clarity=postfilter, guess_lang=use_lid) # , mean_pitch = mean_pitch[speaker])
"""
if not public:
try:
os.system("play " + tempdir + "/tmp.wav &")
except:
pass
"""
return (22050, audio)
# update the text box based on language selection
def update_text_prompt(language):
"""
Updates the text in the textbox to the default prompt for the selected language.
"""
prompt = default_prompts.get(language, "") # Get the prompt, or an empty string if not found
return gr.Textbox(value=prompt)
#
with gr.Blocks() as tts_gui:
gr.Markdown(description_text) #"## Multilingual TTS for Sámi languages (+ Finnish and Estonian)")
with gr.Row():
with gr.Column(scale=2):
# Define each component and assign it to a variable
text_input = gr.Textbox(label="Text", value=default_prompts["North Sámi"])
language_dd = gr.Dropdown(list(languages.keys()), label="Language", value="North Sámi")
speaker_dd = gr.Dropdown(list(speakers.keys()), label="Voice", value="ms(sme)")
with gr.Row():
l_weight_slider = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1, label="Language Weight")
s_weight_slider = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1, label="Speaker Weight")
pace_slider = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1.0, label="Speech Rate")
postfilter_slider = gr.Slider(minimum=0., maximum=2, step=0.05, value=1.0, label="Post-processing")
with gr.Column(scale=1):
# Add a button to trigger synthesis
speak_button = gr.Button("Speak", variant="primary")
audio_output = gr.Audio(label="Output")
language_dd.change(
fn=update_text_prompt,
inputs=[language_dd],
outputs=[text_input]
)
speak_button.click(
fn=speak,
inputs=[
text_input,
language_dd,
speaker_dd,
l_weight_slider,
s_weight_slider,
pace_slider,
postfilter_slider
],
outputs=[audio_output]
)
if __name__ == "__main__":
tts_gui.launch(share=public)
|