Spaces:
Sleeping
Sleeping
import gradio as gr | |
import os | |
import tempfile | |
import syn_hifigan as syn | |
#import syn_vgan as syn | |
#import syn_k_univnet_multi as syn | |
description_text = """ | |
# Multilingual TTS for Sámi languages (+ Finnish and Estonian) | |
Welcome! This is a demonstration of a multi-lingual and multi-speaker Text-to-Speech (TTS) model. | |
The demo is related to research on TTS for low-resource languages, and the effect of augmenting the training data with | |
areally close languages. | |
Disclaimers: | |
For convenience, the demo uses pretrained HiFi-GAN vocoder which doesn't work well with male voices. | |
English does not well due to small dataset and orthographic transcriptions. Use the demo just for testing, not for frequent or commercial use. | |
""" | |
speakers = { | |
"aj(sma)": 2, | |
"am(sme)": 3, | |
"ms(sme)": 4, | |
"ln(sme)": 5, | |
"mu(smj)": 7, | |
"sa(smj)": 8, | |
"bi(smj": 10, | |
"css(fin)": 11, | |
"ti(fin)": 13, | |
"ta(fin)": 14, | |
"liivika(est)": 15, | |
"indek(est)": 16, | |
"kylli(est)": 17, | |
"andreas(est)": 18, | |
"peeter(est)": 19, | |
"kersti(est)": 20, | |
"M6670(eng)": 21, | |
"M6097(eng)": 22, | |
"F92(eng)": 23, | |
"F9136(eng)": 24 | |
} | |
mean_pitch = { | |
"aj0": 130, | |
"aj1": 130, | |
"am": 120, | |
"ms": 120, | |
"ln": 120, | |
"lo": 120, | |
"mu": 120, | |
"sa": 120, | |
"kd": 120, | |
"bi": 120, | |
"ti": 130, | |
"ta": 115, | |
"liivika": 120, | |
"indek": 90, | |
"kylli": 140, | |
"andreas": 100, | |
"peeter": 80, | |
"kersti": 120 | |
} | |
languages = { | |
"guess": -1, | |
"South Sámi": 0, #South | |
"North Sámi": 1, #North | |
"Lule Sámi": 2, #Lule | |
"Finnish": 3, | |
"Estonian": 4, | |
"English": 5 | |
} | |
# --- NEW: Add a dictionary for default prompts per language --- | |
default_prompts = { | |
"guess": "Sáhtta go esso-burgera luohti, Koskenkorva dahje carpool karajoiki gádjut árgabeaivveluođi?", | |
"North Sámi": "Riektačállinreaidduid lassin Divvun-joavkkus ovdanit dál maiddái hállanteknologiijareaidduid.", | |
"South Sámi": " Buerie aerede gaajhkesh dovnesh jïh buerie båeteme dan bæjhkoehtæmman.", #Guktie datnine?", | |
"Lule Sámi": "Sáme hållamsyntiesaj baktu máhttá adnegoahtet sáme gielajt ådå aktijvuodajn.", | |
"Finnish": "Joka kuuseen kurkottaa, se katajaan kapsahtaa.", | |
"Estonian": "Aprilli lõpp pani aiapidajate kannatuse jälle proovile – pärast mõnepäevast sooja saabub ootamatu külmalaine.", | |
"English": "This obscure language is not supported by this model." | |
} | |
public = False | |
tempdir = tempfile.gettempdir() | |
tts = syn.Synthesizer() | |
def speak(text, language, speaker, l_weight, s_weight, pace, postfilter): # pitch_shift,pitch_std): | |
# text frontend not implemented... | |
text = text.replace("...", "…") | |
#print(speakers[speaker]) | |
#print(language) | |
use_lid = False | |
if language == "guess": | |
use_lid = True | |
audio = tts.speak(text, output_file=f'{tempdir}/tmp', lang=languages[language], | |
spkr=speakers[speaker], l_weight=l_weight, s_weight=s_weight, | |
pace=pace, clarity=postfilter, guess_lang=use_lid) # , mean_pitch = mean_pitch[speaker]) | |
""" | |
if not public: | |
try: | |
os.system("play " + tempdir + "/tmp.wav &") | |
except: | |
pass | |
""" | |
return (22050, audio) | |
# update the text box based on language selection | |
def update_text_prompt(language): | |
""" | |
Updates the text in the textbox to the default prompt for the selected language. | |
""" | |
prompt = default_prompts.get(language, "") # Get the prompt, or an empty string if not found | |
return gr.Textbox(value=prompt) | |
# | |
with gr.Blocks() as tts_gui: | |
gr.Markdown(description_text) #"## Multilingual TTS for Sámi languages (+ Finnish and Estonian)") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
# Define each component and assign it to a variable | |
text_input = gr.Textbox(label="Text", value=default_prompts["North Sámi"]) | |
language_dd = gr.Dropdown(list(languages.keys()), label="Language", value="North Sámi") | |
speaker_dd = gr.Dropdown(list(speakers.keys()), label="Voice", value="ms(sme)") | |
with gr.Row(): | |
l_weight_slider = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1, label="Language Weight") | |
s_weight_slider = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1, label="Speaker Weight") | |
pace_slider = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1.0, label="Speech Rate") | |
postfilter_slider = gr.Slider(minimum=0., maximum=2, step=0.05, value=1.0, label="Post-processing") | |
with gr.Column(scale=1): | |
# Add a button to trigger synthesis | |
speak_button = gr.Button("Speak", variant="primary") | |
audio_output = gr.Audio(label="Output") | |
language_dd.change( | |
fn=update_text_prompt, | |
inputs=[language_dd], | |
outputs=[text_input] | |
) | |
speak_button.click( | |
fn=speak, | |
inputs=[ | |
text_input, | |
language_dd, | |
speaker_dd, | |
l_weight_slider, | |
s_weight_slider, | |
pace_slider, | |
postfilter_slider | |
], | |
outputs=[audio_output] | |
) | |
if __name__ == "__main__": | |
tts_gui.launch(share=public) | |