File size: 5,323 Bytes
4f54df3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a44583
4f54df3
 
 
 
 
2a44583
4f54df3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import gradio as gr
import os
import tempfile

import syn_hifigan as syn
#import syn_vgan as syn
#import syn_k_univnet_multi as syn

description_text = """
# Multilingual TTS for Sámi languages (+ Finnish and Estonian)

Welcome! This is a demonstration of a multi-lingual and multi-speaker Text-to-Speech (TTS) model.
The demo is related to research on TTS for low-resource languages, and the effect of augmenting the training data with
areally close languages.


Disclaimers:
For convenience, the demo uses pretrained HiFi-GAN vocoder which doesn't work well with male voices. 
English does not well due to small dataset and orthographic transcriptions. Use the demo just for testing, not for frequent or commercial use.

 

"""
speakers = {
    "aj(sma)": 2,
    "am(sme)": 3,
    "ms(sme)": 4,
    "ln(sme)": 5,
    "mu(smj)": 7,
    "sa(smj)": 8,
    "bi(smj": 10,
    "css(fin)": 11,
    "ti(fin)": 13,
    "ta(fin)": 14,
    "liivika(est)": 15,
    "indek(est)": 16,
    "kylli(est)": 17,
    "andreas(est)": 18,
    "peeter(est)": 19,
    "kersti(est)": 20,
    "M6670(eng)": 21,
    "M6097(eng)": 22,
    "F92(eng)": 23,
    "F9136(eng)": 24
}

mean_pitch = {
    "aj0": 130,
    "aj1": 130,
    "am": 120,
    "ms": 120,
    "ln": 120,
    "lo": 120,
    "mu": 120,
    "sa": 120,
    "kd": 120,
    "bi": 120,
    "ti": 130,
    "ta": 115,
    "liivika": 120,
    "indek": 90,
    "kylli": 140,
    "andreas": 100,
    "peeter": 80,
    "kersti": 120
}

languages = {
    "guess": -1,
    "South Sámi": 0, #South
    "North Sámi": 1, #North
    "Lule Sámi": 2, #Lule
    "Finnish": 3,
    "Estonian": 4,
    "English": 5
}

# --- NEW: Add a dictionary for default prompts per language ---
default_prompts = {
    "guess": "Sáhtta go esso-burgera luohti, Koskenkorva dahje carpool karajoiki gádjut árgabeaivveluođi?",
    
    "North Sámi": "Riektačállinreaidduid lassin Divvun-joavkkus ovdanit dál maiddái hállanteknologiijareaidduid.",
    
    "South Sámi": " Buerie aerede gaajhkesh dovnesh jïh buerie båeteme dan bæjhkoehtæmman.", #Guktie datnine?",
    "Lule Sámi": "Sáme hållamsyntiesaj baktu máhttá adnegoahtet sáme gielajt ådå aktijvuodajn.",

    "Finnish": "Joka kuuseen kurkottaa, se katajaan kapsahtaa.",
    "Estonian": "Aprilli lõpp pani aiapidajate kannatuse jälle proovile – pärast mõnepäevast sooja saabub ootamatu külmalaine.",

    "English": "This obscure language is not supported by this model."
}


public = False

tempdir = tempfile.gettempdir()

tts = syn.Synthesizer()


def speak(text, language, speaker, l_weight, s_weight, pace, postfilter):  # pitch_shift,pitch_std):

    # text frontend not implemented...
    text = text.replace("...", "…")
    #print(speakers[speaker])
    #print(language)
    use_lid = False
    if language == "guess":
        use_lid = True

    audio = tts.speak(text, output_file=f'{tempdir}/tmp', lang=languages[language],
                      spkr=speakers[speaker], l_weight=l_weight, s_weight=s_weight,
                      pace=pace, clarity=postfilter, guess_lang=use_lid)  # , mean_pitch = mean_pitch[speaker])
    """
    if not public:
        try:
            os.system("play " + tempdir + "/tmp.wav &")
        except:
            pass
    """
    return (22050, audio)

# update the text box based on language selection 
def update_text_prompt(language):
    """
    Updates the text in the textbox to the default prompt for the selected language.
    """
    prompt = default_prompts.get(language, "") # Get the prompt, or an empty string if not found
    return gr.Textbox(value=prompt)


#
with gr.Blocks() as tts_gui:
    gr.Markdown(description_text) #"## Multilingual TTS for Sámi languages (+ Finnish and Estonian)")
    with gr.Row():
        with gr.Column(scale=2):
            # Define each component and assign it to a variable
            text_input = gr.Textbox(label="Text", value=default_prompts["North Sámi"])
            language_dd = gr.Dropdown(list(languages.keys()), label="Language", value="North Sámi")
            speaker_dd = gr.Dropdown(list(speakers.keys()), label="Voice", value="ms(sme)")
            
            with gr.Row():
                l_weight_slider = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1, label="Language Weight")
                s_weight_slider = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1, label="Speaker Weight")

            pace_slider = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1.0, label="Speech Rate")
            postfilter_slider = gr.Slider(minimum=0., maximum=2, step=0.05, value=1.0, label="Post-processing")
            
        with gr.Column(scale=1):
            # Add a button to trigger synthesis
            speak_button = gr.Button("Speak", variant="primary")
            audio_output = gr.Audio(label="Output")

    


    language_dd.change(
        fn=update_text_prompt,
        inputs=[language_dd],
        outputs=[text_input]
    )


    speak_button.click(
        fn=speak,
        inputs=[
            text_input,
            language_dd,
            speaker_dd,
            l_weight_slider,
            s_weight_slider,
            pace_slider,
            postfilter_slider
        ],
        outputs=[audio_output]
    )


if __name__ == "__main__":
    tts_gui.launch(share=public)