File size: 10,949 Bytes
a1e382b
4001385
 
a2ea06b
a1e382b
5ecf3a1
966983e
 
5ecf3a1
 
966983e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ecf3a1
 
966983e
 
 
 
 
5ecf3a1
 
966983e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1e382b
7c81a73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2559d73
7c81a73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1e382b
 
7c81a73
 
 
4001385
7c81a73
 
 
4001385
 
7c81a73
 
 
4001385
 
 
5ecf3a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
822739a
5ecf3a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
822739a
5ecf3a1
 
 
a2ea06b
 
4001385
7c81a73
 
a1e382b
4001385
822739a
5ecf3a1
 
 
4001385
 
5ecf3a1
a2ea06b
4001385
 
7c81a73
 
 
 
 
 
 
 
4001385
 
7c81a73
4001385
 
 
 
7c81a73
4001385
 
7c81a73
4001385
 
 
 
 
 
822739a
4001385
 
5ecf3a1
 
 
 
 
 
 
 
 
 
 
 
 
 
a1e382b
 
a2ea06b
a1e382b
 
5ecf3a1
a1e382b
 
 
 
 
 
 
822739a
7c81a73
822739a
a1e382b
822739a
a1e382b
4001385
a1e382b
 
7c81a73
a1e382b
5ecf3a1
 
822739a
5ecf3a1
 
a1e382b
 
 
 
 
5ecf3a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
822739a
 
 
 
 
 
 
 
 
 
 
a1e382b
5ecf3a1
 
822739a
a1e382b
 
 
7c81a73
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
import gradio as gr
import torch
import torch._inductor
import spaces
from char_tokenizers import GermanCharsTokenizer
from german_text_preprocessor import preprocess_german_text
from huggingface_hub import hf_hub_download
import os
import onnxruntime as ort
import numpy as np


# --- Download Model Files from Hugging Face ---
def download_models():
    """
    Download model files from Hugging Face repositories at startup.
    Files are downloaded to the aot_package folder.
    """
    os.makedirs("aot_package", exist_ok=True)

    # Define the models and their files
    models_config = {
        "Warholt/CaroTTS-60M-DE-Karlsson": [
            "karlsson_fastpitch_encoder.pt2",
            "karlsson_fastpitch_decoder.pt2",
            "karlsson_hifigan.pt2",
            "karlsson_fastpitch.onnx",
            "karlsson_hifigan.onnx",
        ],
        "Warholt/CaroTTS-60M-DE-Caro": [
            "caro_fastpitch_encoder.pt2",
            "caro_fastpitch_decoder.pt2",
            "caro_hifigan.pt2",
            "caro_fastpitch.onnx",
            "caro_hifigan.onnx",
        ],
    }

    print("Downloading model files from Hugging Face...")
    for repo_id, files in models_config.items():
        for filename in files:
            print(f"  Downloading {filename} from {repo_id}...")
            hf_hub_download(
                repo_id=repo_id,
                filename=filename,
                local_dir="aot_package",
                local_dir_use_symlinks=False,
            )
    print("All model files downloaded successfully!")


# Download models at startup
download_models()

# --- 1. Define a Wrapper for Lazy Loading ---
class LazyAotPackage(torch.nn.Module):
    """
    A wrapper that holds the path to an AOT package and loads it
    to the GPU only when forward() is called.
    """

    def __init__(self, package_path):
        super().__init__()
        self.package_path = package_path
        self.runner = None

    def forward(self, *args, **kwargs):
        # We are now inside the @spaces.GPU decorated function.
        # Valid GPU context exists.

        # If runner is not loaded, load it now.
        if self.runner is None:
            # Load directly to the active CUDA device
            self.runner = torch._inductor.aoti_load_package(self.package_path)

        # Run inference
        # We add a try/except block because if ZeroGPU swaps the underlying hardware
        # between requests, the old runner might be invalid.
        try:
            return self.runner(*args, **kwargs)
        except RuntimeError:
            # Context might be stale, reload
            self.runner = torch._inductor.aoti_load_package(
                self.package_path, device="cuda"
            )
            return self.runner(*args, **kwargs)


# --- 2. Initialize Global Components ---
TOKENIZER = GermanCharsTokenizer()

# Instead of a dict of raw paths, we instantiate our Lazy Loaders immediately.
# These act like standard PyTorch modules but use almost no RAM until inference.
MODELS = {
    "Caro": {
        "encoder": LazyAotPackage("aot_package/caro_fastpitch_encoder.pt2"),
        "decoder": LazyAotPackage("aot_package/caro_fastpitch_decoder.pt2"),
        "vocoder": LazyAotPackage("aot_package/caro_hifigan.pt2"),
    },
    "Karlsson": {
        "encoder": LazyAotPackage("aot_package/karlsson_fastpitch_encoder.pt2"),
        "decoder": LazyAotPackage("aot_package/karlsson_fastpitch_decoder.pt2"),
        "vocoder": LazyAotPackage("aot_package/karlsson_hifigan.pt2"),
    },
}

# Initialize ONNX sessions for CPU inference
ONNX_SESSIONS = {
    "Caro": {
        "fastpitch": ort.InferenceSession("aot_package/caro_fastpitch.onnx"),
        "hifigan": ort.InferenceSession("aot_package/caro_hifigan.onnx"),
    },
    "Karlsson": {
        "fastpitch": ort.InferenceSession("aot_package/karlsson_fastpitch.onnx"),
        "hifigan": ort.InferenceSession("aot_package/karlsson_hifigan.onnx"),
    },
}


# --- 3. CPU Inference Function (ONNX) ---
def synthesize_speech_cpu(text: str, voice: str, pace: float = 1.0):
    """
    Synthesize speech using ONNX models on CPU.
    """
    if not text.strip():
        return None

    # Preprocess text
    preprocessed_text = preprocess_german_text(text)

    # Tokenize text
    tokens = TOKENIZER.encode(preprocessed_text)

    # Prepare inputs for FastPitch
    paces = np.zeros(len(tokens), dtype=np.float32) + pace
    pitches = np.zeros(len(tokens), dtype=np.float32)

    inputs = {
        "text": np.array([tokens], dtype=np.int64),
        "pace": np.array([paces], dtype=np.float32),
        "pitch": np.array([pitches], dtype=np.float32),
    }

    # Get ONNX sessions for the selected voice
    fastpitch_session = ONNX_SESSIONS[voice]["fastpitch"]
    hifigan_session = ONNX_SESSIONS[voice]["hifigan"]

    # Generate spectrogram with FastPitch
    spec = fastpitch_session.run(None, inputs)[0]

    # Generate audio with HiFiGAN
    gan_inputs = {"spec": spec}
    audio = hifigan_session.run(None, gan_inputs)[0]

    # Convert to format expected by Gradio
    sample_rate = 44100
    audio_array = audio.squeeze()

    return (sample_rate, audio_array)


# --- 4. GPU Inference Function ---
@spaces.GPU(duration=60)
def synthesize_speech(text: str, voice: str, pace: float = 1.0):
    """
    Synthesize speech. The @spaces.GPU decorator ensures a GPU is assigned
    for the duration of this function.
    """
    if not text.strip():
        return None

    # Preprocess text: convert numbers, dates, decimals to spoken form
    preprocessed_text = preprocess_german_text(text)

    # Tokenize text
    tokens = TOKENIZER.encode(preprocessed_text)
    tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to("cuda")

    # Prepare control parameters
    pitch_tensor = torch.zeros_like(tokens_tensor, dtype=torch.float32).to("cuda")
    pace_tensor = torch.ones_like(tokens_tensor, dtype=torch.float32).to("cuda") * pace

    # Retrieve the correct lazy-loaded models
    # The .forward() call inside these objects will trigger the load to GPU
    encoder = MODELS[voice]["encoder"]
    decoder = MODELS[voice]["decoder"]
    vocoder = MODELS[voice]["vocoder"]

    with torch.inference_mode():
        # 1. Run Encoder (Loads .pt2 to GPU if needed -> Runs)
        len_regulated, dec_lens, spk_emb = encoder(
            tokens_tensor, pitch_tensor, pace_tensor
        )

        # 2. Run Decoder (Loads .pt2 to GPU if needed -> Runs)
        spec = decoder(len_regulated, dec_lens, spk_emb)

        # 3. Run Vocoder (Loads .pt2 to GPU if needed -> Runs)
        audio = vocoder(spec)

    # Convert to numpy and return
    sample_rate = 44100
    audio_array = audio.squeeze().cpu().numpy()

    return (sample_rate, audio_array)


# --- 5. Combined Inference Function ---
def synthesize_speech_combined(
    text: str, voice: str, pace: float = 1.0, use_gpu: bool = False
):
    """
    Route to GPU or CPU inference based on user selection.
    """
    if use_gpu:
        return synthesize_speech(text, voice, pace)
    else:
        return synthesize_speech_cpu(text, voice, pace)


# --- 6. Gradio Interface ---
with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
    gr.Markdown(
        """
        # 🎙️ German Text-to-Speech
        Generate German speech using two different voices: **Caro** and **Karlsson**.
        Numbers, dates, and decimals are automatically converted to spoken form.
        """
    )

    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Text to synthesize",
                value="Guten Tag. Herzlich Willkommen zu dieser Demonstration. Es stehen Ihnen zwei Stimmen zur Auswahl: Caro und Karlsson. Sie können außerdem die Sprechgeschwindigkeit anpassen. Unten finden Sie ein Paar Beispielsätze. Probieren Sie es aus!",
                lines=3,
                max_length=1024,
            )
            char_counter = gr.Markdown("**Characters: 0 / 1024**")
            voice_dropdown = gr.Dropdown(
                choices=["Caro", "Karlsson"], label="Voice", value="Karlsson"
            )
            pace_slider = gr.Slider(
                minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speaking Rate"
            )
            use_gpu_checkbox = gr.Checkbox(
                label="Use GPU (ZeroGPU)",
                value=True,
                info="Enable for faster inference on GPU. Disable for CPU inference (slower but always available).",
            )
            generate_btn = gr.Button("Generate Speech 🔊", variant="primary")

        with gr.Column():
            audio_output = gr.Audio(label="Generated Audio", type="numpy")

    # Example sentences section
    gr.Markdown("### 📝 Example Sentences")
    gr.Examples(
        examples=[
            [
                "Die Bundeskanzlerin empfing heute den französischen Präsidenten zu einem Staatsbesuch in Berlin. Die Gespräche dauerten mehr als 3 Stunden."
            ],
            [
                "Am 15. März 2024 wird die neue Ausstellung im Museum eröffnet. Der Eintritt kostet 12,50 Euro für Erwachsene."
            ],
            [
                "In der verzauberten Bibliothek entdeckte die junge Magierin ein uraltes Buch, dessen Seiten im Mondlicht golden schimmerten."
            ],
            [
                "Der mutige Ritter zog sein Schwert und stellte sich dem feuerspeienden Drachen. Ein epischer Kampf begann auf dem Gipfel des Berges."
            ],
            [
                "Wussten Sie, dass die Große Mauer in China über 21000 Kilometer lang ist? Sie wurde über 2000 Jahre hinweg erbaut."
            ],
            [
                "Der menschliche Körper besteht zu etwa 60 Prozent aus Wasser. Ein erwachsener Mensch hat ungefähr 100000 Kilometer Blutgefäße."
            ],
            [
                "Die Temperaturen steigen heute auf bis zu 28 Grad Celsius. Am Wochenende wird mit Schauern und Gewittern gerechnet."
            ],
            [
                "Der Dax schloss heute bei 18456,73 Punkten, ein Plus von 2,3 Prozent. Der Euro notiert bei 1,0892 Dollar."
            ],
            [
                "Es war einmal in einem fernen Königreich, wo die Zeit anders verlief und die Sterne näher schienen. Dort lebte eine weise Eule, die alle Geheimnisse des Waldes kannte."
            ],
        ],
        inputs=text_input,
        label="Try these examples:",
    )

    # Update character counter
    def update_char_count(text):
        count = len(text)
        return f"**Characters: {count} / 1024**"

    text_input.change(
        fn=update_char_count,
        inputs=text_input,
        outputs=char_counter,
    )

    generate_btn.click(
        fn=synthesize_speech_combined,
        inputs=[text_input, voice_dropdown, pace_slider, use_gpu_checkbox],
        outputs=audio_output,
    )

if __name__ == "__main__":
    demo.launch()