Spaces:
Running
on
Zero
Running
on
Zero
File size: 10,949 Bytes
a1e382b 4001385 a2ea06b a1e382b 5ecf3a1 966983e 5ecf3a1 966983e 5ecf3a1 966983e 5ecf3a1 966983e a1e382b 7c81a73 2559d73 7c81a73 a1e382b 7c81a73 4001385 7c81a73 4001385 7c81a73 4001385 5ecf3a1 822739a 5ecf3a1 822739a 5ecf3a1 a2ea06b 4001385 7c81a73 a1e382b 4001385 822739a 5ecf3a1 4001385 5ecf3a1 a2ea06b 4001385 7c81a73 4001385 7c81a73 4001385 7c81a73 4001385 7c81a73 4001385 822739a 4001385 5ecf3a1 a1e382b a2ea06b a1e382b 5ecf3a1 a1e382b 822739a 7c81a73 822739a a1e382b 822739a a1e382b 4001385 a1e382b 7c81a73 a1e382b 5ecf3a1 822739a 5ecf3a1 a1e382b 5ecf3a1 822739a a1e382b 5ecf3a1 822739a a1e382b 7c81a73 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 |
import gradio as gr
import torch
import torch._inductor
import spaces
from char_tokenizers import GermanCharsTokenizer
from german_text_preprocessor import preprocess_german_text
from huggingface_hub import hf_hub_download
import os
import onnxruntime as ort
import numpy as np
# --- Download Model Files from Hugging Face ---
def download_models():
"""
Download model files from Hugging Face repositories at startup.
Files are downloaded to the aot_package folder.
"""
os.makedirs("aot_package", exist_ok=True)
# Define the models and their files
models_config = {
"Warholt/CaroTTS-60M-DE-Karlsson": [
"karlsson_fastpitch_encoder.pt2",
"karlsson_fastpitch_decoder.pt2",
"karlsson_hifigan.pt2",
"karlsson_fastpitch.onnx",
"karlsson_hifigan.onnx",
],
"Warholt/CaroTTS-60M-DE-Caro": [
"caro_fastpitch_encoder.pt2",
"caro_fastpitch_decoder.pt2",
"caro_hifigan.pt2",
"caro_fastpitch.onnx",
"caro_hifigan.onnx",
],
}
print("Downloading model files from Hugging Face...")
for repo_id, files in models_config.items():
for filename in files:
print(f" Downloading {filename} from {repo_id}...")
hf_hub_download(
repo_id=repo_id,
filename=filename,
local_dir="aot_package",
local_dir_use_symlinks=False,
)
print("All model files downloaded successfully!")
# Download models at startup
download_models()
# --- 1. Define a Wrapper for Lazy Loading ---
class LazyAotPackage(torch.nn.Module):
"""
A wrapper that holds the path to an AOT package and loads it
to the GPU only when forward() is called.
"""
def __init__(self, package_path):
super().__init__()
self.package_path = package_path
self.runner = None
def forward(self, *args, **kwargs):
# We are now inside the @spaces.GPU decorated function.
# Valid GPU context exists.
# If runner is not loaded, load it now.
if self.runner is None:
# Load directly to the active CUDA device
self.runner = torch._inductor.aoti_load_package(self.package_path)
# Run inference
# We add a try/except block because if ZeroGPU swaps the underlying hardware
# between requests, the old runner might be invalid.
try:
return self.runner(*args, **kwargs)
except RuntimeError:
# Context might be stale, reload
self.runner = torch._inductor.aoti_load_package(
self.package_path, device="cuda"
)
return self.runner(*args, **kwargs)
# --- 2. Initialize Global Components ---
TOKENIZER = GermanCharsTokenizer()
# Instead of a dict of raw paths, we instantiate our Lazy Loaders immediately.
# These act like standard PyTorch modules but use almost no RAM until inference.
MODELS = {
"Caro": {
"encoder": LazyAotPackage("aot_package/caro_fastpitch_encoder.pt2"),
"decoder": LazyAotPackage("aot_package/caro_fastpitch_decoder.pt2"),
"vocoder": LazyAotPackage("aot_package/caro_hifigan.pt2"),
},
"Karlsson": {
"encoder": LazyAotPackage("aot_package/karlsson_fastpitch_encoder.pt2"),
"decoder": LazyAotPackage("aot_package/karlsson_fastpitch_decoder.pt2"),
"vocoder": LazyAotPackage("aot_package/karlsson_hifigan.pt2"),
},
}
# Initialize ONNX sessions for CPU inference
ONNX_SESSIONS = {
"Caro": {
"fastpitch": ort.InferenceSession("aot_package/caro_fastpitch.onnx"),
"hifigan": ort.InferenceSession("aot_package/caro_hifigan.onnx"),
},
"Karlsson": {
"fastpitch": ort.InferenceSession("aot_package/karlsson_fastpitch.onnx"),
"hifigan": ort.InferenceSession("aot_package/karlsson_hifigan.onnx"),
},
}
# --- 3. CPU Inference Function (ONNX) ---
def synthesize_speech_cpu(text: str, voice: str, pace: float = 1.0):
"""
Synthesize speech using ONNX models on CPU.
"""
if not text.strip():
return None
# Preprocess text
preprocessed_text = preprocess_german_text(text)
# Tokenize text
tokens = TOKENIZER.encode(preprocessed_text)
# Prepare inputs for FastPitch
paces = np.zeros(len(tokens), dtype=np.float32) + pace
pitches = np.zeros(len(tokens), dtype=np.float32)
inputs = {
"text": np.array([tokens], dtype=np.int64),
"pace": np.array([paces], dtype=np.float32),
"pitch": np.array([pitches], dtype=np.float32),
}
# Get ONNX sessions for the selected voice
fastpitch_session = ONNX_SESSIONS[voice]["fastpitch"]
hifigan_session = ONNX_SESSIONS[voice]["hifigan"]
# Generate spectrogram with FastPitch
spec = fastpitch_session.run(None, inputs)[0]
# Generate audio with HiFiGAN
gan_inputs = {"spec": spec}
audio = hifigan_session.run(None, gan_inputs)[0]
# Convert to format expected by Gradio
sample_rate = 44100
audio_array = audio.squeeze()
return (sample_rate, audio_array)
# --- 4. GPU Inference Function ---
@spaces.GPU(duration=60)
def synthesize_speech(text: str, voice: str, pace: float = 1.0):
"""
Synthesize speech. The @spaces.GPU decorator ensures a GPU is assigned
for the duration of this function.
"""
if not text.strip():
return None
# Preprocess text: convert numbers, dates, decimals to spoken form
preprocessed_text = preprocess_german_text(text)
# Tokenize text
tokens = TOKENIZER.encode(preprocessed_text)
tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to("cuda")
# Prepare control parameters
pitch_tensor = torch.zeros_like(tokens_tensor, dtype=torch.float32).to("cuda")
pace_tensor = torch.ones_like(tokens_tensor, dtype=torch.float32).to("cuda") * pace
# Retrieve the correct lazy-loaded models
# The .forward() call inside these objects will trigger the load to GPU
encoder = MODELS[voice]["encoder"]
decoder = MODELS[voice]["decoder"]
vocoder = MODELS[voice]["vocoder"]
with torch.inference_mode():
# 1. Run Encoder (Loads .pt2 to GPU if needed -> Runs)
len_regulated, dec_lens, spk_emb = encoder(
tokens_tensor, pitch_tensor, pace_tensor
)
# 2. Run Decoder (Loads .pt2 to GPU if needed -> Runs)
spec = decoder(len_regulated, dec_lens, spk_emb)
# 3. Run Vocoder (Loads .pt2 to GPU if needed -> Runs)
audio = vocoder(spec)
# Convert to numpy and return
sample_rate = 44100
audio_array = audio.squeeze().cpu().numpy()
return (sample_rate, audio_array)
# --- 5. Combined Inference Function ---
def synthesize_speech_combined(
text: str, voice: str, pace: float = 1.0, use_gpu: bool = False
):
"""
Route to GPU or CPU inference based on user selection.
"""
if use_gpu:
return synthesize_speech(text, voice, pace)
else:
return synthesize_speech_cpu(text, voice, pace)
# --- 6. Gradio Interface ---
with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
gr.Markdown(
"""
# 🎙️ German Text-to-Speech
Generate German speech using two different voices: **Caro** and **Karlsson**.
Numbers, dates, and decimals are automatically converted to spoken form.
"""
)
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Text to synthesize",
value="Guten Tag. Herzlich Willkommen zu dieser Demonstration. Es stehen Ihnen zwei Stimmen zur Auswahl: Caro und Karlsson. Sie können außerdem die Sprechgeschwindigkeit anpassen. Unten finden Sie ein Paar Beispielsätze. Probieren Sie es aus!",
lines=3,
max_length=1024,
)
char_counter = gr.Markdown("**Characters: 0 / 1024**")
voice_dropdown = gr.Dropdown(
choices=["Caro", "Karlsson"], label="Voice", value="Karlsson"
)
pace_slider = gr.Slider(
minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speaking Rate"
)
use_gpu_checkbox = gr.Checkbox(
label="Use GPU (ZeroGPU)",
value=True,
info="Enable for faster inference on GPU. Disable for CPU inference (slower but always available).",
)
generate_btn = gr.Button("Generate Speech 🔊", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Generated Audio", type="numpy")
# Example sentences section
gr.Markdown("### 📝 Example Sentences")
gr.Examples(
examples=[
[
"Die Bundeskanzlerin empfing heute den französischen Präsidenten zu einem Staatsbesuch in Berlin. Die Gespräche dauerten mehr als 3 Stunden."
],
[
"Am 15. März 2024 wird die neue Ausstellung im Museum eröffnet. Der Eintritt kostet 12,50 Euro für Erwachsene."
],
[
"In der verzauberten Bibliothek entdeckte die junge Magierin ein uraltes Buch, dessen Seiten im Mondlicht golden schimmerten."
],
[
"Der mutige Ritter zog sein Schwert und stellte sich dem feuerspeienden Drachen. Ein epischer Kampf begann auf dem Gipfel des Berges."
],
[
"Wussten Sie, dass die Große Mauer in China über 21000 Kilometer lang ist? Sie wurde über 2000 Jahre hinweg erbaut."
],
[
"Der menschliche Körper besteht zu etwa 60 Prozent aus Wasser. Ein erwachsener Mensch hat ungefähr 100000 Kilometer Blutgefäße."
],
[
"Die Temperaturen steigen heute auf bis zu 28 Grad Celsius. Am Wochenende wird mit Schauern und Gewittern gerechnet."
],
[
"Der Dax schloss heute bei 18456,73 Punkten, ein Plus von 2,3 Prozent. Der Euro notiert bei 1,0892 Dollar."
],
[
"Es war einmal in einem fernen Königreich, wo die Zeit anders verlief und die Sterne näher schienen. Dort lebte eine weise Eule, die alle Geheimnisse des Waldes kannte."
],
],
inputs=text_input,
label="Try these examples:",
)
# Update character counter
def update_char_count(text):
count = len(text)
return f"**Characters: {count} / 1024**"
text_input.change(
fn=update_char_count,
inputs=text_input,
outputs=char_counter,
)
generate_btn.click(
fn=synthesize_speech_combined,
inputs=[text_input, voice_dropdown, pace_slider, use_gpu_checkbox],
outputs=audio_output,
)
if __name__ == "__main__":
demo.launch() |