CaroTTS-DE / app.py
Warholt's picture
add character counter
822739a
import gradio as gr
import torch
import torch._inductor
import spaces
from char_tokenizers import GermanCharsTokenizer
from german_text_preprocessor import preprocess_german_text
from huggingface_hub import hf_hub_download
import os
import onnxruntime as ort
import numpy as np
# --- Download Model Files from Hugging Face ---
def download_models():
"""
Download model files from Hugging Face repositories at startup.
Files are downloaded to the aot_package folder.
"""
os.makedirs("aot_package", exist_ok=True)
# Define the models and their files
models_config = {
"Warholt/CaroTTS-60M-DE-Karlsson": [
"karlsson_fastpitch_encoder.pt2",
"karlsson_fastpitch_decoder.pt2",
"karlsson_hifigan.pt2",
"karlsson_fastpitch.onnx",
"karlsson_hifigan.onnx",
],
"Warholt/CaroTTS-60M-DE-Caro": [
"caro_fastpitch_encoder.pt2",
"caro_fastpitch_decoder.pt2",
"caro_hifigan.pt2",
"caro_fastpitch.onnx",
"caro_hifigan.onnx",
],
}
print("Downloading model files from Hugging Face...")
for repo_id, files in models_config.items():
for filename in files:
print(f" Downloading {filename} from {repo_id}...")
hf_hub_download(
repo_id=repo_id,
filename=filename,
local_dir="aot_package",
local_dir_use_symlinks=False,
)
print("All model files downloaded successfully!")
# Download models at startup
download_models()
# --- 1. Define a Wrapper for Lazy Loading ---
class LazyAotPackage(torch.nn.Module):
"""
A wrapper that holds the path to an AOT package and loads it
to the GPU only when forward() is called.
"""
def __init__(self, package_path):
super().__init__()
self.package_path = package_path
self.runner = None
def forward(self, *args, **kwargs):
# We are now inside the @spaces.GPU decorated function.
# Valid GPU context exists.
# If runner is not loaded, load it now.
if self.runner is None:
# Load directly to the active CUDA device
self.runner = torch._inductor.aoti_load_package(self.package_path)
# Run inference
# We add a try/except block because if ZeroGPU swaps the underlying hardware
# between requests, the old runner might be invalid.
try:
return self.runner(*args, **kwargs)
except RuntimeError:
# Context might be stale, reload
self.runner = torch._inductor.aoti_load_package(
self.package_path, device="cuda"
)
return self.runner(*args, **kwargs)
# --- 2. Initialize Global Components ---
TOKENIZER = GermanCharsTokenizer()
# Instead of a dict of raw paths, we instantiate our Lazy Loaders immediately.
# These act like standard PyTorch modules but use almost no RAM until inference.
MODELS = {
"Caro": {
"encoder": LazyAotPackage("aot_package/caro_fastpitch_encoder.pt2"),
"decoder": LazyAotPackage("aot_package/caro_fastpitch_decoder.pt2"),
"vocoder": LazyAotPackage("aot_package/caro_hifigan.pt2"),
},
"Karlsson": {
"encoder": LazyAotPackage("aot_package/karlsson_fastpitch_encoder.pt2"),
"decoder": LazyAotPackage("aot_package/karlsson_fastpitch_decoder.pt2"),
"vocoder": LazyAotPackage("aot_package/karlsson_hifigan.pt2"),
},
}
# Initialize ONNX sessions for CPU inference
ONNX_SESSIONS = {
"Caro": {
"fastpitch": ort.InferenceSession("aot_package/caro_fastpitch.onnx"),
"hifigan": ort.InferenceSession("aot_package/caro_hifigan.onnx"),
},
"Karlsson": {
"fastpitch": ort.InferenceSession("aot_package/karlsson_fastpitch.onnx"),
"hifigan": ort.InferenceSession("aot_package/karlsson_hifigan.onnx"),
},
}
# --- 3. CPU Inference Function (ONNX) ---
def synthesize_speech_cpu(text: str, voice: str, pace: float = 1.0):
"""
Synthesize speech using ONNX models on CPU.
"""
if not text.strip():
return None
# Preprocess text
preprocessed_text = preprocess_german_text(text)
# Tokenize text
tokens = TOKENIZER.encode(preprocessed_text)
# Prepare inputs for FastPitch
paces = np.zeros(len(tokens), dtype=np.float32) + pace
pitches = np.zeros(len(tokens), dtype=np.float32)
inputs = {
"text": np.array([tokens], dtype=np.int64),
"pace": np.array([paces], dtype=np.float32),
"pitch": np.array([pitches], dtype=np.float32),
}
# Get ONNX sessions for the selected voice
fastpitch_session = ONNX_SESSIONS[voice]["fastpitch"]
hifigan_session = ONNX_SESSIONS[voice]["hifigan"]
# Generate spectrogram with FastPitch
spec = fastpitch_session.run(None, inputs)[0]
# Generate audio with HiFiGAN
gan_inputs = {"spec": spec}
audio = hifigan_session.run(None, gan_inputs)[0]
# Convert to format expected by Gradio
sample_rate = 44100
audio_array = audio.squeeze()
return (sample_rate, audio_array)
# --- 4. GPU Inference Function ---
@spaces.GPU(duration=60)
def synthesize_speech(text: str, voice: str, pace: float = 1.0):
"""
Synthesize speech. The @spaces.GPU decorator ensures a GPU is assigned
for the duration of this function.
"""
if not text.strip():
return None
# Preprocess text: convert numbers, dates, decimals to spoken form
preprocessed_text = preprocess_german_text(text)
# Tokenize text
tokens = TOKENIZER.encode(preprocessed_text)
tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to("cuda")
# Prepare control parameters
pitch_tensor = torch.zeros_like(tokens_tensor, dtype=torch.float32).to("cuda")
pace_tensor = torch.ones_like(tokens_tensor, dtype=torch.float32).to("cuda") * pace
# Retrieve the correct lazy-loaded models
# The .forward() call inside these objects will trigger the load to GPU
encoder = MODELS[voice]["encoder"]
decoder = MODELS[voice]["decoder"]
vocoder = MODELS[voice]["vocoder"]
with torch.inference_mode():
# 1. Run Encoder (Loads .pt2 to GPU if needed -> Runs)
len_regulated, dec_lens, spk_emb = encoder(
tokens_tensor, pitch_tensor, pace_tensor
)
# 2. Run Decoder (Loads .pt2 to GPU if needed -> Runs)
spec = decoder(len_regulated, dec_lens, spk_emb)
# 3. Run Vocoder (Loads .pt2 to GPU if needed -> Runs)
audio = vocoder(spec)
# Convert to numpy and return
sample_rate = 44100
audio_array = audio.squeeze().cpu().numpy()
return (sample_rate, audio_array)
# --- 5. Combined Inference Function ---
def synthesize_speech_combined(
text: str, voice: str, pace: float = 1.0, use_gpu: bool = False
):
"""
Route to GPU or CPU inference based on user selection.
"""
if use_gpu:
return synthesize_speech(text, voice, pace)
else:
return synthesize_speech_cpu(text, voice, pace)
# --- 6. Gradio Interface ---
with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
gr.Markdown(
"""
# 🎙️ German Text-to-Speech
Generate German speech using two different voices: **Caro** and **Karlsson**.
Numbers, dates, and decimals are automatically converted to spoken form.
"""
)
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Text to synthesize",
value="Guten Tag. Herzlich Willkommen zu dieser Demonstration. Es stehen Ihnen zwei Stimmen zur Auswahl: Caro und Karlsson. Sie können außerdem die Sprechgeschwindigkeit anpassen. Unten finden Sie ein Paar Beispielsätze. Probieren Sie es aus!",
lines=3,
max_length=1024,
)
char_counter = gr.Markdown("**Characters: 0 / 1024**")
voice_dropdown = gr.Dropdown(
choices=["Caro", "Karlsson"], label="Voice", value="Karlsson"
)
pace_slider = gr.Slider(
minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speaking Rate"
)
use_gpu_checkbox = gr.Checkbox(
label="Use GPU (ZeroGPU)",
value=True,
info="Enable for faster inference on GPU. Disable for CPU inference (slower but always available).",
)
generate_btn = gr.Button("Generate Speech 🔊", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Generated Audio", type="numpy")
# Example sentences section
gr.Markdown("### 📝 Example Sentences")
gr.Examples(
examples=[
[
"Die Bundeskanzlerin empfing heute den französischen Präsidenten zu einem Staatsbesuch in Berlin. Die Gespräche dauerten mehr als 3 Stunden."
],
[
"Am 15. März 2024 wird die neue Ausstellung im Museum eröffnet. Der Eintritt kostet 12,50 Euro für Erwachsene."
],
[
"In der verzauberten Bibliothek entdeckte die junge Magierin ein uraltes Buch, dessen Seiten im Mondlicht golden schimmerten."
],
[
"Der mutige Ritter zog sein Schwert und stellte sich dem feuerspeienden Drachen. Ein epischer Kampf begann auf dem Gipfel des Berges."
],
[
"Wussten Sie, dass die Große Mauer in China über 21000 Kilometer lang ist? Sie wurde über 2000 Jahre hinweg erbaut."
],
[
"Der menschliche Körper besteht zu etwa 60 Prozent aus Wasser. Ein erwachsener Mensch hat ungefähr 100000 Kilometer Blutgefäße."
],
[
"Die Temperaturen steigen heute auf bis zu 28 Grad Celsius. Am Wochenende wird mit Schauern und Gewittern gerechnet."
],
[
"Der Dax schloss heute bei 18456,73 Punkten, ein Plus von 2,3 Prozent. Der Euro notiert bei 1,0892 Dollar."
],
[
"Es war einmal in einem fernen Königreich, wo die Zeit anders verlief und die Sterne näher schienen. Dort lebte eine weise Eule, die alle Geheimnisse des Waldes kannte."
],
],
inputs=text_input,
label="Try these examples:",
)
# Update character counter
def update_char_count(text):
count = len(text)
return f"**Characters: {count} / 1024**"
text_input.change(
fn=update_char_count,
inputs=text_input,
outputs=char_counter,
)
generate_btn.click(
fn=synthesize_speech_combined,
inputs=[text_input, voice_dropdown, pace_slider, use_gpu_checkbox],
outputs=audio_output,
)
if __name__ == "__main__":
demo.launch()