Spaces:

Warholt
/

CaroTTS-DE

Running on Zero

File size: 10,949 Bytes

import gradio as gr
import torch
import torch._inductor
import spaces
from char_tokenizers import GermanCharsTokenizer
from german_text_preprocessor import preprocess_german_text
from huggingface_hub import hf_hub_download
import os
import onnxruntime as ort
import numpy as np


# --- Download Model Files from Hugging Face ---
def download_models():
    """
    Download model files from Hugging Face repositories at startup.
    Files are downloaded to the aot_package folder.
    """
    os.makedirs("aot_package", exist_ok=True)

    # Define the models and their files
    models_config = {
        "Warholt/CaroTTS-60M-DE-Karlsson": [
            "karlsson_fastpitch_encoder.pt2",
            "karlsson_fastpitch_decoder.pt2",
            "karlsson_hifigan.pt2",
            "karlsson_fastpitch.onnx",
            "karlsson_hifigan.onnx",
        ],
        "Warholt/CaroTTS-60M-DE-Caro": [
            "caro_fastpitch_encoder.pt2",
            "caro_fastpitch_decoder.pt2",
            "caro_hifigan.pt2",
            "caro_fastpitch.onnx",
            "caro_hifigan.onnx",
        ],
    }

    print("Downloading model files from Hugging Face...")
    for repo_id, files in models_config.items():
        for filename in files:
            print(f"  Downloading {filename} from {repo_id}...")
            hf_hub_download(
                repo_id=repo_id,
                filename=filename,
                local_dir="aot_package",
                local_dir_use_symlinks=False,
            )
    print("All model files downloaded successfully!")


# Download models at startup
download_models()

# --- 1. Define a Wrapper for Lazy Loading ---
class LazyAotPackage(torch.nn.Module):
    """
    A wrapper that holds the path to an AOT package and loads it
    to the GPU only when forward() is called.
    """

    def __init__(self, package_path):
        super().__init__()
        self.package_path = package_path
        self.runner = None

    def forward(self, *args, **kwargs):
        # We are now inside the @spaces.GPU decorated function.
        # Valid GPU context exists.

        # If runner is not loaded, load it now.
        if self.runner is None:
            # Load directly to the active CUDA device
            self.runner = torch._inductor.aoti_load_package(self.package_path)

        # Run inference
        # We add a try/except block because if ZeroGPU swaps the underlying hardware
        # between requests, the old runner might be invalid.
        try:
            return self.runner(*args, **kwargs)
        except RuntimeError:
            # Context might be stale, reload
            self.runner = torch._inductor.aoti_load_package(
                self.package_path, device="cuda"
            )
            return self.runner(*args, **kwargs)


# --- 2. Initialize Global Components ---
TOKENIZER = GermanCharsTokenizer()

# Instead of a dict of raw paths, we instantiate our Lazy Loaders immediately.
# These act like standard PyTorch modules but use almost no RAM until inference.
MODELS = {
    "Caro": {
        "encoder": LazyAotPackage("aot_package/caro_fastpitch_encoder.pt2"),
        "decoder": LazyAotPackage("aot_package/caro_fastpitch_decoder.pt2"),
        "vocoder": LazyAotPackage("aot_package/caro_hifigan.pt2"),
    },
    "Karlsson": {
        "encoder": LazyAotPackage("aot_package/karlsson_fastpitch_encoder.pt2"),
        "decoder": LazyAotPackage("aot_package/karlsson_fastpitch_decoder.pt2"),
        "vocoder": LazyAotPackage("aot_package/karlsson_hifigan.pt2"),
    },
}

# Initialize ONNX sessions for CPU inference
ONNX_SESSIONS = {
    "Caro": {
        "fastpitch": ort.InferenceSession("aot_package/caro_fastpitch.onnx"),
        "hifigan": ort.InferenceSession("aot_package/caro_hifigan.onnx"),
    },
    "Karlsson": {
        "fastpitch": ort.InferenceSession("aot_package/karlsson_fastpitch.onnx"),
        "hifigan": ort.InferenceSession("aot_package/karlsson_hifigan.onnx"),
    },
}


# --- 3. CPU Inference Function (ONNX) ---
def synthesize_speech_cpu(text: str, voice: str, pace: float = 1.0):
    """
    Synthesize speech using ONNX models on CPU.
    """
    if not text.strip():
        return None

    # Preprocess text
    preprocessed_text = preprocess_german_text(text)

    # Tokenize text
    tokens = TOKENIZER.encode(preprocessed_text)

    # Prepare inputs for FastPitch
    paces = np.zeros(len(tokens), dtype=np.float32) + pace
    pitches = np.zeros(len(tokens), dtype=np.float32)

    inputs = {
        "text": np.array([tokens], dtype=np.int64),
        "pace": np.array([paces], dtype=np.float32),
        "pitch": np.array([pitches], dtype=np.float32),
    }

    # Get ONNX sessions for the selected voice
    fastpitch_session = ONNX_SESSIONS[voice]["fastpitch"]
    hifigan_session = ONNX_SESSIONS[voice]["hifigan"]

    # Generate spectrogram with FastPitch
    spec = fastpitch_session.run(None, inputs)[0]

    # Generate audio with HiFiGAN
    gan_inputs = {"spec": spec}
    audio = hifigan_session.run(None, gan_inputs)[0]

    # Convert to format expected by Gradio
    sample_rate = 44100
    audio_array = audio.squeeze()

    return (sample_rate, audio_array)


# --- 4. GPU Inference Function ---
@spaces.GPU(duration=60)
def synthesize_speech(text: str, voice: str, pace: float = 1.0):
    """
    Synthesize speech. The @spaces.GPU decorator ensures a GPU is assigned
    for the duration of this function.
    """
    if not text.strip():
        return None

    # Preprocess text: convert numbers, dates, decimals to spoken form
    preprocessed_text = preprocess_german_text(text)

    # Tokenize text
    tokens = TOKENIZER.encode(preprocessed_text)
    tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to("cuda")

    # Prepare control parameters
    pitch_tensor = torch.zeros_like(tokens_tensor, dtype=torch.float32).to("cuda")
    pace_tensor = torch.ones_like(tokens_tensor, dtype=torch.float32).to("cuda") * pace

    # Retrieve the correct lazy-loaded models
    # The .forward() call inside these objects will trigger the load to GPU
    encoder = MODELS[voice]["encoder"]
    decoder = MODELS[voice]["decoder"]
    vocoder = MODELS[voice]["vocoder"]

    with torch.inference_mode():
        # 1. Run Encoder (Loads .pt2 to GPU if needed -> Runs)
        len_regulated, dec_lens, spk_emb = encoder(
            tokens_tensor, pitch_tensor, pace_tensor
        )

        # 2. Run Decoder (Loads .pt2 to GPU if needed -> Runs)
        spec = decoder(len_regulated, dec_lens, spk_emb)

        # 3. Run Vocoder (Loads .pt2 to GPU if needed -> Runs)
        audio = vocoder(spec)

    # Convert to numpy and return
    sample_rate = 44100
    audio_array = audio.squeeze().cpu().numpy()

    return (sample_rate, audio_array)


# --- 5. Combined Inference Function ---
def synthesize_speech_combined(
    text: str, voice: str, pace: float = 1.0, use_gpu: bool = False
):
    """
    Route to GPU or CPU inference based on user selection.
    """
    if use_gpu:
        return synthesize_speech(text, voice, pace)
    else:
        return synthesize_speech_cpu(text, voice, pace)


# --- 6. Gradio Interface ---
with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
    gr.Markdown(
        """
        # 🎙️ German Text-to-Speech
        Generate German speech using two different voices: **Caro** and **Karlsson**.
        Numbers, dates, and decimals are automatically converted to spoken form.
        """
    )

    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Text to synthesize",
                value="Guten Tag. Herzlich Willkommen zu dieser Demonstration. Es stehen Ihnen zwei Stimmen zur Auswahl: Caro und Karlsson. Sie können außerdem die Sprechgeschwindigkeit anpassen. Unten finden Sie ein Paar Beispielsätze. Probieren Sie es aus!",
                lines=3,
                max_length=1024,
            )
            char_counter = gr.Markdown("**Characters: 0 / 1024**")
            voice_dropdown = gr.Dropdown(
                choices=["Caro", "Karlsson"], label="Voice", value="Karlsson"
            )
            pace_slider = gr.Slider(
                minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speaking Rate"
            )
            use_gpu_checkbox = gr.Checkbox(
                label="Use GPU (ZeroGPU)",
                value=True,
                info="Enable for faster inference on GPU. Disable for CPU inference (slower but always available).",
            )
            generate_btn = gr.Button("Generate Speech 🔊", variant="primary")

        with gr.Column():
            audio_output = gr.Audio(label="Generated Audio", type="numpy")

    # Example sentences section
    gr.Markdown("### 📝 Example Sentences")
    gr.Examples(
        examples=[
            [
                "Die Bundeskanzlerin empfing heute den französischen Präsidenten zu einem Staatsbesuch in Berlin. Die Gespräche dauerten mehr als 3 Stunden."
            ],
            [
                "Am 15. März 2024 wird die neue Ausstellung im Museum eröffnet. Der Eintritt kostet 12,50 Euro für Erwachsene."
            ],
            [
                "In der verzauberten Bibliothek entdeckte die junge Magierin ein uraltes Buch, dessen Seiten im Mondlicht golden schimmerten."
            ],
            [
                "Der mutige Ritter zog sein Schwert und stellte sich dem feuerspeienden Drachen. Ein epischer Kampf begann auf dem Gipfel des Berges."
            ],
            [
                "Wussten Sie, dass die Große Mauer in China über 21000 Kilometer lang ist? Sie wurde über 2000 Jahre hinweg erbaut."
            ],
            [
                "Der menschliche Körper besteht zu etwa 60 Prozent aus Wasser. Ein erwachsener Mensch hat ungefähr 100000 Kilometer Blutgefäße."
            ],
            [
                "Die Temperaturen steigen heute auf bis zu 28 Grad Celsius. Am Wochenende wird mit Schauern und Gewittern gerechnet."
            ],
            [
                "Der Dax schloss heute bei 18456,73 Punkten, ein Plus von 2,3 Prozent. Der Euro notiert bei 1,0892 Dollar."
            ],
            [
                "Es war einmal in einem fernen Königreich, wo die Zeit anders verlief und die Sterne näher schienen. Dort lebte eine weise Eule, die alle Geheimnisse des Waldes kannte."
            ],
        ],
        inputs=text_input,
        label="Try these examples:",
    )

    # Update character counter
    def update_char_count(text):
        count = len(text)
        return f"**Characters: {count} / 1024**"

    text_input.change(
        fn=update_char_count,
        inputs=text_input,
        outputs=char_counter,
    )

    generate_btn.click(
        fn=synthesize_speech_combined,
        inputs=[text_input, voice_dropdown, pace_slider, use_gpu_checkbox],
        outputs=audio_output,
    )

if __name__ == "__main__":
    demo.launch()