import gradio as gr import torch import torch._inductor import spaces from char_tokenizers import GermanCharsTokenizer from german_text_preprocessor import preprocess_german_text from huggingface_hub import hf_hub_download import os import onnxruntime as ort import numpy as np # --- Download Model Files from Hugging Face --- def download_models(): """ Download model files from Hugging Face repositories at startup. Files are downloaded to the aot_package folder. """ os.makedirs("aot_package", exist_ok=True) # Define the models and their files models_config = { "Warholt/CaroTTS-60M-DE-Karlsson": [ "karlsson_fastpitch_encoder.pt2", "karlsson_fastpitch_decoder.pt2", "karlsson_hifigan.pt2", "karlsson_fastpitch.onnx", "karlsson_hifigan.onnx", ], "Warholt/CaroTTS-60M-DE-Caro": [ "caro_fastpitch_encoder.pt2", "caro_fastpitch_decoder.pt2", "caro_hifigan.pt2", "caro_fastpitch.onnx", "caro_hifigan.onnx", ], } print("Downloading model files from Hugging Face...") for repo_id, files in models_config.items(): for filename in files: print(f" Downloading {filename} from {repo_id}...") hf_hub_download( repo_id=repo_id, filename=filename, local_dir="aot_package", local_dir_use_symlinks=False, ) print("All model files downloaded successfully!") # Download models at startup download_models() # --- 1. Define a Wrapper for Lazy Loading --- class LazyAotPackage(torch.nn.Module): """ A wrapper that holds the path to an AOT package and loads it to the GPU only when forward() is called. """ def __init__(self, package_path): super().__init__() self.package_path = package_path self.runner = None def forward(self, *args, **kwargs): # We are now inside the @spaces.GPU decorated function. # Valid GPU context exists. # If runner is not loaded, load it now. if self.runner is None: # Load directly to the active CUDA device self.runner = torch._inductor.aoti_load_package(self.package_path) # Run inference # We add a try/except block because if ZeroGPU swaps the underlying hardware # between requests, the old runner might be invalid. try: return self.runner(*args, **kwargs) except RuntimeError: # Context might be stale, reload self.runner = torch._inductor.aoti_load_package( self.package_path, device="cuda" ) return self.runner(*args, **kwargs) # --- 2. Initialize Global Components --- TOKENIZER = GermanCharsTokenizer() # Instead of a dict of raw paths, we instantiate our Lazy Loaders immediately. # These act like standard PyTorch modules but use almost no RAM until inference. MODELS = { "Caro": { "encoder": LazyAotPackage("aot_package/caro_fastpitch_encoder.pt2"), "decoder": LazyAotPackage("aot_package/caro_fastpitch_decoder.pt2"), "vocoder": LazyAotPackage("aot_package/caro_hifigan.pt2"), }, "Karlsson": { "encoder": LazyAotPackage("aot_package/karlsson_fastpitch_encoder.pt2"), "decoder": LazyAotPackage("aot_package/karlsson_fastpitch_decoder.pt2"), "vocoder": LazyAotPackage("aot_package/karlsson_hifigan.pt2"), }, } # Initialize ONNX sessions for CPU inference ONNX_SESSIONS = { "Caro": { "fastpitch": ort.InferenceSession("aot_package/caro_fastpitch.onnx"), "hifigan": ort.InferenceSession("aot_package/caro_hifigan.onnx"), }, "Karlsson": { "fastpitch": ort.InferenceSession("aot_package/karlsson_fastpitch.onnx"), "hifigan": ort.InferenceSession("aot_package/karlsson_hifigan.onnx"), }, } # --- 3. CPU Inference Function (ONNX) --- def synthesize_speech_cpu(text: str, voice: str, pace: float = 1.0): """ Synthesize speech using ONNX models on CPU. """ if not text.strip(): return None # Preprocess text preprocessed_text = preprocess_german_text(text) # Tokenize text tokens = TOKENIZER.encode(preprocessed_text) # Prepare inputs for FastPitch paces = np.zeros(len(tokens), dtype=np.float32) + pace pitches = np.zeros(len(tokens), dtype=np.float32) inputs = { "text": np.array([tokens], dtype=np.int64), "pace": np.array([paces], dtype=np.float32), "pitch": np.array([pitches], dtype=np.float32), } # Get ONNX sessions for the selected voice fastpitch_session = ONNX_SESSIONS[voice]["fastpitch"] hifigan_session = ONNX_SESSIONS[voice]["hifigan"] # Generate spectrogram with FastPitch spec = fastpitch_session.run(None, inputs)[0] # Generate audio with HiFiGAN gan_inputs = {"spec": spec} audio = hifigan_session.run(None, gan_inputs)[0] # Convert to format expected by Gradio sample_rate = 44100 audio_array = audio.squeeze() return (sample_rate, audio_array) # --- 4. GPU Inference Function --- @spaces.GPU(duration=60) def synthesize_speech(text: str, voice: str, pace: float = 1.0): """ Synthesize speech. The @spaces.GPU decorator ensures a GPU is assigned for the duration of this function. """ if not text.strip(): return None # Preprocess text: convert numbers, dates, decimals to spoken form preprocessed_text = preprocess_german_text(text) # Tokenize text tokens = TOKENIZER.encode(preprocessed_text) tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to("cuda") # Prepare control parameters pitch_tensor = torch.zeros_like(tokens_tensor, dtype=torch.float32).to("cuda") pace_tensor = torch.ones_like(tokens_tensor, dtype=torch.float32).to("cuda") * pace # Retrieve the correct lazy-loaded models # The .forward() call inside these objects will trigger the load to GPU encoder = MODELS[voice]["encoder"] decoder = MODELS[voice]["decoder"] vocoder = MODELS[voice]["vocoder"] with torch.inference_mode(): # 1. Run Encoder (Loads .pt2 to GPU if needed -> Runs) len_regulated, dec_lens, spk_emb = encoder( tokens_tensor, pitch_tensor, pace_tensor ) # 2. Run Decoder (Loads .pt2 to GPU if needed -> Runs) spec = decoder(len_regulated, dec_lens, spk_emb) # 3. Run Vocoder (Loads .pt2 to GPU if needed -> Runs) audio = vocoder(spec) # Convert to numpy and return sample_rate = 44100 audio_array = audio.squeeze().cpu().numpy() return (sample_rate, audio_array) # --- 5. Combined Inference Function --- def synthesize_speech_combined( text: str, voice: str, pace: float = 1.0, use_gpu: bool = False ): """ Route to GPU or CPU inference based on user selection. """ if use_gpu: return synthesize_speech(text, voice, pace) else: return synthesize_speech_cpu(text, voice, pace) # --- 6. Gradio Interface --- with gr.Blocks(title="German TTS - Caro & Karlsson") as demo: gr.Markdown( """ # 🎙️ German Text-to-Speech Generate German speech using two different voices: **Caro** and **Karlsson**. Numbers, dates, and decimals are automatically converted to spoken form. """ ) with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Text to synthesize", value="Guten Tag. Herzlich Willkommen zu dieser Demonstration. Es stehen Ihnen zwei Stimmen zur Auswahl: Caro und Karlsson. Sie können außerdem die Sprechgeschwindigkeit anpassen. Unten finden Sie ein Paar Beispielsätze. Probieren Sie es aus!", lines=3, max_length=1024, ) char_counter = gr.Markdown("**Characters: 0 / 1024**") voice_dropdown = gr.Dropdown( choices=["Caro", "Karlsson"], label="Voice", value="Karlsson" ) pace_slider = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speaking Rate" ) use_gpu_checkbox = gr.Checkbox( label="Use GPU (ZeroGPU)", value=True, info="Enable for faster inference on GPU. Disable for CPU inference (slower but always available).", ) generate_btn = gr.Button("Generate Speech 🔊", variant="primary") with gr.Column(): audio_output = gr.Audio(label="Generated Audio", type="numpy") # Example sentences section gr.Markdown("### 📝 Example Sentences") gr.Examples( examples=[ [ "Die Bundeskanzlerin empfing heute den französischen Präsidenten zu einem Staatsbesuch in Berlin. Die Gespräche dauerten mehr als 3 Stunden." ], [ "Am 15. März 2024 wird die neue Ausstellung im Museum eröffnet. Der Eintritt kostet 12,50 Euro für Erwachsene." ], [ "In der verzauberten Bibliothek entdeckte die junge Magierin ein uraltes Buch, dessen Seiten im Mondlicht golden schimmerten." ], [ "Der mutige Ritter zog sein Schwert und stellte sich dem feuerspeienden Drachen. Ein epischer Kampf begann auf dem Gipfel des Berges." ], [ "Wussten Sie, dass die Große Mauer in China über 21000 Kilometer lang ist? Sie wurde über 2000 Jahre hinweg erbaut." ], [ "Der menschliche Körper besteht zu etwa 60 Prozent aus Wasser. Ein erwachsener Mensch hat ungefähr 100000 Kilometer Blutgefäße." ], [ "Die Temperaturen steigen heute auf bis zu 28 Grad Celsius. Am Wochenende wird mit Schauern und Gewittern gerechnet." ], [ "Der Dax schloss heute bei 18456,73 Punkten, ein Plus von 2,3 Prozent. Der Euro notiert bei 1,0892 Dollar." ], [ "Es war einmal in einem fernen Königreich, wo die Zeit anders verlief und die Sterne näher schienen. Dort lebte eine weise Eule, die alle Geheimnisse des Waldes kannte." ], ], inputs=text_input, label="Try these examples:", ) # Update character counter def update_char_count(text): count = len(text) return f"**Characters: {count} / 1024**" text_input.change( fn=update_char_count, inputs=text_input, outputs=char_counter, ) generate_btn.click( fn=synthesize_speech_combined, inputs=[text_input, voice_dropdown, pace_slider, use_gpu_checkbox], outputs=audio_output, ) if __name__ == "__main__": demo.launch()