Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import torch | |
| import torch._inductor | |
| import spaces | |
| from char_tokenizers import GermanCharsTokenizer | |
| from german_text_preprocessor import preprocess_german_text | |
| from huggingface_hub import hf_hub_download | |
| import os | |
| import onnxruntime as ort | |
| import numpy as np | |
| # --- Download Model Files from Hugging Face --- | |
| def download_models(): | |
| """ | |
| Download model files from Hugging Face repositories at startup. | |
| Files are downloaded to the aot_package folder. | |
| """ | |
| os.makedirs("aot_package", exist_ok=True) | |
| # Define the models and their files | |
| models_config = { | |
| "Warholt/CaroTTS-60M-DE-Karlsson": [ | |
| "karlsson_fastpitch_encoder.pt2", | |
| "karlsson_fastpitch_decoder.pt2", | |
| "karlsson_hifigan.pt2", | |
| "karlsson_fastpitch.onnx", | |
| "karlsson_hifigan.onnx", | |
| ], | |
| "Warholt/CaroTTS-60M-DE-Caro": [ | |
| "caro_fastpitch_encoder.pt2", | |
| "caro_fastpitch_decoder.pt2", | |
| "caro_hifigan.pt2", | |
| "caro_fastpitch.onnx", | |
| "caro_hifigan.onnx", | |
| ], | |
| } | |
| print("Downloading model files from Hugging Face...") | |
| for repo_id, files in models_config.items(): | |
| for filename in files: | |
| print(f" Downloading {filename} from {repo_id}...") | |
| hf_hub_download( | |
| repo_id=repo_id, | |
| filename=filename, | |
| local_dir="aot_package", | |
| local_dir_use_symlinks=False, | |
| ) | |
| print("All model files downloaded successfully!") | |
| # Download models at startup | |
| download_models() | |
| # --- 1. Define a Wrapper for Lazy Loading --- | |
| class LazyAotPackage(torch.nn.Module): | |
| """ | |
| A wrapper that holds the path to an AOT package and loads it | |
| to the GPU only when forward() is called. | |
| """ | |
| def __init__(self, package_path): | |
| super().__init__() | |
| self.package_path = package_path | |
| self.runner = None | |
| def forward(self, *args, **kwargs): | |
| # We are now inside the @spaces.GPU decorated function. | |
| # Valid GPU context exists. | |
| # If runner is not loaded, load it now. | |
| if self.runner is None: | |
| # Load directly to the active CUDA device | |
| self.runner = torch._inductor.aoti_load_package(self.package_path) | |
| # Run inference | |
| # We add a try/except block because if ZeroGPU swaps the underlying hardware | |
| # between requests, the old runner might be invalid. | |
| try: | |
| return self.runner(*args, **kwargs) | |
| except RuntimeError: | |
| # Context might be stale, reload | |
| self.runner = torch._inductor.aoti_load_package( | |
| self.package_path, device="cuda" | |
| ) | |
| return self.runner(*args, **kwargs) | |
| # --- 2. Initialize Global Components --- | |
| TOKENIZER = GermanCharsTokenizer() | |
| # Instead of a dict of raw paths, we instantiate our Lazy Loaders immediately. | |
| # These act like standard PyTorch modules but use almost no RAM until inference. | |
| MODELS = { | |
| "Caro": { | |
| "encoder": LazyAotPackage("aot_package/caro_fastpitch_encoder.pt2"), | |
| "decoder": LazyAotPackage("aot_package/caro_fastpitch_decoder.pt2"), | |
| "vocoder": LazyAotPackage("aot_package/caro_hifigan.pt2"), | |
| }, | |
| "Karlsson": { | |
| "encoder": LazyAotPackage("aot_package/karlsson_fastpitch_encoder.pt2"), | |
| "decoder": LazyAotPackage("aot_package/karlsson_fastpitch_decoder.pt2"), | |
| "vocoder": LazyAotPackage("aot_package/karlsson_hifigan.pt2"), | |
| }, | |
| } | |
| # Initialize ONNX sessions for CPU inference | |
| ONNX_SESSIONS = { | |
| "Caro": { | |
| "fastpitch": ort.InferenceSession("aot_package/caro_fastpitch.onnx"), | |
| "hifigan": ort.InferenceSession("aot_package/caro_hifigan.onnx"), | |
| }, | |
| "Karlsson": { | |
| "fastpitch": ort.InferenceSession("aot_package/karlsson_fastpitch.onnx"), | |
| "hifigan": ort.InferenceSession("aot_package/karlsson_hifigan.onnx"), | |
| }, | |
| } | |
| # --- 3. CPU Inference Function (ONNX) --- | |
| def synthesize_speech_cpu(text: str, voice: str, pace: float = 1.0): | |
| """ | |
| Synthesize speech using ONNX models on CPU. | |
| """ | |
| if not text.strip(): | |
| return None | |
| # Preprocess text | |
| preprocessed_text = preprocess_german_text(text) | |
| # Tokenize text | |
| tokens = TOKENIZER.encode(preprocessed_text) | |
| # Prepare inputs for FastPitch | |
| paces = np.zeros(len(tokens), dtype=np.float32) + pace | |
| pitches = np.zeros(len(tokens), dtype=np.float32) | |
| inputs = { | |
| "text": np.array([tokens], dtype=np.int64), | |
| "pace": np.array([paces], dtype=np.float32), | |
| "pitch": np.array([pitches], dtype=np.float32), | |
| } | |
| # Get ONNX sessions for the selected voice | |
| fastpitch_session = ONNX_SESSIONS[voice]["fastpitch"] | |
| hifigan_session = ONNX_SESSIONS[voice]["hifigan"] | |
| # Generate spectrogram with FastPitch | |
| spec = fastpitch_session.run(None, inputs)[0] | |
| # Generate audio with HiFiGAN | |
| gan_inputs = {"spec": spec} | |
| audio = hifigan_session.run(None, gan_inputs)[0] | |
| # Convert to format expected by Gradio | |
| sample_rate = 44100 | |
| audio_array = audio.squeeze() | |
| return (sample_rate, audio_array) | |
| # --- 4. GPU Inference Function --- | |
| def synthesize_speech(text: str, voice: str, pace: float = 1.0): | |
| """ | |
| Synthesize speech. The @spaces.GPU decorator ensures a GPU is assigned | |
| for the duration of this function. | |
| """ | |
| if not text.strip(): | |
| return None | |
| # Preprocess text: convert numbers, dates, decimals to spoken form | |
| preprocessed_text = preprocess_german_text(text) | |
| # Tokenize text | |
| tokens = TOKENIZER.encode(preprocessed_text) | |
| tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to("cuda") | |
| # Prepare control parameters | |
| pitch_tensor = torch.zeros_like(tokens_tensor, dtype=torch.float32).to("cuda") | |
| pace_tensor = torch.ones_like(tokens_tensor, dtype=torch.float32).to("cuda") * pace | |
| # Retrieve the correct lazy-loaded models | |
| # The .forward() call inside these objects will trigger the load to GPU | |
| encoder = MODELS[voice]["encoder"] | |
| decoder = MODELS[voice]["decoder"] | |
| vocoder = MODELS[voice]["vocoder"] | |
| with torch.inference_mode(): | |
| # 1. Run Encoder (Loads .pt2 to GPU if needed -> Runs) | |
| len_regulated, dec_lens, spk_emb = encoder( | |
| tokens_tensor, pitch_tensor, pace_tensor | |
| ) | |
| # 2. Run Decoder (Loads .pt2 to GPU if needed -> Runs) | |
| spec = decoder(len_regulated, dec_lens, spk_emb) | |
| # 3. Run Vocoder (Loads .pt2 to GPU if needed -> Runs) | |
| audio = vocoder(spec) | |
| # Convert to numpy and return | |
| sample_rate = 44100 | |
| audio_array = audio.squeeze().cpu().numpy() | |
| return (sample_rate, audio_array) | |
| # --- 5. Combined Inference Function --- | |
| def synthesize_speech_combined( | |
| text: str, voice: str, pace: float = 1.0, use_gpu: bool = False | |
| ): | |
| """ | |
| Route to GPU or CPU inference based on user selection. | |
| """ | |
| if use_gpu: | |
| return synthesize_speech(text, voice, pace) | |
| else: | |
| return synthesize_speech_cpu(text, voice, pace) | |
| # --- 6. Gradio Interface --- | |
| with gr.Blocks(title="German TTS - Caro & Karlsson") as demo: | |
| gr.Markdown( | |
| """ | |
| # 🎙️ German Text-to-Speech | |
| Generate German speech using two different voices: **Caro** and **Karlsson**. | |
| Numbers, dates, and decimals are automatically converted to spoken form. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="Text to synthesize", | |
| value="Guten Tag. Herzlich Willkommen zu dieser Demonstration. Es stehen Ihnen zwei Stimmen zur Auswahl: Caro und Karlsson. Sie können außerdem die Sprechgeschwindigkeit anpassen. Unten finden Sie ein Paar Beispielsätze. Probieren Sie es aus!", | |
| lines=3, | |
| max_length=1024, | |
| ) | |
| char_counter = gr.Markdown("**Characters: 0 / 1024**") | |
| voice_dropdown = gr.Dropdown( | |
| choices=["Caro", "Karlsson"], label="Voice", value="Karlsson" | |
| ) | |
| pace_slider = gr.Slider( | |
| minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speaking Rate" | |
| ) | |
| use_gpu_checkbox = gr.Checkbox( | |
| label="Use GPU (ZeroGPU)", | |
| value=True, | |
| info="Enable for faster inference on GPU. Disable for CPU inference (slower but always available).", | |
| ) | |
| generate_btn = gr.Button("Generate Speech 🔊", variant="primary") | |
| with gr.Column(): | |
| audio_output = gr.Audio(label="Generated Audio", type="numpy") | |
| # Example sentences section | |
| gr.Markdown("### 📝 Example Sentences") | |
| gr.Examples( | |
| examples=[ | |
| [ | |
| "Die Bundeskanzlerin empfing heute den französischen Präsidenten zu einem Staatsbesuch in Berlin. Die Gespräche dauerten mehr als 3 Stunden." | |
| ], | |
| [ | |
| "Am 15. März 2024 wird die neue Ausstellung im Museum eröffnet. Der Eintritt kostet 12,50 Euro für Erwachsene." | |
| ], | |
| [ | |
| "In der verzauberten Bibliothek entdeckte die junge Magierin ein uraltes Buch, dessen Seiten im Mondlicht golden schimmerten." | |
| ], | |
| [ | |
| "Der mutige Ritter zog sein Schwert und stellte sich dem feuerspeienden Drachen. Ein epischer Kampf begann auf dem Gipfel des Berges." | |
| ], | |
| [ | |
| "Wussten Sie, dass die Große Mauer in China über 21000 Kilometer lang ist? Sie wurde über 2000 Jahre hinweg erbaut." | |
| ], | |
| [ | |
| "Der menschliche Körper besteht zu etwa 60 Prozent aus Wasser. Ein erwachsener Mensch hat ungefähr 100000 Kilometer Blutgefäße." | |
| ], | |
| [ | |
| "Die Temperaturen steigen heute auf bis zu 28 Grad Celsius. Am Wochenende wird mit Schauern und Gewittern gerechnet." | |
| ], | |
| [ | |
| "Der Dax schloss heute bei 18456,73 Punkten, ein Plus von 2,3 Prozent. Der Euro notiert bei 1,0892 Dollar." | |
| ], | |
| [ | |
| "Es war einmal in einem fernen Königreich, wo die Zeit anders verlief und die Sterne näher schienen. Dort lebte eine weise Eule, die alle Geheimnisse des Waldes kannte." | |
| ], | |
| ], | |
| inputs=text_input, | |
| label="Try these examples:", | |
| ) | |
| # Update character counter | |
| def update_char_count(text): | |
| count = len(text) | |
| return f"**Characters: {count} / 1024**" | |
| text_input.change( | |
| fn=update_char_count, | |
| inputs=text_input, | |
| outputs=char_counter, | |
| ) | |
| generate_btn.click( | |
| fn=synthesize_speech_combined, | |
| inputs=[text_input, voice_dropdown, pace_slider, use_gpu_checkbox], | |
| outputs=audio_output, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |