Spaces:

Warholt
/

CaroTTS-DE

Running on Zero

App Files Files Community

Warholt commited on 16 days ago

Commit

5ecf3a1

1 Parent(s): b24d0d8

add text preprocessing and onnx cpu inference alternative

Browse files

Files changed (2) hide show

app.py +134 -9
german_text_preprocessor.py +397 -0

app.py CHANGED Viewed

@@ -3,8 +3,11 @@ import torch
 import torch._inductor
 import spaces
 from char_tokenizers import GermanCharsTokenizer
 from huggingface_hub import hf_hub_download
 import os
 # --- Download Model Files from Hugging Face ---
@@ -21,11 +24,15 @@ def download_models():
             "karlsson_fastpitch_encoder.pt2",
             "karlsson_fastpitch_decoder.pt2",
             "karlsson_hifigan.pt2",
         ],
         "Warholt/CaroTTS-60M-DE-Caro": [
             "caro_fastpitch_encoder.pt2",
             "caro_fastpitch_decoder.pt2",
             "caro_hifigan.pt2",
         ],
     }
@@ -97,7 +104,62 @@ MODELS = {
     },
 }
-# --- 3. Inference Function ---
 @spaces.GPU(duration=60)
 def synthesize_speech(text: str, voice: str, pace: float = 1.0):
     """
@@ -105,10 +167,13 @@ def synthesize_speech(text: str, voice: str, pace: float = 1.0):
     for the duration of this function.
     """
     if not text.strip():
-        return None
     # Tokenize text
-    tokens = TOKENIZER.encode(text)
     tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to("cuda")
     # Prepare control parameters
@@ -137,15 +202,29 @@ def synthesize_speech(text: str, voice: str, pace: float = 1.0):
     sample_rate = 44100
     audio_array = audio.squeeze().cpu().numpy()
-    return (sample_rate, audio_array)
-# --- 4. Gradio Interface ---
 with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
     gr.Markdown(
         """
         # 🎙️ German Text-to-Speech
         Generate German speech using two different voices: **Caro** and **Karlsson**.
         """
     )
@@ -153,7 +232,7 @@ with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
         with gr.Column():
             text_input = gr.Textbox(
                 label="Text to synthesize",
-                value="Hallo! Willkommen zur deutschen Sprachsynthese.",
                 lines=3,
             )
             voice_dropdown = gr.Dropdown(
@@ -162,15 +241,61 @@ with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
             pace_slider = gr.Slider(
                 minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speaking Rate"
             )
             generate_btn = gr.Button("Generate Speech 🔊", variant="primary")
         with gr.Column():
             audio_output = gr.Audio(label="Generated Audio", type="numpy")
     generate_btn.click(
-        fn=synthesize_speech,
-        inputs=[text_input, voice_dropdown, pace_slider],
-        outputs=audio_output,
     )
 if __name__ == "__main__":

 import torch._inductor
 import spaces
 from char_tokenizers import GermanCharsTokenizer
+from german_text_preprocessor import preprocess_german_text
 from huggingface_hub import hf_hub_download
 import os
+import onnxruntime as ort
+import numpy as np
 # --- Download Model Files from Hugging Face ---
             "karlsson_fastpitch_encoder.pt2",
             "karlsson_fastpitch_decoder.pt2",
             "karlsson_hifigan.pt2",
+            "karlsson_fastpitch.onnx",
+            "karlsson_hifigan.onnx",
         ],
         "Warholt/CaroTTS-60M-DE-Caro": [
             "caro_fastpitch_encoder.pt2",
             "caro_fastpitch_decoder.pt2",
             "caro_hifigan.pt2",
+            "caro_fastpitch.onnx",
+            "caro_hifigan.onnx",
         ],
     }
     },
 }
+# Initialize ONNX sessions for CPU inference
+ONNX_SESSIONS = {
+    "Caro": {
+        "fastpitch": ort.InferenceSession("aot_package/caro_fastpitch.onnx"),
+        "hifigan": ort.InferenceSession("aot_package/caro_hifigan.onnx"),
+    },
+    "Karlsson": {
+        "fastpitch": ort.InferenceSession("aot_package/karlsson_fastpitch.onnx"),
+        "hifigan": ort.InferenceSession("aot_package/karlsson_hifigan.onnx"),
+    },
+}
+# --- 3. CPU Inference Function (ONNX) ---
+def synthesize_speech_cpu(text: str, voice: str, pace: float = 1.0):
+    """
+    Synthesize speech using ONNX models on CPU.
+    """
+    if not text.strip():
+        return None, ""
+    # Preprocess text
+    preprocessed_text = preprocess_german_text(text)
+    # Tokenize text
+    tokens = TOKENIZER.encode(preprocessed_text)
+    # Prepare inputs for FastPitch
+    paces = np.zeros(len(tokens), dtype=np.float32) + pace
+    pitches = np.zeros(len(tokens), dtype=np.float32)
+    inputs = {
+        "text": np.array([tokens], dtype=np.int64),
+        "pace": np.array([paces], dtype=np.float32),
+        "pitch": np.array([pitches], dtype=np.float32),
+    }
+    # Get ONNX sessions for the selected voice
+    fastpitch_session = ONNX_SESSIONS[voice]["fastpitch"]
+    hifigan_session = ONNX_SESSIONS[voice]["hifigan"]
+    # Generate spectrogram with FastPitch
+    spec = fastpitch_session.run(None, inputs)[0]
+    # Generate audio with HiFiGAN
+    gan_inputs = {"spec": spec}
+    audio = hifigan_session.run(None, gan_inputs)[0]
+    # Convert to format expected by Gradio
+    sample_rate = 44100
+    audio_array = audio.squeeze()
+    return (sample_rate, audio_array), preprocessed_text
+# --- 4. GPU Inference Function ---
 @spaces.GPU(duration=60)
 def synthesize_speech(text: str, voice: str, pace: float = 1.0):
     """
     for the duration of this function.
     """
     if not text.strip():
+        return None, ""
+    # Preprocess text: convert numbers, dates, decimals to spoken form
+    preprocessed_text = preprocess_german_text(text)
     # Tokenize text
+    tokens = TOKENIZER.encode(preprocessed_text)
     tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to("cuda")
     # Prepare control parameters
     sample_rate = 44100
     audio_array = audio.squeeze().cpu().numpy()
+    return (sample_rate, audio_array), preprocessed_text
+# --- 5. Combined Inference Function ---
+def synthesize_speech_combined(
+    text: str, voice: str, pace: float = 1.0, use_gpu: bool = False
+):
+    """
+    Route to GPU or CPU inference based on user selection.
+    """
+    if use_gpu:
+        return synthesize_speech(text, voice, pace)
+    else:
+        return synthesize_speech_cpu(text, voice, pace)
+# --- 6. Gradio Interface ---
 with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
     gr.Markdown(
         """
         # 🎙️ German Text-to-Speech
         Generate German speech using two different voices: **Caro** and **Karlsson**.
+        Numbers, dates, and decimals are automatically converted to spoken form.
         """
     )
         with gr.Column():
             text_input = gr.Textbox(
                 label="Text to synthesize",
+                value="Guten Tag. Herzlich Willkommen zu dieser Demonstration deutscher Sprachsynthese-Modelle. Es stehen Ihnen zwei Stimmen zur Auswahl: Caro und Karlsson. Probieren Sie es aus!",
                 lines=3,
             )
             voice_dropdown = gr.Dropdown(
             pace_slider = gr.Slider(
                 minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speaking Rate"
             )
+            use_gpu_checkbox = gr.Checkbox(
+                label="Use GPU (ZeroGPU)",
+                value=False,
+                info="Enable for faster inference on GPU. Disable for CPU inference (slower but always available).",
+            )
             generate_btn = gr.Button("Generate Speech 🔊", variant="primary")
         with gr.Column():
+            preprocessed_output = gr.Textbox(
+                label="Preprocessed Text (what will be spoken)",
+                lines=3,
+                interactive=False,
+            )
             audio_output = gr.Audio(label="Generated Audio", type="numpy")
+    # Example sentences section
+    gr.Markdown("### 📝 Example Sentences")
+    gr.Examples(
+        examples=[
+            [
+                "Die Bundeskanzlerin empfing heute den französischen Präsidenten zu einem Staatsbesuch in Berlin. Die Gespräche dauerten mehr als 3 Stunden."
+            ],
+            [
+                "Am 15. März 2024 wird die neue Ausstellung im Museum eröffnet. Der Eintritt kostet 12,50 Euro für Erwachsene."
+            ],
+            [
+                "In der verzauberten Bibliothek entdeckte die junge Magierin ein uraltes Buch, dessen Seiten im Mondlicht golden schimmerten."
+            ],
+            [
+                "Der mutige Ritter zog sein Schwert und stellte sich dem feuerspeienden Drachen. Ein epischer Kampf begann auf dem Gipfel des Berges."
+            ],
+            [
+                "Wussten Sie, dass die Große Mauer in China über 21000 Kilometer lang ist? Sie wurde über 2000 Jahre hinweg erbaut."
+            ],
+            [
+                "Der menschliche Körper besteht zu etwa 60 Prozent aus Wasser. Ein erwachsener Mensch hat ungefähr 100000 Kilometer Blutgefäße."
+            ],
+            [
+                "Die Temperaturen steigen heute auf bis zu 28 Grad Celsius. Am Wochenende wird mit Schauern und Gewittern gerechnet."
+            ],
+            [
+                "Der Dax schloss heute bei 18456,73 Punkten, ein Plus von 2,3 Prozent. Der Euro notiert bei 1,0892 Dollar."
+            ],
+            [
+                "Es war einmal in einem fernen Königreich, wo die Zeit anders verlief und die Sterne näher schienen. Dort lebte eine weise Eule, die alle Geheimnisse des Waldes kannte."
+            ],
+        ],
+        inputs=text_input,
+        label="Try these examples:",
+    )
     generate_btn.click(
+        fn=synthesize_speech_combined,
+        inputs=[text_input, voice_dropdown, pace_slider, use_gpu_checkbox],
+        outputs=[audio_output, preprocessed_output],
     )
 if __name__ == "__main__":

german_text_preprocessor.py ADDED Viewed

	@@ -0,0 +1,397 @@

+"""
+German Text Preprocessing Module for TTS
+Handles normalization of numbers, dates, decimal numbers, and other text elements
+to their spoken form in German.
+"""
+import re
+class GermanTextPreprocessor:
+    """
+    Preprocesses German text for TTS by converting numbers, dates, and special
+    characters into their spoken equivalents.
+    """
+    # Number words for German
+    ONES = {
+        0: "", 1: "eins", 2: "zwei", 3: "drei", 4: "vier",
+        5: "fünf", 6: "sechs", 7: "sieben", 8: "acht", 9: "neun"
+    }
+    # Digit names for reading individual digits (including zero)
+    DIGITS = {
+        0: "null", 1: "eins", 2: "zwei", 3: "drei", 4: "vier",
+        5: "fünf", 6: "sechs", 7: "sieben", 8: "acht", 9: "neun"
+    }
+    TEENS = {
+        10: "zehn", 11: "elf", 12: "zwölf", 13: "dreizehn",
+        14: "vierzehn", 15: "fünfzehn", 16: "sechzehn",
+        17: "siebzehn", 18: "achtzehn", 19: "neunzehn"
+    }
+    TENS = {
+        2: "zwanzig", 3: "dreißig", 4: "vierzig",
+        5: "fünfzig", 6: "sechzig", 7: "siebzig",
+        8: "achtzig", 9: "neunzig"
+    }
+    SCALES = [
+        (1000000000, "Milliarde", "Milliarden"),
+        (1000000, "Million", "Millionen"),
+        (1000, "tausend", "tausend")
+    ]
+    # Ordinal number endings
+    ORDINAL_ONES = {
+        1: "erster", 2: "zweiter", 3: "dritter", 4: "vierter",
+        5: "fünfter", 6: "sechster", 7: "siebter", 8: "achter", 9: "neunter"
+    }
+    ORDINAL_TEENS = {
+        10: "zehnter", 11: "elfter", 12: "zwölfter", 13: "dreizehnter",
+        14: "vierzehnter", 15: "fünfzehnter", 16: "sechzehnter",
+        17: "siebzehnter", 18: "achtzehnter", 19: "neunzehnter"
+    }
+    # Month names
+    MONTHS = {
+        1: "Januar", 2: "Februar", 3: "März", 4: "April",
+        5: "Mai", 6: "Juni", 7: "Juli", 8: "August",
+        9: "September", 10: "Oktober", 11: "November", 12: "Dezember"
+    }
+    MONTH_ABBREV = {
+        "jan": "Januar", "feb": "Februar", "mär": "März", "apr": "April",
+        "mai": "Mai", "jun": "Juni", "jul": "Juli", "aug": "August",
+        "sep": "September", "sept": "September", "okt": "Oktober",
+        "nov": "November", "dez": "Dezember"
+    }
+    def __init__(self):
+        """Initialize the German text preprocessor."""
+        pass
+    def _number_to_words(self, num: int) -> str:
+        """
+        Convert a cardinal number to its German word form.
+        Args:
+            num: Integer to convert
+        Returns:
+            German word representation of the number
+        """
+        if num == 0:
+            return "null"
+        if num < 0:
+            return "minus " + self._number_to_words(-num)
+        # Handle 1-9
+        if num < 10:
+            return self.ONES[num]
+        # Handle 10-19
+        if num < 20:
+            return self.TEENS[num]
+        # Handle 20-99
+        if num < 100:
+            ones = num % 10
+            tens = num // 10
+            if ones == 0:
+                return self.TENS[tens]
+            else:
+                ones_word = self.ONES[ones]
+                # Special case: "eins" becomes "ein" in compound numbers
+                if ones == 1:
+                    ones_word = "ein"
+                return f"{ones_word}und{self.TENS[tens]}"
+        # Handle 100-999
+        if num < 1000:
+            hundreds = num // 100
+            remainder = num % 100
+            hundreds_word = "einhundert" if hundreds == 1 else f"{self.ONES[hundreds]}hundert"
+            if remainder == 0:
+                return hundreds_word
+            return f"{hundreds_word}{self._number_to_words(remainder)}"
+        # Handle larger numbers using scales
+        for scale, singular, plural in self.SCALES:
+            if num >= scale:
+                quotient = num // scale
+                remainder = num % scale
+                # Format the quotient part
+                quotient_words = self._number_to_words(quotient)
+                # Choose singular or plural
+                if scale == 1000:
+                    scale_word = singular
+                    # Special formatting for thousands
+                    if quotient == 1:
+                        scale_word = "eintausend"
+                    else:
+                        scale_word = f"{quotient_words}tausend"
+                    if remainder == 0:
+                        return scale_word
+                    return f"{scale_word}{self._number_to_words(remainder)}"
+                else:
+                    scale_word = singular if quotient == 1 else plural
+                    if quotient == 1:
+                        result = f"eine {scale_word}"
+                    else:
+                        result = f"{quotient_words} {scale_word}"
+                    if remainder == 0:
+                        return result
+                    return f"{result} {self._number_to_words(remainder)}"
+        return str(num)
+    def _year_to_words(self, year: int) -> str:
+        """
+        Convert a year to its German spoken form.
+        Args:
+            year: Year as integer (e.g., 1994, 2019)
+        Returns:
+            German spoken form of the year
+        """
+        # For years 1000-1999, split into hundreds
+        if 1000 <= year <= 1999:
+            hundreds = year // 100
+            remainder = year % 100
+            if remainder == 0:
+                return self._number_to_words(year)
+            # Create compound like "neunzehnhundertvierundneunzig"
+            hundreds_word = self._number_to_words(hundreds)
+            return f"{hundreds_word}hundert{self._number_to_words(remainder)}"
+        # For years 2000+, use normal number reading
+        return self._number_to_words(year)
+    def _ordinal_to_words(self, num: int) -> str:
+        """
+        Convert a number to its German ordinal form.
+        Args:
+            num: Integer to convert to ordinal
+        Returns:
+            German ordinal word
+        """
+        if num < 1:
+            return self._number_to_words(num) + "ter"
+        # Handle 1-9
+        if num < 10:
+            return self.ORDINAL_ONES.get(num, self._number_to_words(num) + "ter")
+        # Handle 10-19
+        if num < 20:
+            return self.ORDINAL_TEENS.get(num, self._number_to_words(num) + "ter")
+        # For larger numbers, add "ter" to the cardinal
+        return self._number_to_words(num) + "ter"
+    def _process_decimal(self, match: re.Match) -> str:
+        """
+        Process decimal numbers like "3,1415" -> "drei komma eins vier eins fünf"
+        Args:
+            match: Regex match object containing the decimal number
+        Returns:
+            Spoken form of the decimal number
+        """
+        full_number = match.group(0)
+        parts = full_number.split(',')
+        # Integer part
+        integer_part = int(parts[0]) if parts[0] else 0
+        result = self._number_to_words(integer_part)
+        # Decimal part - read digit by digit (including zeros)
+        if len(parts) > 1 and parts[1]:
+            result += " komma"
+            for digit in parts[1]:
+                result += " " + self.DIGITS[int(digit)]
+        return result
+    def _process_date(self, match: re.Match) -> str:
+        """
+        Process dates in various formats:
+        - "20.11.2019" -> "zwanzigster elfter zweitausendneunzehn"
+        - "1. Jan. 1994" -> "erster Januar neunzehnhundertvierundneunzig"
+        Args:
+            match: Regex match object containing the date
+        Returns:
+            Spoken form of the date
+        """
+        date_str = match.group(0)
+        # Pattern 1: DD.MM.YYYY or D.M.YYYY
+        pattern1 = r'(\d{1,2})\.(\d{1,2})\.(\d{4})'
+        m1 = re.match(pattern1, date_str)
+        if m1:
+            day = int(m1.group(1))
+            month = int(m1.group(2))
+            year = int(m1.group(3))
+            day_word = self._ordinal_to_words(day)
+            month_word = self._ordinal_to_words(month)
+            year_word = self._year_to_words(year)
+            return f"{day_word} {month_word} {year_word}"
+        # Pattern 2: D. Mon. YYYY or DD. Month YYYY
+        pattern2 = r'(\d{1,2})\.\s*([A-Za-zä]+)\.?\s*(\d{4})'
+        m2 = re.match(pattern2, date_str)
+        if m2:
+            day = int(m2.group(1))
+            month_str = m2.group(2).lower()
+            year = int(m2.group(3))
+            day_word = self._ordinal_to_words(day)
+            # Try to find month
+            month_word = self.MONTH_ABBREV.get(month_str, month_str)
+            year_word = self._year_to_words(year)
+            return f"{day_word} {month_word} {year_word}"
+        # Pattern 3: Just DD.MM or D.M (without year)
+        pattern3 = r'(\d{1,2})\.(\d{1,2})\.'
+        m3 = re.match(pattern3, date_str)
+        if m3:
+            day = int(m3.group(1))
+            month = int(m3.group(2))
+            day_word = self._ordinal_to_words(day)
+            month_word = self._ordinal_to_words(month)
+            return f"{day_word} {month_word}"
+        return date_str
+    def _process_standalone_number(self, match: re.Match) -> str:
+        """
+        Process standalone cardinal numbers.
+        Args:
+            match: Regex match object containing the number
+        Returns:
+            Spoken form of the number
+        """
+        num_str = match.group(0)
+        num = int(num_str)
+        return self._number_to_words(num)
+    def preprocess(self, text: str) -> str:
+        """
+        Main preprocessing function that applies all transformations.
+        Args:
+            text: Input German text
+        Returns:
+            Preprocessed text with numbers, dates, etc. converted to spoken form
+        """
+        # Order matters! More specific patterns first
+        # 1. Process dates (must come before decimal and integer processing)
+        # Pattern: DD.MM.YYYY or D.M.YYYY
+        text = re.sub(
+            r'\b(\d{1,2})\.(\d{1,2})\.(\d{4})\b',
+            self._process_date,
+            text
+        )
+        # Pattern: D. Month YYYY or DD. Mon. YYYY
+        text = re.sub(
+            r'\b(\d{1,2})\.\s*([A-Za-zäöüÄÖÜ]+)\.?\s*(\d{4})\b',
+            self._process_date,
+            text
+        )
+        # Pattern: DD.MM. or D.M.
+        text = re.sub(
+            r'\b(\d{1,2})\.(\d{1,2})\.',
+            self._process_date,
+            text
+        )
+        # 2. Process decimal numbers (before integers)
+        # Pattern: number,digits (e.g., 3,1415 or 0,5)
+        text = re.sub(
+            r'\b\d+,\d+\b',
+            self._process_decimal,
+            text
+        )
+        # 3. Process standalone integers (cardinal numbers)
+        # This will catch remaining numbers not processed by date/decimal patterns
+        text = re.sub(
+            r'\b\d+\b',
+            self._process_standalone_number,
+            text
+        )
+        # 4. Clean up any extra whitespace
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text
+# Convenience function for easy import and use
+def preprocess_german_text(text: str) -> str:
+    """
+    Convenience function to preprocess German text.
+    Args:
+        text: Input German text
+    Returns:
+        Preprocessed text with numbers, dates, etc. in spoken form
+    """
+    preprocessor = GermanTextPreprocessor()
+    return preprocessor.preprocess(text)
+# Example usage and testing
+if __name__ == "__main__":
+    preprocessor = GermanTextPreprocessor()
+    test_cases = [
+        "Die Zahl ist 3",
+        "Heute ist der 20.11.2019",
+        "Geboren am 1. Jan. 1994",
+        "Pi ist ungefähr 3,1415",
+        "Es sind 42 Studenten in der Klasse",
+        "Das Jahr 2023 war interessant",
+        "Der Preis beträgt 19,99 Euro",
+        "Am 5.12. ist Nikolaus",
+        "Die Temperatur ist -5 Grad",
+        "Es gibt 1000000 Möglichkeiten",
+        "Im Jahr 1789 begann die Revolution",
+    ]
+    print("German Text Preprocessing Examples:")
+    print("=" * 80)
+    for text in test_cases:
+        processed = preprocessor.preprocess(text)
+        print(f"Input:  {text}")
+        print(f"Output: {processed}")
+        print("-" * 80)