Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| German Text Preprocessing Module for TTS | |
| Handles normalization of numbers, dates, decimal numbers, and other text elements | |
| to their spoken form in German. | |
| """ | |
| import re | |
| class GermanTextPreprocessor: | |
| """ | |
| Preprocesses German text for TTS by converting numbers, dates, and special | |
| characters into their spoken equivalents. | |
| """ | |
| # Number words for German | |
| ONES = { | |
| 0: "", 1: "eins", 2: "zwei", 3: "drei", 4: "vier", | |
| 5: "fünf", 6: "sechs", 7: "sieben", 8: "acht", 9: "neun" | |
| } | |
| # Digit names for reading individual digits (including zero) | |
| DIGITS = { | |
| 0: "null", 1: "eins", 2: "zwei", 3: "drei", 4: "vier", | |
| 5: "fünf", 6: "sechs", 7: "sieben", 8: "acht", 9: "neun" | |
| } | |
| TEENS = { | |
| 10: "zehn", 11: "elf", 12: "zwölf", 13: "dreizehn", | |
| 14: "vierzehn", 15: "fünfzehn", 16: "sechzehn", | |
| 17: "siebzehn", 18: "achtzehn", 19: "neunzehn" | |
| } | |
| TENS = { | |
| 2: "zwanzig", 3: "dreißig", 4: "vierzig", | |
| 5: "fünfzig", 6: "sechzig", 7: "siebzig", | |
| 8: "achtzig", 9: "neunzig" | |
| } | |
| SCALES = [ | |
| (1000000000, "Milliarde", "Milliarden"), | |
| (1000000, "Million", "Millionen"), | |
| (1000, "tausend", "tausend") | |
| ] | |
| # Ordinal number endings | |
| ORDINAL_ONES = { | |
| 1: "erster", 2: "zweiter", 3: "dritter", 4: "vierter", | |
| 5: "fünfter", 6: "sechster", 7: "siebter", 8: "achter", 9: "neunter" | |
| } | |
| ORDINAL_TEENS = { | |
| 10: "zehnter", 11: "elfter", 12: "zwölfter", 13: "dreizehnter", | |
| 14: "vierzehnter", 15: "fünfzehnter", 16: "sechzehnter", | |
| 17: "siebzehnter", 18: "achtzehnter", 19: "neunzehnter" | |
| } | |
| # Month names | |
| MONTHS = { | |
| 1: "Januar", 2: "Februar", 3: "März", 4: "April", | |
| 5: "Mai", 6: "Juni", 7: "Juli", 8: "August", | |
| 9: "September", 10: "Oktober", 11: "November", 12: "Dezember" | |
| } | |
| MONTH_ABBREV = { | |
| "jan": "Januar", "feb": "Februar", "mär": "März", "apr": "April", | |
| "mai": "Mai", "jun": "Juni", "jul": "Juli", "aug": "August", | |
| "sep": "September", "sept": "September", "okt": "Oktober", | |
| "nov": "November", "dez": "Dezember" | |
| } | |
| def __init__(self): | |
| """Initialize the German text preprocessor.""" | |
| pass | |
| def _number_to_words(self, num: int) -> str: | |
| """ | |
| Convert a cardinal number to its German word form. | |
| Args: | |
| num: Integer to convert | |
| Returns: | |
| German word representation of the number | |
| """ | |
| if num == 0: | |
| return "null" | |
| if num < 0: | |
| return "minus " + self._number_to_words(-num) | |
| # Handle 1-9 | |
| if num < 10: | |
| return self.ONES[num] | |
| # Handle 10-19 | |
| if num < 20: | |
| return self.TEENS[num] | |
| # Handle 20-99 | |
| if num < 100: | |
| ones = num % 10 | |
| tens = num // 10 | |
| if ones == 0: | |
| return self.TENS[tens] | |
| else: | |
| ones_word = self.ONES[ones] | |
| # Special case: "eins" becomes "ein" in compound numbers | |
| if ones == 1: | |
| ones_word = "ein" | |
| return f"{ones_word}und{self.TENS[tens]}" | |
| # Handle 100-999 | |
| if num < 1000: | |
| hundreds = num // 100 | |
| remainder = num % 100 | |
| hundreds_word = "einhundert" if hundreds == 1 else f"{self.ONES[hundreds]}hundert" | |
| if remainder == 0: | |
| return hundreds_word | |
| return f"{hundreds_word}{self._number_to_words(remainder)}" | |
| # Handle larger numbers using scales | |
| for scale, singular, plural in self.SCALES: | |
| if num >= scale: | |
| quotient = num // scale | |
| remainder = num % scale | |
| # Format the quotient part | |
| quotient_words = self._number_to_words(quotient) | |
| # Choose singular or plural | |
| if scale == 1000: | |
| scale_word = singular | |
| # Special formatting for thousands | |
| if quotient == 1: | |
| scale_word = "eintausend" | |
| else: | |
| scale_word = f"{quotient_words}tausend" | |
| if remainder == 0: | |
| return scale_word | |
| return f"{scale_word}{self._number_to_words(remainder)}" | |
| else: | |
| scale_word = singular if quotient == 1 else plural | |
| if quotient == 1: | |
| result = f"eine {scale_word}" | |
| else: | |
| result = f"{quotient_words} {scale_word}" | |
| if remainder == 0: | |
| return result | |
| return f"{result} {self._number_to_words(remainder)}" | |
| return str(num) | |
| def _year_to_words(self, year: int) -> str: | |
| """ | |
| Convert a year to its German spoken form. | |
| Args: | |
| year: Year as integer (e.g., 1994, 2019) | |
| Returns: | |
| German spoken form of the year | |
| """ | |
| # For years 1000-1999, split into hundreds | |
| if 1000 <= year <= 1999: | |
| hundreds = year // 100 | |
| remainder = year % 100 | |
| if remainder == 0: | |
| return self._number_to_words(year) | |
| # Create compound like "neunzehnhundertvierundneunzig" | |
| hundreds_word = self._number_to_words(hundreds) | |
| return f"{hundreds_word}hundert{self._number_to_words(remainder)}" | |
| # For years 2000+, use normal number reading | |
| return self._number_to_words(year) | |
| def _ordinal_to_words(self, num: int) -> str: | |
| """ | |
| Convert a number to its German ordinal form. | |
| Args: | |
| num: Integer to convert to ordinal | |
| Returns: | |
| German ordinal word | |
| """ | |
| if num < 1: | |
| return self._number_to_words(num) + "ter" | |
| # Handle 1-9 | |
| if num < 10: | |
| return self.ORDINAL_ONES.get(num, self._number_to_words(num) + "ter") | |
| # Handle 10-19 | |
| if num < 20: | |
| return self.ORDINAL_TEENS.get(num, self._number_to_words(num) + "ter") | |
| # For larger numbers, add "ter" to the cardinal | |
| return self._number_to_words(num) + "ter" | |
| def _process_decimal(self, match: re.Match) -> str: | |
| """ | |
| Process decimal numbers like "3,1415" -> "drei komma eins vier eins fünf" | |
| Args: | |
| match: Regex match object containing the decimal number | |
| Returns: | |
| Spoken form of the decimal number | |
| """ | |
| full_number = match.group(0) | |
| parts = full_number.split(',') | |
| # Integer part | |
| integer_part = int(parts[0]) if parts[0] else 0 | |
| result = self._number_to_words(integer_part) | |
| # Decimal part - read digit by digit (including zeros) | |
| if len(parts) > 1 and parts[1]: | |
| result += " komma" | |
| for digit in parts[1]: | |
| result += " " + self.DIGITS[int(digit)] | |
| return result | |
| def _process_date(self, match: re.Match) -> str: | |
| """ | |
| Process dates in various formats: | |
| - "20.11.2019" -> "zwanzigster elfter zweitausendneunzehn" | |
| - "1. Jan. 1994" -> "erster Januar neunzehnhundertvierundneunzig" | |
| Args: | |
| match: Regex match object containing the date | |
| Returns: | |
| Spoken form of the date | |
| """ | |
| date_str = match.group(0) | |
| # Pattern 1: DD.MM.YYYY or D.M.YYYY | |
| pattern1 = r'(\d{1,2})\.(\d{1,2})\.(\d{4})' | |
| m1 = re.match(pattern1, date_str) | |
| if m1: | |
| day = int(m1.group(1)) | |
| month = int(m1.group(2)) | |
| year = int(m1.group(3)) | |
| day_word = self._ordinal_to_words(day) | |
| month_word = self._ordinal_to_words(month) | |
| year_word = self._year_to_words(year) | |
| return f"{day_word} {month_word} {year_word}" | |
| # Pattern 2: D. Mon. YYYY or DD. Month YYYY | |
| pattern2 = r'(\d{1,2})\.\s*([A-Za-zä]+)\.?\s*(\d{4})' | |
| m2 = re.match(pattern2, date_str) | |
| if m2: | |
| day = int(m2.group(1)) | |
| month_str = m2.group(2).lower() | |
| year = int(m2.group(3)) | |
| day_word = self._ordinal_to_words(day) | |
| # Try to find month | |
| month_word = self.MONTH_ABBREV.get(month_str, month_str) | |
| year_word = self._year_to_words(year) | |
| return f"{day_word} {month_word} {year_word}" | |
| # Pattern 3: Just DD.MM or D.M (without year) | |
| pattern3 = r'(\d{1,2})\.(\d{1,2})\.' | |
| m3 = re.match(pattern3, date_str) | |
| if m3: | |
| day = int(m3.group(1)) | |
| month = int(m3.group(2)) | |
| day_word = self._ordinal_to_words(day) | |
| month_word = self._ordinal_to_words(month) | |
| return f"{day_word} {month_word}" | |
| return date_str | |
| def _process_standalone_number(self, match: re.Match) -> str: | |
| """ | |
| Process standalone cardinal numbers. | |
| Args: | |
| match: Regex match object containing the number | |
| Returns: | |
| Spoken form of the number | |
| """ | |
| num_str = match.group(0) | |
| num = int(num_str) | |
| return self._number_to_words(num) | |
| def preprocess(self, text: str) -> str: | |
| """ | |
| Main preprocessing function that applies all transformations. | |
| Args: | |
| text: Input German text | |
| Returns: | |
| Preprocessed text with numbers, dates, etc. converted to spoken form | |
| """ | |
| # Order matters! More specific patterns first | |
| # 1. Process dates (must come before decimal and integer processing) | |
| # Pattern: DD.MM.YYYY or D.M.YYYY | |
| text = re.sub( | |
| r'\b(\d{1,2})\.(\d{1,2})\.(\d{4})\b', | |
| self._process_date, | |
| text | |
| ) | |
| # Pattern: D. Month YYYY or DD. Mon. YYYY | |
| text = re.sub( | |
| r'\b(\d{1,2})\.\s*([A-Za-zäöüÄÖÜ]+)\.?\s*(\d{4})\b', | |
| self._process_date, | |
| text | |
| ) | |
| # Pattern: DD.MM. or D.M. | |
| text = re.sub( | |
| r'\b(\d{1,2})\.(\d{1,2})\.', | |
| self._process_date, | |
| text | |
| ) | |
| # 2. Process decimal numbers (before integers) | |
| # Pattern: number,digits (e.g., 3,1415 or 0,5) | |
| text = re.sub( | |
| r'\b\d+,\d+\b', | |
| self._process_decimal, | |
| text | |
| ) | |
| # 3. Process standalone integers (cardinal numbers) | |
| # This will catch remaining numbers not processed by date/decimal patterns | |
| text = re.sub( | |
| r'\b\d+\b', | |
| self._process_standalone_number, | |
| text | |
| ) | |
| # 4. Clean up any extra whitespace | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| # Convenience function for easy import and use | |
| def preprocess_german_text(text: str) -> str: | |
| """ | |
| Convenience function to preprocess German text. | |
| Args: | |
| text: Input German text | |
| Returns: | |
| Preprocessed text with numbers, dates, etc. in spoken form | |
| """ | |
| preprocessor = GermanTextPreprocessor() | |
| return preprocessor.preprocess(text) | |
| # Example usage and testing | |
| if __name__ == "__main__": | |
| preprocessor = GermanTextPreprocessor() | |
| test_cases = [ | |
| "Die Zahl ist 3", | |
| "Heute ist der 20.11.2019", | |
| "Geboren am 1. Jan. 1994", | |
| "Pi ist ungefähr 3,1415", | |
| "Es sind 42 Studenten in der Klasse", | |
| "Das Jahr 2023 war interessant", | |
| "Der Preis beträgt 19,99 Euro", | |
| "Am 5.12. ist Nikolaus", | |
| "Die Temperatur ist -5 Grad", | |
| "Es gibt 1000000 Möglichkeiten", | |
| "Im Jahr 1789 begann die Revolution", | |
| ] | |
| print("German Text Preprocessing Examples:") | |
| print("=" * 80) | |
| for text in test_cases: | |
| processed = preprocessor.preprocess(text) | |
| print(f"Input: {text}") | |
| print(f"Output: {processed}") | |
| print("-" * 80) | |