Warholt commited on
Commit
5ecf3a1
·
1 Parent(s): b24d0d8

add text preprocessing and onnx cpu inference alternative

Browse files
Files changed (2) hide show
  1. app.py +134 -9
  2. german_text_preprocessor.py +397 -0
app.py CHANGED
@@ -3,8 +3,11 @@ import torch
3
  import torch._inductor
4
  import spaces
5
  from char_tokenizers import GermanCharsTokenizer
 
6
  from huggingface_hub import hf_hub_download
7
  import os
 
 
8
 
9
 
10
  # --- Download Model Files from Hugging Face ---
@@ -21,11 +24,15 @@ def download_models():
21
  "karlsson_fastpitch_encoder.pt2",
22
  "karlsson_fastpitch_decoder.pt2",
23
  "karlsson_hifigan.pt2",
 
 
24
  ],
25
  "Warholt/CaroTTS-60M-DE-Caro": [
26
  "caro_fastpitch_encoder.pt2",
27
  "caro_fastpitch_decoder.pt2",
28
  "caro_hifigan.pt2",
 
 
29
  ],
30
  }
31
 
@@ -97,7 +104,62 @@ MODELS = {
97
  },
98
  }
99
 
100
- # --- 3. Inference Function ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  @spaces.GPU(duration=60)
102
  def synthesize_speech(text: str, voice: str, pace: float = 1.0):
103
  """
@@ -105,10 +167,13 @@ def synthesize_speech(text: str, voice: str, pace: float = 1.0):
105
  for the duration of this function.
106
  """
107
  if not text.strip():
108
- return None
 
 
 
109
 
110
  # Tokenize text
111
- tokens = TOKENIZER.encode(text)
112
  tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to("cuda")
113
 
114
  # Prepare control parameters
@@ -137,15 +202,29 @@ def synthesize_speech(text: str, voice: str, pace: float = 1.0):
137
  sample_rate = 44100
138
  audio_array = audio.squeeze().cpu().numpy()
139
 
140
- return (sample_rate, audio_array)
141
 
142
 
143
- # --- 4. Gradio Interface ---
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
145
  gr.Markdown(
146
  """
147
  # 🎙️ German Text-to-Speech
148
  Generate German speech using two different voices: **Caro** and **Karlsson**.
 
149
  """
150
  )
151
 
@@ -153,7 +232,7 @@ with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
153
  with gr.Column():
154
  text_input = gr.Textbox(
155
  label="Text to synthesize",
156
- value="Hallo! Willkommen zur deutschen Sprachsynthese.",
157
  lines=3,
158
  )
159
  voice_dropdown = gr.Dropdown(
@@ -162,15 +241,61 @@ with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
162
  pace_slider = gr.Slider(
163
  minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speaking Rate"
164
  )
 
 
 
 
 
165
  generate_btn = gr.Button("Generate Speech 🔊", variant="primary")
166
 
167
  with gr.Column():
 
 
 
 
 
168
  audio_output = gr.Audio(label="Generated Audio", type="numpy")
169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  generate_btn.click(
171
- fn=synthesize_speech,
172
- inputs=[text_input, voice_dropdown, pace_slider],
173
- outputs=audio_output,
174
  )
175
 
176
  if __name__ == "__main__":
 
3
  import torch._inductor
4
  import spaces
5
  from char_tokenizers import GermanCharsTokenizer
6
+ from german_text_preprocessor import preprocess_german_text
7
  from huggingface_hub import hf_hub_download
8
  import os
9
+ import onnxruntime as ort
10
+ import numpy as np
11
 
12
 
13
  # --- Download Model Files from Hugging Face ---
 
24
  "karlsson_fastpitch_encoder.pt2",
25
  "karlsson_fastpitch_decoder.pt2",
26
  "karlsson_hifigan.pt2",
27
+ "karlsson_fastpitch.onnx",
28
+ "karlsson_hifigan.onnx",
29
  ],
30
  "Warholt/CaroTTS-60M-DE-Caro": [
31
  "caro_fastpitch_encoder.pt2",
32
  "caro_fastpitch_decoder.pt2",
33
  "caro_hifigan.pt2",
34
+ "caro_fastpitch.onnx",
35
+ "caro_hifigan.onnx",
36
  ],
37
  }
38
 
 
104
  },
105
  }
106
 
107
+ # Initialize ONNX sessions for CPU inference
108
+ ONNX_SESSIONS = {
109
+ "Caro": {
110
+ "fastpitch": ort.InferenceSession("aot_package/caro_fastpitch.onnx"),
111
+ "hifigan": ort.InferenceSession("aot_package/caro_hifigan.onnx"),
112
+ },
113
+ "Karlsson": {
114
+ "fastpitch": ort.InferenceSession("aot_package/karlsson_fastpitch.onnx"),
115
+ "hifigan": ort.InferenceSession("aot_package/karlsson_hifigan.onnx"),
116
+ },
117
+ }
118
+
119
+
120
+ # --- 3. CPU Inference Function (ONNX) ---
121
+ def synthesize_speech_cpu(text: str, voice: str, pace: float = 1.0):
122
+ """
123
+ Synthesize speech using ONNX models on CPU.
124
+ """
125
+ if not text.strip():
126
+ return None, ""
127
+
128
+ # Preprocess text
129
+ preprocessed_text = preprocess_german_text(text)
130
+
131
+ # Tokenize text
132
+ tokens = TOKENIZER.encode(preprocessed_text)
133
+
134
+ # Prepare inputs for FastPitch
135
+ paces = np.zeros(len(tokens), dtype=np.float32) + pace
136
+ pitches = np.zeros(len(tokens), dtype=np.float32)
137
+
138
+ inputs = {
139
+ "text": np.array([tokens], dtype=np.int64),
140
+ "pace": np.array([paces], dtype=np.float32),
141
+ "pitch": np.array([pitches], dtype=np.float32),
142
+ }
143
+
144
+ # Get ONNX sessions for the selected voice
145
+ fastpitch_session = ONNX_SESSIONS[voice]["fastpitch"]
146
+ hifigan_session = ONNX_SESSIONS[voice]["hifigan"]
147
+
148
+ # Generate spectrogram with FastPitch
149
+ spec = fastpitch_session.run(None, inputs)[0]
150
+
151
+ # Generate audio with HiFiGAN
152
+ gan_inputs = {"spec": spec}
153
+ audio = hifigan_session.run(None, gan_inputs)[0]
154
+
155
+ # Convert to format expected by Gradio
156
+ sample_rate = 44100
157
+ audio_array = audio.squeeze()
158
+
159
+ return (sample_rate, audio_array), preprocessed_text
160
+
161
+
162
+ # --- 4. GPU Inference Function ---
163
  @spaces.GPU(duration=60)
164
  def synthesize_speech(text: str, voice: str, pace: float = 1.0):
165
  """
 
167
  for the duration of this function.
168
  """
169
  if not text.strip():
170
+ return None, ""
171
+
172
+ # Preprocess text: convert numbers, dates, decimals to spoken form
173
+ preprocessed_text = preprocess_german_text(text)
174
 
175
  # Tokenize text
176
+ tokens = TOKENIZER.encode(preprocessed_text)
177
  tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to("cuda")
178
 
179
  # Prepare control parameters
 
202
  sample_rate = 44100
203
  audio_array = audio.squeeze().cpu().numpy()
204
 
205
+ return (sample_rate, audio_array), preprocessed_text
206
 
207
 
208
+ # --- 5. Combined Inference Function ---
209
+ def synthesize_speech_combined(
210
+ text: str, voice: str, pace: float = 1.0, use_gpu: bool = False
211
+ ):
212
+ """
213
+ Route to GPU or CPU inference based on user selection.
214
+ """
215
+ if use_gpu:
216
+ return synthesize_speech(text, voice, pace)
217
+ else:
218
+ return synthesize_speech_cpu(text, voice, pace)
219
+
220
+
221
+ # --- 6. Gradio Interface ---
222
  with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
223
  gr.Markdown(
224
  """
225
  # 🎙️ German Text-to-Speech
226
  Generate German speech using two different voices: **Caro** and **Karlsson**.
227
+ Numbers, dates, and decimals are automatically converted to spoken form.
228
  """
229
  )
230
 
 
232
  with gr.Column():
233
  text_input = gr.Textbox(
234
  label="Text to synthesize",
235
+ value="Guten Tag. Herzlich Willkommen zu dieser Demonstration deutscher Sprachsynthese-Modelle. Es stehen Ihnen zwei Stimmen zur Auswahl: Caro und Karlsson. Probieren Sie es aus!",
236
  lines=3,
237
  )
238
  voice_dropdown = gr.Dropdown(
 
241
  pace_slider = gr.Slider(
242
  minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speaking Rate"
243
  )
244
+ use_gpu_checkbox = gr.Checkbox(
245
+ label="Use GPU (ZeroGPU)",
246
+ value=False,
247
+ info="Enable for faster inference on GPU. Disable for CPU inference (slower but always available).",
248
+ )
249
  generate_btn = gr.Button("Generate Speech 🔊", variant="primary")
250
 
251
  with gr.Column():
252
+ preprocessed_output = gr.Textbox(
253
+ label="Preprocessed Text (what will be spoken)",
254
+ lines=3,
255
+ interactive=False,
256
+ )
257
  audio_output = gr.Audio(label="Generated Audio", type="numpy")
258
 
259
+ # Example sentences section
260
+ gr.Markdown("### 📝 Example Sentences")
261
+ gr.Examples(
262
+ examples=[
263
+ [
264
+ "Die Bundeskanzlerin empfing heute den französischen Präsidenten zu einem Staatsbesuch in Berlin. Die Gespräche dauerten mehr als 3 Stunden."
265
+ ],
266
+ [
267
+ "Am 15. März 2024 wird die neue Ausstellung im Museum eröffnet. Der Eintritt kostet 12,50 Euro für Erwachsene."
268
+ ],
269
+ [
270
+ "In der verzauberten Bibliothek entdeckte die junge Magierin ein uraltes Buch, dessen Seiten im Mondlicht golden schimmerten."
271
+ ],
272
+ [
273
+ "Der mutige Ritter zog sein Schwert und stellte sich dem feuerspeienden Drachen. Ein epischer Kampf begann auf dem Gipfel des Berges."
274
+ ],
275
+ [
276
+ "Wussten Sie, dass die Große Mauer in China über 21000 Kilometer lang ist? Sie wurde über 2000 Jahre hinweg erbaut."
277
+ ],
278
+ [
279
+ "Der menschliche Körper besteht zu etwa 60 Prozent aus Wasser. Ein erwachsener Mensch hat ungefähr 100000 Kilometer Blutgefäße."
280
+ ],
281
+ [
282
+ "Die Temperaturen steigen heute auf bis zu 28 Grad Celsius. Am Wochenende wird mit Schauern und Gewittern gerechnet."
283
+ ],
284
+ [
285
+ "Der Dax schloss heute bei 18456,73 Punkten, ein Plus von 2,3 Prozent. Der Euro notiert bei 1,0892 Dollar."
286
+ ],
287
+ [
288
+ "Es war einmal in einem fernen Königreich, wo die Zeit anders verlief und die Sterne näher schienen. Dort lebte eine weise Eule, die alle Geheimnisse des Waldes kannte."
289
+ ],
290
+ ],
291
+ inputs=text_input,
292
+ label="Try these examples:",
293
+ )
294
+
295
  generate_btn.click(
296
+ fn=synthesize_speech_combined,
297
+ inputs=[text_input, voice_dropdown, pace_slider, use_gpu_checkbox],
298
+ outputs=[audio_output, preprocessed_output],
299
  )
300
 
301
  if __name__ == "__main__":
german_text_preprocessor.py ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ German Text Preprocessing Module for TTS
3
+ Handles normalization of numbers, dates, decimal numbers, and other text elements
4
+ to their spoken form in German.
5
+ """
6
+
7
+ import re
8
+
9
+
10
+ class GermanTextPreprocessor:
11
+ """
12
+ Preprocesses German text for TTS by converting numbers, dates, and special
13
+ characters into their spoken equivalents.
14
+ """
15
+
16
+ # Number words for German
17
+ ONES = {
18
+ 0: "", 1: "eins", 2: "zwei", 3: "drei", 4: "vier",
19
+ 5: "fünf", 6: "sechs", 7: "sieben", 8: "acht", 9: "neun"
20
+ }
21
+
22
+ # Digit names for reading individual digits (including zero)
23
+ DIGITS = {
24
+ 0: "null", 1: "eins", 2: "zwei", 3: "drei", 4: "vier",
25
+ 5: "fünf", 6: "sechs", 7: "sieben", 8: "acht", 9: "neun"
26
+ }
27
+
28
+ TEENS = {
29
+ 10: "zehn", 11: "elf", 12: "zwölf", 13: "dreizehn",
30
+ 14: "vierzehn", 15: "fünfzehn", 16: "sechzehn",
31
+ 17: "siebzehn", 18: "achtzehn", 19: "neunzehn"
32
+ }
33
+
34
+ TENS = {
35
+ 2: "zwanzig", 3: "dreißig", 4: "vierzig",
36
+ 5: "fünfzig", 6: "sechzig", 7: "siebzig",
37
+ 8: "achtzig", 9: "neunzig"
38
+ }
39
+
40
+ SCALES = [
41
+ (1000000000, "Milliarde", "Milliarden"),
42
+ (1000000, "Million", "Millionen"),
43
+ (1000, "tausend", "tausend")
44
+ ]
45
+
46
+ # Ordinal number endings
47
+ ORDINAL_ONES = {
48
+ 1: "erster", 2: "zweiter", 3: "dritter", 4: "vierter",
49
+ 5: "fünfter", 6: "sechster", 7: "siebter", 8: "achter", 9: "neunter"
50
+ }
51
+
52
+ ORDINAL_TEENS = {
53
+ 10: "zehnter", 11: "elfter", 12: "zwölfter", 13: "dreizehnter",
54
+ 14: "vierzehnter", 15: "fünfzehnter", 16: "sechzehnter",
55
+ 17: "siebzehnter", 18: "achtzehnter", 19: "neunzehnter"
56
+ }
57
+
58
+ # Month names
59
+ MONTHS = {
60
+ 1: "Januar", 2: "Februar", 3: "März", 4: "April",
61
+ 5: "Mai", 6: "Juni", 7: "Juli", 8: "August",
62
+ 9: "September", 10: "Oktober", 11: "November", 12: "Dezember"
63
+ }
64
+
65
+ MONTH_ABBREV = {
66
+ "jan": "Januar", "feb": "Februar", "mär": "März", "apr": "April",
67
+ "mai": "Mai", "jun": "Juni", "jul": "Juli", "aug": "August",
68
+ "sep": "September", "sept": "September", "okt": "Oktober",
69
+ "nov": "November", "dez": "Dezember"
70
+ }
71
+
72
+ def __init__(self):
73
+ """Initialize the German text preprocessor."""
74
+ pass
75
+
76
+ def _number_to_words(self, num: int) -> str:
77
+ """
78
+ Convert a cardinal number to its German word form.
79
+
80
+ Args:
81
+ num: Integer to convert
82
+
83
+ Returns:
84
+ German word representation of the number
85
+ """
86
+ if num == 0:
87
+ return "null"
88
+
89
+ if num < 0:
90
+ return "minus " + self._number_to_words(-num)
91
+
92
+ # Handle 1-9
93
+ if num < 10:
94
+ return self.ONES[num]
95
+
96
+ # Handle 10-19
97
+ if num < 20:
98
+ return self.TEENS[num]
99
+
100
+ # Handle 20-99
101
+ if num < 100:
102
+ ones = num % 10
103
+ tens = num // 10
104
+ if ones == 0:
105
+ return self.TENS[tens]
106
+ else:
107
+ ones_word = self.ONES[ones]
108
+ # Special case: "eins" becomes "ein" in compound numbers
109
+ if ones == 1:
110
+ ones_word = "ein"
111
+ return f"{ones_word}und{self.TENS[tens]}"
112
+
113
+ # Handle 100-999
114
+ if num < 1000:
115
+ hundreds = num // 100
116
+ remainder = num % 100
117
+ hundreds_word = "einhundert" if hundreds == 1 else f"{self.ONES[hundreds]}hundert"
118
+ if remainder == 0:
119
+ return hundreds_word
120
+ return f"{hundreds_word}{self._number_to_words(remainder)}"
121
+
122
+ # Handle larger numbers using scales
123
+ for scale, singular, plural in self.SCALES:
124
+ if num >= scale:
125
+ quotient = num // scale
126
+ remainder = num % scale
127
+
128
+ # Format the quotient part
129
+ quotient_words = self._number_to_words(quotient)
130
+
131
+ # Choose singular or plural
132
+ if scale == 1000:
133
+ scale_word = singular
134
+ # Special formatting for thousands
135
+ if quotient == 1:
136
+ scale_word = "eintausend"
137
+ else:
138
+ scale_word = f"{quotient_words}tausend"
139
+
140
+ if remainder == 0:
141
+ return scale_word
142
+ return f"{scale_word}{self._number_to_words(remainder)}"
143
+ else:
144
+ scale_word = singular if quotient == 1 else plural
145
+ if quotient == 1:
146
+ result = f"eine {scale_word}"
147
+ else:
148
+ result = f"{quotient_words} {scale_word}"
149
+
150
+ if remainder == 0:
151
+ return result
152
+ return f"{result} {self._number_to_words(remainder)}"
153
+
154
+ return str(num)
155
+
156
+ def _year_to_words(self, year: int) -> str:
157
+ """
158
+ Convert a year to its German spoken form.
159
+
160
+ Args:
161
+ year: Year as integer (e.g., 1994, 2019)
162
+
163
+ Returns:
164
+ German spoken form of the year
165
+ """
166
+ # For years 1000-1999, split into hundreds
167
+ if 1000 <= year <= 1999:
168
+ hundreds = year // 100
169
+ remainder = year % 100
170
+
171
+ if remainder == 0:
172
+ return self._number_to_words(year)
173
+
174
+ # Create compound like "neunzehnhundertvierundneunzig"
175
+ hundreds_word = self._number_to_words(hundreds)
176
+ return f"{hundreds_word}hundert{self._number_to_words(remainder)}"
177
+
178
+ # For years 2000+, use normal number reading
179
+ return self._number_to_words(year)
180
+
181
+ def _ordinal_to_words(self, num: int) -> str:
182
+ """
183
+ Convert a number to its German ordinal form.
184
+
185
+ Args:
186
+ num: Integer to convert to ordinal
187
+
188
+ Returns:
189
+ German ordinal word
190
+ """
191
+ if num < 1:
192
+ return self._number_to_words(num) + "ter"
193
+
194
+ # Handle 1-9
195
+ if num < 10:
196
+ return self.ORDINAL_ONES.get(num, self._number_to_words(num) + "ter")
197
+
198
+ # Handle 10-19
199
+ if num < 20:
200
+ return self.ORDINAL_TEENS.get(num, self._number_to_words(num) + "ter")
201
+
202
+ # For larger numbers, add "ter" to the cardinal
203
+ return self._number_to_words(num) + "ter"
204
+
205
+ def _process_decimal(self, match: re.Match) -> str:
206
+ """
207
+ Process decimal numbers like "3,1415" -> "drei komma eins vier eins fünf"
208
+
209
+ Args:
210
+ match: Regex match object containing the decimal number
211
+
212
+ Returns:
213
+ Spoken form of the decimal number
214
+ """
215
+ full_number = match.group(0)
216
+ parts = full_number.split(',')
217
+
218
+ # Integer part
219
+ integer_part = int(parts[0]) if parts[0] else 0
220
+ result = self._number_to_words(integer_part)
221
+
222
+ # Decimal part - read digit by digit (including zeros)
223
+ if len(parts) > 1 and parts[1]:
224
+ result += " komma"
225
+ for digit in parts[1]:
226
+ result += " " + self.DIGITS[int(digit)]
227
+
228
+ return result
229
+
230
+ def _process_date(self, match: re.Match) -> str:
231
+ """
232
+ Process dates in various formats:
233
+ - "20.11.2019" -> "zwanzigster elfter zweitausendneunzehn"
234
+ - "1. Jan. 1994" -> "erster Januar neunzehnhundertvierundneunzig"
235
+
236
+ Args:
237
+ match: Regex match object containing the date
238
+
239
+ Returns:
240
+ Spoken form of the date
241
+ """
242
+ date_str = match.group(0)
243
+
244
+ # Pattern 1: DD.MM.YYYY or D.M.YYYY
245
+ pattern1 = r'(\d{1,2})\.(\d{1,2})\.(\d{4})'
246
+ m1 = re.match(pattern1, date_str)
247
+ if m1:
248
+ day = int(m1.group(1))
249
+ month = int(m1.group(2))
250
+ year = int(m1.group(3))
251
+
252
+ day_word = self._ordinal_to_words(day)
253
+ month_word = self._ordinal_to_words(month)
254
+ year_word = self._year_to_words(year)
255
+
256
+ return f"{day_word} {month_word} {year_word}"
257
+
258
+ # Pattern 2: D. Mon. YYYY or DD. Month YYYY
259
+ pattern2 = r'(\d{1,2})\.\s*([A-Za-zä]+)\.?\s*(\d{4})'
260
+ m2 = re.match(pattern2, date_str)
261
+ if m2:
262
+ day = int(m2.group(1))
263
+ month_str = m2.group(2).lower()
264
+ year = int(m2.group(3))
265
+
266
+ day_word = self._ordinal_to_words(day)
267
+
268
+ # Try to find month
269
+ month_word = self.MONTH_ABBREV.get(month_str, month_str)
270
+ year_word = self._year_to_words(year)
271
+
272
+ return f"{day_word} {month_word} {year_word}"
273
+
274
+ # Pattern 3: Just DD.MM or D.M (without year)
275
+ pattern3 = r'(\d{1,2})\.(\d{1,2})\.'
276
+ m3 = re.match(pattern3, date_str)
277
+ if m3:
278
+ day = int(m3.group(1))
279
+ month = int(m3.group(2))
280
+
281
+ day_word = self._ordinal_to_words(day)
282
+ month_word = self._ordinal_to_words(month)
283
+
284
+ return f"{day_word} {month_word}"
285
+
286
+ return date_str
287
+
288
+ def _process_standalone_number(self, match: re.Match) -> str:
289
+ """
290
+ Process standalone cardinal numbers.
291
+
292
+ Args:
293
+ match: Regex match object containing the number
294
+
295
+ Returns:
296
+ Spoken form of the number
297
+ """
298
+ num_str = match.group(0)
299
+ num = int(num_str)
300
+ return self._number_to_words(num)
301
+
302
+ def preprocess(self, text: str) -> str:
303
+ """
304
+ Main preprocessing function that applies all transformations.
305
+
306
+ Args:
307
+ text: Input German text
308
+
309
+ Returns:
310
+ Preprocessed text with numbers, dates, etc. converted to spoken form
311
+ """
312
+ # Order matters! More specific patterns first
313
+
314
+ # 1. Process dates (must come before decimal and integer processing)
315
+ # Pattern: DD.MM.YYYY or D.M.YYYY
316
+ text = re.sub(
317
+ r'\b(\d{1,2})\.(\d{1,2})\.(\d{4})\b',
318
+ self._process_date,
319
+ text
320
+ )
321
+
322
+ # Pattern: D. Month YYYY or DD. Mon. YYYY
323
+ text = re.sub(
324
+ r'\b(\d{1,2})\.\s*([A-Za-zäöüÄÖÜ]+)\.?\s*(\d{4})\b',
325
+ self._process_date,
326
+ text
327
+ )
328
+
329
+ # Pattern: DD.MM. or D.M.
330
+ text = re.sub(
331
+ r'\b(\d{1,2})\.(\d{1,2})\.',
332
+ self._process_date,
333
+ text
334
+ )
335
+
336
+ # 2. Process decimal numbers (before integers)
337
+ # Pattern: number,digits (e.g., 3,1415 or 0,5)
338
+ text = re.sub(
339
+ r'\b\d+,\d+\b',
340
+ self._process_decimal,
341
+ text
342
+ )
343
+
344
+ # 3. Process standalone integers (cardinal numbers)
345
+ # This will catch remaining numbers not processed by date/decimal patterns
346
+ text = re.sub(
347
+ r'\b\d+\b',
348
+ self._process_standalone_number,
349
+ text
350
+ )
351
+
352
+ # 4. Clean up any extra whitespace
353
+ text = re.sub(r'\s+', ' ', text).strip()
354
+
355
+ return text
356
+
357
+
358
+ # Convenience function for easy import and use
359
+ def preprocess_german_text(text: str) -> str:
360
+ """
361
+ Convenience function to preprocess German text.
362
+
363
+ Args:
364
+ text: Input German text
365
+
366
+ Returns:
367
+ Preprocessed text with numbers, dates, etc. in spoken form
368
+ """
369
+ preprocessor = GermanTextPreprocessor()
370
+ return preprocessor.preprocess(text)
371
+
372
+
373
+ # Example usage and testing
374
+ if __name__ == "__main__":
375
+ preprocessor = GermanTextPreprocessor()
376
+
377
+ test_cases = [
378
+ "Die Zahl ist 3",
379
+ "Heute ist der 20.11.2019",
380
+ "Geboren am 1. Jan. 1994",
381
+ "Pi ist ungefähr 3,1415",
382
+ "Es sind 42 Studenten in der Klasse",
383
+ "Das Jahr 2023 war interessant",
384
+ "Der Preis beträgt 19,99 Euro",
385
+ "Am 5.12. ist Nikolaus",
386
+ "Die Temperatur ist -5 Grad",
387
+ "Es gibt 1000000 Möglichkeiten",
388
+ "Im Jahr 1789 begann die Revolution",
389
+ ]
390
+
391
+ print("German Text Preprocessing Examples:")
392
+ print("=" * 80)
393
+ for text in test_cases:
394
+ processed = preprocessor.preprocess(text)
395
+ print(f"Input: {text}")
396
+ print(f"Output: {processed}")
397
+ print("-" * 80)